Skip to content

Commit

Permalink
Closes #1465: Integrate the pre-registration text mining for the UKRN…
Browse files Browse the repository at this point in the history
… Pilot 6

Introducing the pilot6.sql script wrapped with a workflow.xml file.

The wf expects DocumentText avro records at input and produces raw text coming from the madis script at output.
  • Loading branch information
marekhorst committed Sep 4, 2024
1 parent c04b086 commit bca5e4c
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## This is a classpath-based import file (this header is required)
madis classpath eu/dnetlib/iis/3rdparty/scripts/madis
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
--For testing...
-- attach database "../06.Biomedical06/mydata.db" as d1;
-- create table mydata as select * from (setschema 'docid,text' select * from mydata where
-- docid ='PMC2931525' or docid ='PMC2933899' or
-- docid ='PMC3737084' or docid ='PMC3737070' );
-- output 'pubs.txt' select jdict('id', docid, 'text', text) from mydata;
--
--cp pubs.txt pubs.json
--cat pubs.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v1.json

--cat pubs_empty.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v2.json


create temp table mydata as select * from (setschema 'docid,text' select jsonpath(c1,'$.id', '$.text') from stdinput());

hidden var 'urls' from "www\.animalstudyregistry\.org|aspredicted\.org|www\.anzctr\.org\.au|ensaiosclinicos\.gov\.br|www\.chictr\.org\.cn|cris\.nih\.go\.kr\cris|euclinicaltrials\.eu|ctri\.nic\.in|clinicaltrials\.gov|rpcec\.sld\.cu|www\.onderzoekmetmensen\.nl|www\.clinicaltrialsregister\.eu|drks\.de|trialsearch\.who\.int|inplasy\.com|www\.crd\.york\.ac\.uk\/prospero|www\.isrctn\.com|itmctr\.ccebtcm\.org\.cn|www\.irct\.ir|rctportal\.niph\.go\.jp|www\.clinicaltrials\.jp|rctportal\.niph\.go\.jp|lbctr\.moph\.gov\.lb|osf\.io/search\?resourceType=Registration|pactr\.samrc\.ac\.za|ensayosclinicos-repec\.ins\.gob\.pe|preclinicaltrials\.eu|www\.researchregistry\.com|www\.slctr\.lk|thaiclinicaltrials\.org|ww\.umin\.ac\.jp";

hidden var 'regexstatements' from
"10\.17590\/asr\.\d+|AsPredicted\s{0,1}#\d+|ACTRN:{0,1}\s{0,1}\d+p{0,1}|RBR-\d+[a-z0-9]+|ChiCTR\s{0,1}-{0,1}(?:TRC){0,1}-{0,1}\d+|KCT\s{0,1}\d{7}|EUCT\s{0,1}\d{4}-\d+-\d{2}-\d{2}|CTRI/\d{4}/\d{2}/\d+|NCT\s{0,1}\d+|RPCEC\s{0,1}\d+|NL\s{0,1}\d+ ; NTR\s{0,1}\d+|\d{4}[-–]\d+-\d+|DRKS\s{0,1}\d+|U\d{4}-\d{4}-\d{4}|INPLASY\s{0,1}\d+|CRD\s{0,1}\d+|ISRCTN\s{0,1}\d+|ITMCTR\s{0,1}\d+|IRCT\s{0,1}\d+|JMA-IIA\s{0,1}\d+|JapicCTI\s{0,1}-{0,1}\d+|jRCTs{0,1}\s{0,1}\d+|LBCTR\s{0,1}\d+|PACTR\s{0,1}\d+|PER-\d+-\d+|PCTE\s{0,1}\d+|researchregistry\s{0,1}\d+|SLCTR/\d{4}/\d+|TCTR\s{0,1}\d+|UMIN\s{0,1}\d+";

select jdict('query', 'a', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next)
from ( select docid, prev, middle, next
from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('urls'))
from (select docid, text from mydata))
)
union all
select jdict('query', 'b', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next)
from ( select docid, prev, middle, next
from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('regexstatements'))
from (select docid, text from mydata))
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<?xml version="1.0"?>
<!-- Note that documentation placed in comments in this file uses the
"markdown" syntax (along with its way of dividing text into sections). -->
<workflow-app xmlns="uri:oozie:workflow:0.4" name="referenceextraction_ukrn_pilot6_main">

<parameters>
<property>
<name>input_document_text</name>
<description>input document text</description>
</property>
<property>
<name>output</name>
<description>pre-registration mining output</description>
</property>
</parameters>

<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>


<start to="generate-schema" />

<action name="generate-schema">
<java>
<main-class>eu.dnetlib.iis.common.javamapreduce.hack.AvroSchemaGenerator</main-class>
<arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
<capture-output />
</java>
<ok to="main" />
<error to="fail" />
</action>

<action name="main">
<map-reduce>
<prepare>
<delete path="${nameNode}${output}"/>
</prepare>
<streaming>
<mapper>scripts/madis/mexec.py -f scripts/pilot6.sql</mapper>
</streaming>
<configuration>
<!-- # Standard settings for our framework -->
<property>
<name>mapred.output.format.class</name>
<value>org.apache.hadoop.mapred.TextOutputFormat</value>
</property>
<property>
<name>mapred.input.format.class</name>
<value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value>
</property>
<!-- # Custom settings for this workflow node -->
<!-- We do not use any reducers, so we set their number to 0 -->
<property>
<name>mapreduce.job.reduces</name>
<value>0</value>
</property>

<!-- INPUT -->
<property>
<name>mapreduce.input.fileinputformat.inputdir</name>
<value>${input_document_text}</value>
</property>

<property>
<name>input.schema.literal</name>
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
</property>

<!-- OUTPUT -->
<property>
<name>mapreduce.output.fileoutputformat.outputdir</name>
<value>${output}</value>
</property>

<!-- this one is requred due to the large amount of time taken
by process storing plaintexts into the database -->
<property>
<name>mapreduce.task.timeout</name>
<value>7200000</value>
</property>
</configuration>
</map-reduce>
<ok to="end"/>
<error to="fail"/>
</action>

<kill name="fail">
<message>Unfortunately, the process failed -- error message:
[${wf:errorMessage(wf:lastErrorNode())}]
</message>
</kill>

<end name="end"/>
</workflow-app>

0 comments on commit bca5e4c

Please sign in to comment.