Closes #1465: Integrate the pre-registration text mining for the UKRN…

… Pilot 6 Introducing the pilot6.sql script wrapped with a workflow.xml file. The wf expects DocumentText avro records at input and produces raw text coming from the madis script at output.
openaire · Sep 4, 2024 · bca5e4c · bca5e4c
1 parent c04b086
commit bca5e4c
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 0 deletions.
diff --git a/...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/import.txt b/...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/import.txt
@@ -0,0 +1,2 @@
+## This is a classpath-based import file (this header is required)
+madis classpath eu/dnetlib/iis/3rdparty/scripts/madis
diff --git a/...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/pilot6.sql b/...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/pilot6.sql
@@ -0,0 +1,31 @@
+--For testing...
+-- attach database "../06.Biomedical06/mydata.db" as d1;
+-- create table mydata as select * from (setschema 'docid,text' select * from mydata where
+--              docid ='PMC2931525' or  docid ='PMC2933899' or
+--              docid ='PMC3737084' or  docid ='PMC3737070' );
+-- output 'pubs.txt' select jdict('id', docid, 'text', text) from mydata;
+--
+--cp pubs.txt pubs.json
+--cat pubs.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v1.json
+
+--cat pubs_empty.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v2.json
+
+
+create temp table mydata as select * from (setschema 'docid,text' select jsonpath(c1,'$.id', '$.text') from stdinput());
+
+hidden var 'urls' from "www\.animalstudyregistry\.org|aspredicted\.org|www\.anzctr\.org\.au|ensaiosclinicos\.gov\.br|www\.chictr\.org\.cn|cris\.nih\.go\.kr\cris|euclinicaltrials\.eu|ctri\.nic\.in|clinicaltrials\.gov|rpcec\.sld\.cu|www\.onderzoekmetmensen\.nl|www\.clinicaltrialsregister\.eu|drks\.de|trialsearch\.who\.int|inplasy\.com|www\.crd\.york\.ac\.uk\/prospero|www\.isrctn\.com|itmctr\.ccebtcm\.org\.cn|www\.irct\.ir|rctportal\.niph\.go\.jp|www\.clinicaltrials\.jp|rctportal\.niph\.go\.jp|lbctr\.moph\.gov\.lb|osf\.io/search\?resourceType=Registration|pactr\.samrc\.ac\.za|ensayosclinicos-repec\.ins\.gob\.pe|preclinicaltrials\.eu|www\.researchregistry\.com|www\.slctr\.lk|thaiclinicaltrials\.org|ww\.umin\.ac\.jp";
+
+hidden var 'regexstatements' from
+"10\.17590\/asr\.\d+|AsPredicted\s{0,1}#\d+|ACTRN:{0,1}\s{0,1}\d+p{0,1}|RBR-\d+[a-z0-9]+|ChiCTR\s{0,1}-{0,1}(?:TRC){0,1}-{0,1}\d+|KCT\s{0,1}\d{7}|EUCT\s{0,1}\d{4}-\d+-\d{2}-\d{2}|CTRI/\d{4}/\d{2}/\d+|NCT\s{0,1}\d+|RPCEC\s{0,1}\d+|NL\s{0,1}\d+ ; NTR\s{0,1}\d+|\d{4}[-–]\d+-\d+|DRKS\s{0,1}\d+|U\d{4}-\d{4}-\d{4}|INPLASY\s{0,1}\d+|CRD\s{0,1}\d+|ISRCTN\s{0,1}\d+|ITMCTR\s{0,1}\d+|IRCT\s{0,1}\d+|JMA-IIA\s{0,1}\d+|JapicCTI\s{0,1}-{0,1}\d+|jRCTs{0,1}\s{0,1}\d+|LBCTR\s{0,1}\d+|PACTR\s{0,1}\d+|PER-\d+-\d+|PCTE\s{0,1}\d+|researchregistry\s{0,1}\d+|SLCTR/\d{4}/\d+|TCTR\s{0,1}\d+|UMIN\s{0,1}\d+";
+
+select jdict('query', 'a', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next)
+from ( select docid, prev, middle, next
+        from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('urls'))
+                from (select docid, text from mydata))
+)
+union all
+select jdict('query', 'b', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next)
+from ( select docid, prev, middle, next
+        from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('regexstatements'))
+                from (select docid, text from mydata))
+);
diff --git a/...n/resources/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/workflow.xml b/...n/resources/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/workflow.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0"?>
+<!-- Note that documentation placed in comments in this file uses the
+"markdown" syntax (along with its way of dividing text into sections). -->
+<workflow-app xmlns="uri:oozie:workflow:0.4" name="referenceextraction_ukrn_pilot6_main">
+
+    <parameters>
+        <property>
+            <name>input_document_text</name>
+            <description>input document text</description>
+        </property>
+        <property>
+            <name>output</name>
+            <description>pre-registration mining output</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+        </configuration>
+    </global>
+
+
+    <start to="generate-schema" />
+
+    <action name="generate-schema">
+        <java>
+            <main-class>eu.dnetlib.iis.common.javamapreduce.hack.AvroSchemaGenerator</main-class>
+            <arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
+            <capture-output />
+        </java>
+        <ok to="main" />
+        <error to="fail" />
+    </action>
+
+    <action name="main">
+        <map-reduce>
+            <prepare>
+                <delete path="${nameNode}${output}"/>
+            </prepare>
+            <streaming>
+                <mapper>scripts/madis/mexec.py -f scripts/pilot6.sql</mapper>
+            </streaming>
+            <configuration>
+                <!-- # Standard settings for our framework -->
+                <property>
+                    <name>mapred.output.format.class</name>
+                    <value>org.apache.hadoop.mapred.TextOutputFormat</value>
+                </property>
+                <property>
+                    <name>mapred.input.format.class</name>
+                    <value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value>
+                </property>
+                <!-- # Custom settings for this workflow node -->
+                <!-- We do not use any reducers, so we set their number to 0 -->
+                <property>
+                    <name>mapreduce.job.reduces</name>
+                    <value>0</value>
+                </property>
+
+                <!-- INPUT -->
+                <property>
+                    <name>mapreduce.input.fileinputformat.inputdir</name>
+                    <value>${input_document_text}</value>
+                </property>
+
+                <property>
+                    <name>input.schema.literal</name>
+                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
+                </property>
+
+                <!-- OUTPUT -->
+                <property>
+                    <name>mapreduce.output.fileoutputformat.outputdir</name>
+                    <value>${output}</value>
+                </property>          
+
+                <!-- this one is requred due to the large amount of time taken 
+                    by process storing plaintexts into the database -->
+                <property>
+                    <name>mapreduce.task.timeout</name>
+                    <value>7200000</value>
+                </property>
+            </configuration>
+        </map-reduce>
+        <ok to="end"/>
+        <error to="fail"/>
+    </action>
+
+    <kill name="fail">
+        <message>Unfortunately, the process failed -- error message:
+                    [${wf:errorMessage(wf:lastErrorNode())}]
+                </message>
+    </kill>
+
+    <end name="end"/>
+</workflow-app>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		## This is a classpath-based import file (this header is required)
		madis classpath eu/dnetlib/iis/3rdparty/scripts/madis