-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes #1465: Integrate the pre-registration text mining for the UKRN…
… Pilot 6 Introducing the pilot6.sql script wrapped with a workflow.xml file. The wf expects DocumentText avro records at input and produces raw text coming from the madis script at output.
- Loading branch information
1 parent
c04b086
commit bca5e4c
Showing
3 changed files
with
139 additions
and
0 deletions.
There are no files selected for viewing
2 changes: 2 additions & 0 deletions
2
...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/import.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
## This is a classpath-based import file (this header is required) | ||
madis classpath eu/dnetlib/iis/3rdparty/scripts/madis |
31 changes: 31 additions & 0 deletions
31
...s/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/lib/scripts/pilot6.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
--For testing... | ||
-- attach database "../06.Biomedical06/mydata.db" as d1; | ||
-- create table mydata as select * from (setschema 'docid,text' select * from mydata where | ||
-- docid ='PMC2931525' or docid ='PMC2933899' or | ||
-- docid ='PMC3737084' or docid ='PMC3737070' ); | ||
-- output 'pubs.txt' select jdict('id', docid, 'text', text) from mydata; | ||
-- | ||
--cp pubs.txt pubs.json | ||
--cat pubs.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v1.json | ||
|
||
--cat pubs_empty.json | python ~/Desktop/openAIRE/madis2/src/mexec.py -f pilot6.sql -d test01.db > results_v2.json | ||
|
||
|
||
create temp table mydata as select * from (setschema 'docid,text' select jsonpath(c1,'$.id', '$.text') from stdinput()); | ||
|
||
hidden var 'urls' from "www\.animalstudyregistry\.org|aspredicted\.org|www\.anzctr\.org\.au|ensaiosclinicos\.gov\.br|www\.chictr\.org\.cn|cris\.nih\.go\.kr\cris|euclinicaltrials\.eu|ctri\.nic\.in|clinicaltrials\.gov|rpcec\.sld\.cu|www\.onderzoekmetmensen\.nl|www\.clinicaltrialsregister\.eu|drks\.de|trialsearch\.who\.int|inplasy\.com|www\.crd\.york\.ac\.uk\/prospero|www\.isrctn\.com|itmctr\.ccebtcm\.org\.cn|www\.irct\.ir|rctportal\.niph\.go\.jp|www\.clinicaltrials\.jp|rctportal\.niph\.go\.jp|lbctr\.moph\.gov\.lb|osf\.io/search\?resourceType=Registration|pactr\.samrc\.ac\.za|ensayosclinicos-repec\.ins\.gob\.pe|preclinicaltrials\.eu|www\.researchregistry\.com|www\.slctr\.lk|thaiclinicaltrials\.org|ww\.umin\.ac\.jp"; | ||
|
||
hidden var 'regexstatements' from | ||
"10\.17590\/asr\.\d+|AsPredicted\s{0,1}#\d+|ACTRN:{0,1}\s{0,1}\d+p{0,1}|RBR-\d+[a-z0-9]+|ChiCTR\s{0,1}-{0,1}(?:TRC){0,1}-{0,1}\d+|KCT\s{0,1}\d{7}|EUCT\s{0,1}\d{4}-\d+-\d{2}-\d{2}|CTRI/\d{4}/\d{2}/\d+|NCT\s{0,1}\d+|RPCEC\s{0,1}\d+|NL\s{0,1}\d+ ; NTR\s{0,1}\d+|\d{4}[-–]\d+-\d+|DRKS\s{0,1}\d+|U\d{4}-\d{4}-\d{4}|INPLASY\s{0,1}\d+|CRD\s{0,1}\d+|ISRCTN\s{0,1}\d+|ITMCTR\s{0,1}\d+|IRCT\s{0,1}\d+|JMA-IIA\s{0,1}\d+|JapicCTI\s{0,1}-{0,1}\d+|jRCTs{0,1}\s{0,1}\d+|LBCTR\s{0,1}\d+|PACTR\s{0,1}\d+|PER-\d+-\d+|PCTE\s{0,1}\d+|researchregistry\s{0,1}\d+|SLCTR/\d{4}/\d+|TCTR\s{0,1}\d+|UMIN\s{0,1}\d+"; | ||
|
||
select jdict('query', 'a', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next) | ||
from ( select docid, prev, middle, next | ||
from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('urls')) | ||
from (select docid, text from mydata)) | ||
) | ||
union all | ||
select jdict('query', 'b', 'documentId', docid, 'prev', prev, 'middle', middle, 'next', next) | ||
from ( select docid, prev, middle, next | ||
from (setschema 'docid,prev,middle,next' select docid, textwindow2s(regexpr("\n",text," "), 10, 1, 10, var('regexstatements')) | ||
from (select docid, text from mydata)) | ||
); |
106 changes: 106 additions & 0 deletions
106
...n/resources/eu/dnetlib/iis/wf/referenceextraction/ukrn/pilot6/main/oozie_app/workflow.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
<?xml version="1.0"?> | ||
<!-- Note that documentation placed in comments in this file uses the | ||
"markdown" syntax (along with its way of dividing text into sections). --> | ||
<workflow-app xmlns="uri:oozie:workflow:0.4" name="referenceextraction_ukrn_pilot6_main"> | ||
|
||
<parameters> | ||
<property> | ||
<name>input_document_text</name> | ||
<description>input document text</description> | ||
</property> | ||
<property> | ||
<name>output</name> | ||
<description>pre-registration mining output</description> | ||
</property> | ||
</parameters> | ||
|
||
<global> | ||
<job-tracker>${jobTracker}</job-tracker> | ||
<name-node>${nameNode}</name-node> | ||
<configuration> | ||
<property> | ||
<name>mapreduce.job.queuename</name> | ||
<value>${queueName}</value> | ||
</property> | ||
<property> | ||
<name>oozie.launcher.mapred.job.queue.name</name> | ||
<value>${oozieLauncherQueueName}</value> | ||
</property> | ||
</configuration> | ||
</global> | ||
|
||
|
||
<start to="generate-schema" /> | ||
|
||
<action name="generate-schema"> | ||
<java> | ||
<main-class>eu.dnetlib.iis.common.javamapreduce.hack.AvroSchemaGenerator</main-class> | ||
<arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg> | ||
<capture-output /> | ||
</java> | ||
<ok to="main" /> | ||
<error to="fail" /> | ||
</action> | ||
|
||
<action name="main"> | ||
<map-reduce> | ||
<prepare> | ||
<delete path="${nameNode}${output}"/> | ||
</prepare> | ||
<streaming> | ||
<mapper>scripts/madis/mexec.py -f scripts/pilot6.sql</mapper> | ||
</streaming> | ||
<configuration> | ||
<!-- # Standard settings for our framework --> | ||
<property> | ||
<name>mapred.output.format.class</name> | ||
<value>org.apache.hadoop.mapred.TextOutputFormat</value> | ||
</property> | ||
<property> | ||
<name>mapred.input.format.class</name> | ||
<value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value> | ||
</property> | ||
<!-- # Custom settings for this workflow node --> | ||
<!-- We do not use any reducers, so we set their number to 0 --> | ||
<property> | ||
<name>mapreduce.job.reduces</name> | ||
<value>0</value> | ||
</property> | ||
|
||
<!-- INPUT --> | ||
<property> | ||
<name>mapreduce.input.fileinputformat.inputdir</name> | ||
<value>${input_document_text}</value> | ||
</property> | ||
|
||
<property> | ||
<name>input.schema.literal</name> | ||
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value> | ||
</property> | ||
|
||
<!-- OUTPUT --> | ||
<property> | ||
<name>mapreduce.output.fileoutputformat.outputdir</name> | ||
<value>${output}</value> | ||
</property> | ||
|
||
<!-- this one is requred due to the large amount of time taken | ||
by process storing plaintexts into the database --> | ||
<property> | ||
<name>mapreduce.task.timeout</name> | ||
<value>7200000</value> | ||
</property> | ||
</configuration> | ||
</map-reduce> | ||
<ok to="end"/> | ||
<error to="fail"/> | ||
</action> | ||
|
||
<kill name="fail"> | ||
<message>Unfortunately, the process failed -- error message: | ||
[${wf:errorMessage(wf:lastErrorNode())}] | ||
</message> | ||
</kill> | ||
|
||
<end name="end"/> | ||
</workflow-app> |