From f88411f4ab728088e43aca5d1b578f40fdfb0cae Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Fri, 12 Jan 2024 18:43:45 +0100 Subject: [PATCH] Closes #1444: Integrate the Irish Research Council (IRC) projects mining Integrating unidentified IRC project mining. Supplementing integration tests suite with the IRC case. --- .../main_sqlite/oozie_app/lib/scripts/projects.sql | 8 +++++++- .../project/data/document_text.json | 2 ++ .../project/data/document_to_project.json | 10 ++++++++-- .../wf/referenceextraction/project/data/project.json | 3 ++- .../project/data/report_funder.json | 7 ++++++- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index ab709e71e..7a5085eb7 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -23,6 +23,7 @@ hidden var 'sshrc_unidentified' from (select id from grants where fundingclass1= hidden var 'nrc_unidentified' from (select id from grants where fundingclass1="NRC" and grantid="unidentified" limit 1); hidden var 'inca_unidentified' from (select id from grants where fundingclass1="INCa" and grantid="unidentified" limit 1); hidden var 'hfri_unidentified' from (select id from grants where fundingclass1="HFRI" and grantid="unidentified" limit 1); +hidden var 'irc_unidentified' from (select id from grants where fundingclass1="IRC" and grantid="unidentified" limit 1); create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput(); @@ -55,6 +56,9 @@ create temp table matched_undefined_miur_only as select distinct docid, var('miu select c1 as docid, textwindow2s(c2,10,1,10, '\b(?:RBSI\d{2}\w{4})\b') from (setschema 'c1,c2' select * from pubs where c2 is not null)) where var('miur_unidentified') and (regexprmatches('\b(?:RBSI\d{2}\w{4})\b', middle)); +create temp table matched_undefined_irc_only as select distinct docid, var('irc_unidentified') as id, prev,middle,next from (setschema 'docid,prev,middle,next' +select c1 as docid, textwindow2s(keywords(comprspaces(lower(regexpr("\n",c2," ")))),10,3,10, 'irish research council') from (setschema 'c1,c2' select * from pubs where c2 is not null)) +where var('irc_unidentified'); @@ -372,4 +376,6 @@ select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'text union all select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_gsri union all -select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" << "||middle||" >> "||next) from (select * from hfri_unidentified_only group by docid); \ No newline at end of file +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_irc_only +union all +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" << "||middle||" >> "||next) from (select * from hfri_unidentified_only group by docid); diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json index eb77c950c..e8f6c5650 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json @@ -57,3 +57,5 @@ {"text":"This material is based upon work supported by the National Science Foundation under Grant No. ATM-0513463.", "id":"50|od_______212::0f31511cdbd148bf5446b52f49ba8544"} {"text":"Acknowledgements This work was partially supported by Science Foundation Ireland Grant 04/IN1/I478 and Science Foundation Ireland Grant 03/RPT1/I382.", "id":"50|doi_________::9c4ddd5d830294ab76d7e7919a379f3b"} {"text":"Acknowledgments: This work was supported by the Hellenic Foundation for Research and Innovation (HFRI - Project No: 789)", "id":"50|arXiv_______::54002047659adf031293eabfbfe9938b"} +{"text":"This work was initially funded by a Government of Ireland Postgraduate Grant from the Irish Research Council, and subsequently supported by EPSRC Hubs for Robotics and AI in Hazardous Environments", "id":"50|06cdd3ff4700::d5035b3bb468e4ea7b4d82073634d138"} + diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json index 206bfaf25..4cf8da12e 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json @@ -386,7 +386,7 @@ "documentId": "50|core_ac_uk__::2fea53c390909c2640b2d1a94d53c0a7", "projectId": "40|inca________::1e5e62235d094afd01cd56e65112fc63", "confidenceLevel": 0.8, - "textsnippet": "This work was supported by INCA (plbio 2010-216 and INCa-DGOS-Inserm 6046) " + "textsnippet": "This work was supported by INCA (plbio 2010-216 and INCa-DGOS-Inserm 6046)" } { "documentId": "50|arXiv_______::a343cdcd534d696dd93c7ee9d78b9be7", @@ -454,4 +454,10 @@ "projectId": "40|hfri________::644d89adeca811786cf72d7967ec9813", "confidenceLevel": 0.8, "textsnippet": "acknowledgments work supported hellenic foundation research innovation hfri project 789" -} \ No newline at end of file +} +{ + "documentId": "50|06cdd3ff4700::d5035b3bb468e4ea7b4d82073634d138", + "projectId": "40|501100002081::1e5e62235d094afd01cd56e65112fc63", + "confidenceLevel": 0.8, + "textsnippet": "funded by a government of ireland postgraduate grant from the irish research council and subsequently supported by epsrc hubs for robotics and ai" +} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json index 17f8a0905..bdcca50ec 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json @@ -73,4 +73,5 @@ {"id": "40|irb_hr______::37ca9ece55928656726557c7c0a36a1a", "projectGrantId": "IP-2013-11-1021", "projectAcronym": null, "fundingClass": "HRZZ::", "jsonextrainfo": "{}"} {"id": "40|nhmrc_______::019492919738381cbee98a17ae1dae25", "projectGrantId": "1056888", "projectAcronym": null, "fundingClass": "NHMRC::NHMRC Partnerships", "jsonextrainfo": "{}"} {"id": "40|hfri________::644d89adeca811786cf72d7967ec9813", "projectGrantId": "789", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"} -{"id": "40|hfri________::cb5d92ce46b051859d1d9655e0ae7b46", "projectGrantId": "unidentified", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"} \ No newline at end of file +{"id": "40|hfri________::cb5d92ce46b051859d1d9655e0ae7b46", "projectGrantId": "unidentified", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"} +{"id": "40|501100002081::1e5e62235d094afd01cd56e65112fc63", "projectGrantId": "unidentified", "projectAcronym": null, "fundingClass": "IRC::", "jsonextrainfo": "{}"} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json index c2fa14d6b..c0de82c5e 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json @@ -78,6 +78,11 @@ "type": "COUNTER", "value": "1" } +{ + "key": "processing.referenceExtraction.project.references.byfunder.irc", + "type": "COUNTER", + "value": "1" +} { "key": "processing.referenceExtraction.project.references.byfunder.mestd", "type": "COUNTER", @@ -176,5 +181,5 @@ { "key": "processing.referenceExtraction.project.references.total", "type": "COUNTER", - "value": "76" + "value": "77" } \ No newline at end of file