Big Bang change

kermitt2 · Oct 20, 2023 · e42a127 · e42a127
1 parent 35f7762
commit e42a127
Show file tree

Hide file tree

Showing 129 changed files with 6,134 additions and 2,920 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,3 @@
-/.git
-/data
+.git
+data
+logs/*
diff --git a/.gitignore b/.gitignore
@@ -19,11 +19,11 @@ data/db
 scripts/dump.json
 scripts/dump.json.gz
 scripts/node_modules
-lookup/build
+build
 .gradle
 *.mdb
 **/node_modules/**
 **/out
 **/package-lock.json
-lookup/data/db/*
-pubmed-glutton/build/*
+data/db/*
+pubmed-glutton/*
diff --git a/Dockerfile b/Dockerfile
diff --git a/Readme.md b/Readme.md
diff --git a/lookup/build.gradle → build.gradle b/lookup/build.gradle → build.gradle
@@ -28,7 +28,7 @@ version '0.3-SNAPSHOT'
 sourceCompatibility = 1.8
 
 // The main class of the application
-mainClassName = 'com.scienceminer.lookup.web.LookupServiceApplication'
+mainClassName = 'com.scienceminer.glutton.web.LookupServiceApplication'
 tasks.run.workingDir = rootProject.rootDir
 
 repositories {
@@ -82,6 +82,7 @@ dependencies {
 
 //    compile group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.3'
     implementation 'org.apache.commons:commons-collections4:4.1'
+    implementation 'commons-beanutils:commons-beanutils:1.9.4'
     implementation 'commons-io:commons-io:2.6'
     implementation group: 'org.apache.commons', name: 'commons-compress', version: '1.20'
 
@@ -90,6 +91,8 @@ dependencies {
     implementation "com.fasterxml.jackson.core:jackson-core:2.9.10"
     implementation "com.fasterxml.jackson.core:jackson-databind:2.9.10"
     implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.9.10"
+
+    implementation group: 'com.opencsv', name: 'opencsv', version: '5.0'
 }
 
 distributions {

diff --git a/config/glutton.yml b/config/glutton.yml
@@ -1,14 +1,9 @@
-version: 0.2
+version: 0.3
 
-# where the metadata are stored, it takes more than 200GB for all Crossref, Unpaywall, PubMed and ISTEX mappings 
+# where the compiled metadata are stored, it takes more than 200GB for all Crossref, Unpaywall, PubMed and ISTEX mappings, 
+# this is a single location for all compiled resources  
 #storage: data/db
-storage: /media/lopez/T5/data/db
-
-# Crossref fields to be ignored when storing metadata, reference field in particular take a lot of space
-ignoreCrossRefFields: 
-#  - reference
-#  - abstract
-  - indexed
+storage: /media/lopez/T51/data/db
 
 # batch size for preparing the data
 loadingBatchSize: 10000
@@ -37,6 +32,12 @@ proxy:
     port: 
 
 crossref:
+  # Crossref fields to be ignored when storing metadata, reference field in particular take a lot of space
+  ignoreCrossrefFields: 
+    - reference
+    - abstract
+    - indexed
+
   # a directory where the crossref incremental update files (gap or daily update) will be located
   # to be changed according to your storage
   dumpPath: /media/lopez/data2/crossref
@@ -54,6 +55,25 @@ crossref:
   #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
   token:
 
+# the following is used only for pubmed related enrichments and extractions
+pubmed:
+  # path to the medline downloaded resources, to be changed according to your storage
+  pubmedDirectory: /media/lopez/data2/medline_2023
+  # path to PMC mapping data
+  pmcDirectory: data/pmc
+  # elasticsearch index for pubmed, used to create extraction based on MeSH terms
+  #index: pubmed
+  # path to the working pubmed databases, to be changed according to your storage 
+  #dbDirectory: /media/lopez/T51/data2/db
+
+hal: 
+  # OAI PMH endpoint for harvesting HAL metadata
+  api: "http://api.archives-ouvertes.fr/oai/hal"
+
+dblp:
+  # URL of the DBLP metadata dump
+  dump: "https://dblp.uni-trier.de/xml/dblp.xml.gz"
+
 unpaywall:
   dumpPath: 
   # a directory where the unpaywall update data feed change files will be located
@@ -96,13 +116,3 @@ logging:
     archivedFileCount: 5
     timeZone: UTC
 
-# the following is used only for pubmed related enrichments and extractions
-pubmed:
-  # path to the medline resources, to be changed according to your storage
-  pubmedDirectory: /media/lopez/data/biblio/medline2021/
-  # path to PMC mapping data
-  pmcDirectory: data/pmc
-  # elasticsearch index for pubmed, used to create extraction based on MeSH terms
-  index: pubmed
-  # path to the working pubmed databases, to be changed according to your storage 
-  dbDirectory: /media/lopez/T5/data2/db
diff --git a/doc/API.md b/doc/API.md
@@ -0,0 +1,134 @@
+## REST API
+
+The service can be queried based on a strong identifier, likeDOI, PMID, etc. as follow:
+
+- match record by DOI
+    - `GET host:port/service/lookup?doi=DOI`
+    - `GET host:port/service/lookup/doi/{DOI}`
+
+- match record by PMID
+    - `GET host:port/service/lookup?pmid=PMID`
+    - `GET host:port/service/lookup/pmid/{PMID}`
+
+- match record by PMC ID
+    - `GET host:port/service/lookup?pmc=PMC`
+    - `GET host:port/service/lookup/pmc/{PMC}`
+
+- match record by ISTEX ID
+    - `GET host:port/service/lookup?istexid=ISTEXID`
+    - `GET host:port/service/lookup/istexid/{ISTEXID}`
+
+- match record by PII ID
+    - `GET host:port/service/lookup?pii=PII`
+    - `GET host:port/service/lookup/pii/{PII}`   
+
+The service can be queried with various metadata like article title (`atitle`), first author last name (`firstAuthor`), journal title (`jtitle`), volume (`volume`), first page (`firstPage`) and publication year (`year`)
+
+- match record by article title and first author lastname
+    - `GET host:port/service/lookup?atitle=ARTICLE_TITLE&firstAuthor=FIRST_AUTHOR_SURNAME`
+
+- match record by journal title or abbreviated title, volume and first page
+    - `GET host:port/service/lookup?jtitle=JOURNAL_TITLE&volume=VOLUME&firstPage=FIRST_PAGE`
+
+- match record by journal title or abbreviated title, volume, first page, and first author lastname
+    - `GET host:port/service/lookup?jtitle=JOURNAL_TITLE&volume=VOLUME&firstPage=FIRST_PAGE&firstAuthor=FIRST_AUTHOR_SURNAME`
+
+It's possible to query the service based on a raw citation string (`biblio`):
+
+- match record by raw citation string 
+    - `GET host:port/service/lookup?biblio=BIBLIO_STRING&`
+    - `POST host:port/service/lookup/biblio` with `ContentType=text/plain` 
+
+Any combinations of these metadata and full raw citation string is possible, for instance: 
+
+    - `GET host:port/service/lookup?biblio=BIBLIO_STRING&atitle=ARTICLE_TITLE&firstAuthor=FIRST_AUTHOR_SURNAME`
+
+or:
+
+    - `GET host:port/service/lookup?jtitle=JOURNAL_TITLE&volume=VOLUME&firstPage=FIRST_PAGE&firstAuthor=FIRST_AUTHOR_SURNAME&atitle=ARTICLE_TITLE`
+
+or:
+
+    - `GET host:port/service/lookup?biblio=BIBLIO_STRING&atitle=ARTICLE_TITLE&firstAuthor=FIRST_AUTHOR_SURNAME&year=YYYY`
+
+It is also possible to combine a strong identifier with validation metadata. In this case, if the DOI appears conflicting with the provided metadata, no results will be returned, as a way to detect invalid DOI with post-validation:
+
+    - `GET host:port/service/lookup?doi=DOI&atitle=ARTICLE_TITLE&firstAuthor=FIRST_AUTHOR_SURNAME`
+
+biblio-glutton will make the best use of all the parameters sent to retrieve in the fastest way a record and apply matching threashold to avoid false positive. It is advised to send __as much metadata as possible__ to try to optimize the DOI matching in term of speed and accuracy, and when possible a full raw bibliographical string. 
+
+The more metadata are available in the query, the better. The original raw bibliographical string is also be exploited when availableto control the bibliographical record matching.
+
+For convenience, in case you are only interested by the Open Access URL for a bibliographical object, the open Access resolver API returns the OA PDF link (URL) only via an identifier: 
+
+- return the best Open Access URL if available
+    - `GET host:port/service/oa?doi=DOI` return the best Open Accss PDF url for a given DOI 
+    - `GET host:port/service/oa?pmid=PMID` return the best Open Accss PDF url for a given PMID 
+    - `GET host:port/service/oa?pmc=PMC` return the best Open Accss PDF url for a given PMC ID
+    - `GET host:port/service/oa?pii=PII` return the best Open Accss PDF url for a given PII ID
+
+- return the best Open Access URL and ISTEX PDF URL if available
+    - `GET host:port/service/oa_istex?doi=DOI` return the best Open Accss PDF url and ISTEX PDF url for a given DOI 
+    - `GET host:port/service/oa_istex?pmid=PMID` return the best Open Accss PDF url and ISTEX PDF url for a given PMID 
+    - `GET host:port/service/oa_istex?pmc=PMC` return the best Open Accss PDF url and ISTEX PDF url for a given PMC ID
+    - `GET host:port/service/oa_istex?pii=PII` return the best Open Accss PDF url and ISTEX PDF url for a given PII ID
+
+## cURL examples
+
+To illustrate the usage of the API, we provide some cURL example queries:
+
+Bibliographical metadata lookup by DOI:
+
+```sh
+curl http://localhost:8080/service/lookup?doi=10.1484/J.QUAESTIO.1.103624
+```
+
+Matching with title and first authort lastname:
+
+```sh
+curl "http://localhost:8080/service/lookup?atitle=Naturalizing+Intentionality+between+Philosophy+and+Brain+Science.+A+Survey+of+Methodological+and+Metaphysical+Issues&firstAuthor=Pecere"
+
+curl "http://localhost:8080/service/lookup?atitle=Naturalizing+Intentionality+between+Philosophy+and+Brain+Science&firstAuthor=Pecere"
+```
+
+Matching with raw bibliographical reference string:
+
+```sh
+curl "http://localhost:8080/service/lookup?biblio=Baltz,+R.,+Domon,+C.,+Pillay,+D.T.N.+and+Steinmetz,+A.+(1992)+Characterization+of+a+pollen-specific+cDNA+from+sunflower+encoding+a+zinc+finger+protein.+Plant+J.+2:+713-721"
+```
+
+Bibliographical metadata lookup by PMID (note that only the number is expected):
+
+```sh
+curl http://localhost:8080/service/lookup?pmid=1605817
+```
+
+Bibliographical metadata lookup by PMC ID (note that the `PMC` prefix in the identifier is expected):
+
+```sh
+curl http://localhost:8080/service/lookup?pmc=PMC1017419
+```
+
+Bibliographical metadata lookup by PII ID:
+
+```sh
+curl http://localhost:8080/service/lookup?pii=
+```
+
+Bibliographical metadata lookup by ISTEX ID:
+
+```sh
+curl http://localhost:8080/service/lookup?istexid=E6CF7ECC9B002E3EA3EC590E7CC8DDBF38655723
+```
+
+Open Access resolver by DOI:
+
+```sh
+curl "http://localhost:8080/service/oa?doi=10.1038/nature12373"
+```
+
+Combination of Open Access resolver and ISTEX identifier by DOI:
+
+```sh
+curl "http://localhost:8080/service/oa_istex?doi=10.1038/nature12373"
+```
-Original file line number
+Diff line change
@@ -1,2 +1,3 @@
-    /.git
-    /data
+    .git
+    data
+    logs/*