Skip to content

Commit

Permalink
WIP names: support mapping affiliation IDs
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Nov 25, 2024
1 parent ca50491 commit e9c124b
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 22 deletions.
77 changes: 57 additions & 20 deletions invenio_vocabularies/contrib/names/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import regex as re
from flask import current_app
from invenio_access.permissions import system_identity
from werkzeug.utils import cached_property

from invenio_vocabularies.contrib.names.s3client import S3OrcidClient

Expand Down Expand Up @@ -150,12 +151,14 @@ def __init__(
names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
affiliation_relation_schemes=None,
org_scheme_mappping=None,
org_ids_mapping=None,
**kwargs,
) -> None:
"""Constructor."""
self._names_exclude_regex = names_exclude_regex
self._affiliation_relation_schemes = affiliation_relation_schemes
self._org_scheme_mappping = org_scheme_mappping
self._org_ids_mapping = org_ids_mapping
super().__init__()

@property
Expand All @@ -173,6 +176,21 @@ def org_scheme_mappping(self):
"FUNDREF": "fundref",
}

@cached_property
def org_ids_mapping(self):
"""Mapping of ORCiD org IDs to affiliation IDs."""
org_ids_mapping_file = current_app.config.get(
"VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
)
if org_ids_mapping_file:
with open(org_ids_mapping_file) as fin:
return {
(org_scheme, org_id): ror_id
for org_scheme, org_id, ror_id in csv.reader(fin)
}

return self._org_ids_mapping or {}

def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
record = stream_entry.entry
Expand Down Expand Up @@ -209,39 +227,58 @@ def _extract_affiliations(self, record):
"""Extract affiliations from the ORCiD record."""
result = []
try:
employments = (
record.get("activities-summary", {})
.get("employments", {})
.get("affiliation-group")
)
employments = record.get("activities-summary", {}).get("employments", [])
# Single values might be originally a dict
if isinstance(employments, dict):
employments = [employments]

# Remove the "affiliation-group/employment-summary" nesting
employments = [
employment.get("affiliation-group", {}).get("employment-summary", {})
for employment in employments
]

history = set()
for employment in employments:
terminated = employment["employment-summary"].get("end-date")
org = employment["employment-summary"]["organization"]
if org["name"] not in history and not terminated:
history.add(org["name"])
aff = {"name": org["name"]}
terminated = employment.get("end-date")
org = employment["organization"]

if org.get("disambiguated-organization"):
dis_org = org["disambiguated-organization"]
org_id = dis_org.get("disambiguated-organization-identifier")
org_scheme = dis_org.get("disambiguation-source")
aff_scheme = self.org_scheme_mappping.get(org_scheme)
if terminated or org["name"] in history:
continue

if org_id and aff_scheme in self.affiliation_relation_schemes:
if aff_scheme == "ror":
org_id = org_id.split("/")[-1]
history.add(org["name"])
aff = {"name": org["name"]}

aff["id"] = org_id
# Extract the org ID, to link to the affiliation vocabulary
aff_id = self._extract_affiliation_id(org)
if aff_id:
aff["id"] = aff_id

result.append(aff)
result.append(aff)
except Exception:
pass
return result

def _extract_affiliation_id(self, org):
"""Extract the affiliation ID from an ORCiD organization."""
dis_org = org.get("disambiguated-organization")
if not dis_org:
return

aff_id = None

org_id = dis_org.get("disambiguated-organization-identifier")
org_scheme = dis_org.get("disambiguation-source")
aff_scheme = self.org_scheme_mappping.get(org_scheme)
if org_id and aff_scheme in self.affiliation_relation_schemes:
if aff_scheme == "ror":
org_id = org_id.split("/")[-1]
aff_id = org_id
else:
aff_id = self.org_ids_mapping.get((org_scheme, org_id))

return aff_id


class NamesServiceWriter(ServiceWriter):
"""Names service writer."""
Expand Down
4 changes: 2 additions & 2 deletions tests/contrib/names/test_names_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def expected_from_xml():
<common:organization>
<common:name>European Southern Observatory</common:name>
<common:disambiguated-organization>
<common:disambiguated-organization-identifier>54249</common:disambiguated-organization-identifier>
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
<common:disambiguated-organization-identifier>grid.424907.c</common:disambiguated-organization-identifier>
<common:disambiguation-source>GRID</common:disambiguation-source>
</common:disambiguated-organization>
</common:organization>
</employment:employment-summary>
Expand Down

0 comments on commit e9c124b

Please sign in to comment.