Skip to content

Commit

Permalink
facets: integrate combined_subjects / fix nested subject faceting
Browse files Browse the repository at this point in the history
- closes #798
  • Loading branch information
fenekku committed Nov 29, 2023
1 parent c3da317 commit 83ccf45
Show file tree
Hide file tree
Showing 14 changed files with 6,616 additions and 10 deletions.
14 changes: 14 additions & 0 deletions invenio_rdm_records/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def always_valid(identifier):
"field": "subjects.subject",
},
},
# subject_nested is deprecated and should be removed.
# subject_combined does require a pre-existing change to indexed documents,
# so it's unclear if a direct replacement is right.
# Keeping it around until v13 might be better. On the flipside it is an incorrect
# facet...
"subject_nested": {
"facet": facets.subject_nested,
"ui": {
Expand All @@ -183,6 +188,15 @@ def always_valid(identifier):
},
},
},
"subject_combined": {
"facet": facets.subject_combined,
"ui": {
"field": "subjects.scheme",
"childAgg": {
"field": "subjects.subject",
},
},
},
}

RDM_SEARCH_SORT_BY_VERIFIED = False
Expand Down
6 changes: 4 additions & 2 deletions invenio_rdm_records/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

from . import models
from .dumpers import (
CombinedSubjectsDumperExt,
EDTFDumperExt,
EDTFListDumperExt,
GrantTokensDumperExt,
Expand Down Expand Up @@ -118,6 +119,7 @@ class CommonFieldsMixin:
EDTFDumperExt("metadata.publication_date"),
EDTFListDumperExt("metadata.dates", "date"),
RelationDumperExt("relations"),
CombinedSubjectsDumperExt(),
CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"),
StatisticsDumperExt("stats"),
]
Expand Down Expand Up @@ -339,7 +341,7 @@ class RDMDraft(CommonFieldsMixin, Draft):

model_cls = models.RDMDraftMetadata

index = IndexField("rdmrecords-drafts-draft-v6.0.0", search_alias="rdmrecords")
index = IndexField("rdmrecords-drafts-draft-v6.1.0", search_alias="rdmrecords")

files = FilesField(
store=False,
Expand Down Expand Up @@ -409,7 +411,7 @@ class RDMRecord(CommonFieldsMixin, Record):
model_cls = models.RDMRecordMetadata

index = IndexField(
"rdmrecords-records-record-v6.0.0", search_alias="rdmrecords-records"
"rdmrecords-records-record-v6.1.0", search_alias="rdmrecords-records"
)

files = FilesField(
Expand Down
2 changes: 2 additions & 0 deletions invenio_rdm_records/records/dumpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
"""Search dumpers, for transforming to and from versions to index."""

from .access import GrantTokensDumperExt
from .combined_subjects import CombinedSubjectsDumperExt
from .edtf import EDTFDumperExt, EDTFListDumperExt
from .locations import LocationsDumper
from .pids import PIDsDumperExt
from .statistics import StatisticsDumperExt

__all__ = (
"CombinedSubjectsDumperExt",
"EDTFDumperExt",
"EDTFListDumperExt",
"PIDsDumperExt",
Expand Down
68 changes: 68 additions & 0 deletions invenio_rdm_records/records/dumpers/combined_subjects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 Northwestern University.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Search dumpers for combined subjects."""

from invenio_records.dumpers import SearchDumperExt


SPLITCHAR = "::" # explict better than implicit


class CombinedSubjectsDumperExt(SearchDumperExt):
"""Search dumper extension for sombined subjects support.
It parses the values of the `subjects` field
in the document and adds entries of the form:
`<scheme><splitchar><subject>` or `<subject>` in the `combined_subjects` field.
The combined_subjects field is required for properly aggregating/faceting subjects.
This dumper needs to be placed after the RelationDumper for subjects as it relies
on dereferenced scheme + subject pairs.
"""

def __init__(self, splitchar=SPLITCHAR):
"""Constructor.
:param splitchar: string to use to combine scheme + subject.
It must be identical to the splitchar value used in the
CombinedTermsFacet.
"""
super().__init__()
self._splitchar = splitchar

def dump(self, record, data):
"""Dump the data to secondary storage (OpenSearch-like)."""
subjects = data.get("metadata", {}).get("subjects", [])

def get_scheme_subject(subject_dict):
"""
Return [<scheme>, <subject>] or [<subject>] for the given `subject_dict`.
Assumes subject_dict has been dereferenced at this point.
"""
result = []
if "scheme" in subject_dict:
result.append(subject_dict["scheme"])
result.append(subject_dict["subject"])
return result

data["metadata"]["combined_subjects"] = [
self._splitchar.join(get_scheme_subject(subject))
for subject in subjects
]

def load(self, data, record_cls):
"""Load the data from secondary storage (OpenSearch-like).
This is run against the parent too (for some reason), so presence of any
field cannot be assumed.
"""
if "metadata" in data:
data["metadata"].pop("combined_subjects", None)
return data
Loading

0 comments on commit 83ccf45

Please sign in to comment.