facets: integrate combined_subjects / fix nested subject faceting

- closes #798
inveniosoftware · Nov 29, 2023 · 83ccf45 · 83ccf45
1 parent c3da317
commit 83ccf45
Show file tree

Hide file tree

Showing 14 changed files with 6,616 additions and 10 deletions.
diff --git a/invenio_rdm_records/config.py b/invenio_rdm_records/config.py
@@ -174,6 +174,11 @@ def always_valid(identifier):
             "field": "subjects.subject",
         },
     },
+    # subject_nested is deprecated and should be removed.
+    # subject_combined does require a pre-existing change to indexed documents,
+    # so it's unclear if a direct replacement is right.
+    # Keeping it around until v13 might be better. On the flipside it is an incorrect
+    # facet...
     "subject_nested": {
         "facet": facets.subject_nested,
         "ui": {
@@ -183,6 +188,15 @@ def always_valid(identifier):
             },
         },
     },
+    "subject_combined": {
+        "facet": facets.subject_combined,
+        "ui": {
+            "field": "subjects.scheme",
+            "childAgg": {
+                "field": "subjects.subject",
+            },
+        },
+    },
 }
 
 RDM_SEARCH_SORT_BY_VERIFIED = False

diff --git a/invenio_rdm_records/records/api.py b/invenio_rdm_records/records/api.py
@@ -47,6 +47,7 @@
 
 from . import models
 from .dumpers import (
+    CombinedSubjectsDumperExt,
     EDTFDumperExt,
     EDTFListDumperExt,
     GrantTokensDumperExt,
@@ -118,6 +119,7 @@ class CommonFieldsMixin:
             EDTFDumperExt("metadata.publication_date"),
             EDTFListDumperExt("metadata.dates", "date"),
             RelationDumperExt("relations"),
+            CombinedSubjectsDumperExt(),
             CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"),
             StatisticsDumperExt("stats"),
         ]
@@ -339,7 +341,7 @@ class RDMDraft(CommonFieldsMixin, Draft):
 
     model_cls = models.RDMDraftMetadata
 
-    index = IndexField("rdmrecords-drafts-draft-v6.0.0", search_alias="rdmrecords")
+    index = IndexField("rdmrecords-drafts-draft-v6.1.0", search_alias="rdmrecords")
 
     files = FilesField(
         store=False,
@@ -409,7 +411,7 @@ class RDMRecord(CommonFieldsMixin, Record):
     model_cls = models.RDMRecordMetadata
 
     index = IndexField(
-        "rdmrecords-records-record-v6.0.0", search_alias="rdmrecords-records"
+        "rdmrecords-records-record-v6.1.0", search_alias="rdmrecords-records"
     )
 
     files = FilesField(

diff --git a/invenio_rdm_records/records/dumpers/__init__.py b/invenio_rdm_records/records/dumpers/__init__.py
@@ -8,12 +8,14 @@
 """Search dumpers, for transforming to and from versions to index."""
 
 from .access import GrantTokensDumperExt
+from .combined_subjects import CombinedSubjectsDumperExt
 from .edtf import EDTFDumperExt, EDTFListDumperExt
 from .locations import LocationsDumper
 from .pids import PIDsDumperExt
 from .statistics import StatisticsDumperExt
 
 __all__ = (
+    "CombinedSubjectsDumperExt",
     "EDTFDumperExt",
     "EDTFListDumperExt",
     "PIDsDumperExt",

diff --git a/invenio_rdm_records/records/dumpers/combined_subjects.py b/invenio_rdm_records/records/dumpers/combined_subjects.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 Northwestern University.
+#
+# Invenio-RDM-Records is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Search dumpers for combined subjects."""
+
+from invenio_records.dumpers import SearchDumperExt
+
+
+SPLITCHAR = "::"  # explict better than implicit
+
+
+class CombinedSubjectsDumperExt(SearchDumperExt):
+    """Search dumper extension for sombined subjects support.
+
+    It parses the values of the `subjects` field
+    in the document and adds entries of the form:
+    `<scheme><splitchar><subject>` or `<subject>` in the `combined_subjects` field.
+
+    The combined_subjects field is required for properly aggregating/faceting subjects.
+
+    This dumper needs to be placed after the RelationDumper for subjects as it relies
+    on dereferenced scheme + subject pairs.
+    """
+
+    def __init__(self, splitchar=SPLITCHAR):
+        """Constructor.
+
+        :param splitchar: string to use to combine scheme + subject.
+                          It must be identical to the splitchar value used in the
+                          CombinedTermsFacet.
+        """
+        super().__init__()
+        self._splitchar = splitchar
+
+    def dump(self, record, data):
+        """Dump the data to secondary storage (OpenSearch-like)."""
+        subjects = data.get("metadata", {}).get("subjects", [])
+
+        def get_scheme_subject(subject_dict):
+            """
+            Return [<scheme>, <subject>] or [<subject>] for the given `subject_dict`.
+
+            Assumes subject_dict has been dereferenced at this point.
+            """
+            result = []
+            if "scheme" in subject_dict:
+                result.append(subject_dict["scheme"])
+            result.append(subject_dict["subject"])
+            return result
+
+        data["metadata"]["combined_subjects"] = [
+            self._splitchar.join(get_scheme_subject(subject))
+            for subject in subjects
+        ]
+
+    def load(self, data, record_cls):
+        """Load the data from secondary storage (OpenSearch-like).
+
+        This is run against the parent too (for some reason), so presence of any
+        field cannot be assumed.
+        """
+        if "metadata" in data:
+            data["metadata"].pop("combined_subjects", None)
+        return data