Updated LuceneDocument to take advantage of looking up feature values…

… on existing features and selecting the max when parsing multi-value sparse vectors
john-wagster · May 29, 2024 · 1fc33f1 · 1fc33f1
1 parent 066b8a4
commit 1fc33f1
Show file tree

Hide file tree

Showing 4 changed files with 220 additions and 30 deletions.
diff --git a/docs/changelog/109007.yaml b/docs/changelog/109007.yaml
@@ -0,0 +1,5 @@
+pr: 109007
+summary: Multivalue Sparse Vector Support
+area: Search
+type: enhancement
+issues: []
diff --git a/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml
@@ -1,22 +1,25 @@
 ---
 "Indexing and searching sparse vectors in >=8.11":
 
+  - skip:
+      cluster_features: [ "gte_v8.15.0" ]
+      reason: "sparse_vector field type was updated to support multi-value sparse vectors in 8.15.0"
   - requires:
-      cluster_features: ["gte_v8.11.0"]
+      cluster_features: [ "gte_v8.11.0" ]
       reason: "sparse_vector field type reintroduced in 8.11"
 
   - do:
       indices.create:
-          index: test
-          body:
-            mappings:
-              properties:
-                text:
-                   type: text
-                ml.tokens:
-                   type: sparse_vector
-                embeddings:
-                  type: sparse_vector
+        index: test
+        body:
+          mappings:
+            properties:
+              text:
+                type: text
+              ml.tokens:
+                type: sparse_vector
+              embeddings:
+                type: sparse_vector
 
   - match: { acknowledged: true }
 
@@ -149,12 +152,179 @@
               field: embeddings
   - match: { hits.total: 1 }
 
+---
+"Indexing and searching multi-value sparse vectors in >=8.15":
+
+  - requires:
+      cluster_features: [ "gte_v8.15.0" ]
+      reason: "sparse_vector field type added multi-value support in 8.15"
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            properties:
+              text:
+                type: text
+              ml.tokens:
+                type: sparse_vector
+              embeddings:
+                type: sparse_vector
+
+  - match: { acknowledged: true }
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          text: "running is good for you"
+          ml:
+            tokens:
+              - running: 2.4097164
+                good: 2.170997
+                run: 2.052153
+                race: 0.1
+                for: 1.1908325
+                runner: 1.1803857
+                exercise: 0.1
+                you: 0.9654308
+                training: 0.94999343
+                sports: 0.93650943
+                fitness: 0.83129317
+                best: 0.820365
+                bad: 0.1
+                health: 0.1
+                marathon: 0.61555296
+                gym: 0.5652374
+              - running: 0.1
+                good: 0.1
+                run: 0.1
+                race: 1.4575411
+                for: 0.1
+                runner: 0.1
+                exercise: 1.1652642
+                you: 0.1
+                training: 0.1
+                sports: 0.1
+                fitness: 0.1
+                best: 0.1
+                bad: 0.7385934
+                health: 0.7098149
+                marathon: 0.1
+                gym: 0.1
+
+  - match: { result: "created" }
+
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          text: "walking is a healthy exercise"
+          ml:
+            tokens:
+              walking: 2.4797723
+              exercise: 2.074234
+              healthy: 1.971596
+              walk: 1.6458614
+              health: 1.5291847
+              walker: 1.4736869
+              activity: 1.0793462
+              good: 1.0597849
+              fitness: 0.91855437
+              training: 0.86342937
+              movement: 0.7657065
+              normal: 0.6694081
+              foot: 0.5892523
+              physical: 0.4926789
+
+  - match: { result: "created" }
+
+  - do:
+      index:
+        index: test
+        id: "3"
+        body:
+          text: "empty array with no nested values - should not be retrieved in exists queries"
+          ml:
+            tokens: [ ]
+  - do:
+      index:
+        index: test
+        id: "4"
+        body:
+          text: "should still respond to exists queries if when empty"
+          ml:
+            tokens: { }
+
+  - match: { result: "created" }
+
+  - do:
+      index:
+        index: test
+        id: "5"
+        body:
+          text: "other embeddings available only"
+          embeddings:
+            aardvark: 0.5
+
+  - match: { result: "created" }
+
+  - do:
+      indices.refresh: { }
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            bool:
+              should:
+                - term:
+                    ml.tokens:
+                      value: "walk"
+                      boost: 1.9790847
+                - term:
+                    ml.tokens:
+                      value: "walking"
+                      boost: 1.7092685
+                - term:
+                    ml.tokens:
+                      value: "exercise"
+                      boost: 0.84076905
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "2" }
+  - match: { hits.hits.1._id: "1" }
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test
+        body:
+          query:
+            exists:
+              field: ml.tokens
+  - match: { hits.total: 3 }
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test
+        body:
+          query:
+            exists:
+              field: embeddings
+  - match: { hits.total: 1 }
+
 ---
 "Sparse vector in 7.x":
   - requires:
-      test_runner_features: ["allowed_warnings"]
+      test_runner_features: [ "allowed_warnings" ]
   - skip:
-      cluster_features: ["gte_v8.0.0"]
+      cluster_features: [ "gte_v8.0.0" ]
       reason: "sparse_vector field type supported in 7.x"
   - do:
       allowed_warnings:
@@ -184,10 +354,10 @@
 ---
 "Sparse vector in 8.0.0 <= x < 8.11.0":
   - skip:
-      cluster_features: ["gte_v8.11.0"]
+      cluster_features: [ "gte_v8.11.0" ]
       reason: "sparse_vector field type not supported in 8.x until 8.11.0"
   - requires:
-      cluster_features: ["gte_v8.0.0"]
+      cluster_features: [ "gte_v8.0.0" ]
       reason: "sparse_vector field type not supported in 8.x until 8.11.0"
   - do:
       catch: /The \[sparse_vector\] field type .* supported/

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@@ -9,6 +9,7 @@
 package org.elasticsearch.index.mapper.vectors;
 
 import org.apache.lucene.document.FeatureField;
+import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
@@ -187,14 +188,15 @@ public void parse(DocumentParserContext context) throws IOException {
                 } else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
                     final String key = name() + "." + feature;
                     float value = context.parser().floatValue(true);
-                    if (context.doc().getByKey(key) != null) {
-                        throw new IllegalArgumentException(
-                            "[sparse_vector] fields do not support indexing multiple values for the same feature ["
-                                + key
-                                + "] in the same document"
-                        );
+
+                    // if we have an existing feature of the same name we'll select for the one with the max value
+                    // based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
+                    IndexableField currentField = context.doc().getByKey(key);
+                    if (currentField == null) {
+                        context.doc().addWithKey(key, new FeatureField(name(), feature, value));
+                    } else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
+                        ((FeatureField) currentField).setFeatureValue(value);
                     }
-                    context.doc().addWithKey(key, new FeatureField(name(), feature, value));
                 } else {
                     throw new IllegalArgumentException(
                         "[sparse_vector] fields take hashes that map a feature to a strictly positive "

diff --git a/...er/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/...er/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java
@@ -119,7 +119,8 @@ public void testDotInFieldName() throws Exception {
         assertThat(ex.getCause().getMessage(), containsString("politi.cs"));
     }
 
-    public void testRejectMultiValuedFields() throws MapperParsingException, IOException {
+    public void testHandlesMultiValuedFields() throws MapperParsingException, IOException {
+        // setup a mapping that includes a sparse vector property
         DocumentMapper mapper = createDocumentMapper(mapping(b -> {
             b.startObject("field").field("type", "sparse_vector").endObject();
             b.startObject("foo").startObject("properties");
@@ -129,27 +130,39 @@ public void testRejectMultiValuedFields() throws MapperParsingException, IOExcep
             b.endObject().endObject();
         }));
 
+        // when providing a malformed list of values for a single field
         DocumentParsingException e = expectThrows(
             DocumentParsingException.class,
             () -> mapper.parse(source(b -> b.startObject("field").field("foo", Arrays.asList(10, 20)).endObject()))
         );
+
+        // then fail appropriately
         assertEquals(
             "[sparse_vector] fields take hashes that map a feature to a strictly positive float, but got unexpected token " + "START_ARRAY",
             e.getCause().getMessage()
         );
 
-        e = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> {
+        // when providing a two fields with the same key name
+        ParsedDocument doc1 = mapper.parse(source(b -> {
             b.startArray("foo");
             {
-                b.startObject().startObject("field").field("bar", 10).endObject().endObject();
+                b.startObject().startObject("field").field("coup", 1).endObject().endObject();
+                b.startObject().startObject("field").field("bar", 5).endObject().endObject();
                 b.startObject().startObject("field").field("bar", 20).endObject().endObject();
+                b.startObject().startObject("field").field("bar", 10).endObject().endObject();
+                b.startObject().startObject("field").field("soup", 2).endObject().endObject();
             }
             b.endArray();
-        })));
-        assertEquals(
-            "[sparse_vector] fields do not support indexing multiple values for the same feature [foo.field.bar] in " + "the same document",
-            e.getCause().getMessage()
-        );
+        }));
+
+        // then validate that the generate document stored both values appropriately and we have only the max value stored
+        FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field.bar"));
+        assertEquals(20, barField.getFeatureValue(), 1);
+
+        FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1));
+        assertEquals(20, storedBarField.getFeatureValue(), 1);
+
+        assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof FeatureField).count());
     }
 
     public void testCannotBeUsedInMultiFields() {