Skip to content

Commit

Permalink
Updated LuceneDocument to take advantage of looking up feature values…
Browse files Browse the repository at this point in the history
… on existing features and selecting the max when parsing multi-value sparse vectors
  • Loading branch information
john-wagster committed May 29, 2024
1 parent 066b8a4 commit 1fc33f1
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 30 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/109007.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 109007
summary: Multivalue Sparse Vector Support
area: Search
type: enhancement
issues: []
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
---
"Indexing and searching sparse vectors in >=8.11":

- skip:
cluster_features: [ "gte_v8.15.0" ]
reason: "sparse_vector field type was updated to support multi-value sparse vectors in 8.15.0"
- requires:
cluster_features: ["gte_v8.11.0"]
cluster_features: [ "gte_v8.11.0" ]
reason: "sparse_vector field type reintroduced in 8.11"

- do:
indices.create:
index: test
body:
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector
embeddings:
type: sparse_vector
index: test
body:
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector
embeddings:
type: sparse_vector

- match: { acknowledged: true }

Expand Down Expand Up @@ -149,12 +152,179 @@
field: embeddings
- match: { hits.total: 1 }

---
"Indexing and searching multi-value sparse vectors in >=8.15":

- requires:
cluster_features: [ "gte_v8.15.0" ]
reason: "sparse_vector field type added multi-value support in 8.15"

- do:
indices.create:
index: test
body:
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector
embeddings:
type: sparse_vector

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
text: "running is good for you"
ml:
tokens:
- running: 2.4097164
good: 2.170997
run: 2.052153
race: 0.1
for: 1.1908325
runner: 1.1803857
exercise: 0.1
you: 0.9654308
training: 0.94999343
sports: 0.93650943
fitness: 0.83129317
best: 0.820365
bad: 0.1
health: 0.1
marathon: 0.61555296
gym: 0.5652374
- running: 0.1
good: 0.1
run: 0.1
race: 1.4575411
for: 0.1
runner: 0.1
exercise: 1.1652642
you: 0.1
training: 0.1
sports: 0.1
fitness: 0.1
best: 0.1
bad: 0.7385934
health: 0.7098149
marathon: 0.1
gym: 0.1

- match: { result: "created" }

- do:
index:
index: test
id: "2"
body:
text: "walking is a healthy exercise"
ml:
tokens:
walking: 2.4797723
exercise: 2.074234
healthy: 1.971596
walk: 1.6458614
health: 1.5291847
walker: 1.4736869
activity: 1.0793462
good: 1.0597849
fitness: 0.91855437
training: 0.86342937
movement: 0.7657065
normal: 0.6694081
foot: 0.5892523
physical: 0.4926789

- match: { result: "created" }

- do:
index:
index: test
id: "3"
body:
text: "empty array with no nested values - should not be retrieved in exists queries"
ml:
tokens: [ ]
- do:
index:
index: test
id: "4"
body:
text: "should still respond to exists queries if when empty"
ml:
tokens: { }

- match: { result: "created" }

- do:
index:
index: test
id: "5"
body:
text: "other embeddings available only"
embeddings:
aardvark: 0.5

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
query:
bool:
should:
- term:
ml.tokens:
value: "walk"
boost: 1.9790847
- term:
ml.tokens:
value: "walking"
boost: 1.7092685
- term:
ml.tokens:
value: "exercise"
boost: 0.84076905

- match: { hits.total.value: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }

- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
exists:
field: ml.tokens
- match: { hits.total: 3 }

- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
exists:
field: embeddings
- match: { hits.total: 1 }

---
"Sparse vector in 7.x":
- requires:
test_runner_features: ["allowed_warnings"]
test_runner_features: [ "allowed_warnings" ]
- skip:
cluster_features: ["gte_v8.0.0"]
cluster_features: [ "gte_v8.0.0" ]
reason: "sparse_vector field type supported in 7.x"
- do:
allowed_warnings:
Expand Down Expand Up @@ -184,10 +354,10 @@
---
"Sparse vector in 8.0.0 <= x < 8.11.0":
- skip:
cluster_features: ["gte_v8.11.0"]
cluster_features: [ "gte_v8.11.0" ]
reason: "sparse_vector field type not supported in 8.x until 8.11.0"
- requires:
cluster_features: ["gte_v8.0.0"]
cluster_features: [ "gte_v8.0.0" ]
reason: "sparse_vector field type not supported in 8.x until 8.11.0"
- do:
catch: /The \[sparse_vector\] field type .* supported/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package org.elasticsearch.index.mapper.vectors;

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -187,14 +188,15 @@ public void parse(DocumentParserContext context) throws IOException {
} else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
final String key = name() + "." + feature;
float value = context.parser().floatValue(true);
if (context.doc().getByKey(key) != null) {
throw new IllegalArgumentException(
"[sparse_vector] fields do not support indexing multiple values for the same feature ["
+ key
+ "] in the same document"
);

// if we have an existing feature of the same name we'll select for the one with the max value
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new FeatureField(name(), feature, value));
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
((FeatureField) currentField).setFeatureValue(value);
}
context.doc().addWithKey(key, new FeatureField(name(), feature, value));
} else {
throw new IllegalArgumentException(
"[sparse_vector] fields take hashes that map a feature to a strictly positive "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ public void testDotInFieldName() throws Exception {
assertThat(ex.getCause().getMessage(), containsString("politi.cs"));
}

public void testRejectMultiValuedFields() throws MapperParsingException, IOException {
public void testHandlesMultiValuedFields() throws MapperParsingException, IOException {
// setup a mapping that includes a sparse vector property
DocumentMapper mapper = createDocumentMapper(mapping(b -> {
b.startObject("field").field("type", "sparse_vector").endObject();
b.startObject("foo").startObject("properties");
Expand All @@ -129,27 +130,39 @@ public void testRejectMultiValuedFields() throws MapperParsingException, IOExcep
b.endObject().endObject();
}));

// when providing a malformed list of values for a single field
DocumentParsingException e = expectThrows(
DocumentParsingException.class,
() -> mapper.parse(source(b -> b.startObject("field").field("foo", Arrays.asList(10, 20)).endObject()))
);

// then fail appropriately
assertEquals(
"[sparse_vector] fields take hashes that map a feature to a strictly positive float, but got unexpected token " + "START_ARRAY",
e.getCause().getMessage()
);

e = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> {
// when providing a two fields with the same key name
ParsedDocument doc1 = mapper.parse(source(b -> {
b.startArray("foo");
{
b.startObject().startObject("field").field("bar", 10).endObject().endObject();
b.startObject().startObject("field").field("coup", 1).endObject().endObject();
b.startObject().startObject("field").field("bar", 5).endObject().endObject();
b.startObject().startObject("field").field("bar", 20).endObject().endObject();
b.startObject().startObject("field").field("bar", 10).endObject().endObject();
b.startObject().startObject("field").field("soup", 2).endObject().endObject();
}
b.endArray();
})));
assertEquals(
"[sparse_vector] fields do not support indexing multiple values for the same feature [foo.field.bar] in " + "the same document",
e.getCause().getMessage()
);
}));

// then validate that the generate document stored both values appropriately and we have only the max value stored
FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field.bar"));
assertEquals(20, barField.getFeatureValue(), 1);

FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1));
assertEquals(20, storedBarField.getFeatureValue(), 1);

assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof FeatureField).count());
}

public void testCannotBeUsedInMultiFields() {
Expand Down

0 comments on commit 1fc33f1

Please sign in to comment.