Skip to content

Commit

Permalink
Add support for index_prefix (#816)
Browse files Browse the repository at this point in the history
Implementation for index_prefixes
  • Loading branch information
manav113 authored Feb 6, 2025
1 parent f12984b commit 49f16bc
Show file tree
Hide file tree
Showing 13 changed files with 2,554 additions and 1,690 deletions.
10 changes: 10 additions & 0 deletions clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,16 @@ message Field {
// than ignore_above will not be indexed or stored. This option is useful for protecting against Lucene’s
// term byte-length limit of 32766
optional int32 ignoreAbove = 36;
// Parameter enables the indexing of term prefixes to speed up prefix searches
IndexPrefixes indexPrefixes = 37;
}

// Options for including IndexPrefixes for field
message IndexPrefixes{
// The minimum prefix length to index. Must be greater than 0, and defaults to 2.
optional int32 min_chars = 1;
// The maximum prefix length to index. Must be less than 20, and defaults to 5.
optional int32 max_chars = 2;
}

// Vector field element type
Expand Down
3,419 changes: 1,756 additions & 1,663 deletions grpc-gateway/luceneserver.pb.go

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions grpc-gateway/luceneserver.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3338,6 +3338,10 @@
"type": "integer",
"format": "int32",
"title": "For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer\nthan ignore_above will not be indexed or stored. This option is useful for protecting against Lucene’s\nterm byte-length limit of 32766"
},
"indexPrefixes": {
"$ref": "#/definitions/luceneserverIndexPrefixes",
"title": "Parameter enables the indexing of term prefixes to speed up prefix searches"
}
},
"title": "Definition of a field in an index"
Expand Down Expand Up @@ -3961,6 +3965,22 @@
"description": "- DEFAULT: Use field default index options: ATOM=DOCS, TEXT=DOCS_FREQS_POSITIONS\n - DOCS: Index only doc ids\n - DOCS_FREQS: Index doc ids and term frequencies\n - DOCS_FREQS_POSITIONS: Index doc ids, term frequencies and positions\n - DOCS_FREQS_POSITIONS_OFFSETS: Index doc ids, term frequencies, positions and offsets",
"title": "How text tokens should be indexed"
},
"luceneserverIndexPrefixes": {
"type": "object",
"properties": {
"minChars": {
"type": "integer",
"format": "int32",
"description": "The minimum prefix length to index. Must be greater than 0, and defaults to 2."
},
"maxChars": {
"type": "integer",
"format": "int32",
"description": "The maximum prefix length to index. Must be less than 20, and defaults to 5."
}
},
"title": "Options for including IndexPrefixes for field"
},
"luceneserverIndexSettings": {
"type": "object",
"properties": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;

/**
* An {@link AnalyzerWrapper} that wraps another analyzer and applies an Edge N-Gram token filter to
* the token stream.
*/
public class PrefixWrappedAnalyzer extends AnalyzerWrapper {
private final int minChars;
private final int maxChars;
private final Analyzer delegate;

/**
* Create a new {@link PrefixWrappedAnalyzer} that wraps the given {@link Analyzer} and sets
* applies an Edge N-Gram token filter to the token stream.
*
* @param delegate the analyzer to wrap
* @param minChars the minimum number of characters for the edge n-grams
* @param maxChars the maximum number of characters for the edge n-grams
*/
public PrefixWrappedAnalyzer(Analyzer delegate, int minChars, int maxChars) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.minChars = minChars;
this.maxChars = maxChars;
}

@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}

@Override
protected TokenStreamComponents wrapComponents(
String fieldName, TokenStreamComponents components) {
TokenFilter filter =
new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
return new TokenStreamComponents(components.getSource(), filter);
}

@Override
public String toString() {
return "PrefixWrappedAnalyzer(" + delegate.toString() + ")";
}
}
18 changes: 13 additions & 5 deletions src/main/java/com/yelp/nrtsearch/server/field/AtomFieldDef.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

import static com.yelp.nrtsearch.server.analysis.AnalyzerCreator.hasAnalyzer;

import com.yelp.nrtsearch.server.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.field.properties.RangeQueryable;
import com.yelp.nrtsearch.server.field.properties.Sortable;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import com.yelp.nrtsearch.server.grpc.RangeQuery;
import com.yelp.nrtsearch.server.grpc.SortType;
import java.util.List;
Expand All @@ -31,14 +33,12 @@
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;

/** Field class for 'ATOM' field type. Uses {@link KeywordAnalyzer} for text analysis. */
public class AtomFieldDef extends TextBaseFieldDef implements Sortable, RangeQueryable {
public class AtomFieldDef extends TextBaseFieldDef
implements Sortable, RangeQueryable, PrefixQueryable {
private static final Analyzer keywordAnalyzer = new KeywordAnalyzer();

public AtomFieldDef(
Expand Down Expand Up @@ -150,4 +150,12 @@ public Query getRangeQuery(RangeQuery rangeQuery) {
"Only SORTED or SORTED_SET doc values are supported for range queries: " + getName());
}
}

@Override
public Query getPrefixQuery(
PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod, boolean spanQuery) {
verifySearchable("Prefix query");
return new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()), rewriteMethod);
}
}
106 changes: 106 additions & 0 deletions src/main/java/com/yelp/nrtsearch/server/field/PrefixFieldDef.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.field;

import com.yelp.nrtsearch.server.analysis.PrefixWrappedAnalyzer;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;

public class PrefixFieldDef extends TextBaseFieldDef {
private final int minChars;
private final int maxChars;
private final String parentField;
private static final String INDEX_PREFIX = "._index_prefix";

public PrefixFieldDef(
String parentName, Field requestField, FieldDefCreator.FieldDefCreatorContext context) {
super(parentName + INDEX_PREFIX, requestField, context);
this.minChars = requestField.getIndexPrefixes().getMinChars();
this.maxChars = requestField.getIndexPrefixes().getMaxChars();
this.parentField = parentName;
}

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
fieldType.setOmitNorms(true);
fieldType.setTokenized(true);
fieldType.setIndexOptions(IndexOptions.DOCS);
}

@Override
protected Analyzer parseIndexAnalyzer(Field requestField) {
Analyzer baseAnalyzer = super.parseIndexAnalyzer(requestField);
if (baseAnalyzer == null) {
throw new IllegalArgumentException("Could not determine analyzer");
}
return new PrefixWrappedAnalyzer(
baseAnalyzer,
requestField.getIndexPrefixes().getMinChars(),
requestField.getIndexPrefixes().getMaxChars());
}

boolean accept(int length) {
return length >= minChars - 1 && length <= maxChars;
}

public Query getPrefixQuery(PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod) {
String textValue = prefixQuery.getPrefix();
if (textValue.length() >= minChars) {
return super.getTermQueryFromTextValue(textValue);
}
List<Automaton> automata = new ArrayList<>();
automata.add(Automata.makeString(textValue));
for (int i = textValue.length(); i < minChars; i++) {
automata.add(Automata.makeAnyChar());
}
Automaton automaton = Operations.concatenate(automata);
AutomatonQuery query =
new AutomatonQuery(new Term(getName(), textValue + "*"), automaton, false, rewriteMethod);
return new BooleanQuery.Builder()
.add(query, BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(parentField, textValue)), BooleanClause.Occur.SHOULD)
.build();
}

@Override
public String getType() {
return "PREFIX";
}

public int getMinChars() {
return minChars;
}

public int getMaxChars() {
return maxChars;
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), minChars, maxChars, parentField);
}
}
105 changes: 104 additions & 1 deletion src/main/java/com/yelp/nrtsearch/server/field/TextFieldDef.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,70 @@
*/
package com.yelp.nrtsearch.server.field;

import com.yelp.nrtsearch.server.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.IndexPrefixes;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;

/** Field class for 'TEXT' field type. */
public class TextFieldDef extends TextBaseFieldDef {
public class TextFieldDef extends TextBaseFieldDef implements PrefixQueryable {
protected PrefixFieldDef prefixFieldDef;
private final Map<String, IndexableFieldDef<?>> childFieldsWithPrefix;
private static final int DEFAULT_MIN_CHARS = 2;
private static final int DEFAULT_MAX_CHARS = 5;

public TextFieldDef(
String name, Field requestField, FieldDefCreator.FieldDefCreatorContext context) {
super(name, requestField, context);
if (requestField.hasIndexPrefixes()) {
verifySearchable("Prefix query");
int minChars =
requestField.getIndexPrefixes().hasMinChars()
? requestField.getIndexPrefixes().getMinChars()
: DEFAULT_MIN_CHARS;
int maxChars =
requestField.getIndexPrefixes().hasMaxChars()
? requestField.getIndexPrefixes().getMaxChars()
: DEFAULT_MAX_CHARS;
validatePrefix(minChars, maxChars);
Field.Builder prefixFieldBuilder =
Field.newBuilder()
.setSearch(true)
.setIndexPrefixes(
IndexPrefixes.newBuilder().setMinChars(minChars).setMaxChars(maxChars).build());

if (requestField.hasAnalyzer()) {
prefixFieldBuilder.setAnalyzer(requestField.getAnalyzer());
}
if (requestField.hasIndexAnalyzer()) {
prefixFieldBuilder.setIndexAnalyzer(requestField.getIndexAnalyzer());
}

this.prefixFieldDef = new PrefixFieldDef(getName(), prefixFieldBuilder.build(), context);

Map<String, IndexableFieldDef<?>> childFieldsMap = new HashMap<>(super.getChildFields());
childFieldsMap.put(prefixFieldDef.getName(), prefixFieldDef);
childFieldsWithPrefix = Collections.unmodifiableMap(childFieldsMap);
} else {
this.prefixFieldDef = null;
childFieldsWithPrefix = super.getChildFields();
}
}

@Override
public Map<String, IndexableFieldDef<?>> getChildFields() {
return childFieldsWithPrefix;
}

@Override
Expand Down Expand Up @@ -51,4 +106,52 @@ protected void setSearchProperties(FieldType fieldType, Field requestField) {
fieldType.setTokenized(true);
fieldType.setOmitNorms(requestField.getOmitNorms());
}

public PrefixFieldDef getPrefixFieldDef() {
return prefixFieldDef;
}

public boolean hasPrefix() {
return prefixFieldDef != null;
}

@Override
public void parseDocumentField(
Document document, List<String> fieldValues, List<List<String>> facetHierarchyPaths) {
super.parseDocumentField(document, fieldValues, facetHierarchyPaths);

if (hasPrefix() && !fieldValues.isEmpty()) {
prefixFieldDef.parseDocumentField(document, fieldValues, facetHierarchyPaths);
}
}

@Override
public Query getPrefixQuery(
PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod, boolean spanQuery) {
verifySearchable("Prefix query");
if (hasPrefix() && prefixFieldDef.accept(prefixQuery.getPrefix().length()) && !spanQuery) {
Query query = prefixFieldDef.getPrefixQuery(prefixQuery, rewriteMethod);
if (rewriteMethod == null
|| rewriteMethod == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| rewriteMethod == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
return new ConstantScoreQuery(query);
}
return query;
}
return new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()), rewriteMethod);
}

public void validatePrefix(int minChars, int maxChars) {
if (minChars > maxChars) {
throw new IllegalArgumentException(
"min_chars [" + minChars + "] must be less than max_chars [" + maxChars + "]");
}
if (minChars < 1) {
throw new IllegalArgumentException("min_chars [" + minChars + "] must be greater than zero");
}
if (maxChars >= 20) {
throw new IllegalArgumentException("max_chars [" + maxChars + "] must be less than 20");
}
}
}
Loading

0 comments on commit 49f16bc

Please sign in to comment.