diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java index a782aaee0975..5254c1f024d1 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java @@ -288,7 +288,13 @@ private void writeMeta( meta.writeInt(Float.floatToIntBits(centroidDp)); } OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, meta, binarizedVectorData, count, maxDoc, docsWithField); + DIRECT_MONOTONIC_BLOCK_SHIFT, + meta, + binarizedVectorData, + count, + maxDoc, + docsWithField, + null); } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index 75abbb3e60b2..22a6ce64d8a0 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -591,7 +591,7 @@ private void writeMeta( int count = docsWithField.cardinality(); meta.writeInt(count); OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField); + DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField, null); meta.writeVInt(M); // write graph nodes on each level if (graph == null) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index b36f153eaeb4..c2db6bbcdf5f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -364,7 +364,13 @@ private void writeMeta( } // write docIDs OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, meta, quantizedVectorData, count, maxDoc, docsWithField); + DIRECT_MONOTONIC_BLOCK_SHIFT, + meta, + quantizedVectorData, + count, + maxDoc, + docsWithField, + null); } private void writeQuantizedVectors(FieldWriter fieldData, ScalarQuantizer scalarQuantizer) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index ee9765f2ac0e..3e02cd456f6f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -350,7 +350,7 @@ private void updateVectorMetadataFile(Directory dir, String fileName) throws Exc // Write configuration OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, out, null, 0, 0, null); + DIRECT_MONOTONIC_BLOCK_SHIFT, out, null, 0, 0, null, null); // Mark end of fields and write footer out.writeInt(-1); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java index a579f588f4f7..29d3956103ae 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java @@ -303,7 +303,7 @@ private void writeMeta( meta.writeInt(Float.floatToIntBits(centroidDp)); } OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField); + DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField, null); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java index 83350e5ce955..c364d8e069c9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java @@ -423,7 +423,7 @@ public static RandomAccessInput createJumpTable( /** * Returns an iterator that delegates to the IndexedDISI. Advancing this iterator will advance the - * underlying IndexedDISI, and vice-versa. + * underlying IndexedDISI, and vice versa. */ public static KnnVectorValues.DocIndexIterator asDocIndexIterator(IndexedDISI disi) { // can we replace with fromDISI? diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index b05aeb20347a..c78fa6feabab 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -99,12 +99,33 @@ public static OffHeapFloatVectorValues load( long vectorDataLength, IndexInput vectorData) throws IOException { - if (configuration.docsWithFieldOffset == -2 || vectorEncoding != VectorEncoding.FLOAT32) { + if (configuration.isEmpty() || vectorEncoding != VectorEncoding.FLOAT32) { return new EmptyOffHeapVectorValues(dimension, flatVectorsScorer, vectorSimilarityFunction); } IndexInput bytesSlice = vectorData.slice("vector-data", vectorDataOffset, vectorDataLength); int byteSize = dimension * Float.BYTES; - if (configuration.docsWithFieldOffset == -1) { + if (configuration.isReordered()) { + if (configuration.isDense()) { + return new ReorderedDenseVectorValues( + configuration.size, + configuration.getOrdToDocReader(vectorData, configuration.size), + bytesSlice, + dimension, + byteSize, + flatVectorsScorer, + vectorSimilarityFunction); + } else { + return new ReorderedSparseVectorValues( + configuration, + null, + vectorData, + bytesSlice, + dimension, + byteSize, + flatVectorsScorer, + vectorSimilarityFunction); + } + } else if (configuration.isDense()) { return new DenseOffHeapVectorValues( dimension, configuration.size, @@ -146,11 +167,6 @@ public DenseOffHeapVectorValues copy() throws IOException { dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); } - @Override - public int ordToDoc(int ord) { - return ord; - } - @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -205,10 +221,106 @@ public VectorScorer.Bulk bulk(DocIdSetIterator matchingDocs) { } } + /** Dense vector values that are stored off-heap with reordered nodes. */ + public static class ReorderedDenseVectorValues extends OffHeapFloatVectorValues { + private final OrdToDocDISIReaderConfiguration.OrdToDocReader ordToDoc; + + public ReorderedDenseVectorValues( + int size, + OrdToDocDISIReaderConfiguration.OrdToDocReader ordToDoc, + IndexInput bytesSlice, + int dimension, + int byteSize, + FlatVectorsScorer flatVectorsScorer, + VectorSimilarityFunction similarityFunction) + throws IOException { + super(dimension, size, bytesSlice, byteSize, flatVectorsScorer, similarityFunction); + this.ordToDoc = ordToDoc; + } + + @Override + public ReorderedDenseVectorValues copy() throws IOException { + return new ReorderedDenseVectorValues( + size, + ordToDoc, + slice.clone(), + dimension, + byteSize, + flatVectorsScorer, + similarityFunction); + } + + @Override + public int ordToDoc(int ord) { + return ordToDoc.ordToDoc(ord); + } + + @Override + public DocIndexIterator iterator() { + return ordToDoc.iterator(); + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + ReorderedDenseVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); + RandomVectorScorer randomVectorScorer = + flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + return new VectorScorer() { + @Override + public float score() throws IOException { + return randomVectorScorer.score(iterator.docID()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + + @Override + public VectorScorer.Bulk bulk(DocIdSetIterator matchingDocs) { + return new Bulk() { + final DocIdSetIterator matches = + matchingDocs == null + ? iterator + : ConjunctionUtils.createConjunction( + List.of(matchingDocs, iterator), List.of()); + int[] docIds = new int[0]; + + @Override + public float nextDocsAndScores( + int nextCount, Bits liveDocs, DocAndFloatFeatureBuffer buffer) throws IOException { + if (matches.docID() == -1) { + matches.nextDoc(); + } + buffer.growNoCopy(nextCount); + docIds = ArrayUtil.growNoCopy(docIds, nextCount); + int size = 0; + for (int doc = matches.docID(); + doc != DocIdSetIterator.NO_MORE_DOCS && size < nextCount; + doc = matches.nextDoc()) { + if (liveDocs == null || liveDocs.get(doc)) { + buffer.docs[size] = iterator.index(); + docIds[size] = doc; + ++size; + } + } + buffer.size = size; + float maxScore = randomVectorScorer.bulkScore(buffer.docs, buffer.features, size); + // copy back the real doc IDs + System.arraycopy(docIds, 0, buffer.docs, 0, size); + return maxScore; + } + }; + } + }; + } + } + private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() + // dataIn was used to init a new IndexedDISI for #randomAccess() private final IndexInput dataIn; private final OrdToDocDISIReaderConfiguration configuration; @@ -335,6 +447,132 @@ public float nextDocsAndScores( } } + // this is sparse + private static class ReorderedSparseVectorValues extends OffHeapFloatVectorValues { + private final IndexInput dataIn; + private final OrdToDocDISIReaderConfiguration configuration; + private final OrdToDocDISIReaderConfiguration.OrdToDocReader ordToDoc; + + public ReorderedSparseVectorValues( + OrdToDocDISIReaderConfiguration configuration, + OrdToDocDISIReaderConfiguration.OrdToDocReader ordToDoc, + IndexInput dataIn, + IndexInput slice, + int dimension, + int byteSize, + FlatVectorsScorer flatVectorsScorer, + VectorSimilarityFunction similarityFunction) + throws IOException { + + super(dimension, configuration.size, slice, byteSize, flatVectorsScorer, similarityFunction); + this.configuration = configuration; + if (ordToDoc == null) { + this.ordToDoc = configuration.getOrdToDocReader(dataIn, configuration.size); + } else { + this.ordToDoc = ordToDoc; + } + dataIn.seek(configuration.addressesOffset); + this.dataIn = dataIn; + } + + @Override + public ReorderedSparseVectorValues copy() throws IOException { + return new ReorderedSparseVectorValues( + configuration, + ordToDoc, + dataIn, + slice.clone(), + dimension, + byteSize, + flatVectorsScorer, + similarityFunction); + } + + @Override + public int ordToDoc(int ord) { + return ordToDoc.ordToDoc(ord); + } + + @Override + public Bits getAcceptOrds(Bits acceptDocs) { + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size; + } + }; + } + + @Override + public DocIndexIterator iterator() { + return ordToDoc.iterator(); + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + ReorderedSparseVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); + RandomVectorScorer randomVectorScorer = + flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + return new VectorScorer() { + @Override + public float score() throws IOException { + return randomVectorScorer.score(iterator.index()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + + @Override + public VectorScorer.Bulk bulk(DocIdSetIterator matchingDocs) { + return new Bulk() { + final DocIdSetIterator matches = + matchingDocs == null + ? iterator + : ConjunctionUtils.createConjunction( + List.of(matchingDocs, iterator), List.of()); + int[] docIds = new int[0]; + + @Override + public float nextDocsAndScores( + int nextCount, Bits liveDocs, DocAndFloatFeatureBuffer buffer) throws IOException { + if (matches.docID() == -1) { + matches.nextDoc(); + } + buffer.growNoCopy(nextCount); + docIds = ArrayUtil.growNoCopy(docIds, nextCount); + int size = 0; + for (int doc = matches.docID(); + doc != DocIdSetIterator.NO_MORE_DOCS && size < nextCount; + doc = matches.nextDoc()) { + if (liveDocs == null || liveDocs.get(doc)) { + buffer.docs[size] = iterator.index(); + docIds[size] = doc; + ++size; + } + } + buffer.size = size; + float maxScore = randomVectorScorer.bulkScore(buffer.docs, buffer.features, size); + // copy back the real doc IDs + System.arraycopy(docIds, 0, buffer.docs, 0, size); + return maxScore; + } + }; + } + }; + } + } + private static class EmptyOffHeapVectorValues extends OffHeapFloatVectorValues { public EmptyOffHeapVectorValues( diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java index e4c921ddee22..1f26ea699a2c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java @@ -17,15 +17,23 @@ package org.apache.lucene.codecs.lucene95; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + import java.io.IOException; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.KnnVectorValues.DocIndexIterator; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BpVectorReorderer; +import org.apache.lucene.util.GroupVIntUtil; +import org.apache.lucene.util.LongValues; import org.apache.lucene.util.packed.DirectMonotonicReader; import org.apache.lucene.util.packed.DirectMonotonicWriter; +import org.apache.lucene.util.packed.DirectReader; +import org.apache.lucene.util.packed.DirectWriter; /** * Configuration for {@link DirectMonotonicReader} and {@link IndexedDISI} for reading sparse @@ -40,29 +48,42 @@ public class OrdToDocDISIReaderConfiguration { *

Within outputMeta the format is as follows: * *

* *

Within the vectorData the format is as follows: * *

* + * @param directMonotonicBlockShift used when writing OrdToDocs * @param outputMeta the outputMeta * @param vectorData the vectorData * @param count the count of docs with vectors * @param maxDoc the maxDoc for the index - * @param docsWithField the docs contaiting a vector field + * @param docsWithField the docs containing a vector field + * @param sortMap encoding the mapping from ordered doc (new) to doc (old); null if they are in + * the same order * @throws IOException thrown when writing data fails to either output */ public static void writeStoredMeta( @@ -71,7 +92,8 @@ public static void writeStoredMeta( IndexOutput vectorData, int count, int maxDoc, - DocsWithFieldSet docsWithField) + DocsWithFieldSet docsWithField, + BpVectorReorderer.DocMap sortMap) throws IOException { if (count == 0) { outputMeta.writeLong(-2); // docsWithFieldOffset @@ -79,7 +101,11 @@ public static void writeStoredMeta( outputMeta.writeShort((short) -1); // jumpTableEntryCount outputMeta.writeByte((byte) -1); // denseRankPower } else if (count == maxDoc) { - outputMeta.writeLong(-1); // docsWithFieldOffset + if (sortMap != null) { + outputMeta.writeLong(-3); // docsWithFieldOffset + } else { + outputMeta.writeLong(-1); // docsWithFieldOffset + } outputMeta.writeLong(0L); // docsWithFieldLength outputMeta.writeShort((short) -1); // jumpTableEntryCount outputMeta.writeByte((byte) -1); // denseRankPower @@ -93,23 +119,69 @@ public static void writeStoredMeta( outputMeta.writeShort(jumpTableEntryCount); outputMeta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); - // write ordToDoc mapping - long start = vectorData.getFilePointer(); - outputMeta.writeLong(start); - outputMeta.writeVInt(directMonotonicBlockShift); - // dense case and empty case do not need to store ordToMap mapping - final DirectMonotonicWriter ordToDocWriter = - DirectMonotonicWriter.getInstance( - outputMeta, vectorData, count, directMonotonicBlockShift); - DocIdSetIterator iterator = docsWithField.iterator(); - for (int doc = iterator.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = iterator.nextDoc()) { - ordToDocWriter.add(doc); + if (sortMap == null) { + writeMonotonicOrdToDoc( + directMonotonicBlockShift, outputMeta, vectorData, count, docsWithField); } - ordToDocWriter.finish(); - outputMeta.writeLong(vectorData.getFilePointer() - start); } + assert sortMap == null || count > 0; + if (sortMap != null && count > 0) { + writeRandomOrdToDoc(outputMeta, vectorData, maxDoc, sortMap); + } + } + + private static void writeRandomOrdToDoc( + IndexOutput outputMeta, IndexOutput vectorData, int maxDoc, BpVectorReorderer.DocMap ordToDocMap) + throws IOException { + int bitsRequired = DirectWriter.bitsRequired(maxDoc); + long start = vectorData.getFilePointer(); + outputMeta.writeLong(start); + outputMeta.writeVInt(0); // blockShift = 0 + // iterate over the vector ordinals and write the docs they are in, in a format that supports + // random access + DirectWriter ordToDocWriter = + DirectWriter.getInstance(vectorData, ordToDocMap.size(), bitsRequired); + for (int ord = 0; ord < ordToDocMap.size(); ord++) { + ordToDocWriter.add(ordToDocMap.newToOld(ord)); + ; + } + ordToDocWriter.finish(); + long startOrds = vectorData.getFilePointer(); + outputMeta.writeLong(startOrds - start); + outputMeta.writeByte((byte) bitsRequired); + // For docToOrd we do not require random access; this only needs to support forward iteration. + // Write the ordinals in docid order using GroupVarInt encoding: + // Note we don't need to encode the actual docids or gaps here since we will iterate to docs + // having values using docsWithField bitset while advancing through this array of ords + GroupVIntUtil.writeGroupVInts( + vectorData, + new byte[GroupVIntUtil.MAX_LENGTH_PER_GROUP], + ordToDocMap.oldToNew, + ordToDocMap.oldToNew.length); + // write length of ordinals + outputMeta.writeLong(vectorData.getFilePointer() - startOrds); + } + + // write monotonic ordToDoc mapping + private static void writeMonotonicOrdToDoc( + int directMonotonicBlockShift, + IndexOutput outputMeta, + IndexOutput vectorData, + int count, + DocsWithFieldSet docsWithField) + throws IOException { + long start = vectorData.getFilePointer(); + outputMeta.writeLong(start); + outputMeta.writeVInt(directMonotonicBlockShift); + // dense case and empty case do not need to store ordToMap mapping + final DirectMonotonicWriter ordToDocWriter = + DirectMonotonicWriter.getInstance(outputMeta, vectorData, count, directMonotonicBlockShift); + DocIdSetIterator iterator = docsWithField.iterator(); + for (int doc = iterator.nextDoc(); doc != NO_MORE_DOCS; doc = iterator.nextDoc()) { + ordToDocWriter.add(doc); + } + ordToDocWriter.finish(); + outputMeta.writeLong(vectorData.getFilePointer() - start); } /** @@ -117,7 +189,7 @@ public static void writeStoredMeta( * DirectMonotonicReader} and {@link IndexedDISI}. * * @param inputMeta the inputMeta, previously written to via {@link #writeStoredMeta(int, - * IndexOutput, IndexOutput, int, int, DocsWithFieldSet)} + * IndexOutput, IndexOutput, int, int, DocsWithFieldSet, BpVectorReorderer.DocMap)} * @param size The number of vectors * @return the configuration required to read sparse vectors * @throws IOException thrown when reading data fails @@ -128,24 +200,54 @@ public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMet long docsWithFieldLength = inputMeta.readLong(); short jumpTableEntryCount = inputMeta.readShort(); byte denseRankPower = inputMeta.readByte(); - long addressesOffset = 0; - int blockShift = 0; - DirectMonotonicReader.Meta meta = null; - long addressesLength = 0; - if (docsWithFieldOffset > -1) { + long addressesOffset; + int blockShift; + DirectMonotonicReader.Meta meta; + long addressesLength; + long docToOrdLength; + byte bitsRequired; + if (docsWithFieldOffset == -3) { addressesOffset = inputMeta.readLong(); blockShift = inputMeta.readVInt(); - meta = DirectMonotonicReader.loadMeta(inputMeta, size, blockShift); + assert blockShift == 0; addressesLength = inputMeta.readLong(); + bitsRequired = inputMeta.readByte(); + docToOrdLength = inputMeta.readLong(); + meta = null; + } else if (docsWithFieldOffset > -1) { + addressesOffset = inputMeta.readLong(); + blockShift = inputMeta.readVInt(); + if (blockShift > 0) { + meta = DirectMonotonicReader.loadMeta(inputMeta, size, blockShift); + } else { + meta = null; + } + addressesLength = inputMeta.readLong(); + if (blockShift == 0) { + bitsRequired = inputMeta.readByte(); + docToOrdLength = inputMeta.readLong(); + } else { + bitsRequired = 0; + docToOrdLength = 0; + } + } else { + // unused when dense and not reordered + addressesOffset = -1; + addressesLength = -1; + docToOrdLength = -1; + bitsRequired = -1; + meta = null; } return new OrdToDocDISIReaderConfiguration( size, jumpTableEntryCount, addressesOffset, addressesLength, + docToOrdLength, docsWithFieldOffset, docsWithFieldLength, denseRankPower, + bitsRequired, meta); } @@ -158,10 +260,15 @@ public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMet final short jumpTableEntryCount; final long docsWithFieldOffset, docsWithFieldLength; final byte denseRankPower; + final byte bitsRequired; - // the following four variables used to read ordToDoc encoded by DirectMonotonicWriter - // note that only spare case needs to store ordToDoc + // the following variables used to read ordToDoc are encoded either by DirectMonotonicWriter, in + // the sparse non-reordered case, or DirectWriter in the reordered case. Reordered case is + // indicated by + // meta = null and addressesOffset > 0. final long addressesOffset, addressesLength; + // This is used to read docToOrd encoded using GroupVInt, only present in reordered case + final long docToOrdLength; final DirectMonotonicReader.Meta meta; OrdToDocDISIReaderConfiguration( @@ -169,9 +276,11 @@ public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMet short jumpTableEntryCount, long addressesOffset, long addressesLength, + long docToOrdLength, long docsWithFieldOffset, long docsWithFieldLength, byte denseRankPower, + byte bitsRequired, DirectMonotonicReader.Meta meta) { this.size = size; this.jumpTableEntryCount = jumpTableEntryCount; @@ -179,7 +288,9 @@ public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMet this.addressesLength = addressesLength; this.docsWithFieldOffset = docsWithFieldOffset; this.docsWithFieldLength = docsWithFieldLength; + this.docToOrdLength = docToOrdLength; this.denseRankPower = denseRankPower; + this.bitsRequired = bitsRequired; this.meta = meta; } @@ -190,6 +301,7 @@ public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMet */ public IndexedDISI getIndexedDISI(IndexInput dataIn) throws IOException { assert docsWithFieldOffset > -1; + assert denseRankPower > 0; return new IndexedDISI( dataIn, docsWithFieldOffset, @@ -206,6 +318,7 @@ public IndexedDISI getIndexedDISI(IndexInput dataIn) throws IOException { */ public DirectMonotonicReader getDirectMonotonicReader(IndexInput dataIn) throws IOException { assert docsWithFieldOffset > -1; + assert meta != null; final RandomAccessInput addressesData = dataIn.randomAccessSlice(addressesOffset, addressesLength); return DirectMonotonicReader.getInstance(meta, addressesData); @@ -224,6 +337,114 @@ public boolean isEmpty() { * is sparse, some documents missing values. */ public boolean isDense() { - return docsWithFieldOffset == -1; + return docsWithFieldOffset == -1 || docsWithFieldOffset == -3; + } + + /** + * @return If true, the field is reordered: vector ordinals are not monotonic with docids. + */ + public boolean isReordered() { + return (docsWithFieldOffset > -1 || docsWithFieldOffset == -3) && meta == null; + } + + public OrdToDocReader getOrdToDocReader(IndexInput dataIn, int size) throws IOException { + return new OrdToDocReader(dataIn.clone(), size); + } + + public class OrdToDocReader { + private final int size; + private final IndexInput dataIn; + private final LongValues ordToDoc; + private final IndexInput docToOrdSlice; + private final IndexedDISI disi; + + OrdToDocReader(IndexInput dataIn, int size) throws IOException { + this.size = size; + this.dataIn = dataIn; + ordToDoc = + DirectReader.getInstance( + dataIn.randomAccessSlice(addressesOffset, addressesLength), bitsRequired); + docToOrdSlice = + dataIn.slice("OrdToDocReader", addressesOffset + addressesLength, docToOrdLength); + if (isDense()) { + disi = null; + } else { + disi = getIndexedDISI(dataIn.clone()); + } + } + + public int ordToDoc(int ord) { + return (int) ordToDoc.get(ord); + } + + DocIndexIterator iterator() { + int[] ordBuffer = new int[128]; + IndexInput docToOrd = docToOrdSlice.clone(); + + return new DocIndexIterator() { + int doc = -1; + int pos = -1; + + void nextPos() throws IOException { + ++pos; + if (pos % ordBuffer.length == 0) { + int remaining = size - pos; + GroupVIntUtil.readGroupVInts( + docToOrd, ordBuffer, Math.min(remaining, ordBuffer.length)); + } + } + + @Override + public int docID() { + return doc; + } + + @Override + public int index() { + return ordBuffer[pos % ordBuffer.length]; + } + + @Override + public int nextDoc() throws IOException { + if (doc == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + if (disi == null) { + // dense + if (doc == size - 1) { + doc = NO_MORE_DOCS; + } else { + ++doc; + } + } else { + // sparse + doc = disi.nextDoc(); + } + if (doc != NO_MORE_DOCS) { + nextPos(); + } + return doc; + } + + @Override + public int advance(int target) throws IOException { + if (disi != null) { + doc = slowAdvance(target); + } else if (target > size - 1) { + doc = NO_MORE_DOCS; + } else { + doc = target; + pos = doc - 1; + nextPos(); + } + return doc; + } + + @Override + public long cost() { + return size; + } + }; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java index c8ef2709db66..ee77ff58f745 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java @@ -41,8 +41,8 @@ * sample is stored as an IEEE float in little-endian byte order. *
  • DocIds encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}, * note that only in sparse case - *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note - * that only in sparse case + *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}Lucene, + * note that only in sparse case * * *

    .vemf (vector metadata) file

    @@ -56,11 +56,25 @@ *
  • [vlong] length of this field's vectors, in bytes *
  • [vint] dimension of this field's vectors *
  • [int] the number of documents having values for this field - *
  • [int8] if equals to -2, empty - no vector values. If equals to -1, dense – all - * documents have values for a field. If equals to 0, sparse – some documents missing values. + *
  • [int64] docsWithFieldOffset: if equals to -2, empty - no vector values. If equals to + * -1, dense – all documents have values for a field. If >= 0, sparse – some documents + * missing values, and this value is the offset to the docsWithField bitset in the main data + * (.vec) file. If equals to -3, dense *and* vectors have been reordered. + *
  • [int64] docsWithFieldLength: 0, or the length of the docsWithField bitset when + * sparse. + *
  • [int16] jumpTableEntryCount: used when sparse; otherwise -1. + *
  • [int8] denseRankPower: used when sparse; otherwise -1. *
  • DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)} - *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note - * that only in sparse case + *
  • When Sparse and monotonically ordered: + *
  • [int64] addressesOffset: pointer to OrdToDoc in vector data. + *
  • [int64] addressesLength: length of OrdToDoc in vector data. + *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}. + *
  • When re-ordered: + *
  • [int64] addressesOffset: pointer to OrdToDoc in vector data. + *
  • [int64] addressesLength: length of OrdToDoc in vector data. + *
  • [int64] docToOrdLength: length of DocToOrd in vector data. + *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectWriter}. + *
  • DocToOrd was encoded by {@link org.apache.lucene.util.GroupVIntUtil}. * * * @lucene.experimental @@ -78,16 +92,24 @@ public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat { static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; private final FlatVectorsScorer vectorsScorer; + private final boolean enableReorder; /** Constructs a format */ public Lucene99FlatVectorsFormat(FlatVectorsScorer vectorsScorer) { super(NAME); this.vectorsScorer = vectorsScorer; + this.enableReorder = true; + } + + public Lucene99FlatVectorsFormat(FlatVectorsScorer vectorsScorer, boolean enableReorder) { + super(NAME); + this.vectorsScorer = vectorsScorer; + this.enableReorder = enableReorder; } @Override public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new Lucene99FlatVectorsWriter(state, vectorsScorer); + return new Lucene99FlatVectorsWriter(state, vectorsScorer, enableReorder); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 1432f5ea46b8..8e83ab890005 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -28,6 +28,7 @@ import java.util.List; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; @@ -44,6 +45,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.DataAccessHint; import org.apache.lucene.store.FileDataHint; import org.apache.lucene.store.FileTypeHint; @@ -51,6 +53,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BpVectorReorderer; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; @@ -69,13 +72,15 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter { private final SegmentWriteState segmentWriteState; private final IndexOutput meta, vectorData; + private final boolean enableReorder; private final List> fields = new ArrayList<>(); private boolean finished; - public Lucene99FlatVectorsWriter(SegmentWriteState state, FlatVectorsScorer scorer) - throws IOException { + Lucene99FlatVectorsWriter( + SegmentWriteState state, FlatVectorsScorer scorer, boolean enableReorder) throws IOException { super(scorer); + this.enableReorder = enableReorder; segmentWriteState = state; String metaFileName = IndexFileNames.segmentFileName( @@ -163,7 +168,12 @@ private void writeField(FieldWriter fieldData, int maxDoc) throws IOException long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta( - fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, fieldData.docsWithField); + fieldData.fieldInfo, + maxDoc, + vectorDataOffset, + vectorDataLength, + fieldData.docsWithField, + null); } private void writeFloat32Vectors(FieldWriter fieldData) throws IOException { @@ -197,7 +207,8 @@ private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocM }; long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; - writeMeta(fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, newDocsWithField); + writeMeta( + fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, newDocsWithField, null); } private long writeSortedFloat32Vectors(FieldWriter fieldData, int[] ordMap) @@ -224,7 +235,7 @@ private long writeSortedByteVectors(FieldWriter fieldData, int[] ordMap) thro @Override public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - // Since we know we will not be searching for additional indexing, we can just write the + // Since we know we will not be searching for additional indexing, we can just write // the vectors directly to the new segment. long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); // No need to use temporary file as we don't have to re-open for reading @@ -246,92 +257,184 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE segmentWriteState.segmentInfo.maxDoc(), vectorDataOffset, vectorDataLength, - docsWithField); + docsWithField, + null); } @Override public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( FieldInfo fieldInfo, MergeState mergeState) throws IOException { - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); - IndexOutput tempVectorData = - segmentWriteState.directory.createTempOutput( - vectorData.getName(), "temp", segmentWriteState.context); - IndexInput vectorDataInput = null; - try { - // write the vector data to a temporary file - DocsWithFieldSet docsWithField = - switch (fieldInfo.getVectorEncoding()) { - case BYTE -> - writeByteVectorData( - tempVectorData, - KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues( - fieldInfo, mergeState)); - case FLOAT32 -> - writeVectorData( - tempVectorData, - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues( - fieldInfo, mergeState)); - }; - CodecUtil.writeFooter(tempVectorData); - IOUtils.close(tempVectorData); - - // This temp file will be accessed in a random-access fashion to construct the HNSW graph. - // Note: don't use the context from the state, which is a flush/merge context, not expecting - // to perform random reads. - vectorDataInput = + return new FieldMerger(fieldInfo, mergeState).merge(); + } + + // Hold some state while merging one field + private class FieldMerger { + private final FieldInfo fieldInfo; + private final MergeState mergeState; + + List toClose = new ArrayList<>(); + List tempFileNames = new ArrayList<>(); + IndexInput vectorDataInput; + String tempFileName; + Sorter.DocMap ordMap; + BpVectorReorderer.DocMap ordToDocMap; + + FieldMerger(FieldInfo fieldInfo, MergeState mergeState) { + this.fieldInfo = fieldInfo; + this.mergeState = mergeState; + } + + CloseableRandomVectorScorerSupplier merge() throws IOException { + long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); + IndexOutput tempVectorData = openOutput(vectorData.getName(), "temp"); + try { + // write the vector data to a temporary file + DocsWithFieldSet docsWithField = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> + writeByteVectorData( + tempVectorData, + MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); + case FLOAT32 -> + writeVectorData( + tempVectorData, + MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); + }; + closeWithFooter(tempVectorData); + tempFileName = tempVectorData.getName(); + + // This temp file will be accessed in a random-access fashion to construct the HNSW graph. + // Note: don't use the context from the state, which is a flush/merge context, not expecting + // to perform random reads. + vectorDataInput = openInput(tempFileName); + + if (enableReorder + && fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32 + && docsWithField.cardinality() > 0) { + reorderVectors(docsWithField); + } else { + // copy the temporary file vectors to the actual data file in docid order + vectorData.copyBytes( + vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength()); + } + + CodecUtil.retrieveChecksum(vectorDataInput); + long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; + writeMeta( + fieldInfo, + segmentWriteState.segmentInfo.maxDoc(), + vectorDataOffset, + vectorDataLength, + docsWithField, + ordToDocMap); + + final RandomVectorScorerSupplier randomVectorScorerSupplier = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> + vectorsScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + new OffHeapByteVectorValues.DenseOffHeapVectorValues( + fieldInfo.getVectorDimension(), + docsWithField.cardinality(), + vectorDataInput, + fieldInfo.getVectorDimension() * Byte.BYTES, + vectorsScorer, + fieldInfo.getVectorSimilarityFunction())); + case FLOAT32 -> + vectorsScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + fieldInfo.getVectorDimension(), + docsWithField.cardinality(), + vectorDataInput, + fieldInfo.getVectorDimension() * Float.BYTES, + vectorsScorer, + fieldInfo.getVectorSimilarityFunction())); + }; + return new FlatCloseableRandomVectorScorerSupplier( + () -> { + IOUtils.close(vectorDataInput); + deleteFile(tempFileName); + }, + docsWithField.cardinality(), + randomVectorScorerSupplier, + ordMap); + } catch (Throwable t) { + IOUtils.closeWhileSuppressingExceptions(t, toClose); + IOUtils.deleteFilesSuppressingExceptions(t, segmentWriteState.directory, tempFileNames); + throw t; + } + } + + void reorderVectors(DocsWithFieldSet docsWithField) throws IOException { + int vectorDimension = fieldInfo.getVectorDimension(); + int byteSize = fieldInfo.getVectorEncoding().byteSize; + int vectorCount = docsWithField.cardinality(); + int vectorByteSize = vectorDimension * byteSize; + FloatVectorValues vectorValues = + new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + vectorDimension, + vectorCount, + vectorDataInput, + vectorByteSize, + DefaultFlatVectorScorer.INSTANCE, + fieldInfo.getVectorSimilarityFunction()); + BpVectorReorderer vectorReorderer = new BpVectorReorderer(); + vectorReorderer.setMaxIters(10); + TaskExecutor exec = new TaskExecutor(mergeState.intraMergeTaskExecutor); + ordMap = + vectorReorderer.computeValueMap( + vectorValues, fieldInfo.getVectorSimilarityFunction(), exec); + // copy the temporary file vectors to yet another temp file after reordering + IndexOutput tempReorderVectorData = openOutput(vectorData.getName(), "tempReorder"); + for (int ord = 0; ord < vectorCount; ord++) { + int oldOrd = ordMap.newToOld(ord); + vectorDataInput.seek((long) oldOrd * vectorByteSize); + tempReorderVectorData.copyBytes(vectorDataInput, vectorByteSize); + } + closeWithFooter(tempReorderVectorData); + ordToDocMap = BpVectorReorderer.ordToDocFromValueMap(ordMap, docsWithField); + // clean up + IOUtils.close(vectorDataInput); + // Copy the reordered vector data input to the merged segment vector data + String reorderedTempFileName = tempReorderVectorData.getName(); + try { + vectorDataInput = openInput(reorderedTempFileName); + vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength()); + } finally { + IOUtils.close(vectorDataInput); + deleteFile(reorderedTempFileName); + } + // reopen vectorDataInput on the *unordered* vectors to be used when building HNSW graph + vectorDataInput = openInput(tempFileName); + } + + IndexOutput openOutput(String name, String ext) throws IOException { + IndexOutput output = + segmentWriteState.directory.createTempOutput(name, ext, segmentWriteState.context); + toClose.add(output); + tempFileNames.add(output.getName()); + return output; + } + + IndexInput openInput(String name) throws IOException { + IndexInput input = segmentWriteState.directory.openInput( - tempVectorData.getName(), + name, IOContext.DEFAULT.withHints( FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM)); - // copy the temporary file vectors to the actual data file - vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength()); - CodecUtil.retrieveChecksum(vectorDataInput); - long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; - writeMeta( - fieldInfo, - segmentWriteState.segmentInfo.maxDoc(), - vectorDataOffset, - vectorDataLength, - docsWithField); - - final IndexInput finalVectorDataInput = vectorDataInput; - vectorDataInput = null; - - final RandomVectorScorerSupplier randomVectorScorerSupplier = - switch (fieldInfo.getVectorEncoding()) { - case BYTE -> - vectorsScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - new OffHeapByteVectorValues.DenseOffHeapVectorValues( - fieldInfo.getVectorDimension(), - docsWithField.cardinality(), - finalVectorDataInput, - fieldInfo.getVectorDimension() * Byte.BYTES, - vectorsScorer, - fieldInfo.getVectorSimilarityFunction())); - case FLOAT32 -> - vectorsScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - new OffHeapFloatVectorValues.DenseOffHeapVectorValues( - fieldInfo.getVectorDimension(), - docsWithField.cardinality(), - finalVectorDataInput, - fieldInfo.getVectorDimension() * Float.BYTES, - vectorsScorer, - fieldInfo.getVectorSimilarityFunction())); - }; - return new FlatCloseableRandomVectorScorerSupplier( - () -> { - IOUtils.close(finalVectorDataInput); - segmentWriteState.directory.deleteFile(tempVectorData.getName()); - }, - docsWithField.cardinality(), - randomVectorScorerSupplier); - } catch (Throwable t) { - IOUtils.closeWhileSuppressingExceptions(t, vectorDataInput, tempVectorData); - IOUtils.deleteFilesSuppressingExceptions( - t, segmentWriteState.directory, tempVectorData.getName()); - throw t; + toClose.add(input); + return input; + } + + void closeWithFooter(IndexOutput output) throws IOException { + CodecUtil.writeFooter(output); + IOUtils.close(output); + } + + void deleteFile(String filename) throws IOException { + segmentWriteState.directory.deleteFile(filename); + tempFileNames.remove(filename); } } @@ -340,7 +443,8 @@ private void writeMeta( int maxDoc, long vectorDataOffset, long vectorDataLength, - DocsWithFieldSet docsWithField) + DocsWithFieldSet docsWithField, + BpVectorReorderer.DocMap sortMap) throws IOException { meta.writeInt(field.number); meta.writeInt(field.getVectorEncoding().ordinal()); @@ -353,7 +457,7 @@ private void writeMeta( int count = docsWithField.cardinality(); meta.writeInt(count); OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField); + DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField, sortMap); } /** @@ -499,12 +603,17 @@ static final class FlatCloseableRandomVectorScorerSupplier private final RandomVectorScorerSupplier supplier; private final Closeable onClose; private final int numVectors; + private final Sorter.DocMap sortMap; FlatCloseableRandomVectorScorerSupplier( - Closeable onClose, int numVectors, RandomVectorScorerSupplier supplier) { + Closeable onClose, + int numVectors, + RandomVectorScorerSupplier supplier, + Sorter.DocMap sortMap) { this.onClose = onClose; this.supplier = supplier; this.numVectors = numVectors; + this.sortMap = sortMap; } @Override @@ -526,5 +635,10 @@ public void close() throws IOException { public int totalVectorCount() { return numVectors; } + + @Override + public Sorter.DocMap sortMap() { + return sortMap; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index d265d9d29329..ab733a268fef 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; @@ -51,6 +52,7 @@ *
  • array[vint] the delta encoded neighbor ordinals * * + * re *
  • After all levels are encoded, memory offsets for each node's neighbor nodes are appended to * the end of the file. The offsets are encoded by {@link * org.apache.lucene.util.packed.DirectMonotonicWriter}. @@ -149,12 +151,19 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { private final int beamWidth; /** The format for storing, reading, and merging vectors on disk. */ - private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); + private static final FlatVectorsScorer DEFAULT_FLAT_VECTORS_SCORER = + FlatVectorScorerUtil.getLucene99FlatVectorsScorer(); + + private static final FlatVectorsFormat DEFAULT_FLAT_VECTORS_FORMAT = + new Lucene99FlatVectorsFormat(DEFAULT_FLAT_VECTORS_SCORER, false); + private static final FlatVectorsFormat REORDERING_FLAT_VECTORS_FORMAT = + new Lucene99FlatVectorsFormat(DEFAULT_FLAT_VECTORS_SCORER, true); private final int numMergeWorkers; private final TaskExecutor mergeExec; + private final boolean enableReorder; + /** * The threshold to use to bypass HNSW graph building for tiny segments in terms of k for a graph * i.e. number of docs to match the query (default is {@link @@ -168,6 +177,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { */ private final int tinySegmentsThreshold; + private final FlatVectorsFormat flatVectorsFormat; + private final int writeVersion; /** Constructs a format using default graph construction parameters */ @@ -178,6 +189,7 @@ public Lucene99HnswVectorsFormat() { DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD, + false, VERSION_CURRENT); } @@ -188,7 +200,14 @@ public Lucene99HnswVectorsFormat() { * @param beamWidth the size of the queue maintained during graph construction. */ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) { - this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD, VERSION_CURRENT); + this( + maxConn, + beamWidth, + DEFAULT_NUM_MERGE_WORKER, + null, + HNSW_GRAPH_THRESHOLD, + false, + VERSION_CURRENT); } /** @@ -201,7 +220,13 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) { */ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, int tinySegmentsThreshold) { this( - maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, tinySegmentsThreshold, VERSION_CURRENT); + maxConn, + beamWidth, + DEFAULT_NUM_MERGE_WORKER, + null, + tinySegmentsThreshold, + false, + VERSION_CURRENT); } /** @@ -217,7 +242,14 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, int tinySegmentsThr */ public Lucene99HnswVectorsFormat( int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) { - this(maxConn, beamWidth, numMergeWorkers, mergeExec, HNSW_GRAPH_THRESHOLD, VERSION_CURRENT); + this( + maxConn, + beamWidth, + numMergeWorkers, + mergeExec, + HNSW_GRAPH_THRESHOLD, + false, + VERSION_CURRENT); } /** @@ -240,7 +272,46 @@ public Lucene99HnswVectorsFormat( int numMergeWorkers, ExecutorService mergeExec, int tinySegmentsThreshold) { - this(maxConn, beamWidth, numMergeWorkers, mergeExec, tinySegmentsThreshold, VERSION_CURRENT); + this( + maxConn, + beamWidth, + numMergeWorkers, + mergeExec, + tinySegmentsThreshold, + false, + VERSION_CURRENT); + } + + /** + * Constructs a format using the given graph construction parameters. (This is a Test-Only + * Constructor) + * + * @param maxConn the maximum number of connections to a node in the HNSW graph + * @param beamWidth the size of the queue maintained during graph construction. + * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If + * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec + * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are + * generated by this format to do the merge. If null, the configured {@link + * MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used. + * @param tinySegmentsThreshold the expected number of vector operations to return k nearest + * neighbors of the current graph size + * @param enableReorder if true, BpVectorReorderer is used to sort vector nodes when merging. + */ + public Lucene99HnswVectorsFormat( + int maxConn, + int beamWidth, + int numMergeWorkers, + ExecutorService mergeExec, + int tinySegmentsThreshold, + boolean enableReorder) { + this( + maxConn, + beamWidth, + numMergeWorkers, + mergeExec, + tinySegmentsThreshold, + enableReorder, + VERSION_CURRENT); } /** @@ -255,6 +326,7 @@ public Lucene99HnswVectorsFormat( * MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used. * @param tinySegmentsThreshold the expected number of vector operations to return k nearest * neighbors of the current graph size + * @param enableReorder if true, BpVectorReorderer is used to sort vector nodes when merging. * @param writeVersion the version used for the writer to encode docID's (VarInt=0, GroupVarInt=1) */ Lucene99HnswVectorsFormat( @@ -263,6 +335,7 @@ public Lucene99HnswVectorsFormat( int numMergeWorkers, ExecutorService mergeExec, int tinySegmentsThreshold, + boolean enableReorder, int writeVersion) { super("Lucene99HnswVectorsFormat"); if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) { @@ -282,6 +355,7 @@ public Lucene99HnswVectorsFormat( this.maxConn = maxConn; this.beamWidth = beamWidth; this.tinySegmentsThreshold = tinySegmentsThreshold; + this.enableReorder = enableReorder; this.writeVersion = writeVersion; if (numMergeWorkers == 1 && mergeExec != null) { throw new IllegalArgumentException( @@ -293,6 +367,11 @@ public Lucene99HnswVectorsFormat( } else { this.mergeExec = null; } + if (enableReorder) { + flatVectorsFormat = REORDERING_FLAT_VECTORS_FORMAT; + } else { + flatVectorsFormat = DEFAULT_FLAT_VECTORS_FORMAT; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index d2526cff3ab8..a5751b3fe0ac 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -48,6 +48,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BpVectorReorderer; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.RamUsageEstimator; @@ -257,7 +258,9 @@ private void writeSortingField(FieldWriter fieldData, Sorter.DocMap sortMap) long vectorIndexOffset = vectorIndex.getFilePointer(); OnHeapHnswGraph graph = fieldData.getGraph(); int[][] graphLevelNodeOffsets = graph == null ? new int[0][] : new int[graph.numLevels()][]; - HnswGraph mockGraph = reconstructAndWriteGraph(graph, ordMap, oldOrdMap, graphLevelNodeOffsets); + HnswGraph mockGraph = + reconstructAndWriteGraph( + graph, BpVectorReorderer.sortMapOf(oldOrdMap, ordMap), graphLevelNodeOffsets); long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset; writeMeta( @@ -275,15 +278,13 @@ private void writeSortingField(FieldWriter fieldData, Sorter.DocMap sortMap) *

    Additionally, the graph node connections are written to the vectorIndex. * * @param graph The current on heap graph - * @param newToOldMap the new node ids indexed to the old node ids - * @param oldToNewMap the old node ids indexed to the new node ids + * @param sortMap the new node ids mapped to (and from) the old node ids * @param levelNodeOffsets where to place the new offsets for the nodes in the vector index. * @return The graph * @throws IOException if writing to vectorIndex fails */ private HnswGraph reconstructAndWriteGraph( - OnHeapHnswGraph graph, int[] newToOldMap, int[] oldToNewMap, int[][] levelNodeOffsets) - throws IOException { + OnHeapHnswGraph graph, Sorter.DocMap sortMap, int[][] levelNodeOffsets) throws IOException { if (graph == null) return null; List nodesByLevel = new ArrayList<>(graph.numLevels()); @@ -295,9 +296,9 @@ private HnswGraph reconstructAndWriteGraph( levelNodeOffsets[0] = new int[nodesOnLevel0.size()]; while (nodesOnLevel0.hasNext()) { int node = nodesOnLevel0.nextInt(); - NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]); + NeighborArray neighbors = graph.getNeighbors(0, sortMap.newToOld(node)); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); + reconstructAndWriteNeighbours(neighbors, sortMap, scratch, maxOrd); levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -305,16 +306,16 @@ private HnswGraph reconstructAndWriteGraph( NodesIterator nodesOnLevel = graph.getNodesOnLevel(level); int[] newNodes = new int[nodesOnLevel.size()]; for (int n = 0; nodesOnLevel.hasNext(); n++) { - newNodes[n] = oldToNewMap[nodesOnLevel.nextInt()]; + newNodes[n] = sortMap.oldToNew(nodesOnLevel.nextInt()); } Arrays.sort(newNodes); nodesByLevel.add(newNodes); levelNodeOffsets[level] = new int[newNodes.length]; int nodeOffsetIndex = 0; for (int node : newNodes) { - NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]); + NeighborArray neighbors = graph.getNeighbors(level, sortMap.newToOld(node)); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); + reconstructAndWriteNeighbours(neighbors, sortMap, scratch, maxOrd); levelNodeOffsets[level][nodeOffsetIndex++] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -367,12 +368,13 @@ public NodesIterator getNodesOnLevel(int level) { } private void reconstructAndWriteNeighbours( - NeighborArray neighbors, int[] oldToNewMap, int[] scratch, int maxOrd) throws IOException { + NeighborArray neighbors, Sorter.DocMap sortMap, int[] scratch, int maxOrd) + throws IOException { int size = neighbors.size(); // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); for (int i = 0; i < size; i++) { - nnodes[i] = oldToNewMap[nnodes[i]]; + nnodes[i] = sortMap.oldToNew(nnodes[i]); } Arrays.sort(nnodes, 0, size); int actualSize = 0; @@ -410,6 +412,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE // we use Lucene99HnswVectorsReader.DenseOffHeapVectorValues for the graph construction // doesn't need to know docIds OnHeapHnswGraph graph = null; + HnswGraph graphToWrite = null; int[][] vectorIndexNodeOffsets = null; // Check if we should bypass graph building for tiny segments boolean makeHnswGraph = @@ -445,7 +448,14 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE mergedVectorValues, segmentWriteState.infoStream, scorerSupplier.totalVectorCount()); - vectorIndexNodeOffsets = writeGraph(graph); + if (scorerSupplier.sortMap() == null) { + vectorIndexNodeOffsets = writeGraph(graph); + graphToWrite = graph; + } else { + vectorIndexNodeOffsets = new int[graph.numLevels()][]; + graphToWrite = + reconstructAndWriteGraph(graph, scorerSupplier.sortMap(), vectorIndexNodeOffsets); + } } long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset; writeMeta( @@ -453,7 +463,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE vectorIndexOffset, vectorIndexLength, scorerSupplier.totalVectorCount(), - graph, + graphToWrite, vectorIndexNodeOffsets); } catch (Throwable t) { IOUtils.closeWhileSuppressingExceptions(t, scorerSupplier); @@ -464,11 +474,14 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE /** * @param graph Write the graph in a compressed format + * @param sortMap if not null, used to map the node ordinals (old to new) while rewriting * @return The non-cumulative offsets for the nodes. Should be used to create cumulative offsets. * @throws IOException if writing to vectorIndex fails */ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { - if (graph == null) return new int[0][0]; + if (graph == null) { + return new int[0][0]; + } // write vectors' neighbours on each level into the vectorIndex file int countOnLevel0 = graph.size(); int[][] offsets = new int[graph.numLevels()][]; @@ -478,7 +491,8 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { offsets[level] = new int[sortedNodes.size()]; int nodeOffsetId = 0; while (sortedNodes.hasNext()) { - NeighborArray neighbors = graph.getNeighbors(level, sortedNodes.next()); + int node = sortedNodes.next(); + NeighborArray neighbors = graph.getNeighbors(level, node); int size = neighbors.size(); // Write size in VInt as the neighbors list is typically small long offsetStart = vectorIndex.getFilePointer(); @@ -515,6 +529,70 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { return offsets; } + private static HnswGraph withReorderedNodes(HnswGraph inner, Sorter.DocMap sortMap) { + return new HnswGraph() { + @Override + public void seek(int level, int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int neighborCount() { + throw new UnsupportedOperationException(); + } + + @Override + public int size() { + return inner.size(); + } + + @Override + public int nextNeighbor() throws IOException { + return sortMap.oldToNew(inner.nextNeighbor()); + } + + @Override + public int numLevels() throws IOException { + return inner.numLevels(); + } + + @Override + public int maxConn() { + return inner.maxConn(); + } + + @Override + public int entryNode() throws IOException { + return sortMap.oldToNew(inner.entryNode()); + } + + @Override + public NodesIterator getNodesOnLevel(int level) throws IOException { + NodesIterator innerIter = inner.getNodesOnLevel(level); + return new NodesIterator(innerIter.size()) { + @Override + public int consume(int[] dest) { + int result = innerIter.consume(dest); + for (int i = 0; i < result; i++) { + dest[i] = sortMap.oldToNew(dest[i]); + } + return result; + } + + @Override + public int nextInt() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasNext() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + private void writeMeta( FieldInfo field, long vectorIndexOffset, diff --git a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java index 8e58f387a334..aad677e9de97 100644 --- a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java @@ -65,8 +65,6 @@ public int getVectorByteLength() { /** Returns a Bits accepting docs accepted by the argument and having a vector value */ public Bits getAcceptOrds(Bits acceptDocs) { - // FIXME: change default to return acceptDocs and provide this impl - // somewhere more specialized (in every non-dense impl). if (acceptDocs == null) { return null; } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/AbstractBPReorderer.java b/lucene/core/src/java/org/apache/lucene/util/AbstractBPReorderer.java similarity index 97% rename from lucene/misc/src/java/org/apache/lucene/misc/index/AbstractBPReorderer.java rename to lucene/core/src/java/org/apache/lucene/util/AbstractBPReorderer.java index 3f7442a25263..5f6e2e28a587 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/AbstractBPReorderer.java +++ b/lucene/core/src/java/org/apache/lucene/util/AbstractBPReorderer.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.misc.index; +package org.apache.lucene.util; /** Base class for docid-reorderers implemented using binary partitioning (BP). */ public abstract class AbstractBPReorderer implements IndexReorderer { @@ -71,7 +71,7 @@ public void setRAMBudgetMB(double ramBudgetMB) { /** Exception that is thrown when not enough RAM is available. */ public static class NotEnoughRAMException extends RuntimeException { - NotEnoughRAMException(String message) { + public NotEnoughRAMException(String message) { super(message); } } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java b/lucene/core/src/java/org/apache/lucene/util/BpVectorReorderer.java similarity index 92% rename from lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java rename to lucene/core/src/java/org/apache/lucene/util/BpVectorReorderer.java index 246109ede04c..20f20a22cddf 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java +++ b/lucene/core/src/java/org/apache/lucene/util/BpVectorReorderer.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.misc.index; +package org.apache.lucene.util; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -28,6 +28,7 @@ import java.util.concurrent.RecursiveAction; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexReader; @@ -42,15 +43,11 @@ import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.CloseableThreadLocal; -import org.apache.lucene.util.IntroSelector; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.VectorUtil; /** * Implementation of "recursive graph bisection", also called "bipartite graph partitioning" and * often abbreviated BP, an approach to doc ID assignment that aims at reducing the sum of the log - * gap between consecutive neighbor node ids. See {@link BPIndexReorderer}. + * gap between consecutive neighbor node ids. See also BPIndexReorderer. */ public class BpVectorReorderer extends AbstractBPReorderer { @@ -89,10 +86,14 @@ public class BpVectorReorderer extends AbstractBPReorderer { private final String partitionField; + public BpVectorReorderer() { + this(null); + } + /** Constructor. */ public BpVectorReorderer(String partitionField) { - setMinPartitionSize(DEFAULT_MIN_PARTITION_SIZE); - setMaxIters(DEFAULT_MAX_ITERS); + setMinPartitionSize(AbstractBPReorderer.DEFAULT_MIN_PARTITION_SIZE); + setMaxIters(AbstractBPReorderer.DEFAULT_MAX_ITERS); // 10% of the available heap size by default setRAMBudgetMB(Runtime.getRuntime().totalMemory() / 1024d / 1024d / 10d); this.partitionField = partitionField; @@ -117,12 +118,17 @@ private static class PerThreadState { } } - private static class DocMap extends Sorter.DocMap { + public static final class DocMap extends Sorter.DocMap { + + public final int[] oldToNew; + public final int[] newToOld; - private final int[] newToOld; - private final int[] oldToNew; + public DocMap(int[] oldToNew, int[] newToOld) { + this.oldToNew = oldToNew; + this.newToOld = newToOld; + } - public DocMap(int[] newToOld) { + DocMap(int[] newToOld) { this.newToOld = newToOld; oldToNew = new int[newToOld.length]; for (int i = 0; i < newToOld.length; ++i) { @@ -146,6 +152,10 @@ public int newToOld(int docID) { } } + public static Sorter.DocMap sortMapOf(int[] oldToNew, int[] newToOld) { + return new DocMap(oldToNew, newToOld); + } + private abstract class BaseRecursiveAction extends RecursiveAction { protected final TaskExecutor executor; @@ -577,6 +587,10 @@ private float computeBias(float[] vector, float[] leftCentroid, float[] rightCen @Override public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir, Executor executor) throws IOException { + if (partitionField == null) { + throw new IllegalStateException( + "initialized with null field name; cannot partition using a reader"); + } TaskExecutor taskExecutor; if (executor == null) { taskExecutor = null; @@ -588,12 +602,15 @@ public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir, Execut return null; } FloatVectorValues floats = reader.getFloatVectorValues(partitionField); + if (floats == null) { + return null; + } Sorter.DocMap valueMap = computeValueMap(floats, vectorScore, taskExecutor); return valueMapToDocMap(valueMap, floats, reader.maxDoc()); } /** Expert: Compute the {@link DocMap} that holds the new vector ordinal numbering. */ - Sorter.DocMap computeValueMap( + public Sorter.DocMap computeValueMap( FloatVectorValues vectors, VectorSimilarityFunction vectorScore, TaskExecutor executor) { if (docRAMRequirements(vectors.size()) >= ramBudgetMB * 1024 * 1024) { throw new NotEnoughRAMException( @@ -611,9 +628,9 @@ Sorter.DocMap computeValueMap( */ private int[] computePermutation( FloatVectorValues vectors, VectorSimilarityFunction vectorScore, TaskExecutor executor) { - final int size = vectors.size(); + int size = vectors.size(); int[] sortedIds = new int[size]; - for (int i = 0; i < size; ++i) { + for (int i = 0; i < size; i++) { sortedIds[i] = i; } try (CloseableThreadLocal threadLocal = @@ -623,7 +640,7 @@ protected PerThreadState initialValue() { return new PerThreadState(vectors); } }) { - IntsRef ids = new IntsRef(sortedIds, 0, sortedIds.length); + IntsRef ids = new IntsRef(sortedIds, 0, size); new ReorderTask(ids, new float[size], threadLocal, executor, 0, vectorScore).compute(); } return sortedIds; @@ -768,23 +785,27 @@ private static Sorter.DocMap valueMapToDocMap( ++docid; ++nextNullDoc; } + return new DocMap(oldToNew, newToOld); + } - return new Sorter.DocMap() { - - @Override - public int size() { - return newToOld.length; - } - - @Override - public int oldToNew(int docID) { - return oldToNew[docID]; - } - - @Override - public int newToOld(int docID) { - return newToOld[docID]; - } - }; + // returns a map where new->old maps ord->doc and + // old->new maps old ords (which are in doc order) to new ords + public static DocMap ordToDocFromValueMap( + Sorter.DocMap valueMap, DocsWithFieldSet docsWithField) throws IOException { + // valueMap maps old/new ords; values maps old docs/old ords + // we want old docs/new ords map. docs with no value map to -1 + int[] newToOld = new int[valueMap.size()]; + int[] oldToNew = new int[valueMap.size()]; + DocIdSetIterator it = docsWithField.iterator(); + for (int ord = 0, nextDoc = it.nextDoc(); + nextDoc != NO_MORE_DOCS; + nextDoc = it.nextDoc(), ord++) { + int newOrd = valueMap.oldToNew(ord); + newToOld[newOrd] = nextDoc; + // this will hold ords in docid order, but it is not doc->ord because there may be gaps in the + // doc sequence + oldToNew[ord] = newOrd; + } + return new DocMap(oldToNew, newToOld); } } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexReorderer.java b/lucene/core/src/java/org/apache/lucene/util/IndexReorderer.java similarity index 92% rename from lucene/misc/src/java/org/apache/lucene/misc/index/IndexReorderer.java rename to lucene/core/src/java/org/apache/lucene/util/IndexReorderer.java index 1fdba7a4eb03..694a5ac5285c 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexReorderer.java +++ b/lucene/core/src/java/org/apache/lucene/util/IndexReorderer.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.misc.index; +package org.apache.lucene.util; import java.io.IOException; import java.util.concurrent.Executor; @@ -22,7 +22,7 @@ import org.apache.lucene.index.Sorter; import org.apache.lucene.store.Directory; -/** Interface for docid-reordering expected by {@link BPReorderingMergePolicy}. */ +/** Interface for docid-reordering expected by BPReorderingMergePolicy. */ public interface IndexReorderer { /** * Returns a mapping from old to new docids. diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java index 8c66147f1a8b..bb515d7e2885 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java @@ -18,6 +18,7 @@ package org.apache.lucene.util.hnsw; import java.io.Closeable; +import org.apache.lucene.index.Sorter; /** * A supplier that creates {@link UpdateableRandomVectorScorer} from an ordinal. Caller should be @@ -28,4 +29,14 @@ */ public interface CloseableRandomVectorScorerSupplier extends Closeable, RandomVectorScorerSupplier { int totalVectorCount(); + + /** + * If the vectors were reordered, this encodes the mapping from old (before reordering) node + * ordinals to new (reordered) ones. If no reordering was done, this will be null. Note that the + * RandomVectorScorer(s) returned from this will be based on the old ordinals; the reordering is + * only indicative. + */ + default Sorter.DocMap sortMap() { + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java index 08366927b247..cd093b192aaa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java @@ -20,7 +20,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; -import org.apache.lucene.internal.hppc.IntCursor; +import java.util.Arrays; import org.apache.lucene.internal.hppc.IntHashSet; import org.apache.lucene.util.BitSet; @@ -145,8 +145,10 @@ private void updateGraph(HnswGraph gS, int[] ordMapS) throws IOException { IntHashSet j = UpdateGraphsUtils.computeJoinSet(gS); // for nodes that in the join set, add them directly to the graph - for (IntCursor node : j) { - addGraphNode(ordMapS[node.value]); + int[] nodes = j.toArray(); + Arrays.sort(nodes); + for (int node : nodes) { + addGraphNode(ordMapS[node]); } // for each node outside of j set: diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104ScalarQuantizedVectorsFormat.java index 32d5b07ad3e0..8088f01b9774 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104ScalarQuantizedVectorsFormat.java @@ -365,7 +365,7 @@ private void updateVectorMetadataFile(Directory dir, String fileName) throws Exc // Write configuration OrdToDocDISIReaderConfiguration.writeStoredMeta( - DIRECT_MONOTONIC_BLOCK_SHIFT, out, null, 0, 0, null); + DIRECT_MONOTONIC_BLOCK_SHIFT, out, null, 0, 0, null, null); // Mark end of fields and write footer out.writeInt(-1); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index c03afd92be69..71c8386e70d4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.hnsw.HnswGraphProvider; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -53,6 +54,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.HnswGraph; @@ -222,9 +224,11 @@ public void testSortOnAddIndicesRandom() throws IOException { new Sort( new SortField("sorted_binary_sort_field", SortField.Type.STRING, false), new SortField("alt_id", SortField.Type.INT)))); + IndexWriterConfig cfg = newIndexWriterConfig(); + cfg.setCodec(TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(8, 32))); + cfg.setIndexSort(indexSort); try (Directory sortDir = newDirectory()) { - try (IndexWriter writer = - new IndexWriter(sortDir, newIndexWriterConfig().setIndexSort(indexSort))) { + try (IndexWriter writer = new IndexWriter(sortDir, cfg)) { try (DirectoryReader reader = DirectoryReader.open(dir)) { List readers = new ArrayList<>(); for (LeafReaderContext ctx : reader.leaves()) { diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index 0f9a6c84578b..be95bdc8e9d9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -611,8 +611,7 @@ public void testRandomWithFilter() throws IOException { try (Directory d = newDirectoryForTest()) { // Always use the default kNN format to have predictable behavior around when it hits // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN - // format - // implementation. + // format implementation. IndexWriterConfig iwc = configStandardCodec(); RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); for (int i = 0; i < numDocs; i++) { @@ -659,7 +658,9 @@ public void testRandomWithFilter() throws IOException { assertEquals(1, fieldDoc.fields.length); int tag = (int) fieldDoc.fields[0]; - assertTrue(lower <= tag && tag <= numDocs); + assertTrue( + "tag=" + tag + " lower=" + lower + " numDocs=" + numDocs, + lower <= tag && tag <= numDocs); } // Test a filter with cost slightly more than k, and check we use exact search as k // results are not retrieved from approximate search diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/core/src/test/org/apache/lucene/util/TestBpVectorReorderer.java similarity index 70% rename from lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java rename to lucene/core/src/test/org/apache/lucene/util/TestBpVectorReorderer.java index 653c519729a9..f33272f0b305 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestBpVectorReorderer.java @@ -15,20 +15,25 @@ * limitations under the License. */ -package org.apache.lucene.misc.index; +package org.apache.lucene.util; import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Random; import java.util.concurrent.Executor; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinWorkerThread; +import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.codecs.lucene104.Lucene104HnswScalarQuantizedVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexReader; @@ -36,15 +41,19 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LogDocMergePolicy; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.util.VectorUtil; +import org.apache.lucene.util.hnsw.HnswGraph; /** Tests reordering vector values using Binary Partitioning */ public class TestBpVectorReorderer extends LuceneTestCase { @@ -126,6 +135,7 @@ public void testEuclideanLinear() { } public void testQuantizedIndex() throws Exception { + // FIXME test with an executor sometimes doTestQuantizedIndex(null); } @@ -217,6 +227,10 @@ private static List shuffleVectors(List vectors) { private static List randomLinearVectors() { int n = random().nextInt(100) + 10; + return randomLinearVectors(n); + } + + private static List randomLinearVectors(int n) { List vectors = new ArrayList<>(); float b = random().nextFloat(); float m = random().nextFloat(); @@ -310,13 +324,25 @@ && angularDifference(t0min, t0max) < angularDifference(t0min, t1max)) && angularDifference(t1min, t1max) < angularDifference(t1min, t0max))); } + // Disable skipping HNSW graph creation for small segments and pass through reordering flag. + private IndexWriterConfig createIndexWriterConfig(boolean enableReorder) { + IndexWriterConfig cfg = new IndexWriterConfig(); + cfg.setCodec( + TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswVectorsFormat(8, 32, 1, null, 0, enableReorder))); + cfg.setMergePolicy(new LogDocMergePolicy()); + cfg.setMergeScheduler(new SerialMergeScheduler()); + return cfg; + } + public void testIndexReorderDense() throws Exception { List vectors = shuffleVectors(randomLinearVectors()); Path tmpdir = createTempDir(); try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf - try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig())) { + IndexWriterConfig cfg = createIndexWriterConfig(false); + try (IndexWriter writer = new IndexWriter(dir, cfg)) { int id = 0; for (float[] vector : vectors) { Document doc = new Document(); @@ -384,6 +410,7 @@ public void testIndexReorderDense() throws Exception { } } + // test the reordering utility (BpVectorReorderer.main) public void testIndexReorderSparse() throws Exception { List vectors = shuffleVectors(randomLinearVectors()); // compute the expected ordering @@ -394,7 +421,8 @@ public void testIndexReorderSparse() throws Exception { int maxDoc = 0; try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf - try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) { + IndexWriterConfig cfg = createIndexWriterConfig(false); + try (IndexWriter writer = new IndexWriter(dir, cfg)) { for (float[] vector : vectors) { Document doc = new Document(); if (random().nextBoolean()) { @@ -431,6 +459,139 @@ public void testIndexReorderSparse() throws Exception { } } + // Tests of reordering in the codec + public void testReorderDenseCodec() throws Exception { + doTestReorderCodec(false); + } + + public void testReorderSparseCodec() throws Exception { + doTestReorderCodec(true); + } + + private void addDocuments(List vectors, boolean indexIsSparse, long seed, IndexWriter writer) throws IOException { + int id = 0; + boolean didCommit = false; + Random random = new Random(seed); + for (float[] vector : vectors) { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new NumericDocValuesField("id", id)); + doc.add(new StoredField("id", id)); + writer.addDocument(doc); + if (indexIsSparse && random.nextBoolean()) { + for (int i = 0; i < random.nextInt(3); i++) { + // insert some gaps -- docs with no vectors + // give them the same numeric id as their "neighbor" so that + // they sort together + Document gapDoc = new Document(); + gapDoc.add(new NumericDocValuesField("id", id)); + writer.addDocument(gapDoc); + } + } + ++id; + if (didCommit == false && id > vectors.size() / 2) { + // make two segments to induce a merge + writer.commit(); + } + } + writer.forceMerge(1); + } + + private void doTestReorderCodec(boolean indexIsSparse) throws Exception { + // must be big enough to trigger some reordering + int numVectors = BpVectorReorderer.DEFAULT_MIN_PARTITION_SIZE * 4 + random().nextInt(32); + List vectors = shuffleVectors(randomLinearVectors(numVectors)); + // use default settings to match codec + reorderer = new BpVectorReorderer(); + + // compute the expected ordering + Sorter.DocMap expected = + reorderer.computeValueMap( + FloatVectorValues.fromFloats(vectors, 2), VectorSimilarityFunction.EUCLIDEAN, null); + + // index without reordering in order to get the expected HNSW graph + // record a seed so we can re-generate the same index below + long seed = random().nextLong(); + Path tmpdir = createTempDir(); + List> expectedGraph = new ArrayList<>(); + try (Directory dir = newFSDirectory(tmpdir)) { + try (IndexWriter writer = new IndexWriter(dir, createIndexWriterConfig(false))) { + addDocuments(vectors, indexIsSparse, seed, writer); + } + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + HnswGraph hnsw = + ((HnswGraphProvider) ((CodecReader) leafReader).getVectorReader()).getGraph("f"); + for (int ord = 0; ord < hnsw.size(); ord++) { + List neighbors = new ArrayList<>(); + hnsw.seek(0, ord); + int nbr; + while ((nbr = hnsw.nextNeighbor()) != DocIdSetIterator.NO_MORE_DOCS) { + neighbors.add(nbr); + } + expectedGraph.add(neighbors); + } + } + } + + tmpdir = createTempDir(); + try (Directory dir = newFSDirectory(tmpdir)) { + IndexWriterConfig cfg = createIndexWriterConfig(true); + // Configure an index sort sometimes + if (random().nextBoolean()) { + cfg.setIndexSort(new Sort(new SortField("id", SortField.Type.INT))); + } + try (IndexWriter writer = new IndexWriter(dir, cfg)) { + addDocuments(vectors, indexIsSparse, seed, writer); + } + + // Verify the ordering produced when merging is the same, + // that the values iterator returns the correct vector values + // and that the ordToDoc mapping is correct + + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + FloatVectorValues values = leafReader.getFloatVectorValues("f"); + StoredFields storedFields = reader.storedFields(); + KnnVectorValues.DocIndexIterator it = values.iterator(); + while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int docId = it.docID(); + int oldOrd = Integer.parseInt(storedFields.document(docId).get("id")); + int newOrd = it.index(); + assertEquals(expected.oldToNew(oldOrd), newOrd); + assertEquals(expected.newToOld(newOrd), oldOrd); + assertEquals(docId, values.ordToDoc(newOrd)); + float[] actualVector = values.vectorValue(newOrd); + float[] expectedVector = vectors.get(oldOrd); + assertArrayEquals( + "values differ at index " + oldOrd + "->" + newOrd + " docid=" + docId, + expectedVector, + actualVector, + 0); + } + // Verify that we produce the same graph, numbered according to the reordering. + HnswGraph hnsw = + ((HnswGraphProvider) ((CodecReader) leafReader).getVectorReader()).getGraph("f"); + assertEquals(expectedGraph.size(), hnsw.size()); + for (int newOrd = 0; newOrd < hnsw.size(); newOrd++) { + hnsw.seek(0, newOrd); + List neighbors = new ArrayList<>(); + int nbr; + while ((nbr = hnsw.nextNeighbor()) != DocIdSetIterator.NO_MORE_DOCS) { + // map neighbor ords back to original ords + neighbors.add(expected.newToOld(nbr)); + } + // we may now get nodes out of order + Collections.sort(neighbors); + assertEquals( + "neighbors of " + newOrd + " (was " + expected.newToOld(newOrd) + ")", + expectedGraph.get(expected.newToOld(newOrd)), + neighbors); + } + } + } + } + static double angularDifference(double a, double b) { return angle2pi(b - a); } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java index 4d2bab216b9b..08afcaed66b4 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java @@ -48,6 +48,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.store.TrackingDirectoryWrapper; +import org.apache.lucene.util.AbstractBPReorderer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BytesRef; diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java index 51af1662a860..77e198127d5c 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java @@ -27,8 +27,9 @@ import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.Sorter; -import org.apache.lucene.misc.index.AbstractBPReorderer.NotEnoughRAMException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.AbstractBPReorderer.NotEnoughRAMException; +import org.apache.lucene.util.IndexReorderer; import org.apache.lucene.util.SetOnce; /** diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPReorderingMergePolicy.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPReorderingMergePolicy.java index 302ce0d751cc..fa78214596fe 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPReorderingMergePolicy.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPReorderingMergePolicy.java @@ -35,6 +35,8 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.AbstractBPReorderer; +import org.apache.lucene.util.BpVectorReorderer; import org.apache.lucene.util.IOUtils; import org.junit.Before;