diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3bf3d34f4826..b5dbf7e3efaa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -150,6 +150,8 @@ Optimizations * GITHUB#15160: Increased the size used for blocks of postings from 128 to 256. This gives a noticeable speedup to many queries. (Adrien Grand) +* GITHUB#15198: Optimize ForUtil.expand8 using the JDK Vector API. (Ramakrishna Chilaka) + * GITHUB#14863: Perform scoring for 4, 7, 8 bit quantized vectors off-heap. (Kaival Parikh) Bug Fixes diff --git a/lucene/core/src/generated/checksums/generateForUtil.json b/lucene/core/src/generated/checksums/generateForUtil.json index e7970804e68d..424079d09820 100644 --- a/lucene/core/src/generated/checksums/generateForUtil.json +++ b/lucene/core/src/generated/checksums/generateForUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java": "5dda079c68e6060217f29010618c7fd807583056", - "lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py": "4692fed62d9f79554647c5423b96b9e60c9f30eb" + "lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java": "bf1168dbc05311c2e49b652391e01cb01d3f9133", + "lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py": "e87e420e633601f6f751b6777d7c094ebc66c3e7" } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java index 3a2ad7396c70..523680e7d87c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.VectorUtil; /** * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get @@ -55,13 +56,7 @@ static int mask8(int bitsPerValue) { } static void expand8(int[] arr) { - for (int i = 0; i < 64; ++i) { - int l = arr[i]; - arr[i] = (l >>> 24) & 0xFF; - arr[64 + i] = (l >>> 16) & 0xFF; - arr[128 + i] = (l >>> 8) & 0xFF; - arr[192 + i] = l & 0xFF; - } + VectorUtil.expand8(arr); } static void collapse8(int[] arr) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py index eb73e4e4e899..cacec01e0a0d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py @@ -45,6 +45,7 @@ import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.VectorUtil; /** * Inspired from https://fulmicoton.com/posts/bitpacking/ @@ -80,13 +81,7 @@ } static void expand8(int[] arr) { - for (int i = 0; i < 64; ++i) { - int l = arr[i]; - arr[i] = (l >>> 24) & 0xFF; - arr[64 + i] = (l >>> 16) & 0xFF; - arr[128 + i] = (l >>> 8) & 0xFF; - arr[192 + i] = l & 0xFF; - } + VectorUtil.expand8(arr); } static void collapse8(int[] arr) { diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java index 7f08c673a7f1..3ec646288cdd 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java @@ -415,4 +415,16 @@ public float[] l2normalize(float[] v, boolean throwOnZero) { } return v; } + + @Override + public void expand8(int[] arr) { + // BLOCK_SIZE is 256 + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[64 + i] = (l >>> 16) & 0xFF; + arr[128 + i] = (l >>> 8) & 0xFF; + arr[192 + i] = l & 0xFF; + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java index 7242a2501a19..f92a0b653caa 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java @@ -144,4 +144,12 @@ float recalculateScalarQuantizationOffset( int filterByScore(int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo); float[] l2normalize(float[] v, boolean throwOnZero); + + /** + * Expands a 64-element integer array into a 256-element array by extracting individual bytes. + * Each 32-bit integer is split into 4 bytes, expanding the array from 64 to 256 elements. Only + * works on arrays with exactly 256 items (64 integers expanded to 256 bytes). Vectorization is + * beneficial here because the block size is 256. + */ + void expand8(int[] arr); } diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index db1f6fee083b..791521c63551 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -484,4 +484,8 @@ public static int filterByScore( } return IMPL.filterByScore(docBuffer, scoreBuffer, minScoreInclusive, upTo); } + + public static void expand8(int[] arr) { + IMPL.expand8(arr); + } } diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index ba612f750040..d2e104f92f70 100644 --- a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -1409,4 +1409,29 @@ private void l2normalizeBody(float[] v, float invNorm, int limit) { FloatVector.fromArray(FLOAT_SPECIES, v, i).mul(invNormVector).intoArray(v, i); } } + + private static final boolean EXPAND_8_VECTOR_OPTIMIZATION = INT_SPECIES.length() >= 4; + + @Override + public void expand8(int[] arr) { + // BLOCK_SIZE is 256 + if (EXPAND_8_VECTOR_OPTIMIZATION) { + for (int i = 0; i < 64; i += INT_SPECIES.length()) { + IntVector v = IntVector.fromArray(INT_SPECIES, arr, i); + + v.lanewise(LSHR, 24).intoArray(arr, i); + v.lanewise(LSHR, 16).and(0xFF).intoArray(arr, 64 + i); + v.lanewise(LSHR, 8).and(0xFF).intoArray(arr, 128 + i); + v.and(0xFF).intoArray(arr, 192 + i); + } + } else { + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[64 + i] = (l >>> 16) & 0xFF; + arr[128 + i] = (l >>> 8) & 0xFF; + arr[192 + i] = l & 0xFF; + } + } + } }