apache · RamakrishnaChilaka · Sep 21, 2025 · Sep 18, 2025 · Sep 20, 2025 · Sep 20, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -150,6 +150,8 @@ Optimizations
 * GITHUB#15160: Increased the size used for blocks of postings from 128 to 256.
   This gives a noticeable speedup to many queries. (Adrien Grand)
 
+* GITHUB#15198: Optimize ForUtil.expand8 using the JDK Vector API. (Ramakrishna Chilaka)
+
 * GITHUB#14863: Perform scoring for 4, 7, 8 bit quantized vectors off-heap. (Kaival Parikh)
 
 Bug Fixes

diff --git a/lucene/core/src/generated/checksums/generateForUtil.json b/lucene/core/src/generated/checksums/generateForUtil.json
@@ -1,4 +1,4 @@
 {
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java": "5dda079c68e6060217f29010618c7fd807583056",
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py": "4692fed62d9f79554647c5423b96b9e60c9f30eb"
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java": "bf1168dbc05311c2e49b652391e01cb01d3f9133",
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py": "e87e420e633601f6f751b6777d7c094ebc66c3e7"
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/ForUtil.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.VectorUtil;
 
 /**
  * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get
@@ -55,13 +56,7 @@ static int mask8(int bitsPerValue) {
   }
 
   static void expand8(int[] arr) {
-    for (int i = 0; i < 64; ++i) {
-      int l = arr[i];
-      arr[i] = (l >>> 24) & 0xFF;
-      arr[64 + i] = (l >>> 16) & 0xFF;
-      arr[128 + i] = (l >>> 8) & 0xFF;
-      arr[192 + i] = l & 0xFF;
-    }
+    VectorUtil.expand8(arr);
   }
 
   static void collapse8(int[] arr) {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/gen_ForUtil.py
@@ -45,6 +45,7 @@
 import java.io.IOException;
 import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.VectorUtil;
 
 /**
  * Inspired from https://fulmicoton.com/posts/bitpacking/
@@ -80,13 +81,7 @@
   }
 
   static void expand8(int[] arr) {
-    for (int i = 0; i < 64; ++i) {
-      int l = arr[i];
-      arr[i] = (l >>> 24) & 0xFF;
-      arr[64 + i] = (l >>> 16) & 0xFF;
-      arr[128 + i] = (l >>> 8) & 0xFF;
-      arr[192 + i] = l & 0xFF;
-    }
+    VectorUtil.expand8(arr);
   }
 
   static void collapse8(int[] arr) {

diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java
@@ -415,4 +415,16 @@ public float[] l2normalize(float[] v, boolean throwOnZero) {
     }
     return v;
   }
+
+  @Override
+  public void expand8(int[] arr) {
+    // BLOCK_SIZE is 256
+    for (int i = 0; i < 64; ++i) {
+      int l = arr[i];
+      arr[i] = (l >>> 24) & 0xFF;
+      arr[64 + i] = (l >>> 16) & 0xFF;
+      arr[128 + i] = (l >>> 8) & 0xFF;
+      arr[192 + i] = l & 0xFF;
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java
@@ -144,4 +144,12 @@ float recalculateScalarQuantizationOffset(
   int filterByScore(int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo);
 
   float[] l2normalize(float[] v, boolean throwOnZero);
+
+  /**
+   * Expands a 64-element integer array into a 256-element array by extracting individual bytes.
+   * Each 32-bit integer is split into 4 bytes, expanding the array from 64 to 256 elements. Only
+   * works on arrays with exactly 256 items (64 integers expanded to 256 bytes). Vectorization is
+   * beneficial here because the block size is 256.
+   */
+  void expand8(int[] arr);
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
@@ -484,4 +484,8 @@ public static int filterByScore(
     }
     return IMPL.filterByScore(docBuffer, scoreBuffer, minScoreInclusive, upTo);
   }
+
+  public static void expand8(int[] arr) {
+    IMPL.expand8(arr);
+  }
 }
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
@@ -1409,4 +1409,29 @@ private void l2normalizeBody(float[] v, float invNorm, int limit) {
       FloatVector.fromArray(FLOAT_SPECIES, v, i).mul(invNormVector).intoArray(v, i);
     }
   }
+
+  private static final boolean EXPAND_8_VECTOR_OPTIMIZATION = INT_SPECIES.length() >= 4;
+
+  @Override
+  public void expand8(int[] arr) {
+    // BLOCK_SIZE is 256
+    if (EXPAND_8_VECTOR_OPTIMIZATION) {
+      for (int i = 0; i < 64; i += INT_SPECIES.length()) {
+        IntVector v = IntVector.fromArray(INT_SPECIES, arr, i);
+
+        v.lanewise(LSHR, 24).intoArray(arr, i);
+        v.lanewise(LSHR, 16).and(0xFF).intoArray(arr, 64 + i);
+        v.lanewise(LSHR, 8).and(0xFF).intoArray(arr, 128 + i);
+        v.and(0xFF).intoArray(arr, 192 + i);
+      }
+    } else {
+      for (int i = 0; i < 64; ++i) {
+        int l = arr[i];
+        arr[i] = (l >>> 24) & 0xFF;
+        arr[64 + i] = (l >>> 16) & 0xFF;
+        arr[128 + i] = (l >>> 8) & 0xFF;
+        arr[192 + i] = l & 0xFF;
+      }
+    }
+  }
 }