From 7577d1e024d4ecff40a6877222317ca555c7dae5 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 15:15:30 +0000 Subject: [PATCH 01/86] Add jvector-4.0.0-rc.5 dependency --- lucene/licenses/agrona-1.20.0.jar.sha1 | 1 + lucene/licenses/agrona-LICENSE-ASL.txt | 201 ++++++++ lucene/licenses/agrona-NOTICE.txt | 6 + lucene/licenses/commons-math3-3.6.1.jar.sha1 | 1 + lucene/licenses/commons-math3-LICENSE-ASL.txt | 456 ++++++++++++++++++ lucene/licenses/commons-math3-NOTICE.txt | 4 + lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 | 1 + lucene/licenses/jvector-LICENSE-ASL.txt | 202 ++++++++ lucene/licenses/jvector-NOTICE.txt | 6 + lucene/licenses/snakeyaml-2.4.jar.sha1 | 1 + lucene/licenses/snakeyaml-LICENSE-ASL.txt | 176 +++++++ lucene/licenses/snakeyaml-NOTICE.txt | 4 + lucene/sandbox/build.gradle | 13 + versions.lock | 274 ++++++++++- 14 files changed, 1342 insertions(+), 4 deletions(-) create mode 100644 lucene/licenses/agrona-1.20.0.jar.sha1 create mode 100644 lucene/licenses/agrona-LICENSE-ASL.txt create mode 100644 lucene/licenses/agrona-NOTICE.txt create mode 100644 lucene/licenses/commons-math3-3.6.1.jar.sha1 create mode 100644 lucene/licenses/commons-math3-LICENSE-ASL.txt create mode 100644 lucene/licenses/commons-math3-NOTICE.txt create mode 100644 lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 create mode 100644 lucene/licenses/jvector-LICENSE-ASL.txt create mode 100644 lucene/licenses/jvector-NOTICE.txt create mode 100644 lucene/licenses/snakeyaml-2.4.jar.sha1 create mode 100644 lucene/licenses/snakeyaml-LICENSE-ASL.txt create mode 100644 lucene/licenses/snakeyaml-NOTICE.txt diff --git a/lucene/licenses/agrona-1.20.0.jar.sha1 b/lucene/licenses/agrona-1.20.0.jar.sha1 new file mode 100644 index 000000000000..badef8d6e169 --- /dev/null +++ b/lucene/licenses/agrona-1.20.0.jar.sha1 @@ -0,0 +1 @@ +00580b67864f7739bf7778162f418ada69fa3037 diff --git a/lucene/licenses/agrona-LICENSE-ASL.txt b/lucene/licenses/agrona-LICENSE-ASL.txt new file mode 100644 index 000000000000..91d486281cdf --- /dev/null +++ b/lucene/licenses/agrona-LICENSE-ASL.txt @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/agrona-NOTICE.txt b/lucene/licenses/agrona-NOTICE.txt new file mode 100644 index 000000000000..795926439ada --- /dev/null +++ b/lucene/licenses/agrona-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the Agrona project. +https://github.com/real-logic/agrona + +Copyright © 2014-2023 Real Logic Limited + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/commons-math3-3.6.1.jar.sha1 b/lucene/licenses/commons-math3-3.6.1.jar.sha1 new file mode 100644 index 000000000000..ed9a549757f5 --- /dev/null +++ b/lucene/licenses/commons-math3-3.6.1.jar.sha1 @@ -0,0 +1 @@ +e4ba98f1d4b3c80ec46392f25e094a6a2e58fcbf diff --git a/lucene/licenses/commons-math3-LICENSE-ASL.txt b/lucene/licenses/commons-math3-LICENSE-ASL.txt new file mode 100644 index 000000000000..a08b1c749765 --- /dev/null +++ b/lucene/licenses/commons-math3-LICENSE-ASL.txt @@ -0,0 +1,456 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +Apache Commons Math includes the following code provided to the ASF under the +Apache License 2.0: + + - The inverse error function implementation in the Erf class is based on CUDA + code developed by Mike Giles, Oxford-Man Institute of Quantitative Finance, + and published in GPU Computing Gems, volume 2, 2010 (grant received on + March 23th 2013) + - The LinearConstraint, LinearObjectiveFunction, LinearOptimizer, + RelationShip, SimplexSolver and SimplexTableau classes in package + org.apache.commons.math3.optimization.linear include software developed by + Benjamin McCann (http://www.benmccann.com) and distributed with + the following copyright: Copyright 2009 Google Inc. (grant received on + March 16th 2009) + - The class "org.apache.commons.math3.exception.util.LocalizedFormatsTest" which + is an adapted version of "OrekitMessagesTest" test class for the Orekit library + - The "org.apache.commons.math3.analysis.interpolation.HermiteInterpolator" + has been imported from the Orekit space flight dynamics library. + +=============================================================================== + + + +APACHE COMMONS MATH DERIVATIVE WORKS: + +The Apache commons-math library includes a number of subcomponents +whose implementation is derived from original sources written +in C or Fortran. License terms of the original sources +are reproduced below. + +=============================================================================== +For the lmder, lmpar and qrsolv Fortran routine from minpack and translated in +the LevenbergMarquardtOptimizer class in package +org.apache.commons.math3.optimization.general +Original source copyright and license statement: + +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. +=============================================================================== + +Copyright and license statement for the odex Fortran routine developed by +E. Hairer and G. Wanner and translated in GraggBulirschStoerIntegrator class +in package org.apache.commons.math3.ode.nonstiff: + + +Copyright (c) 2004, Ernst Hairer + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== + +Copyright and license statement for the original Mersenne twister C +routines translated in MersenneTwister class in package +org.apache.commons.math3.random: + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=============================================================================== + +The initial code for shuffling an array (originally in class +"org.apache.commons.math3.random.RandomDataGenerator", now replaced by +a method in class "org.apache.commons.math3.util.MathArrays") was +inspired from the algorithm description provided in +"Algorithms", by Ian Craw and John Pulham (University of Aberdeen 1999). +The textbook (containing a proof that the shuffle is uniformly random) is +available here: + http://citeseerx.ist.psu.edu/viewdoc/download;?doi=10.1.1.173.1898&rep=rep1&type=pdf + +=============================================================================== +License statement for the direction numbers in the resource files for Sobol sequences. + +----------------------------------------------------------------------------- +Licence pertaining to sobol.cc and the accompanying sets of direction numbers + +----------------------------------------------------------------------------- +Copyright (c) 2008, Frances Y. Kuo and Stephen Joe +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the copyright holders nor the names of the + University of New South Wales and the University of Waikato + and its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== + +The initial commit of package "org.apache.commons.math3.ml.neuralnet" is +an adapted version of code developed in the context of the Data Processing +and Analysis Consortium (DPAC) of the "Gaia" project of the European Space +Agency (ESA). +=============================================================================== + +The initial commit of the class "org.apache.commons.math3.special.BesselJ" is +an adapted version of code translated from the netlib Fortran program, rjbesl +http://www.netlib.org/specfun/rjbesl by R.J. Cody at Argonne National +Laboratory (USA). There is no license or copyright statement included with the +original Fortran sources. +=============================================================================== + + +The BracketFinder (package org.apache.commons.math3.optimization.univariate) +and PowellOptimizer (package org.apache.commons.math3.optimization.general) +classes are based on the Python code in module "optimize.py" (version 0.5) +developed by Travis E. Oliphant for the SciPy library (http://www.scipy.org/) +Copyright © 2003-2009 SciPy Developers. + +SciPy license +Copyright © 2001, 2002 Enthought, Inc. +All rights reserved. + +Copyright © 2003-2013 SciPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Enthought nor the names of the SciPy Developers may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== diff --git a/lucene/licenses/commons-math3-NOTICE.txt b/lucene/licenses/commons-math3-NOTICE.txt new file mode 100644 index 000000000000..5e2a2f91d48a --- /dev/null +++ b/lucene/licenses/commons-math3-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the Apache Commons Math project. +https://commons.apache.org/proper/commons-math/ + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 b/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 new file mode 100644 index 000000000000..ae9459b0c93d --- /dev/null +++ b/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 @@ -0,0 +1 @@ +799740d5484d589c579ba0b9a65ec887ec542123 diff --git a/lucene/licenses/jvector-LICENSE-ASL.txt b/lucene/licenses/jvector-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/jvector-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/jvector-NOTICE.txt b/lucene/licenses/jvector-NOTICE.txt new file mode 100644 index 000000000000..0542e27d7ef7 --- /dev/null +++ b/lucene/licenses/jvector-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the JVector project. +https://github.com/jbellis/jvector + +Copyright © 2023 Jonathan Ellis + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/snakeyaml-2.4.jar.sha1 b/lucene/licenses/snakeyaml-2.4.jar.sha1 new file mode 100644 index 000000000000..8739f8c17629 --- /dev/null +++ b/lucene/licenses/snakeyaml-2.4.jar.sha1 @@ -0,0 +1 @@ +e0666b825b796f85521f02360e77f4c92c5a7a07 diff --git a/lucene/licenses/snakeyaml-LICENSE-ASL.txt b/lucene/licenses/snakeyaml-LICENSE-ASL.txt new file mode 100644 index 000000000000..d9a10c0d8e86 --- /dev/null +++ b/lucene/licenses/snakeyaml-LICENSE-ASL.txt @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/lucene/licenses/snakeyaml-NOTICE.txt b/lucene/licenses/snakeyaml-NOTICE.txt new file mode 100644 index 000000000000..c1e6931cc149 --- /dev/null +++ b/lucene/licenses/snakeyaml-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the SnakeYAML project. +https://bitbucket.org/snakeyaml/snakeyaml + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index daf952f84a8d..6040c651f887 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -16,12 +16,25 @@ */ +plugins { + id 'java-library' +} description = 'Various third party contributions and new ideas' +java { + modularity.inferModulePath = true +} + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + + moduleImplementation('io.github.jbellis:jvector:4.0.0-rc.5') { + exclude group: 'org.slf4j', module: 'slf4j-api' + } + + moduleImplementation 'org.slf4j:slf4j-api:2.0.17' } diff --git a/versions.lock b/versions.lock index ba7fa170cddf..8dfeb4db3f8e 100644 --- a/versions.lock +++ b/versions.lock @@ -6,14 +6,16 @@ "com.ibm.icu:icu4j:78.1" : "47ea4550,refs=6", "commons-codec:commons-codec:1.20.0" : "e6288df0,refs=6", "commons-io:commons-io:2.20.0" : "5ce8cdc6,refs=2", + "io.github.jbellis:jvector:4.0.0-rc.5" : "9f877bb0,refs=7", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.2" : "fa9ef26b,refs=4", "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "5ce8cdc6,refs=2", + "org.agrona:agrona:1.20.0" : "9f877bb0,refs=7", "org.antlr:antlr4-runtime:4.13.2" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.28.0" : "5ce8cdc6,refs=2", "org.apache.commons:commons-lang3:3.18.0" : "5ce8cdc6,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", + "org.apache.commons:commons-math3:3.6.1" : "dd26014b,refs=8", "org.apache.opennlp:opennlp-tools:2.5.6.1" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", @@ -21,7 +23,8 @@ "org.hamcrest:hamcrest:3.0" : "fa9ef26b,refs=4", "org.locationtech.spatial4j:spatial4j:0.8" : "cbc357ab,refs=4", "org.openjdk.jmh:jmh-core:1.37" : "85a1e4c6,refs=2", - "org.slf4j:slf4j-api:2.0.17" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.17" : "07f0efc6,refs=10", + "org.yaml:snakeyaml:2.4" : "9f877bb0,refs=7", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.2" : "5ce8cdc6,refs=2" }, @@ -48,16 +51,18 @@ "commons-io:commons-io:2.20.0" : "6f16ff86,refs=2", "io.github.eisop:dataflow-errorprone:3.41.0-eisop1" : "90685606,refs=39", "io.github.java-diff-utils:java-diff-utils:4.12" : "90685606,refs=39", + "io.github.jbellis:jvector:4.0.0-rc.5" : "43dd284b,refs=10", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", "javax.inject:javax.inject:1" : "90685606,refs=39", "junit:junit:4.13.2" : "129da9bf,refs=76", "net.bytebuddy:byte-buddy:1.17.7" : "b7ba1646,refs=2", "net.sf.jopt-simple:jopt-simple:5.0.4" : "152d9f78,refs=3", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "6f16ff86,refs=2", + "org.agrona:agrona:1.20.0" : "43dd284b,refs=10", "org.antlr:antlr4-runtime:4.13.2" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.28.0" : "6f16ff86,refs=2", "org.apache.commons:commons-lang3:3.18.0" : "6f16ff86,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", + "org.apache.commons:commons-math3:3.6.1" : "f0656784,refs=12", "org.apache.opennlp:opennlp-tools:2.5.6.1" : "b91715f0,refs=6", "org.assertj:assertj-core:3.27.6" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", @@ -71,12 +76,55 @@ "org.openjdk.jmh:jmh-core:1.37" : "152d9f78,refs=3", "org.openjdk.jmh:jmh-generator-annprocess:1.37" : "ecaf1d73,refs=1", "org.pcollections:pcollections:4.0.1" : "90685606,refs=39", - "org.slf4j:slf4j-api:2.0.17" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.17" : "736bb8da,refs=15", + "org.yaml:snakeyaml:2.4" : "43dd284b,refs=10", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.2" : "6f16ff86,refs=2" } }, "because" : { + "07f0efc6" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "129da9bf" : [ { "configuration" : "testCompileClasspath", @@ -443,6 +491,48 @@ "projectPath" : ":lucene:analysis:opennlp" } ], + "43dd284b" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "47ea4550" : [ { "configuration" : "compileClasspath", @@ -511,6 +601,68 @@ "projectPath" : ":lucene:queries" } ], + "736bb8da" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "79af844b" : [ { "configuration" : "compileClasspath", @@ -731,6 +883,36 @@ "projectPath" : ":lucene:analysis:phonetic" } ], + "9f877bb0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "b7ba1646" : [ { "configuration" : "testCompileClasspath", @@ -825,6 +1007,40 @@ "projectPath" : ":lucene:expressions" } ], + "dd26014b" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "e077a675" : [ { "configuration" : "testCompileClasspath", @@ -891,6 +1107,56 @@ "projectPath" : ":lucene:benchmark-jmh" } ], + "f0656784" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "fa9ef26b" : [ { "configuration" : "compileClasspath", From 7843b82f6cdc1f3075a2ff5f489039f20c2c7c0a Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 15:19:11 +0000 Subject: [PATCH 02/86] [build-fails] Checkout opensearch jvector codec --- lucene/sandbox/src/java/module-info.java | 6 +- .../jvector/ForceMergesOnlyMergePolicy.java | 99 ++ .../codecs/jvector/GraphNodeIdToDocMap.java | 150 ++ .../jvector/JVectorFloatVectorValues.java | 121 ++ .../sandbox/codecs/jvector/JVectorFormat.java | 196 +++ .../codecs/jvector/JVectorIndexWriter.java | 105 ++ .../codecs/jvector/JVectorKnnCollector.java | 67 + .../jvector/JVectorKnnFloatVectorQuery.java | 83 + .../jvector/JVectorRandomAccessReader.java | 174 ++ .../sandbox/codecs/jvector/JVectorReader.java | 382 ++++ .../codecs/jvector/JVectorVectorScorer.java | 38 + .../sandbox/codecs/jvector/JVectorWriter.java | 1097 ++++++++++++ .../sandbox/codecs/jvector/package-info.java | 23 + .../org.apache.lucene.codecs.KnnVectorsFormat | 1 + .../codecs/jvector/KNNJVectorTests.java | 1557 +++++++++++++++++ 15 files changed, 4098 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index ee9be3227de2..ea49d9e2b26a 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -16,13 +16,16 @@ */ /** Various third party contributions and new ideas */ +@SuppressWarnings("requires-automatic") module org.apache.lucene.sandbox { requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires jvector; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.faiss; + exports org.apache.lucene.sandbox.codecs.jvector; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; exports org.apache.lucene.sandbox.document; @@ -41,5 +44,6 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with - org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat; + org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat, + org.apache.lucene.sandbox.codecs.jvector.JVectorFormat; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java new file mode 100644 index 000000000000..8357a5fcdb46 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -0,0 +1,99 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.index.SegmentCommitInfo; +import org.apache.lucene.index.SegmentInfos; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * A merge policy that only merges segments if they are forced. + * This is useful for testing and benchmarking purposes. Since it can be used for benchmarks, it is placed in the common + * codec module. + */ +public class ForceMergesOnlyMergePolicy extends MergePolicy { + private final boolean useCompoundFile; + + public ForceMergesOnlyMergePolicy() { + this(false); + } + + public ForceMergesOnlyMergePolicy(boolean useCompoundFile) { + super(); + this.useCompoundFile = useCompoundFile; + } + + @Override + public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) + throws IOException { + return null; + } + + @Override + public MergeSpecification findForcedMerges( + SegmentInfos segmentInfos, + int maxSegmentCount, + Map segmentsToMerge, + MergeContext mergeContext + ) throws IOException { + // If the segments are already merged (e.g. there's only 1 segment), or + // there are segments = segmentInfos.asList(); + MergeSpecification spec = new MergeSpecification(); + + final OneMerge merge = new OneMerge(segments); + spec.add(merge); + return spec; + } + + @Override + public boolean useCompoundFile(SegmentInfos segmentInfos, SegmentCommitInfo newSegment, MergeContext mergeContext) throws IOException { + return useCompoundFile; + } + + @Override + public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException { + return null; + } + + /** + * Returns true if the number of segments eligible for merging is less than or equal to the + * specified {@code maxNumSegments}. + */ + protected boolean isMerged( + SegmentInfos infos, + int maxNumSegments, + Map segmentsToMerge, + MergeContext mergeContext + ) throws IOException { + final int numSegments = infos.size(); + int numToMerge = 0; + SegmentCommitInfo mergeInfo = null; + boolean segmentIsOriginal = false; + for (int i = 0; i < numSegments && numToMerge <= maxNumSegments; i++) { + final SegmentCommitInfo info = infos.info(i); + final Boolean isOriginal = segmentsToMerge.get(info); + if (isOriginal != null) { + segmentIsOriginal = isOriginal; + numToMerge++; + mergeInfo = info; + } + } + + return numToMerge <= maxNumSegments && (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext)); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java new file mode 100644 index 000000000000..7fff91e12062 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -0,0 +1,150 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.util.Arrays; + +/** + * This class represents the mapping from the Lucene document IDs to the jVector ordinals. + * This mapping is necessary because the jVector ordinals can be different from the Lucene document IDs and when lucene documentIDs change after a merge, + * we need to update this mapping to reflect the new document IDs. + * This requires us to know the previous mapping from the previous merge and the new mapping from the current merge. + *

+ * Which means that we also need to persist this mapping to disk to be available across merges. + */ +@Log4j2 +public class GraphNodeIdToDocMap { + private static final int VERSION = 1; + private int[] graphNodeIdsToDocIds; + private int[] docIdsToGraphNodeIds; + + /** + * Constructor that reads the mapping from the index input + * + * @param in The index input + * @throws IOException if an I/O error occurs + */ + public GraphNodeIdToDocMap(IndexInput in) throws IOException { + final int version = in.readInt(); // Read the version + if (version != VERSION) { + throw new IOException("Unsupported version: " + version); + } + int size = in.readVInt(); + int maxDocId = in.readVInt(); + + graphNodeIdsToDocIds = new int[size]; + docIdsToGraphNodeIds = new int[maxDocId]; + for (int ord = 0; ord < size; ord++) { + final int docId = in.readVInt(); + graphNodeIdsToDocIds[ord] = docId; + docIdsToGraphNodeIds[docId] = ord; + } + } + + /** + * Constructor that creates a new mapping between ordinals and docIds + * + * @param graphNodeIdsToDocIds The mapping from ordinals to docIds + */ + public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { + if (graphNodeIdsToDocIds.length == 0) { + this.graphNodeIdsToDocIds = new int[0]; + this.docIdsToGraphNodeIds = new int[0]; + return; + } + this.graphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; + System.arraycopy(graphNodeIdsToDocIds, 0, this.graphNodeIdsToDocIds, 0, graphNodeIdsToDocIds.length); + final int maxDocId = Arrays.stream(graphNodeIdsToDocIds).max().getAsInt(); + final int maxDocs = maxDocId + 1; + // We are going to assume that the number of ordinals is roughly the same as the number of documents in the segment, therefore, + // the mapping will not be sparse. + if (maxDocs < graphNodeIdsToDocIds.length) { + throw new IllegalStateException("Max docs " + maxDocs + " is less than the number of ordinals " + graphNodeIdsToDocIds.length); + } + if (maxDocId > graphNodeIdsToDocIds.length) { + log.warn( + "Max doc id {} is greater than the number of ordinals {}, this implies a lot of deleted documents. Or that some documents are missing vectors. Wasting a lot of memory", + maxDocId, + graphNodeIdsToDocIds.length + ); + } + this.docIdsToGraphNodeIds = new int[maxDocs]; + Arrays.fill(this.docIdsToGraphNodeIds, -1); // -1 means no mapping to ordinal + for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { + this.docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; + } + } + + /** + * Updates the mapping from the Lucene document IDs to the jVector ordinals based on the sort operation. (during flush) + * + * @param sortMap The sort map + */ + public void update(Sorter.DocMap sortMap) { + final int[] newGraphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; + final int maxNewDocId = Arrays.stream(graphNodeIdsToDocIds).map(sortMap::oldToNew).max().getAsInt(); + final int maxDocs = maxNewDocId + 1; + if (maxDocs < graphNodeIdsToDocIds.length) { + throw new IllegalStateException("Max docs " + maxDocs + " is less than the number of ordinals " + graphNodeIdsToDocIds.length); + } + final int[] newDocIdsToOrdinals = new int[maxDocs]; + Arrays.fill(newDocIdsToOrdinals, -1); + for (int oldDocId = 0; oldDocId < docIdsToGraphNodeIds.length; oldDocId++) { + if (docIdsToGraphNodeIds[oldDocId] == -1) { + continue; + } + final int newDocId = sortMap.oldToNew(oldDocId); + final int oldOrd = docIdsToGraphNodeIds[oldDocId]; + newDocIdsToOrdinals[newDocId] = oldOrd; + newGraphNodeIdsToDocIds[oldOrd] = newDocId; + } + this.docIdsToGraphNodeIds = newDocIdsToOrdinals; + this.graphNodeIdsToDocIds = newGraphNodeIdsToDocIds; + } + + /** + * Returns the jVector node id for the given Lucene document ID + * + * @param luceneDocId The Lucene document ID + * @return The jVector ordinal + */ + public int getJVectorNodeId(int luceneDocId) { + return docIdsToGraphNodeIds[luceneDocId]; + } + + /** + * Returns the Lucene document ID for the given jVector node id + * + * @param graphNodeId The jVector ordinal + * @return The Lucene document ID + *

+ * NOTE: This method is useful when, for example, we want to remap acceptedDocs bitmap from Lucene to jVector ordinal bitmap filter + */ + public int getLuceneDocId(int graphNodeId) { + return graphNodeIdsToDocIds[graphNodeId]; + } + + /** + * Writes the mapping to the index output + * + * @param out The index output + * @throws IOException if an I/O error occurs + */ + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(VERSION); + out.writeVInt(graphNodeIdsToDocIds.length); + out.writeVInt(docIdsToGraphNodeIds.length); + for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { + out.writeVInt(graphNodeIdsToDocIds[ord]); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java new file mode 100644 index 000000000000..ce3008a79c29 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -0,0 +1,121 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.util.Bits; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.VectorScorer; + +import java.io.IOException; + +public class JVectorFloatVectorValues extends FloatVectorValues { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final OnDiskGraphIndex.View view; + private final VectorSimilarityFunction similarityFunction; + private final int dimension; + private final int size; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + + public JVectorFloatVectorValues( + OnDiskGraphIndex onDiskGraphIndex, + VectorSimilarityFunction similarityFunction, + GraphNodeIdToDocMap graphNodeIdToDocMap + ) throws IOException { + this.view = onDiskGraphIndex.getView(); + this.dimension = view.dimension(); + this.size = view.size(); + this.similarityFunction = similarityFunction; + this.graphNodeIdToDocMap = graphNodeIdToDocMap; + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public int size() { + return size; + } + + // This allows us to access the vector without copying it to float[] + public VectorFloat vectorFloatValue(int ord) { + return view.getVector(ord); + } + + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int docId = -1; + private final Bits liveNodes = view.liveNodes(); + + @Override + public long cost() { + return size(); + } + + @Override + public int index() { + return graphNodeIdToDocMap.getJVectorNodeId(docId); + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + // Advance to the next node docId starts from -1 which is why we need to increment docId by 1 "size" + // times + while (docId < size - 1) { + docId++; + if (liveNodes.get(docId)) { + return docId; + } + } + docId = NO_MORE_DOCS; + + return docId; + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + }; + } + + @Override + public float[] vectorValue(int i) throws IOException { + try { + final VectorFloat vector = vectorFloatValue(i); + return (float[]) vector.get(); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + public VectorFloat vectorValueObject(int i) throws IOException { + return vectorFloatValue(i); + } + + @Override + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + return new JVectorVectorScorer(this, VECTOR_TYPE_SUPPORT.createFloatVector(query), similarityFunction); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java new file mode 100644 index 000000000000..5d25622d3df6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -0,0 +1,196 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.opensearch.knn.common.KNNConstants; + +import java.io.IOException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ForkJoinWorkerThread; +import java.util.function.Function; + +@Log4j2 +public class JVectorFormat extends KnnVectorsFormat { + public static final String NAME = "JVectorFormat"; + public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; + public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; + public static final String NEIGHBORS_SCORE_CACHE_CODEC_NAME = "JVectorVectorsFormatNeighborsScoreCache"; + public static final String JVECTOR_FILES_SUFFIX = "jvector"; + public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; + public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; + public static final String NEIGHBORS_SCORE_CACHE_EXTENSION = "neighbors-score-cache-" + JVECTOR_FILES_SUFFIX; + + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + public static final int DEFAULT_MAX_CONN = 32; + public static final int DEFAULT_BEAM_WIDTH = 100; + // Unfortunately, this can't be managed yet by the OpenSearch ThreadPool because it's not supporting {@link ForkJoinPool} types + public static final ForkJoinPool SIMD_POOL_MERGE = getPhysicalCoreExecutor(); + public static final ForkJoinPool SIMD_POOL_FLUSH = getPhysicalCoreExecutor(); + + private final int maxConn; + private final int beamWidth; + private final Function numberOfSubspacesPerVectorSupplier; // as a function of the original dimension + private final int minBatchSizeForQuantization; + private final float alpha; + private final float neighborOverflow; + private final boolean hierarchyEnabled; + + public JVectorFormat() { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), + KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + KNNConstants.DEFAULT_HIERARCHY_ENABLED + ); + } + + public JVectorFormat(int minBatchSizeForQuantization) { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), + KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + minBatchSizeForQuantization, + KNNConstants.DEFAULT_HIERARCHY_ENABLED + ); + } + + public JVectorFormat( + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled + ) { + this( + NAME, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled + ); + } + + public JVectorFormat( + String name, + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled + ) { + super(name); + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minBatchSizeForQuantization = minBatchSizeForQuantization; + this.alpha = alpha; + this.neighborOverflow = neighborOverflow; + this.hierarchyEnabled = hierarchyEnabled; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new JVectorWriter( + state, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled + ); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new JVectorReader(state); + } + + @Override + public int getMaxDimensions(String s) { + // Not a hard limit, but a reasonable default + return 8192; + } + + /** + * This method returns the default number of subspaces per vector for a given original dimension. + * Should be used as a default value for the number of subspaces per vector in case no value is provided. + * + * @param originalDimension original vector dimension + * @return default number of subspaces per vector + */ + public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { + // the idea here is that higher dimensions compress well, but not so well that we should use fewer bits + // than a lower-dimension vector, which is what you could get with cutoff points to switch between (e.g.) + // D*0.5 and D*0.25. Thus, the following ensures that bytes per vector is strictly increasing with D. + int compressedBytes; + if (originalDimension <= 32) { + // We are compressing from 4-byte floats to single-byte codebook indexes, + // so this represents compression of 4x + // * GloVe-25 needs 25 BPV to achieve good recall + compressedBytes = originalDimension; + } else if (originalDimension <= 64) { + // * GloVe-50 performs fine at 25 + compressedBytes = 32; + } else if (originalDimension <= 200) { + // * GloVe-100 and -200 perform well at 50 and 100 BPV, respectively + compressedBytes = (int) (originalDimension * 0.5); + } else if (originalDimension <= 400) { + // * NYTimes-256 actually performs fine at 64 BPV but we'll be conservative + // since we don't want BPV to decrease + compressedBytes = 100; + } else if (originalDimension <= 768) { + // allow BPV to increase linearly up to 192 + compressedBytes = (int) (originalDimension * 0.25); + } else if (originalDimension <= 1536) { + // * ada002 vectors have good recall even at 192 BPV = compression of 32x + compressedBytes = 192; + } else { + // We have not tested recall with larger vectors than this, let's let it increase linearly + compressedBytes = (int) (originalDimension * 0.125); + } + return compressedBytes; + } + + public static ForkJoinPool getPhysicalCoreExecutor() { + final int estimatedPhysicalCoreCount = Integer.getInteger( + "jvector.physical_core_count", + Math.max(1, Runtime.getRuntime().availableProcessors() / 2) + ); + assert estimatedPhysicalCoreCount > 0 && estimatedPhysicalCoreCount <= Runtime.getRuntime().availableProcessors() + : "Invalid core count: " + estimatedPhysicalCoreCount; + final ForkJoinPool.ForkJoinWorkerThreadFactory factory = pool -> { + ForkJoinWorkerThread thread = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool); + thread.setPriority(Thread.NORM_PRIORITY - 2); + return thread; + }; + + log.info("Creating SIMD ForkJoinPool with {} physical cores for JVector SIMD operations", estimatedPhysicalCoreCount); + return new ForkJoinPool(estimatedPhysicalCoreCount, factory, null, true); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java new file mode 100644 index 000000000000..b01b4c8db1bb --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -0,0 +1,105 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.disk.IndexWriter; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; + +/** + * JVectorRandomAccessWriter is a wrapper around IndexOutput that implements RandomAccessWriter. + * Note: This is not thread safe! + */ +@Log4j2 +public class JVectorIndexWriter implements IndexWriter { + private final IndexOutput indexOutputDelegate; + + public JVectorIndexWriter(IndexOutput indexOutputDelegate) { + this.indexOutputDelegate = indexOutputDelegate; + } + + @Override + public long position() throws IOException { + return indexOutputDelegate.getFilePointer(); + } + + @Override + public void close() throws IOException { + indexOutputDelegate.close(); + } + + @Override + public void write(int b) throws IOException { + indexOutputDelegate.writeByte((byte) b); + } + + @Override + public void write(byte[] b) throws IOException { + indexOutputDelegate.writeBytes(b, 0, b.length); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + indexOutputDelegate.writeBytes(b, off, len); + } + + @Override + public void writeBoolean(boolean v) throws IOException { + indexOutputDelegate.writeByte((byte) (v ? 1 : 0)); + } + + @Override + public void writeByte(int v) throws IOException { + indexOutputDelegate.writeByte((byte) v); + } + + @Override + public void writeShort(int v) throws IOException { + indexOutputDelegate.writeShort((short) v); + } + + @Override + public void writeChar(int v) throws IOException { + throw new UnsupportedOperationException("JVectorRandomAccessWriter does not support writing chars"); + } + + @Override + public void writeInt(int v) throws IOException { + indexOutputDelegate.writeInt(v); + } + + @Override + public void writeLong(long v) throws IOException { + indexOutputDelegate.writeLong(v); + } + + @Override + public void writeFloat(float v) throws IOException { + indexOutputDelegate.writeInt(Float.floatToIntBits(v)); + } + + @Override + public void writeDouble(double v) throws IOException { + writeLong(Double.doubleToLongBits(v)); + } + + @Override + public void writeBytes(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing String as bytes"); + } + + @Override + public void writeChars(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing chars"); + } + + @Override + public void writeUTF(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing UTF strings"); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java new file mode 100644 index 000000000000..573726f5f19a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -0,0 +1,67 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.knn.index.codec.jvector; + +import lombok.Value; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.knn.KnnSearchStrategy; +import org.apache.lucene.search.TopDocs; + +/** + * Wrapper class for KnnCollector that provides passing of additional parameters specific for JVector. + */ +@Value +public class JVectorKnnCollector implements KnnCollector { + KnnCollector delegate; + float threshold; + float rerankFloor; + int overQueryFactor; + boolean usePruning; + + @Override + public boolean earlyTerminated() { + return delegate.earlyTerminated(); + } + + @Override + public void incVisitedCount(int count) { + delegate.incVisitedCount(count); + } + + @Override + public long visitedCount() { + return delegate.visitedCount(); + } + + @Override + public long visitLimit() { + return delegate.visitLimit(); + } + + @Override + public int k() { + return delegate.k(); + } + + @Override + public boolean collect(int docId, float similarity) { + return delegate.collect(docId, similarity); + } + + @Override + public float minCompetitiveSimilarity() { + return delegate.minCompetitiveSimilarity(); + } + + @Override + public TopDocs topDocs() { + return delegate.topDocs(); + } + + @Override + public KnnSearchStrategy getSearchStrategy() { + return delegate.getSearchStrategy(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java new file mode 100644 index 000000000000..922a7dcd55b1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java @@ -0,0 +1,83 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.knn.index.codec.jvector; + +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.*; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.search.knn.KnnSearchStrategy; +import org.apache.lucene.util.Bits; + +import java.io.IOException; + +/** + * {@link KnnFloatVectorQuery} that uses jVector to perform the search. + * We use this wrapper simply because we can't pass jVector specific parameters with the upstream {@link KnnFloatVectorQuery}. + */ +public class JVectorKnnFloatVectorQuery extends KnnFloatVectorQuery { + private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; + private final int overQueryFactor; + private final float threshold; + private final float rerankFloor; + private final boolean usePruning; + + public JVectorKnnFloatVectorQuery( + String field, + float[] target, + int k, + int overQueryFactor, + float threshold, + float rerankFloor, + boolean usePruning + ) { + super(field, target, k); + this.overQueryFactor = overQueryFactor; + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.usePruning = usePruning; + } + + public JVectorKnnFloatVectorQuery( + String field, + float[] target, + int k, + Query filter, + int overQueryFactor, + float threshold, + float rerankFloor, + boolean usePruning + ) { + super(field, target, k, filter); + this.overQueryFactor = overQueryFactor; + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.usePruning = usePruning; + } + + @Override + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager + ) throws IOException { + final KnnCollector delegateCollector = knnCollectorManager.newCollector(visitedLimit, KnnSearchStrategy.Hnsw.DEFAULT, context); + final KnnCollector knnCollector = new JVectorKnnCollector(delegateCollector, threshold, rerankFloor, overQueryFactor, usePruning); + LeafReader reader = context.reader(); + FloatVectorValues floatVectorValues = reader.getFloatVectorValues(field); + if (floatVectorValues == null) { + FloatVectorValues.checkField(reader, field); + return NO_RESULTS; + } + if (Math.min(knnCollector.k(), floatVectorValues.size()) == 0) { + return NO_RESULTS; + } + reader.searchNearestVectors(field, getTargetCopy(), knnCollector, acceptDocs); + TopDocs results = knnCollector.topDocs(); + return results != null ? results : NO_RESULTS; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java new file mode 100644 index 000000000000..c3b823010c6d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -0,0 +1,174 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.disk.RandomAccessReader; +import io.github.jbellis.jvector.disk.ReaderSupplier; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; + +@Log4j2 +public class JVectorRandomAccessReader implements RandomAccessReader { + private final byte[] internalBuffer = new byte[Long.BYTES]; + private final byte[] internalFloatBuffer = new byte[Float.BYTES]; + private final IndexInput indexInputDelegate; + private volatile boolean closed = false; + + public JVectorRandomAccessReader(IndexInput indexInputDelegate) { + this.indexInputDelegate = indexInputDelegate; + } + + @Override + public void seek(long offset) throws IOException { + indexInputDelegate.seek(offset); + } + + @Override + public long getPosition() throws IOException { + return indexInputDelegate.getFilePointer(); + } + + @Override + public int readInt() throws IOException { + return indexInputDelegate.readInt(); + } + + @Override + public float readFloat() throws IOException { + return Float.intBitsToFloat(indexInputDelegate.readInt()); + } + + // TODO: bring back to override when upgrading jVector again + // @Override + public long readLong() throws IOException { + return indexInputDelegate.readLong(); + } + + @Override + public void readFully(byte[] bytes) throws IOException { + indexInputDelegate.readBytes(bytes, 0, bytes.length); + } + + @Override + public void readFully(ByteBuffer buffer) throws IOException { + // validate that the requested bytes actually exist ---- + long remainingInFile = indexInputDelegate.length() - indexInputDelegate.getFilePointer(); + if (buffer.remaining() > remainingInFile) { + throw new EOFException("Requested " + buffer.remaining() + " bytes but only " + remainingInFile + " available"); + } + + // Heap buffers with a backing array can be filled in one call ---- + if (buffer.hasArray()) { + int off = buffer.arrayOffset() + buffer.position(); + int len = buffer.remaining(); + indexInputDelegate.readBytes(buffer.array(), off, len); + buffer.position(buffer.limit()); // advance fully + return; + } + + // Direct / non-array buffers: copy in reasonable chunks ---- + while (buffer.hasRemaining()) { + final int bytesToRead = Math.min(buffer.remaining(), Long.BYTES); + indexInputDelegate.readBytes(this.internalBuffer, 0, bytesToRead); + buffer.put(this.internalBuffer, 0, bytesToRead); + } + } + + @Override + public void readFully(long[] vector) throws IOException { + for (int i = 0; i < vector.length; i++) { + vector[i] = readLong(); + } + } + + @Override + public void read(int[] ints, int offset, int count) throws IOException { + for (int i = 0; i < count; i++) { + ints[offset + i] = readInt(); + } + } + + @Override + public void read(float[] floats, int offset, int count) throws IOException { + final ByteBuffer byteBuffer = ByteBuffer.allocate(Float.BYTES * count); + indexInputDelegate.readBytes(byteBuffer.array(), offset, Float.BYTES * count); + FloatBuffer buffer = byteBuffer.asFloatBuffer(); + buffer.get(floats, offset, count); + } + + @Override + public void close() throws IOException { + log.debug("Closing JVectorRandomAccessReader for file: {}", indexInputDelegate); + this.closed = true; + // no need to really close the index input delegate since it is a clone + log.debug("Closed JVectorRandomAccessReader for file: {}", indexInputDelegate); + } + + @Override + public long length() throws IOException { + return indexInputDelegate.length(); + } + + /** + * Supplies readers which are actually slices of the original IndexInput. + * We will vend out slices in order for us to easily find the footer of the jVector graph index. + * This is useful because our logic that reads the graph that the footer is always at {@link IndexInput#length()} of the slice. + * Which is how {@link io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} is working behind the scenes. + * The header offset, on the other hand, is flexible because we can provide it as a parameter to {@link io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} + */ + public static class Supplier implements ReaderSupplier { + private final AtomicInteger readerCount = new AtomicInteger(0); + private final IndexInput currentInput; + private final long sliceStartOffset; + private final long sliceLength; + private final ConcurrentHashMap readers = new ConcurrentHashMap<>(); + + public Supplier(IndexInput indexInput) throws IOException { + this(indexInput, indexInput.getFilePointer(), indexInput.length() - indexInput.getFilePointer()); + } + + public Supplier(IndexInput indexInput, long sliceStartOffset, long sliceLength) throws IOException { + this.currentInput = indexInput; + this.sliceStartOffset = sliceStartOffset; + this.sliceLength = sliceLength; + } + + @Override + public RandomAccessReader get() throws IOException { + synchronized (this) { + final IndexInput input = currentInput.slice("Input Slice for the jVector graph or PQ", sliceStartOffset, sliceLength) + .clone(); + + var reader = new JVectorRandomAccessReader(input); + int readerId = readerCount.getAndIncrement(); + readers.put(readerId, reader); + return reader; + } + + } + + @Override + public void close() throws IOException { + // Close source of all cloned inputs + IOUtils.closeWhileHandlingException(currentInput); + + // Close all readers + for (RandomAccessReader reader : readers.values()) { + IOUtils.closeWhileHandlingException(reader::close); + } + readers.clear(); + readerCount.set(0); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java new file mode 100644 index 000000000000..3c8aa4622000 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -0,0 +1,382 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.disk.RandomAccessReader; +import io.github.jbellis.jvector.disk.ReaderSupplier; +import io.github.jbellis.jvector.graph.GraphSearcher; +import io.github.jbellis.jvector.graph.SearchResult; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider; +import io.github.jbellis.jvector.graph.similarity.ScoreFunction; +import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.*; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.*; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOUtils; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import org.opensearch.knn.common.KNNConstants; +import org.opensearch.knn.plugin.stats.KNNCounter; + +import java.io.Closeable; +import java.io.IOException; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +@Log4j2 +public class JVectorReader extends KnnVectorsReader { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final FieldInfos fieldInfos; + private final String baseDataFileName; + // Maps field name to field entries + private final Map fieldEntryMap = new HashMap<>(1); + private final Directory directory; + private final SegmentReadState state; + + public JVectorReader(SegmentReadState state) throws IOException { + this.state = state; + this.fieldInfos = state.fieldInfos; + this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; + final String metaFileName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + JVectorFormat.META_EXTENSION + ); + this.directory = state.directory; + boolean success = false; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + CodecUtil.checkIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + readFields(meta); + CodecUtil.checkFooter(meta); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void checkIntegrity() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + // Verify the vector index file + try (var indexInput = state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, IOContext.READONCE)) { + CodecUtil.checksumEntireFile(indexInput); + } + + // Verify the neighbors score cache file + try (var indexInput = state.directory.openInput(fieldEntry.neighborsScoreCacheIndexFieldFileName, IOContext.READONCE)) { + CodecUtil.checksumEntireFile(indexInput); + } + } + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + return new JVectorFloatVectorValues(fieldEntry.index, fieldEntry.similarityFunction, fieldEntry.graphNodeIdToDocMap); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + /** + * Byte vector values are not supported in jVector library. Instead use PQ. + */ + return null; + } + + public Optional getProductQuantizationForField(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + if (fieldEntry.pqVectors == null) { + return Optional.empty(); + } + + return Optional.of(fieldEntry.pqVectors.getCompressor()); + } + + public RandomAccessReader getNeighborsScoreCacheForField(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + return fieldEntry.neighborsScoreCacheIndexReaderSupplier.get(); + } + + public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { + return fieldEntryMap.get(field).index; + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + final OnDiskGraphIndex index = fieldEntryMap.get(field).index; + final JVectorKnnCollector jvectorKnnCollector; + if (knnCollector instanceof JVectorKnnCollector) { + jvectorKnnCollector = (JVectorKnnCollector) knnCollector; + } else { + log.warn("KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is not ideal"); + jvectorKnnCollector = new JVectorKnnCollector( + knnCollector, + KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), + KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), + KNNConstants.DEFAULT_OVER_QUERY_FACTOR, + KNNConstants.DEFAULT_QUERY_USE_PRUNING + ); + + } + + // search for a random vector using a GraphSearcher and SearchScoreProvider + VectorFloat q = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final SearchScoreProvider ssp; + + try (var view = index.getView()) { + final long graphSearchStart = System.currentTimeMillis(); + if (fieldEntryMap.get(field).pqVectors != null) { // Quantized, use the precomputed score function + final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; + // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, + // then reranks with the exact vectors that are stored on disk in the index + ScoreFunction.ApproximateScoreFunction asf = pqVectors.precomputedScoreFunctionFor( + q, + fieldEntryMap.get(field).similarityFunction + ); + ScoreFunction.ExactScoreFunction reranker = view.rerankerFor(q, fieldEntryMap.get(field).similarityFunction); + ssp = new DefaultSearchScoreProvider(asf, reranker); + } else { // Not quantized, used typical searcher + ssp = DefaultSearchScoreProvider.exact(q, fieldEntryMap.get(field).similarityFunction, view); + } + final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntryMap.get(field).graphNodeIdToDocMap; + // Convert the acceptDocs bitmap from Lucene to jVector ordinal bitmap filter + // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check if the jVector ordinal has a + // corresponding Lucene doc ID accepted by acceptDocs filter. + io.github.jbellis.jvector.util.Bits compatibleBits = ord -> acceptDocs == null + || acceptDocs.get(jvectorLuceneDocMap.getLuceneDocId(ord)); + + try (var graphSearcher = new GraphSearcher(index)) { + final var searchResults = graphSearcher.search( + ssp, + jvectorKnnCollector.k(), + jvectorKnnCollector.k() * jvectorKnnCollector.getOverQueryFactor(), + jvectorKnnCollector.getThreshold(), + jvectorKnnCollector.getRerankFloor(), + compatibleBits + ); + for (SearchResult.NodeScore ns : searchResults.getNodes()) { + jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); + } + final long graphSearchEnd = System.currentTimeMillis(); + final long searchTime = graphSearchEnd - graphSearchStart; + log.debug("Search (including acquiring view) took {} ms", searchTime); + + // Collect the below metrics about the search and somehow wire this back to {@link @KNNStats} + final int visitedNodesCount = searchResults.getVisitedCount(); + final int rerankedCount = searchResults.getRerankedCount(); + + final int expandedCount = searchResults.getExpandedCount(); + final int expandedBaseLayerCount = searchResults.getExpandedCountBaseLayer(); + + KNNCounter.KNN_QUERY_VISITED_NODES.add(visitedNodesCount); + KNNCounter.KNN_QUERY_RERANKED_COUNT.add(rerankedCount); + KNNCounter.KNN_QUERY_EXPANDED_NODES.add(expandedCount); + KNNCounter.KNN_QUERY_EXPANDED_BASE_LAYER_NODES.add(expandedBaseLayerCount); + KNNCounter.KNN_QUERY_GRAPH_SEARCH_TIME.add(searchTime); + log.debug( + "rerankedCount: {}, visitedNodesCount: {}, expandedCount: {}, expandedBaseLayerCount: {}", + rerankedCount, + visitedNodesCount, + expandedCount, + expandedBaseLayerCount + ); + + } + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + // TODO: implement this + throw new UnsupportedOperationException("Byte vector search is not supported yet with jVector"); + } + + @Override + public void close() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + IOUtils.close(fieldEntry); + } + fieldEntryMap.clear(); + } + + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number + JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = new JVectorWriter.VectorIndexFieldMetadata(meta); + assert fieldInfo.number == vectorIndexFieldMetadata.getFieldNumber(); + fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); + } + } + + class FieldEntry implements Closeable { + private final FieldInfo fieldInfo; + private final VectorEncoding vectorEncoding; + private final VectorSimilarityFunction similarityFunction; + private final int dimension; + private final long vectorIndexOffset; + private final long vectorIndexLength; + private final long pqCodebooksAndVectorsLength; + private final long pqCodebooksAndVectorsOffset; + private final String vectorIndexFieldDataFileName; + private final String neighborsScoreCacheIndexFieldFileName; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final ReaderSupplier indexReaderSupplier; + private final ReaderSupplier pqCodebooksReaderSupplier; + private final ReaderSupplier neighborsScoreCacheIndexReaderSupplier; + private final OnDiskGraphIndex index; + private final PQVectors pqVectors; // The product quantized vectors with their codebooks + + public FieldEntry(FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) throws IOException { + this.fieldInfo = fieldInfo; + this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( + vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal() + ); + this.vectorEncoding = vectorIndexFieldMetadata.getVectorEncoding(); + this.vectorIndexOffset = vectorIndexFieldMetadata.getVectorIndexOffset(); + this.vectorIndexLength = vectorIndexFieldMetadata.getVectorIndexLength(); + this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.getPqCodebooksAndVectorsLength(); + this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.getPqCodebooksAndVectorsOffset(); + this.dimension = vectorIndexFieldMetadata.getVectorDimension(); + this.graphNodeIdToDocMap = vectorIndexFieldMetadata.getGraphNodeIdToDocMap(); + + this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; + this.neighborsScoreCacheIndexFieldFileName = baseDataFileName + + "_" + + fieldInfo.name + + "." + + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; + + // For the slice we would like to include the Lucene header, unfortunately, we have to do this because jVector use global + // offsets instead of local offsets + final long sliceLength = vectorIndexLength + CodecUtil.indexHeaderLength( + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + state.segmentSuffix + ); + // Load the graph index + this.indexReaderSupplier = new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), + 0, + sliceLength + ); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + + // If quantized load the compressed product quantized vectors with their codebooks + if (pqCodebooksAndVectorsLength > 0) { + assert pqCodebooksAndVectorsOffset > 0; + if (pqCodebooksAndVectorsOffset < vectorIndexOffset) { + throw new IllegalArgumentException("pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); + } + this.pqCodebooksReaderSupplier = new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, IOContext.READONCE), + pqCodebooksAndVectorsOffset, + pqCodebooksAndVectorsLength + ); + log.debug( + "Loading PQ codebooks and vectors for field {}, with numbers of vectors: {}", + fieldInfo.name, + state.segmentInfo.maxDoc() + ); + try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { + this.pqVectors = PQVectors.load(randomAccessReader); + } + } else { + this.pqCodebooksReaderSupplier = null; + this.pqVectors = null; + } + + final IndexInput indexInput = directory.openInput(neighborsScoreCacheIndexFieldFileName, state.context); + CodecUtil.readIndexHeader(indexInput); + + this.neighborsScoreCacheIndexReaderSupplier = new JVectorRandomAccessReader.Supplier(indexInput); + } + + @Override + public void close() throws IOException { + if (indexReaderSupplier != null) { + IOUtils.close(indexReaderSupplier::close); + } + if (pqCodebooksReaderSupplier != null) { + IOUtils.close(pqCodebooksReaderSupplier::close); + } + if (neighborsScoreCacheIndexReaderSupplier != null) { + IOUtils.close(neighborsScoreCacheIndexReaderSupplier::close); + } + } + } + + /** + * Utility class to map between Lucene and jVector similarity functions and metadata ordinals. + */ + public static class VectorSimilarityMapper { + /** + List of vector similarity functions supported by jVector library + The similarity functions orders matter in this list because it is later used to resolve the similarity function by ordinal. + */ + public static final List JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS = List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE + ); + + public static final Map LUCENE_TO_JVECTOR_MAP = Map.of( + org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.EUCLIDEAN, + org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.DOT_PRODUCT, + org.apache.lucene.index.VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.COSINE + ); + + public static int distFuncToOrd(org.apache.lucene.index.VectorSimilarityFunction func) { + if (LUCENE_TO_JVECTOR_MAP.containsKey(func)) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.indexOf(LUCENE_TO_JVECTOR_MAP.get(func)); + } + + throw new IllegalArgumentException("invalid distance function: " + func); + } + + public static VectorSimilarityFunction ordToDistFunc(int ord) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + } + + public static org.apache.lucene.index.VectorSimilarityFunction ordToLuceneDistFunc(int ord) { + if (ord < 0 || ord >= JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("Invalid ordinal: " + ord); + } + VectorSimilarityFunction jvectorFunc = JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + for (Map.Entry entry : LUCENE_TO_JVECTOR_MAP + .entrySet()) { + if (entry.getValue().equals(jvectorFunc)) { + return entry.getKey(); + } + } + throw new IllegalStateException("No matching Lucene VectorSimilarityFunction found for ordinal: " + ord); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java new file mode 100644 index 000000000000..e27b168b6362 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -0,0 +1,38 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.VectorScorer; + +import java.io.IOException; + +public class JVectorVectorScorer implements VectorScorer { + private final JVectorFloatVectorValues floatVectorValues; + private final KnnVectorValues.DocIndexIterator docIndexIterator; + private final VectorFloat target; + private final VectorSimilarityFunction similarityFunction; + + public JVectorVectorScorer(JVectorFloatVectorValues vectorValues, VectorFloat target, VectorSimilarityFunction similarityFunction) { + this.floatVectorValues = vectorValues; + this.docIndexIterator = floatVectorValues.iterator(); + this.target = target; + this.similarityFunction = similarityFunction; + } + + @Override + public float score() throws IOException { + return similarityFunction.compare(target, floatVectorValues.vectorFloatValue(docIndexIterator.index())); + } + + @Override + public DocIdSetIterator iterator() { + return docIndexIterator; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java new file mode 100644 index 000000000000..434e08a6964e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -0,0 +1,1097 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import io.github.jbellis.jvector.disk.RandomAccessReader; +import io.github.jbellis.jvector.graph.*; +import io.github.jbellis.jvector.graph.disk.*; +import io.github.jbellis.jvector.graph.disk.feature.Feature; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; +import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.Value; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.index.*; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.*; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.opensearch.knn.plugin.stats.KNNCounter; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.time.Clock; +import java.util.*; +import java.util.concurrent.ForkJoinPool; +import java.util.function.Function; +import java.util.stream.IntStream; + +import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; +import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_FLUSH; +import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_MERGE; + +/** + * JVectorWriter is responsible for writing vector data into index segments using the JVector library. + * + *

Persisting the JVector Graph Index

+ * + *

+ * Flushing data into disk segments occurs in two scenarios: + *

    + *
  1. When the segment is being flushed to disk (e.g., when a new segment is created) via {@link #flush(int, Sorter.DocMap)}
  2. + *
  3. When the segment is a result of a merge (e.g., when multiple segments are merged into one) via {@link #mergeOneField(FieldInfo, MergeState)}
  4. + *
+ * + *

jVector Graph Ordinal to Lucene Document ID Mapping

+ * + *

+ * JVector keeps its own ordinals to identify its nodes. Those ordinals can be different from the Lucene document IDs. + * Document IDs in Lucene can change after a merge operation. Therefore, we need to maintain a mapping between + * JVector ordinals and Lucene document IDs that can hold across merges. + *

+ * Document IDs in Lucene are mapped across merges and sorts using the {@link org.apache.lucene.index.MergeState.DocMap} for merges and {@link org.apache.lucene.index.Sorter.DocMap} for flush/sorts. + * For jVector however, we don't want to modify the ordinals in the jVector graph, and therefore we need to maintain a mapping between the jVector ordinals and the new Lucene document IDs. + * This is achieved by keeping checkpoints of the {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as needed across merges by constructing a new mapping from the previous mapping and the {@link MergeState.DocMap} provided in the {@link MergeState}. + * And across sorts with {@link GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. + *

+ * + */ +@Log4j2 +public class JVectorWriter extends KnnVectorsWriter { + private static final long SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); + + private final List> fields = new ArrayList<>(); + + private final IndexOutput meta; + private final IndexOutput vectorIndex; + private final String indexDataFileName; + private final String baseDataFileName; + private final SegmentWriteState segmentWriteState; + private final int maxConn; + private final int beamWidth; + private final float degreeOverflow; + private final float alpha; + private final Function numberOfSubspacesPerVectorSupplier; // Number of subspaces used per vector for PQ quantization + // as a function of the original dimension + private final int minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger PQ quantization + private final boolean hierarchyEnabled; + + private boolean finished = false; + + public JVectorWriter( + SegmentWriteState segmentWriteState, + int maxConn, + int beamWidth, + float degreeOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minimumBatchSizeForQuantization, + boolean hierarchyEnabled + ) throws IOException { + this.segmentWriteState = segmentWriteState; + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.degreeOverflow = degreeOverflow; + this.alpha = alpha; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; + this.hierarchyEnabled = hierarchyEnabled; + String metaFileName = IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.META_EXTENSION + ); + + this.indexDataFileName = IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.VECTOR_INDEX_EXTENSION + ); + this.baseDataFileName = segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; + + boolean success = false; + try { + meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); + vectorIndex = segmentWriteState.directory.createOutput(indexDataFileName, segmentWriteState.context); + CodecUtil.writeIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix + ); + + CodecUtil.writeIndexHeader( + vectorIndex, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix + ); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + log.info("Adding field {} in segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); + if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { + final String errorMessage = "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + log.error(errorMessage); + throw new UnsupportedOperationException(errorMessage); + } + FieldWriter newField = new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name); + + fields.add(newField); + return newField; + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + log.info("Merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); + try { + final long mergeStart = Clock.systemDefaultZone().millis(); + switch (fieldInfo.getVectorEncoding()) { + case BYTE: + throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); + case FLOAT32: + final var mergeRavv = new RandomAccessMergedFloatVectorValues(fieldInfo, mergeState); + mergeRavv.merge(); + break; + } + final long mergeEnd = Clock.systemDefaultZone().millis(); + final long mergeTime = mergeEnd - mergeStart; + KNNCounter.KNN_GRAPH_MERGE_TIME.add(mergeTime); + log.info("Completed Merge field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); + } catch (Exception e) { + log.error("Error merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name, e); + throw e; + } + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + log.info("Flushing {} fields", fields.size()); + + log.info("Flushing jVector graph index"); + for (FieldWriter field : fields) { + final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; + final int[] newToOldOrds = new int[randomAccessVectorValues.size()]; + for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { + newToOldOrds[ord] = ord; + } + final BuildScoreProvider buildScoreProvider; + final PQVectors pqVectors; + final FieldInfo fieldInfo = field.fieldInfo; + if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { + log.info("Calculating codebooks and compressed vectors for field {}", fieldInfo.name); + pqVectors = getPQVectors(newToOldOrds, randomAccessVectorValues, fieldInfo); + buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(getVectorSimilarityFunction(fieldInfo), pqVectors); + } else { + log.info( + "Vector count: {}, less than limit to trigger PQ quantization: {}, for field {}, will use full precision vectors instead.", + randomAccessVectorValues.size(), + minimumBatchSizeForQuantization, + fieldInfo.name + ); + pqVectors = null; + buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( + randomAccessVectorValues, + getVectorSimilarityFunction(fieldInfo) + ); + } + + // Generate the ord to doc mapping + final int[] ordinalsToDocIds = new int[randomAccessVectorValues.size()]; + for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { + ordinalsToDocIds[ord] = field.docIds.get(ord); + } + final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(ordinalsToDocIds); + if (sortMap != null) { + graphNodeIdToDocMap.update(sortMap); + } + + OnHeapGraphIndex graph = getGraph( + buildScoreProvider, + randomAccessVectorValues, + newToOldOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_FLUSH + ); + writeField(field.fieldInfo, field.randomAccessVectorValues, pqVectors, newToOldOrds, graphNodeIdToDocMap, graph); + + } + } + + private void writeField( + FieldInfo fieldInfo, + RandomAccessVectorValues randomAccessVectorValues, + PQVectors pqVectors, + int[] newToOldOrds, + GraphNodeIdToDocMap graphNodeIdToDocMap, + OnHeapGraphIndex graph + ) throws IOException { + log.info( + "Writing field {} with vector count: {}, for segment: {}", + fieldInfo.name, + randomAccessVectorValues.size(), + segmentWriteState.segmentInfo.name + ); + final var vectorIndexFieldMetadata = writeGraph( + graph, + randomAccessVectorValues, + fieldInfo, + pqVectors, + newToOldOrds, + graphNodeIdToDocMap + ); + meta.writeInt(fieldInfo.number); + vectorIndexFieldMetadata.toOutput(meta); + + log.info("Writing neighbors score cache for field {}", fieldInfo.name); + // field data file, which contains the graph + final String neighborsScoreCacheIndexFieldFileName = baseDataFileName + + "_" + + fieldInfo.name + + "." + + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; + try ( + IndexOutput indexOutput = segmentWriteState.directory.createOutput( + neighborsScoreCacheIndexFieldFileName, + segmentWriteState.context + ); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput) + ) { + CodecUtil.writeIndexHeader( + indexOutput, + JVectorFormat.NEIGHBORS_SCORE_CACHE_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix + ); + graph.save(jVectorIndexWriter); + CodecUtil.writeFooter(indexOutput); + } + } + + /** + * Writes the graph and PQ codebooks and compressed vectors to the vector index file + * @param graph graph + * @param randomAccessVectorValues random access vector values + * @param fieldInfo field info + * @return Tuple of start offset and length of the graph + * @throws IOException IOException + */ + private VectorIndexFieldMetadata writeGraph( + OnHeapGraphIndex graph, + RandomAccessVectorValues randomAccessVectorValues, + FieldInfo fieldInfo, + PQVectors pqVectors, + int[] newToOldOrds, + GraphNodeIdToDocMap graphNodeIdToDocMap + ) throws IOException { + // field data file, which contains the graph + final String vectorIndexFieldFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; + + try ( + IndexOutput indexOutput = segmentWriteState.directory.createOutput(vectorIndexFieldFileName, segmentWriteState.context); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput) + ) { + // Header for the field data file + CodecUtil.writeIndexHeader( + indexOutput, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix + ); + final long startOffset = indexOutput.getFilePointer(); + + log.info("Writing graph to {}", vectorIndexFieldFileName); + var resultBuilder = VectorIndexFieldMetadata.builder() + .fieldNumber(fieldInfo.number) + .vectorEncoding(fieldInfo.getVectorEncoding()) + .vectorSimilarityFunction(fieldInfo.getVectorSimilarityFunction()) + .vectorDimension(randomAccessVectorValues.dimension()) + .graphNodeIdToDocMap(graphNodeIdToDocMap); + + try ( + var writer = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter).with( + new InlineVectors(randomAccessVectorValues.dimension()) + ).build() + ) { + var suppliers = Feature.singleStateFactory( + FeatureId.INLINE_VECTORS, + nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(newToOldOrds[nodeId])) + ); + writer.write(suppliers); + long endGraphOffset = jVectorIndexWriter.position(); + resultBuilder.vectorIndexOffset(startOffset); + resultBuilder.vectorIndexLength(endGraphOffset - startOffset); + + // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed vectors + if (pqVectors != null) { + log.info( + "Writing PQ codebooks and vectors for field {} since the size is {} >= {}", + fieldInfo.name, + randomAccessVectorValues.size(), + minimumBatchSizeForQuantization + ); + resultBuilder.pqCodebooksAndVectorsOffset(endGraphOffset); + // write the compressed vectors and codebooks to disk + pqVectors.write(jVectorIndexWriter); + resultBuilder.pqCodebooksAndVectorsLength(jVectorIndexWriter.position() - endGraphOffset); + } else { + resultBuilder.pqCodebooksAndVectorsOffset(0); + resultBuilder.pqCodebooksAndVectorsLength(0); + } + CodecUtil.writeFooter(indexOutput); + } + + return resultBuilder.build(); + } + } + + private PQVectors getPQVectors(int[] newToOldOrds, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) + throws IOException { + final String fieldName = fieldInfo.name; + final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); + log.info("Computing PQ codebooks for field {} for {} vectors", fieldName, randomAccessVectorValues.size()); + final long start = Clock.systemDefaultZone().millis(); + final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); + final var numberOfClustersPerSubspace = Math.min(256, randomAccessVectorValues.size()); // number of centroids per + // subspace + ProductQuantization pq = ProductQuantization.compute( + randomAccessVectorValues, + M, // number of subspaces + numberOfClustersPerSubspace, // number of centroids per subspace + vectorSimilarityFunction == VectorSimilarityFunction.EUCLIDEAN, // center the dataset + UNWEIGHTED, + SIMD_POOL_MERGE, + ForkJoinPool.commonPool() + ); + + final long end = Clock.systemDefaultZone().millis(); + final long trainingTime = end - start; + log.info("Computed PQ codebooks for field {}, in {} millis", fieldName, trainingTime); + KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); + log.info("Encoding and building PQ vectors for field {} for {} vectors", fieldName, randomAccessVectorValues.size()); + // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); + PQVectors pqVectors = PQVectors.encodeAndBuild(pq, newToOldOrds.length, newToOldOrds, randomAccessVectorValues, SIMD_POOL_MERGE); + log.info( + "Encoded and built PQ vectors for field {}, original size: {} bytes, compressed size: {} bytes", + fieldName, + pqVectors.getOriginalSize(), + pqVectors.getCompressedSize() + ); + return pqVectors; + } + + @Value + @Builder(toBuilder = true) + @AllArgsConstructor + public static class VectorIndexFieldMetadata { + int fieldNumber; + VectorEncoding vectorEncoding; + VectorSimilarityFunction vectorSimilarityFunction; + int vectorDimension; + long vectorIndexOffset; + long vectorIndexLength; + long pqCodebooksAndVectorsOffset; + long pqCodebooksAndVectorsLength; + float degreeOverflow; // important when leveraging cache + GraphNodeIdToDocMap graphNodeIdToDocMap; + + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(fieldNumber); + out.writeInt(vectorEncoding.ordinal()); + out.writeInt(JVectorReader.VectorSimilarityMapper.distFuncToOrd(vectorSimilarityFunction)); + out.writeVInt(vectorDimension); + out.writeVLong(vectorIndexOffset); + out.writeVLong(vectorIndexLength); + out.writeVLong(pqCodebooksAndVectorsOffset); + out.writeVLong(pqCodebooksAndVectorsLength); + out.writeInt(Float.floatToIntBits(degreeOverflow)); + graphNodeIdToDocMap.toOutput(out); + } + + public VectorIndexFieldMetadata(IndexInput in) throws IOException { + this.fieldNumber = in.readInt(); + this.vectorEncoding = readVectorEncoding(in); + this.vectorSimilarityFunction = JVectorReader.VectorSimilarityMapper.ordToLuceneDistFunc(in.readInt()); + this.vectorDimension = in.readVInt(); + this.vectorIndexOffset = in.readVLong(); + this.vectorIndexLength = in.readVLong(); + this.pqCodebooksAndVectorsOffset = in.readVLong(); + this.pqCodebooksAndVectorsLength = in.readVLong(); + this.degreeOverflow = Float.intBitsToFloat(in.readInt()); + this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(in); + } + + } + + @Override + public void finish() throws IOException { + log.info("Finishing segment {}", segmentWriteState.segmentInfo.name); + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + + if (vectorIndex != null) { + CodecUtil.writeFooter(vectorIndex); + } + + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorIndex); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (FieldWriter field : fields) { + // the field tracks the delegate field usage + total += field.ramBytesUsed(); + } + return total; + } + + /** + * The FieldWriter class is responsible for writing vector field data into index segments. + * It provides functionality to process vector values as those being added, manage memory usage, and build HNSW graph + * indexing structures for efficient retrieval during search queries. + * + * @param The type of vector value to be handled by the writer. + * This is often specialized to support specific implementations, such as float[] or byte[] vectors. + */ + static class FieldWriter extends KnnFieldVectorsWriter { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); + @Getter + private final FieldInfo fieldInfo; + private int lastDocID = -1; + private final String segmentName; + private final RandomAccessVectorValues randomAccessVectorValues; + // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to the jVector ordinal + private final List> vectors = new ArrayList<>(); + private final List docIds = new ArrayList<>(); + + FieldWriter(FieldInfo fieldInfo, String segmentName) { + /** + * For creating a new field from a flat field vectors writer. + */ + this.randomAccessVectorValues = new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); + this.fieldInfo = fieldInfo; + this.segmentName = segmentName; + } + + @Override + public void addValue(int docID, T vectorValue) throws IOException { + log.trace("Adding value {} to field {} in segment {}", vectorValue, fieldInfo.name, segmentName); + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)" + ); + } + docIds.add(docID); + if (vectorValue instanceof float[]) { + vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); + } else if (vectorValue instanceof byte[]) { + final String errorMessage = "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + log.error("{}", errorMessage); + throw new UnsupportedOperationException(errorMessage); + } else { + throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); + } + + lastDocID = docID; + } + + @Override + public T copyValue(T vectorValue) { + throw new UnsupportedOperationException("copyValue not supported"); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES; + } + + } + + static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction(FieldInfo fieldInfo) { + log.info("Matching vector similarity function {} for field {}", fieldInfo.getVectorSimilarityFunction(), fieldInfo.name); + return switch (fieldInfo.getVectorSimilarityFunction()) { + case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; + case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + default -> throw new IllegalArgumentException("Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); + }; + } + + /** + * Implementation of RandomAccessVectorValues that directly uses the source + * FloatVectorValues from multiple segments without copying the vectors. + * + * Some details about the implementation logic: + * + * First, we identify the leading reader, which is the one with the most live vectors. + * Second, we build a mapping between the ravv ordinals and the reader index and the ordinal in that reader. + * Third, we build a mapping between the ravv ordinals and the global doc ids. + * + * Very important to note that for the leading graph the node Ids need to correspond to their original ravv ordinals in the reader. + * This is because we are later going to expand that graph with new vectors from the other readers. + * While the new vectors can be assigned arbitrary node Ids, the leading graph needs to preserve its original node Ids and map them to the original ravv vector ordinals. + */ + class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { + private static final int READER_ID = 0; + private static final int READER_ORD = 1; + private static final int LEADING_READER_IDX = 0; + + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + + // Array of sub-readers + private final KnnVectorsReader[] readers; + private final JVectorFloatVectorValues[] perReaderFloatVectorValues; + + // Maps the ravv ordinals to the reader index and the ordinal in that reader. This is allowing us to get a unified view of all the + // vectors in all the readers with a single unified ordinal space. + private final int[][] ravvOrdToReaderMapping; + + // Total number of vectors + private final int size; + // Total number of documents including those without values + private final int totalDocsCount; + + // Vector dimension + private final int dimension; + private final FieldInfo fieldInfo; + private final MergeState mergeState; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final int[] graphNodeIdsToRavvOrds; + private boolean deletesFound = false; + + /** + * Creates a random access view over merged float vector values. + * + * @param fieldInfo Field info for the vector field + * @param mergeState Merge state containing readers and doc maps + */ + public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); + this.fieldInfo = fieldInfo; + this.mergeState = mergeState; + + final String fieldName = fieldInfo.name; + + // Count total vectors, collect readers and identify leading reader, collect base ordinals to later be used to build the mapping + // between global ordinals and global lucene doc ids + int totalVectorsCount = 0; + int totalLiveVectorsCount = 0; + int dimension = 0; + int tempLeadingReaderIdx = -1; + int vectorsCountInLeadingReader = -1; + List allReaders = new ArrayList<>(); + final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); + final Bits[] liveDocs = mergeState.liveDocs.clone(); + final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; + final int[] deletedOrds = new int[mergeState.knnVectorsReaders.length]; // counts the number of deleted documents in each reader + // that previously had a vector + + // Find the leading reader, count the total number of live vectors, and the base ordinals for each reader + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + FieldInfos fieldInfos = mergeState.fieldInfos[i]; + baseOrds[i] = totalVectorsCount; + if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { + KnnVectorsReader reader = mergeState.knnVectorsReaders[i]; + if (reader != null) { + FloatVectorValues values = reader.getFloatVectorValues(fieldName); + if (values != null) { + allReaders.add(reader); + int vectorCountInReader = values.size(); + int liveVectorCountInReader = 0; + KnnVectorValues.DocIndexIterator it = values.iterator(); + while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + if (liveDocs[i] == null || liveDocs[i].get(it.docID())) { + liveVectorCountInReader++; + } else { + deletedOrds[i]++; + deletesFound = true; + } + } + if (liveVectorCountInReader >= vectorsCountInLeadingReader) { + vectorsCountInLeadingReader = liveVectorCountInReader; + tempLeadingReaderIdx = i; + } + totalVectorsCount += vectorCountInReader; + totalLiveVectorsCount += liveVectorCountInReader; + dimension = Math.max(dimension, values.dimension()); + } + } + } + } + + assert (totalVectorsCount <= totalDocsCount) : "Total number of vectors exceeds the total number of documents"; + assert (totalLiveVectorsCount <= totalVectorsCount) : "Total number of live vectors exceeds the total number of vectors"; + assert (dimension > 0) : "No vectors found for field " + fieldName; + + this.size = totalVectorsCount; + this.readers = new KnnVectorsReader[allReaders.size()]; + for (int i = 0; i < readers.length; i++) { + readers[i] = allReaders.get(i); + } + + // always swap the leading reader to the first position + // For this part we need to make sure we also swap all the other metadata arrays that are indexed by reader index + // Such as readers, docMaps, liveDocs, baseOrds, deletedOrds + if (tempLeadingReaderIdx != 0) { + final KnnVectorsReader temp = readers[LEADING_READER_IDX]; + readers[LEADING_READER_IDX] = readers[tempLeadingReaderIdx]; + readers[tempLeadingReaderIdx] = temp; + // also swap the leading doc map to the first position to match the readers + final MergeState.DocMap tempDocMap = docMaps[LEADING_READER_IDX]; + docMaps[LEADING_READER_IDX] = docMaps[tempLeadingReaderIdx]; + docMaps[tempLeadingReaderIdx] = tempDocMap; + // swap base ords + final int tempBaseOrd = baseOrds[LEADING_READER_IDX]; + baseOrds[LEADING_READER_IDX] = baseOrds[tempLeadingReaderIdx]; + baseOrds[tempLeadingReaderIdx] = tempBaseOrd; + } + + this.perReaderFloatVectorValues = new JVectorFloatVectorValues[readers.length]; + this.dimension = dimension; + + // Build mapping from global ordinal to [readerIndex, readerOrd] + this.ravvOrdToReaderMapping = new int[totalDocsCount][2]; + + int documentsIterated = 0; + + // Will be used to build the new graphNodeIdToDocMap with the new graph node id to docId mapping. + // This mapping should not be used to access the vectors at any time during construction, but only after the merge is complete + // and the new segment is created and used by searchers. + final int[] graphNodeIdToDocIds = new int[totalLiveVectorsCount]; + this.graphNodeIdsToRavvOrds = new int[totalLiveVectorsCount]; + + int graphNodeId = 0; + if (deletesFound) { + // If there are deletes, we need to build a new graph from scratch and compact the graph node ids + // TODO: remove this logic once we support incremental graph building with deletes see + // https://github.com/opensearch-project/opensearch-jvector/issues/171 + for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { + final JVectorFloatVectorValues values = (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + // For each vector in this reader + KnnVectorValues.DocIndexIterator it = values.iterator(); + + for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + readerIdx + ); + } else { + // Mapping from ravv ordinals to [readerIndex, readerOrd] + // Map graph node id to ravv ordinal + // Map graph node id to doc id + final int newGlobalDocId = docMaps[readerIdx].get(docId); + final int ravvLocalOrd = it.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; + graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; + graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader + } + + documentsIterated++; + } + } + } else { + // If there are no deletes, we can reuse the existing graph and simply remap the ravv ordinals to the new global doc ids + // for the leading reader we must preserve the original node Ids and map them to the corresponding ravv vectors originally + // used to build the graph + // This is necessary because we are later going to expand that graph with new vectors from the other readers. + // The leading reader is ALWAYS the first one in the readers array + final JVectorFloatVectorValues leadingReaderValues = (JVectorFloatVectorValues) readers[LEADING_READER_IDX] + .getFloatVectorValues(fieldName); + perReaderFloatVectorValues[LEADING_READER_IDX] = leadingReaderValues; + var leadingReaderIt = leadingReaderValues.iterator(); + for (int docId = leadingReaderIt.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = leadingReaderIt.nextDoc()) { + final int newGlobalDocId = docMaps[LEADING_READER_IDX].get(docId); + if (newGlobalDocId == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + LEADING_READER_IDX + ); + } else { + final int ravvLocalOrd = leadingReaderIt.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[LEADING_READER_IDX]; + graphNodeIdToDocIds[ravvLocalOrd] = newGlobalDocId; + graphNodeIdsToRavvOrds[ravvLocalOrd] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = LEADING_READER_IDX; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader + } + + documentsIterated++; + } + + // For the remaining readers we map the graph node id to the ravv ordinal in the order they appear + for (int readerIdx = 1; readerIdx < readers.length; readerIdx++) { + final JVectorFloatVectorValues values = (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + // For each vector in this reader + KnnVectorValues.DocIndexIterator it = values.iterator(); + + for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + readerIdx + ); + } else { + // Mapping from ravv ordinals to [readerIndex, readerOrd] + // Map graph node id to ravv ordinal + // Map graph node id to doc id + final int newGlobalDocId = docMaps[readerIdx].get(docId); + final int ravvLocalOrd = it.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; + graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; + graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader + } + + documentsIterated++; + } + } + } + + if (documentsIterated < totalVectorsCount) { + throw new IllegalStateException( + "More documents were expected than what was found in the readers." + + "Expected at least number of total vectors: " + + totalVectorsCount + + " but found only: " + + documentsIterated + + " documents." + ); + } + + this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(graphNodeIdToDocIds); + log.debug("Created RandomAccessMergedFloatVectorValues with {} total vectors from {} readers", size, readers.length); + + } + + /** + * Merges the float vector values from multiple readers into a unified structure. + * This process includes handling product quantization (PQ) for vector compression, + * generating ord-to-doc mappings, and writing the merged index into a new segment file. + *

+ * The method determines if pre-existing product quantization codebooks are available + * from the leading reader. If available, it refines them using remaining vectors + * from other readers in the merge. If no pre-existing codebooks are found and + * the total vector count meets the required minimum threshold, new codebooks + * and compressed vectors are computed. Otherwise, no PQ compression is applied. + *

+ * Also, it generates a mapping of ordinals to document IDs by iterating through + * the provided vector data, which is further used to write the field data. + *

+ * In the event of no deletes or quantization, the graph construction is done by incrementally adding vectors from smaller segments into the largest segment. + * For all other cases, we build a new graph from scratch from all the vectors. + * + * TODO: Add support for incremental graph building with quantization see issue + * + * @throws IOException if there is an issue during reading or writing vector data. + */ + public void merge() throws IOException { + // This section creates the PQVectors to be used for this merge + // Get PQ compressor for leading reader + final int totalVectorsCount = size; + final String fieldName = fieldInfo.name; + final PQVectors pqVectors; + final OnHeapGraphIndex graph; + // Get the leading reader + PerFieldKnnVectorsFormat.FieldsReader fieldsReader = (PerFieldKnnVectorsFormat.FieldsReader) readers[LEADING_READER_IDX]; + JVectorReader leadingReader = (JVectorReader) fieldsReader.getFieldReader(fieldName); + final BuildScoreProvider buildScoreProvider; + // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the remaining vectors + if (leadingReader.getProductQuantizationForField(fieldInfo.name).isEmpty()) { + // No pre-existing codebooks, check if we have enough vectors to trigger quantization + log.info( + "No Pre-existing PQ codebooks found in this merge for field {} in segment {}, will check if a new codebooks is necessary", + fieldName, + mergeState.segmentInfo.name + ); + if (this.size() >= minimumBatchSizeForQuantization) { + log.info( + "Calculating new codebooks and compressed vectors for field: {}, with totalVectorCount: {}, above minimumBatchSizeForQuantization: {}", + fieldName, + totalVectorsCount, + minimumBatchSizeForQuantization + ); + pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); + } else { + log.info( + "Not enough vectors found for field: {}, totalVectorCount: {}, is below minimumBatchSizeForQuantization: {}", + fieldName, + totalVectorsCount, + minimumBatchSizeForQuantization + ); + pqVectors = null; + } + } else { + log.info( + "Pre-existing PQ codebooks found in this merge for field {} in segment {}, will refine the codebooks from the leading reader with the remaining vectors", + fieldName, + mergeState.segmentInfo.name + ); + final long start = Clock.systemDefaultZone().millis(); + ProductQuantization leadingCompressor = leadingReader.getProductQuantizationForField(fieldName).get(); + // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading reader since it's already been + // used to create the leadingCompressor + // We assume the leading reader is ALWAYS the first one in the readers array + for (int i = LEADING_READER_IDX + 1; i < readers.length; i++) { + final FloatVectorValues values = readers[i].getFloatVectorValues(fieldName); + final RandomAccessVectorValues randomAccessVectorValues = new RandomAccessVectorValuesOverVectorValues(values); + leadingCompressor.refine(randomAccessVectorValues); + } + final long end = Clock.systemDefaultZone().millis(); + final long trainingTime = end - start; + log.info("Refined PQ codebooks for field {}, in {} millis", fieldName, trainingTime); + KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); + pqVectors = PQVectors.encodeAndBuild( + leadingCompressor, + graphNodeIdsToRavvOrds.length, + graphNodeIdsToRavvOrds, + this, + SIMD_POOL_MERGE + ); + } + + if (pqVectors == null) { + buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( + this, + graphNodeIdsToRavvOrds, + getVectorSimilarityFunction(fieldInfo) + ); + // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, segmentWriteState.segmentInfo.name); + if (!deletesFound) { + final String segmentName = segmentWriteState.segmentInfo.name; + log.info( + "No deletes found, and no PQ codebooks found, expanding previous graph with additional vectors for field {} in segment {}", + fieldName, + segmentName + ); + final RandomAccessReader leadingOnHeapGraphReader = leadingReader.getNeighborsScoreCacheForField(fieldName); + final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); + graph = (OnHeapGraphIndex) GraphIndexBuilder.buildAndMergeNewNodes( + leadingOnHeapGraphReader, + this, + buildScoreProvider, + numBaseVectors, + graphNodeIdsToRavvOrds, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled + ); + } else { + log.info("Deletes found, and no PQ codebooks found, building new graph from scratch"); + graph = getGraph( + buildScoreProvider, + this, + graphNodeIdsToRavvOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE + ); + } + } else { + log.info("PQ codebooks found, building graph from scratch with PQ vectors"); + buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(getVectorSimilarityFunction(fieldInfo), pqVectors); + // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD threads) + buildScoreProvider.diversityProviderFor(0); + graph = getGraph( + buildScoreProvider, + this, + graphNodeIdsToRavvOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE + ); + } + + writeField(fieldInfo, this, pqVectors, graphNodeIdsToRavvOrds, graphNodeIdToDocMap, graph); + } + + @Override + public int size() { + return size; + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public VectorFloat getVector(int ord) { + if (ord < 0 || ord >= totalDocsCount) { + throw new IllegalArgumentException("Ordinal out of bounds: " + ord); + } + + final int readerIdx = ravvOrdToReaderMapping[ord][READER_ID]; + final int readerOrd = ravvOrdToReaderMapping[ord][READER_ORD]; + + // Access to float values is not thread safe + synchronized (perReaderFloatVectorValues[readerIdx]) { + return perReaderFloatVectorValues[readerIdx].vectorFloatValue(readerOrd); + } + } + + @Override + public boolean isValueShared() { + return false; + } + + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } + + /** + * This method will return the graph index for the field + * @return OnHeapGraphIndex + */ + public OnHeapGraphIndex getGraph( + BuildScoreProvider buildScoreProvider, + RandomAccessVectorValues randomAccessVectorValues, + int[] newToOldOrds, + FieldInfo fieldInfo, + String segmentName, + ForkJoinPool SIMD_POOL + ) { + final GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder( + buildScoreProvider, + fieldInfo.getVectorDimension(), + maxConn, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled + ); + + /* + * We cannot always use randomAccessVectorValues for the graph building + * because it's size will not always correspond to the document count. + * To have the right mapping from docId to vector ordinal we need to use the mergedFloatVector. + * This is the case when we are merging segments and we might have more documents than vectors. + */ + final long start = Clock.systemDefaultZone().millis(); + final OnHeapGraphIndex graphIndex; + var vv = randomAccessVectorValues.threadLocalSupplier(); + + log.info("Building graph from merged float vector"); + // parallel graph construction from the merge documents Ids + SIMD_POOL.submit(() -> IntStream.range(0, newToOldOrds.length).parallel().forEach(ord -> { + graphIndexBuilder.addGraphNode(ord, vv.get().getVector(newToOldOrds[ord])); + })).join(); + graphIndexBuilder.cleanup(); + graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); + final long end = Clock.systemDefaultZone().millis(); + + log.info("Built graph for field {} in segment {} in {} millis", fieldInfo.name, segmentName, end - start); + return graphIndex; + } + + static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + private final FloatVectorValues values; + + public RandomAccessVectorValuesOverVectorValues(FloatVectorValues values) { + this.values = values; + } + + @Override + public int size() { + return values.size(); + } + + @Override + public int dimension() { + return values.dimension(); + } + + @Override + public VectorFloat getVector(int nodeId) { + try { + // Access to float values is not thread safe + synchronized (this) { + final float[] vector = values.vectorValue(nodeId); + final float[] copy = new float[vector.length]; + System.arraycopy(vector, 0, copy, 0, vector.length); + return VECTOR_TYPE_SUPPORT.createFloatVector(copy); + } + } catch (IOException e) { + log.error("Error retrieving vector at ordinal {}", nodeId, e); + throw new RuntimeException(e); + } + } + + @Override + public boolean isValueShared() { + return false; + } + + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java new file mode 100644 index 000000000000..5f05b040c88a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the implementation of the JVector codec, a Lucene codec for approximate + * nearest neighbor search using vector quantization and HNSW graph indexing. It is based on the + * OpenSearch JVector codec and optimized for Lucene. + */ +package org.apache.lucene.sandbox.codecs.jvector; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 29a44d2ecfa8..84f11e50fd0a 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat +org.apache.lucene.sandbox.codecs.jvector.JVectorFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java new file mode 100644 index 000000000000..899663214405 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -0,0 +1,1557 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.jvector; + +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Assert; +import org.junit.Test; +import org.opensearch.knn.TestUtils; +import org.opensearch.knn.common.KNNConstants; +import org.opensearch.knn.index.ThreadLeakFiltersForTests; +import org.opensearch.knn.plugin.stats.KNNCounter; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.opensearch.knn.common.KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; +import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; + +/** + * Test used specifically for JVector + */ +// Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not being shutdown. +// Ignore thread leaks until we remove the ForkJoinPool.commonPool() usage from IndexGraphBuilder +// TODO: Wire the execution thread pool to {@link IndexGraphBuilder} to avoid the failure of the UT due to leaked thread pool warning. +@ThreadLeakFilters(defaultFilters = true, filters = { ThreadLeakFiltersForTests.class }) +@LuceneTestCase.SuppressSysoutChecks(bugUrl = "") +@Log4j2 +public class KNNJVectorTests extends LuceneTestCase { + private static final String TEST_FIELD = "test_field"; + private static final String TEST_ID_FIELD = "id"; + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. + * Single field is used to store the vectors. + * All the documents are stored in a single segment. + * Single commit without refreshing the index. + * No merge. + */ + @Test + public void testJVectorKnnIndex_simpleCase() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f / i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + log.info("successfully closed directory"); + } + + /** + * Test the scenario when not all documents are populated with the vector field + */ + public void testMissing_fields() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + if (i % 2 == 0) { + final float[] source = new float[] { 0.0f, i }; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + } + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 0.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(2, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(4, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 4.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + log.info("successfully closed directory"); + } + + /** + * Test the scenario when the index is sorted by a doc value + * We want to make sure the docIDs are correctly mapped to the jVector ordinals + * @throws IOException if an I/O error occurs + */ + public void test_sorted_index() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final String sortFieldName = "sorted_field"; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // Add index sorting configuration + indexWriterConfig.setIndexSort(new Sort(new SortField(sortFieldName, SortField.Type.INT, true))); // true = reverse order + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + final float[] source = new float[] { 0.0f, i }; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + // Add the sortable field + doc.add(new NumericDocValuesField(sortFieldName, i)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + assertEquals(0, reader.storedFields().document(topDocs.scoreDocs[0].doc).getField(TEST_ID_FIELD).numericValue().intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 0.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(8, topDocs.scoreDocs[1].doc); + assertEquals(1, reader.storedFields().document(topDocs.scoreDocs[1].doc).getField(TEST_ID_FIELD).numericValue().intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(7, topDocs.scoreDocs[2].doc); + assertEquals(2, reader.storedFields().document(topDocs.scoreDocs[2].doc).getField(TEST_ID_FIELD).numericValue().intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + log.info("successfully closed directory"); + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. + * Single field is used to store the vectors. + * Documents are stored in a multiple segments. + * Multiple commits without refreshing the index. + * No merge. + */ + @Test + public void testJVectorKnnIndex_multipleSegments() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f / i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.commit(); // this creates a new segment + } + log.info("Done writing all files to the file system"); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 10 segments, each with a single document"); + Assert.assertEquals(10, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. + * Single field is used to store the vectors. + * Documents are stored in a multiple segments. + * Multiple commits without refreshing the index. + * Merge is enabled. + */ + @Test + public void testJVectorKnnIndex_mergeEnabled() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f * i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 3.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbors + * in the index. + * Single field is used to store the vectors. + * Documents are stored in potentially multiple segments. + * Multiple commits. + * Multiple merges. + */ + @Test + public void multipleMerges() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f * i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + w.forceMerge(1); // this merges all segments into a single segment + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + vectorSimilarityFunction.compare(target, new float[] { 0.0f, 1.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 3.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbours + * in the index. + * A Single field is used to store the vectors. + * Documents are stored in potentially multiple segments. + * Multiple commits. + * Multiple merges. + * Large batches + * Use a compound file + */ + @Test + public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() throws IOException { + int segmentSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f / i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge + } + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + float expectedMinScoreInTopK = VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, k }); + final float recall = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertEquals(1.0f, recall, 0.01f); + + log.info("successfully completed search tests"); + } + } + } + + /** + * Similar to testJVectorKnnIndex_multiple_merges_large_batches_no_quantization but with random vectors + * It's important to add more randomness to the vectors to make sure the graph is not linear + * @throws IOException if an I/O error occurs + */ + @Test + public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization_with_random_vectors() throws IOException { + int segmentSize = 200; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + final int dimension = 2; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final float[] target = TestUtils.generateRandomVectors(1, dimension)[0]; + final float[][] source = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, source, k, vectorSimilarityFunction); + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + for (int i = 0; i < source.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge + } + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + } + + /** + * Tests the functionality and integrity of a Lucene k-NN index under multiple merge cycles and verifies + * the proper ordering of vectors and document identifiers. + * + * The method performs the following validation steps: + * 1. Indexes a predefined number of documents into a Lucene index, creating many small segments. + * Each document + * includes a k-NN float vector field encoding a specific order. + * 2. Executes several merge operations on the index (partial and full merges) to validate that the merging + * process maintains correctness and consistency. + * 3. Validates the following invariants post-merge: + * (a) Verifies that the index is merged into a single segment. + * (b) Confirms the integrity of vector values by iterating through the merged segment and checking the + * relationship between vector components and document identifiers. + * (c) Performs k-NN searches with various cases: + * - Single-threaded searches using vectors to ensure correct results. + * - Multi-threaded concurrent searches to confirm robustness and verify the index operates correctly + * under concurrent access without exhausting file handles or encountering other issues. + * + * Assertions are used throughout to ensure the state of the index matches the expected behavior, + * validate merge + * results, and confirm the accuracy of search operations. + * The test also logs the number of successful k-NN queries + * during the concurrent search phase. + * + * @throws IOException if an I/O error occurs during index operations. + * @throws InterruptedException if the concurrent search phase is interrupted. + */ + @Test + public void testLuceneKnnIndex_multipleMerges_with_ordering_check() throws IOException, InterruptedException { + final int numDocs = 10000; + final String floatVectorField = "vec"; + final String expectedDocIdField = "expectedDocId"; + final Path indexPath = createTempDir(); + final float[][] sourceVectors = TestUtils.generateRandomVectors(numDocs, 2); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + try (Directory dir = newFSDirectory(indexPath)) { + IndexWriterConfig cfg = newIndexWriterConfig(); + cfg.setCodec(getCodec()); + cfg.setUseCompoundFile(false); + cfg.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + cfg.setMergeScheduler(new SerialMergeScheduler()); + + try (IndexWriter w = new IndexWriter(dir, cfg)) { + /* ---------- 1. index documents, create many tiny segments ---------- */ + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + // vector whose first component encodes the future (segment-local) docID + doc.add(new KnnFloatVectorField(floatVectorField, sourceVectors[i], vectorSimilarityFunction)); + doc.add(new StoredField(expectedDocIdField, i)); + w.addDocument(doc); + } + w.commit(); + + /* ---------- 2. run several merge cycles ---------- */ + w.forceMerge(5); // partial merge + w.forceMerge(3); // another partial merge + w.forceMerge(1); // final full merge + } + + /* ---------- 3. open reader and assert the invariant ---------- */ + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals("we merged down to exactly one segment", 1, reader.leaves().size()); + + // (a) iterate through vectors directly + for (LeafReaderContext context : reader.leaves()) { + FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final var docIdSetIterator = vectorValues.iterator(); // iterator for all the vectors with values + int docId = -1; + while ((docId = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + final int luceneDocId = context.docBase + docId; + final int globalDocId = reader.storedFields() + .document(luceneDocId) + .getField(expectedDocIdField) + .storedValue() + .getIntValue(); + float[] vectorValue = vectorValues.vectorValue(docIdSetIterator.index()); + float[] expectedVectorValue = sourceVectors[globalDocId]; + // if the vectors do not match, also look which source vector should be the right result + if (!Arrays.equals(expectedVectorValue, vectorValue)) { + for (int i = 0; i < sourceVectors.length; i++) { + if (Arrays.equals(sourceVectors[i], vectorValue)) { + log.error( + "found vector with global id: {}, in docId: {}, however the actual position of the vector in source is: {}", + globalDocId, + luceneDocId, + i + ); + } + } + } + Assert.assertArrayEquals( + "vector with global id " + + globalDocId + + " in source doesn't match vector value in lucene docID " + + luceneDocId + + " on the index", + expectedVectorValue, + vectorValue, + 0.0f + ); + } + } + + // (b) search with the same vector and confirm we are not exhausting the file handles with each search + IndexSearcher searcher = newSearcher(reader); + LeafReaderContext context = reader.leaves().get(0); // we only have one leaf at this point so we can use it to obtain the + // vector values + final int baseDocId = context.docBase; + final FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final int k = 1; + for (int i = 0; i < reader.maxDoc(); i++) { + float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + TopDocs td = searcher.search(getJVectorKnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); + assertEquals(k, td.scoreDocs.length); + + compareSearchResults(td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); + } + + // (c) search with the same vector and this time add concurrency to make sure we are still not exhausting the file handles + int numThreads = 10; // Number of concurrent search threads + int queriesPerThread = 100; // Number of searches per thread + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + CountDownLatch latch = new CountDownLatch(numThreads); + AtomicBoolean failureDetected = new AtomicBoolean(false); + AtomicInteger totalQueries = new AtomicInteger(0); + + try { + for (int t = 0; t < numThreads; t++) { + executor.submit(() -> { + int i = 0; + + try { + for (i = 0; i < queriesPerThread && !failureDetected.get(); i++) { + float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + try { + TopDocs td = searcher.search(new KnnFloatVectorQuery("vec", query, k), k); + assertEquals("Search should return correct number of results", k, td.scoreDocs.length); + compareSearchResults(td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); + totalQueries.incrementAndGet(); + } catch (Throwable e) { + failureDetected.compareAndSet(false, true); + log.error("Exception encountered", e); + fail("Exception during concurrent search: " + e.getMessage()); + } + } + } finally { + latch.countDown(); + log.warn("Ran {} queries", i); + } + }); + } + + // Wait for all threads to complete or for a failure + boolean completed = latch.await(30, TimeUnit.SECONDS); + assertTrue("Test timed out while waiting for concurrent searches", completed); + assertFalse("Test encountered failures during concurrent searches", failureDetected.get()); + assertEquals("Incorrect number of queries executed", numThreads * queriesPerThread, totalQueries.get()); + + // Log the number of successful queries + log.info("Successfully completed {} concurrent kNN search queries!", totalQueries.get()); + + } finally { + executor.shutdownNow(); + } + } + } + + } + + private void compareSearchResults( + TopDocs topDocs, + float[][] sourceVectors, + DirectoryReader reader, + String expectedDocIdField, + int baseDocId, + FloatVectorValues vectorValues + ) throws IOException { + // Get the ords matching the lucene doc ids so that we can later find their values in the {@link vectorValues} + final Map docToOrdMap = new HashMap<>(); // docToOrd map + final var docIdSetIterator = vectorValues.iterator(); + while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + docToOrdMap.put(docIdSetIterator.docID() + baseDocId, docIdSetIterator.index()); + } + + for (int resultIdx = 0; resultIdx < topDocs.scoreDocs.length; resultIdx++) { + final int localDocId = topDocs.scoreDocs[resultIdx].doc; + final int globalDocId = reader.storedFields().document(localDocId).getField(expectedDocIdField).storedValue().getIntValue(); + + // Access to float values is not thread safe + final float[] vectorValue; + synchronized (vectorValues) { + vectorValue = vectorValues.vectorValue(docToOrdMap.get(localDocId)); + } + float[] expectedVectorValue = sourceVectors[globalDocId]; + Assert.assertArrayEquals("vectors in source and index should match", expectedVectorValue, vectorValue, 0.0f); + } + } + + /** + * Test to verify that a document which has been deleted is no longer + * returned in a k-NN search. The index uses the JVector codec and is + * kept in multiple segments to ensure we also cover the case where the + * deleted document still physically resides in the segment as a dead + * (non-live) record. + */ + @Test + public void deletedDocs() throws IOException { + final int totalNumberOfDocs = 100; + final int batchSize = 10; + final int k = batchSize - 1; + final int docToDeleteInEachBatch = 5; + final Path indexPath = createTempDir(); + final IndexWriterConfig iwc = newIndexWriterConfig(); + // JVector codec requires compound files to be disabled at the moment + iwc.setUseCompoundFile(false); + iwc.setCodec(getCodec()); + iwc.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter writer = new IndexWriter(dir, iwc)) { + + /* + * 1. Index 100 docs, in batches of 10. Delete the 5th doc in each batch. + * will leave us with 10 segments, each with 9 live docs. + */ + int batchNumber = 0; + for (int i = 1; i <= totalNumberOfDocs; i++) { + Document doc = new Document(); + final float[] vector = { 0.0f, 1.0f * (i + batchNumber) }; + doc.add(new StringField("docId", Integer.toString(i + 1), Field.Store.YES)); + doc.add(new KnnFloatVectorField("test_field", vector, VectorSimilarityFunction.EUCLIDEAN)); + writer.addDocument(doc); + if (i % batchSize == 0) { + writer.flush(); + writer.deleteDocuments(new TermQuery(new Term("docId", Integer.toString(i - docToDeleteInEachBatch)))); + batchNumber++; + } + } + writer.commit(); + + /* ---------------------------------------- + * 2. Merge all segments into one + * ---------------------------------------- */ + writer.forceMerge(1); + + /* ---------------------------------------- + * 3. Search – the deleted doc must be gone + * ---------------------------------------- */ + try (IndexReader reader = DirectoryReader.open(writer)) { + assertEquals( + "All documents except the deleted ones should be live", + totalNumberOfDocs - (totalNumberOfDocs / batchSize), + reader.numDocs() + ); + // For each batch we will verify that the deleted document doesn't come up in search and only it's neighbours are returned + + for (int i = 0; i < totalNumberOfDocs; i += batchSize) { + final float[] target = { 0.0f, 1.0f * (i + docToDeleteInEachBatch) }; + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery( + "test_field", + target, + k, + new MatchAllDocsQuery() + ); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + for (int j = 0; j < k; j++) { + Document doc = reader.storedFields().document(topDocs.scoreDocs[j].doc); + int docId = Integer.parseInt(doc.get("docId")); + assertNotEquals("Deleted doc should not be returned in search results", i + docToDeleteInEachBatch, docId); + } + } + } + } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. + * Single field is used to store the vectors. + * Documents are stored in potentially multiple segments. + * Multiple commits. + * Multiple merges. + * Merge is enabled. + * compound file is enabled. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f / i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), + topDocs.scoreDocs[0].score, + 0.01f + ); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), + topDocs.scoreDocs[1].score, + 0.01f + ); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), + topDocs.scoreDocs[2].score, + 0.01f + ); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. + * Single field is used to store the vectors. + * Documents are stored in potentially multiple segments. + * Multiple commits. + * Multiple merges. + * Merge is enabled. + * compound file is enabled. + * cosine similarity is used. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 1.0f, 1.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 1.0f + i, 2.0f * i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.COSINE)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] { 2.0f, 2.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(1, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] { 3.0f, 4.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(2, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] { 4.0f, 6.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test to verify that the JVector codec is providing proper error if used with byte vector + * TODO: Create Binary Quantization support for JVector codec + */ + @Test + public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + // TODO: re-enable this after fixing the compound file augmentation for JVector + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (Directory dir = newFSDirectory(indexPath); RandomIndexWriter w = new RandomIndexWriter(random(), dir, indexWriterConfig)) { + final byte[] source = new byte[] { (byte) 0, (byte) 0 }; + final Document doc = new Document(); + doc.add(new KnnByteVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + Assert.assertThrows(UnsupportedOperationException.class, () -> w.addDocument(doc)); + } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index with a filter applied. + */ + @Test + public void testJVectorKnnIndex_withFilter() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (Directory dir = newFSDirectory(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] { 0.0f, 0.0f }; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] { 0.0f, 1.0f / i }; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("filter_field", i % 2 == 0 ? "even" : "odd", Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("Applying filter to the KNN search"); + final Query filterQuery = new TermQuery(new Term("filter_field", "even")); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + + log.info("Validating filtered KNN results"); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), + topDocs.scoreDocs[0].score, + 0.001f + ); + assertEquals(7, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), + topDocs.scoreDocs[1].score, + 0.001f + ); + assertEquals(5, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 6.0f }), + topDocs.scoreDocs[2].score, + 0.001f + ); + log.info("successfully completed filtered search tests"); + } + } + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size with no merges or too small batch sizes + */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException { + int k = 50; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test recall with different types of rerank parameters + */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOException { + int k = 1; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = generateZerosVectorWithLastValue(dimension, i); + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + float expectedMinScoreInTopK = VectorSimilarityFunction.EUCLIDEAN.compare( + target, + new float[] { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, k } + ); + + // Query with essentially no reranking and expect recall to be very low + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + final float recallWithLowOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + + // Query with reranking and expect recall to be high + knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 5); + topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertTrue(recallWithLowOverqueryFactor <= recallWithHighOverqueryFactor); + + log.info("successfully completed search tests"); + } + } + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size each time with a merge of + * multiple segments + */ + @Test + public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() throws IOException { + final int dimension = 16; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and + // jittery tests + final int perfectBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION is the minimal + // batch size that will trigger a quantization without + // breaking it, generally speaking the batch size can't be + // lower than the number of clusters + final int totalNumberOfDocs = perfectBatchSize * 2; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % perfectBatchSize == 0) { + w.commit(); + } + } + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of recommended clusters in the index + * The expected behavior is for the quantization to only kick in when we have a merge or batch size that is bigger than the minimal required batch size + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges() throws IOException { + final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = notIdealBatchSize * 3; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link + // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} as a result of merging all the smaller + // batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); + } + } + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of recommended clusters in the index + * The expected behavior is for the quantization to only kick in when we have a merge or batch size that is bigger than the minimal required batch size + * Also this is adding the compound file to the mix + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_withCompoundFile() throws IOException { + final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = notIdealBatchSize * 10; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} + // as a result of merging all the smaller batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will result in a broken linear graph + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); + } + } + w.commit(); + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals("Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + + Assert.assertTrue("No quantization time recorded", KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount() > 0); + Assert.assertTrue("No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + + /** + * We will use multiple batches, each can trigger a quantization and later merge them in an appending order to keep track + * of refinement + * @throws IOException + */ + @Test + public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinement() throws IOException { + final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and + // jittery tests + final int dimension = 16; + final int idealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = idealBatchSize * 10; // 10 batches, each batch on it's own will trigger quantization + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will result in a broken linear graph + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + w.addDocument(doc); + if (i % idealBatchSize == 0) { + final long beforeTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); + w.commit(); + w.forceMerge(1); // force merge will trigger PQ refinement if other segments are present + final long afterTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); + Assert.assertTrue( + "Expected to have a training time of at least " + beforeTrainingTime + " but got " + afterTrainingTime, + afterTrainingTime >= beforeTrainingTime + ); + } + } + w.commit(); + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals("Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + + Assert.assertTrue("No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + + /** + * Calculate the recall for the top k documents + * For simplicity we assume that all documents have unique scores and therefore the minimum score in the top k documents is the kth document + * @param topDocs the top documents returned by the search + * @param minScoreInTopK the minimum score in the top k documents + * @return the recall of the top k documents + */ + private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { + int totalRelevantDocs = 0; + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + if (topDocs.scoreDocs[i].score >= minScoreInTopK) { + totalRelevantDocs++; + } + } + float recall = ((float) totalRelevantDocs) / ((float) topDocs.scoreDocs.length); + + if (recall == 0.0f) { + log.info( + "Recall is 0.0, this is probably not correct, here is some debug information\n topDocs: {}, minScoreInTopK: {}, totalRelevantDocs: {}", + topDocsToString(topDocs), + minScoreInTopK, + totalRelevantDocs + ); + } + return recall; + } + + // convert topDocs to a pretty printed string + private String topDocsToString(TopDocs topDocs) { + StringBuilder sb = new StringBuilder(); + sb.append("TopDocs: ["); + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + sb.append(topDocs.scoreDocs[i].doc).append(" (").append(topDocs.scoreDocs[i].score).append("), "); + } + sb.append("]"); + return sb.toString(); + } + + private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery(String fieldName, float[] target, int k, Query filterQuery) { + return getJVectorKnnFloatVectorQuery(fieldName, target, k, filterQuery, KNNConstants.DEFAULT_OVER_QUERY_FACTOR); + } + + private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( + String fieldName, + float[] target, + int k, + Query filterQuery, + int overQueryFactor + ) { + return new JVectorKnnFloatVectorQuery( + fieldName, + target, + k, + filterQuery, + overQueryFactor, + KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), + KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), + KNNConstants.DEFAULT_QUERY_USE_PRUNING + ); + } + + private static float[][] getMonotonicallyIncreasingVectors(int numVectors, int vectorDimension) { + float[][] vectors = new float[numVectors][vectorDimension]; + for (int i = 0; i < numVectors; i++) { + vectors[i] = generateZerosVectorWithLastValue(vectorDimension, i); + } + + return vectors; + } + + private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { + float[] vector = new float[vectorDimension]; + for (int i = 0; i < vectorDimension - 1; i++) { + vector[i] = 0; + } + vector[vectorDimension - 1] = lastValue; + return vector; + } + + private static float calculateRecall(IndexReader reader, Set groundTruthVectorsIds, TopDocs topDocs, int k) + throws IOException { + final ScoreDoc[] scoreDocs = topDocs.scoreDocs; + Assert.assertEquals(groundTruthVectorsIds.size(), scoreDocs.length); + int totalRelevantDocs = 0; + for (ScoreDoc scoreDoc : scoreDocs) { + final int id = reader.storedFields().document(scoreDoc.doc).getField(TEST_ID_FIELD).storedValue().getIntValue(); + if (groundTruthVectorsIds.contains(id)) { + totalRelevantDocs++; + } + } + return ((float) totalRelevantDocs) / ((float) k); + } + + /** + * Find the IDs of the ground truth vectors in the dataset + * @param query query vector + * @param dataset dataset of all the vectors with their ordinal position in the array as their ID + * @param k the number of expected results + * @return the IDs of the ground truth vectors in the dataset + */ + private static Set calculateGroundTruthVectorsIds( + float[] query, + final float[][] dataset, + int k, + VectorSimilarityFunction vectorSimilarityFunction + ) { + final Set groundTruthVectorsIds = new HashSet<>(); + final PriorityQueue priorityQueue = new PriorityQueue<>(k, (o1, o2) -> Float.compare(o1.score, o2.score)); + for (int i = 0; i < dataset.length; i++) { + ScoreDoc scoreDoc = new ScoreDoc(i, vectorSimilarityFunction.compare(query, dataset[i])); + if (priorityQueue.size() >= k) { + final ScoreDoc top = priorityQueue.poll(); + if (top.score < scoreDoc.score) { + priorityQueue.add(scoreDoc); + } else { + priorityQueue.add(top); + } + } else { + priorityQueue.add(scoreDoc); + } + } + while (!priorityQueue.isEmpty()) { + groundTruthVectorsIds.add(priorityQueue.poll().doc); + } + + return groundTruthVectorsIds; + } +} From c65c4679bb42f8b33018ee03025b03d7ee54c151 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 15:20:24 +0000 Subject: [PATCH 03/86] Fix license headers --- .../jvector/ForceMergesOnlyMergePolicy.java | 16 ++++++++++++++-- .../codecs/jvector/GraphNodeIdToDocMap.java | 16 ++++++++++++++-- .../codecs/jvector/JVectorFloatVectorValues.java | 16 ++++++++++++++-- .../sandbox/codecs/jvector/JVectorFormat.java | 16 ++++++++++++++-- .../codecs/jvector/JVectorIndexWriter.java | 16 ++++++++++++++-- .../codecs/jvector/JVectorKnnCollector.java | 16 ++++++++++++++-- .../jvector/JVectorKnnFloatVectorQuery.java | 16 ++++++++++++++-- .../jvector/JVectorRandomAccessReader.java | 16 ++++++++++++++-- .../sandbox/codecs/jvector/JVectorReader.java | 16 ++++++++++++++-- .../codecs/jvector/JVectorVectorScorer.java | 16 ++++++++++++++-- .../sandbox/codecs/jvector/JVectorWriter.java | 16 ++++++++++++++-- .../sandbox/codecs/jvector/KNNJVectorTests.java | 16 ++++++++++++++-- 12 files changed, 168 insertions(+), 24 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java index 8357a5fcdb46..71e11ce22d22 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 7fff91e12062..28c19df90b21 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index ce3008a79c29..7d80fb0f6918 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 5d25622d3df6..29cefe6598e2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index b01b4c8db1bb..c4cc2f715bec 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java index 573726f5f19a..32b35af7c012 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java index 922a7dcd55b1..1ee729db1543 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index c3b823010c6d..0599ff2121cb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 3c8aa4622000..8e36c1c3dda3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java index e27b168b6362..3e0b042dbe2a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 434e08a6964e..764d4a21a15f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index 899663214405..b562e52fd4a1 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -1,6 +1,18 @@ /* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.opensearch.knn.index.codec.jvector; From cf2aa85226ba6bd3d00cf742dc6a2bc54522561c Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 15:21:25 +0000 Subject: [PATCH 04/86] Run tidy --- .../jvector/ForceMergesOnlyMergePolicy.java | 150 +- .../codecs/jvector/GraphNodeIdToDocMap.java | 255 +- .../jvector/JVectorFloatVectorValues.java | 203 +- .../sandbox/codecs/jvector/JVectorFormat.java | 352 +- .../codecs/jvector/JVectorIndexWriter.java | 176 +- .../codecs/jvector/JVectorKnnCollector.java | 87 +- .../jvector/JVectorKnnFloatVectorQuery.java | 122 +- .../jvector/JVectorRandomAccessReader.java | 281 +- .../sandbox/codecs/jvector/JVectorReader.java | 652 ++-- .../codecs/jvector/JVectorVectorScorer.java | 43 +- .../sandbox/codecs/jvector/JVectorWriter.java | 1996 ++++++----- .../codecs/jvector/KNNJVectorTests.java | 3067 +++++++++-------- 12 files changed, 3810 insertions(+), 3574 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java index 71e11ce22d22..d43e7e4ac80f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -17,95 +17,99 @@ package org.opensearch.knn.index.codec.jvector; +import java.io.IOException; +import java.util.List; +import java.util.Map; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeTrigger; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; -import java.io.IOException; -import java.util.List; -import java.util.Map; - /** - * A merge policy that only merges segments if they are forced. - * This is useful for testing and benchmarking purposes. Since it can be used for benchmarks, it is placed in the common - * codec module. + * A merge policy that only merges segments if they are forced. This is useful for testing and + * benchmarking purposes. Since it can be used for benchmarks, it is placed in the common codec + * module. */ public class ForceMergesOnlyMergePolicy extends MergePolicy { - private final boolean useCompoundFile; + private final boolean useCompoundFile; - public ForceMergesOnlyMergePolicy() { - this(false); - } - - public ForceMergesOnlyMergePolicy(boolean useCompoundFile) { - super(); - this.useCompoundFile = useCompoundFile; - } + public ForceMergesOnlyMergePolicy() { + this(false); + } - @Override - public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) - throws IOException { - return null; - } + public ForceMergesOnlyMergePolicy(boolean useCompoundFile) { + super(); + this.useCompoundFile = useCompoundFile; + } - @Override - public MergeSpecification findForcedMerges( - SegmentInfos segmentInfos, - int maxSegmentCount, - Map segmentsToMerge, - MergeContext mergeContext - ) throws IOException { - // If the segments are already merged (e.g. there's only 1 segment), or - // there are segments = segmentInfos.asList(); - MergeSpecification spec = new MergeSpecification(); + @Override + public MergeSpecification findMerges( + MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) + throws IOException { + return null; + } - final OneMerge merge = new OneMerge(segments); - spec.add(merge); - return spec; + @Override + public MergeSpecification findForcedMerges( + SegmentInfos segmentInfos, + int maxSegmentCount, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + // If the segments are already merged (e.g. there's only 1 segment), or + // there are segments = segmentInfos.asList(); + MergeSpecification spec = new MergeSpecification(); - @Override - public boolean useCompoundFile(SegmentInfos segmentInfos, SegmentCommitInfo newSegment, MergeContext mergeContext) throws IOException { - return useCompoundFile; - } + final OneMerge merge = new OneMerge(segments); + spec.add(merge); + return spec; + } - @Override - public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException { - return null; - } + @Override + public boolean useCompoundFile( + SegmentInfos segmentInfos, SegmentCommitInfo newSegment, MergeContext mergeContext) + throws IOException { + return useCompoundFile; + } - /** - * Returns true if the number of segments eligible for merging is less than or equal to the - * specified {@code maxNumSegments}. - */ - protected boolean isMerged( - SegmentInfos infos, - int maxNumSegments, - Map segmentsToMerge, - MergeContext mergeContext - ) throws IOException { - final int numSegments = infos.size(); - int numToMerge = 0; - SegmentCommitInfo mergeInfo = null; - boolean segmentIsOriginal = false; - for (int i = 0; i < numSegments && numToMerge <= maxNumSegments; i++) { - final SegmentCommitInfo info = infos.info(i); - final Boolean isOriginal = segmentsToMerge.get(info); - if (isOriginal != null) { - segmentIsOriginal = isOriginal; - numToMerge++; - mergeInfo = info; - } - } + @Override + public MergeSpecification findForcedDeletesMerges( + SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException { + return null; + } - return numToMerge <= maxNumSegments && (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext)); + /** + * Returns true if the number of segments eligible for merging is less than or equal to the + * specified {@code maxNumSegments}. + */ + protected boolean isMerged( + SegmentInfos infos, + int maxNumSegments, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + final int numSegments = infos.size(); + int numToMerge = 0; + SegmentCommitInfo mergeInfo = null; + boolean segmentIsOriginal = false; + for (int i = 0; i < numSegments && numToMerge <= maxNumSegments; i++) { + final SegmentCommitInfo info = infos.info(i); + final Boolean isOriginal = segmentsToMerge.get(info); + if (isOriginal != null) { + segmentIsOriginal = isOriginal; + numToMerge++; + mergeInfo = info; + } } + + return numToMerge <= maxNumSegments + && (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext)); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 28c19df90b21..ce6050088d68 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -17,146 +17,157 @@ package org.opensearch.knn.index.codec.jvector; +import java.io.IOException; +import java.util.Arrays; import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.Sorter; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import java.io.IOException; -import java.util.Arrays; - /** - * This class represents the mapping from the Lucene document IDs to the jVector ordinals. - * This mapping is necessary because the jVector ordinals can be different from the Lucene document IDs and when lucene documentIDs change after a merge, - * we need to update this mapping to reflect the new document IDs. - * This requires us to know the previous mapping from the previous merge and the new mapping from the current merge. - *

- * Which means that we also need to persist this mapping to disk to be available across merges. + * This class represents the mapping from the Lucene document IDs to the jVector ordinals. This + * mapping is necessary because the jVector ordinals can be different from the Lucene document IDs + * and when lucene documentIDs change after a merge, we need to update this mapping to reflect the + * new document IDs. This requires us to know the previous mapping from the previous merge and the + * new mapping from the current merge. + * + *

Which means that we also need to persist this mapping to disk to be available across merges. */ @Log4j2 public class GraphNodeIdToDocMap { - private static final int VERSION = 1; - private int[] graphNodeIdsToDocIds; - private int[] docIdsToGraphNodeIds; - - /** - * Constructor that reads the mapping from the index input - * - * @param in The index input - * @throws IOException if an I/O error occurs - */ - public GraphNodeIdToDocMap(IndexInput in) throws IOException { - final int version = in.readInt(); // Read the version - if (version != VERSION) { - throw new IOException("Unsupported version: " + version); - } - int size = in.readVInt(); - int maxDocId = in.readVInt(); + private static final int VERSION = 1; + private int[] graphNodeIdsToDocIds; + private int[] docIdsToGraphNodeIds; - graphNodeIdsToDocIds = new int[size]; - docIdsToGraphNodeIds = new int[maxDocId]; - for (int ord = 0; ord < size; ord++) { - final int docId = in.readVInt(); - graphNodeIdsToDocIds[ord] = docId; - docIdsToGraphNodeIds[docId] = ord; - } + /** + * Constructor that reads the mapping from the index input + * + * @param in The index input + * @throws IOException if an I/O error occurs + */ + public GraphNodeIdToDocMap(IndexInput in) throws IOException { + final int version = in.readInt(); // Read the version + if (version != VERSION) { + throw new IOException("Unsupported version: " + version); } + int size = in.readVInt(); + int maxDocId = in.readVInt(); - /** - * Constructor that creates a new mapping between ordinals and docIds - * - * @param graphNodeIdsToDocIds The mapping from ordinals to docIds - */ - public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { - if (graphNodeIdsToDocIds.length == 0) { - this.graphNodeIdsToDocIds = new int[0]; - this.docIdsToGraphNodeIds = new int[0]; - return; - } - this.graphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; - System.arraycopy(graphNodeIdsToDocIds, 0, this.graphNodeIdsToDocIds, 0, graphNodeIdsToDocIds.length); - final int maxDocId = Arrays.stream(graphNodeIdsToDocIds).max().getAsInt(); - final int maxDocs = maxDocId + 1; - // We are going to assume that the number of ordinals is roughly the same as the number of documents in the segment, therefore, - // the mapping will not be sparse. - if (maxDocs < graphNodeIdsToDocIds.length) { - throw new IllegalStateException("Max docs " + maxDocs + " is less than the number of ordinals " + graphNodeIdsToDocIds.length); - } - if (maxDocId > graphNodeIdsToDocIds.length) { - log.warn( - "Max doc id {} is greater than the number of ordinals {}, this implies a lot of deleted documents. Or that some documents are missing vectors. Wasting a lot of memory", - maxDocId, - graphNodeIdsToDocIds.length - ); - } - this.docIdsToGraphNodeIds = new int[maxDocs]; - Arrays.fill(this.docIdsToGraphNodeIds, -1); // -1 means no mapping to ordinal - for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { - this.docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; - } + graphNodeIdsToDocIds = new int[size]; + docIdsToGraphNodeIds = new int[maxDocId]; + for (int ord = 0; ord < size; ord++) { + final int docId = in.readVInt(); + graphNodeIdsToDocIds[ord] = docId; + docIdsToGraphNodeIds[docId] = ord; } + } - /** - * Updates the mapping from the Lucene document IDs to the jVector ordinals based on the sort operation. (during flush) - * - * @param sortMap The sort map - */ - public void update(Sorter.DocMap sortMap) { - final int[] newGraphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; - final int maxNewDocId = Arrays.stream(graphNodeIdsToDocIds).map(sortMap::oldToNew).max().getAsInt(); - final int maxDocs = maxNewDocId + 1; - if (maxDocs < graphNodeIdsToDocIds.length) { - throw new IllegalStateException("Max docs " + maxDocs + " is less than the number of ordinals " + graphNodeIdsToDocIds.length); - } - final int[] newDocIdsToOrdinals = new int[maxDocs]; - Arrays.fill(newDocIdsToOrdinals, -1); - for (int oldDocId = 0; oldDocId < docIdsToGraphNodeIds.length; oldDocId++) { - if (docIdsToGraphNodeIds[oldDocId] == -1) { - continue; - } - final int newDocId = sortMap.oldToNew(oldDocId); - final int oldOrd = docIdsToGraphNodeIds[oldDocId]; - newDocIdsToOrdinals[newDocId] = oldOrd; - newGraphNodeIdsToDocIds[oldOrd] = newDocId; - } - this.docIdsToGraphNodeIds = newDocIdsToOrdinals; - this.graphNodeIdsToDocIds = newGraphNodeIdsToDocIds; + /** + * Constructor that creates a new mapping between ordinals and docIds + * + * @param graphNodeIdsToDocIds The mapping from ordinals to docIds + */ + public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { + if (graphNodeIdsToDocIds.length == 0) { + this.graphNodeIdsToDocIds = new int[0]; + this.docIdsToGraphNodeIds = new int[0]; + return; } - - /** - * Returns the jVector node id for the given Lucene document ID - * - * @param luceneDocId The Lucene document ID - * @return The jVector ordinal - */ - public int getJVectorNodeId(int luceneDocId) { - return docIdsToGraphNodeIds[luceneDocId]; + this.graphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; + System.arraycopy( + graphNodeIdsToDocIds, 0, this.graphNodeIdsToDocIds, 0, graphNodeIdsToDocIds.length); + final int maxDocId = Arrays.stream(graphNodeIdsToDocIds).max().getAsInt(); + final int maxDocs = maxDocId + 1; + // We are going to assume that the number of ordinals is roughly the same as the number of + // documents in the segment, therefore, + // the mapping will not be sparse. + if (maxDocs < graphNodeIdsToDocIds.length) { + throw new IllegalStateException( + "Max docs " + + maxDocs + + " is less than the number of ordinals " + + graphNodeIdsToDocIds.length); + } + if (maxDocId > graphNodeIdsToDocIds.length) { + log.warn( + "Max doc id {} is greater than the number of ordinals {}, this implies a lot of deleted documents. Or that some documents are missing vectors. Wasting a lot of memory", + maxDocId, + graphNodeIdsToDocIds.length); + } + this.docIdsToGraphNodeIds = new int[maxDocs]; + Arrays.fill(this.docIdsToGraphNodeIds, -1); // -1 means no mapping to ordinal + for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { + this.docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; } + } - /** - * Returns the Lucene document ID for the given jVector node id - * - * @param graphNodeId The jVector ordinal - * @return The Lucene document ID - *

- * NOTE: This method is useful when, for example, we want to remap acceptedDocs bitmap from Lucene to jVector ordinal bitmap filter - */ - public int getLuceneDocId(int graphNodeId) { - return graphNodeIdsToDocIds[graphNodeId]; + /** + * Updates the mapping from the Lucene document IDs to the jVector ordinals based on the sort + * operation. (during flush) + * + * @param sortMap The sort map + */ + public void update(Sorter.DocMap sortMap) { + final int[] newGraphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; + final int maxNewDocId = + Arrays.stream(graphNodeIdsToDocIds).map(sortMap::oldToNew).max().getAsInt(); + final int maxDocs = maxNewDocId + 1; + if (maxDocs < graphNodeIdsToDocIds.length) { + throw new IllegalStateException( + "Max docs " + + maxDocs + + " is less than the number of ordinals " + + graphNodeIdsToDocIds.length); } + final int[] newDocIdsToOrdinals = new int[maxDocs]; + Arrays.fill(newDocIdsToOrdinals, -1); + for (int oldDocId = 0; oldDocId < docIdsToGraphNodeIds.length; oldDocId++) { + if (docIdsToGraphNodeIds[oldDocId] == -1) { + continue; + } + final int newDocId = sortMap.oldToNew(oldDocId); + final int oldOrd = docIdsToGraphNodeIds[oldDocId]; + newDocIdsToOrdinals[newDocId] = oldOrd; + newGraphNodeIdsToDocIds[oldOrd] = newDocId; + } + this.docIdsToGraphNodeIds = newDocIdsToOrdinals; + this.graphNodeIdsToDocIds = newGraphNodeIdsToDocIds; + } + + /** + * Returns the jVector node id for the given Lucene document ID + * + * @param luceneDocId The Lucene document ID + * @return The jVector ordinal + */ + public int getJVectorNodeId(int luceneDocId) { + return docIdsToGraphNodeIds[luceneDocId]; + } + + /** + * Returns the Lucene document ID for the given jVector node id + * + * @param graphNodeId The jVector ordinal + * @return The Lucene document ID + *

NOTE: This method is useful when, for example, we want to remap acceptedDocs bitmap from + * Lucene to jVector ordinal bitmap filter + */ + public int getLuceneDocId(int graphNodeId) { + return graphNodeIdsToDocIds[graphNodeId]; + } - /** - * Writes the mapping to the index output - * - * @param out The index output - * @throws IOException if an I/O error occurs - */ - public void toOutput(IndexOutput out) throws IOException { - out.writeInt(VERSION); - out.writeVInt(graphNodeIdsToDocIds.length); - out.writeVInt(docIdsToGraphNodeIds.length); - for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { - out.writeVInt(graphNodeIdsToDocIds[ord]); - } + /** + * Writes the mapping to the index output + * + * @param out The index output + * @throws IOException if an I/O error occurs + */ + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(VERSION); + out.writeVInt(graphNodeIdsToDocIds.length); + out.writeVInt(docIdsToGraphNodeIds.length); + for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { + out.writeVInt(graphNodeIdsToDocIds[ord]); } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 7d80fb0f6918..5dba75410ac0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -23,111 +23,112 @@ import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.IOException; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.search.VectorScorer; -import java.io.IOException; - public class JVectorFloatVectorValues extends FloatVectorValues { - private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - - private final OnDiskGraphIndex.View view; - private final VectorSimilarityFunction similarityFunction; - private final int dimension; - private final int size; - private final GraphNodeIdToDocMap graphNodeIdToDocMap; - - public JVectorFloatVectorValues( - OnDiskGraphIndex onDiskGraphIndex, - VectorSimilarityFunction similarityFunction, - GraphNodeIdToDocMap graphNodeIdToDocMap - ) throws IOException { - this.view = onDiskGraphIndex.getView(); - this.dimension = view.dimension(); - this.size = view.size(); - this.similarityFunction = similarityFunction; - this.graphNodeIdToDocMap = graphNodeIdToDocMap; - } - - @Override - public int dimension() { - return dimension; - } - - @Override - public int size() { - return size; - } - - // This allows us to access the vector without copying it to float[] - public VectorFloat vectorFloatValue(int ord) { - return view.getVector(ord); - } - - public DocIndexIterator iterator() { - return new DocIndexIterator() { - private int docId = -1; - private final Bits liveNodes = view.liveNodes(); - - @Override - public long cost() { - return size(); - } - - @Override - public int index() { - return graphNodeIdToDocMap.getJVectorNodeId(docId); - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - // Advance to the next node docId starts from -1 which is why we need to increment docId by 1 "size" - // times - while (docId < size - 1) { - docId++; - if (liveNodes.get(docId)) { - return docId; - } - } - docId = NO_MORE_DOCS; - - return docId; - } - - @Override - public int advance(int target) throws IOException { - return slowAdvance(target); - } - }; - } - - @Override - public float[] vectorValue(int i) throws IOException { - try { - final VectorFloat vector = vectorFloatValue(i); - return (float[]) vector.get(); - } catch (Throwable e) { - throw new RuntimeException(e); + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final OnDiskGraphIndex.View view; + private final VectorSimilarityFunction similarityFunction; + private final int dimension; + private final int size; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + + public JVectorFloatVectorValues( + OnDiskGraphIndex onDiskGraphIndex, + VectorSimilarityFunction similarityFunction, + GraphNodeIdToDocMap graphNodeIdToDocMap) + throws IOException { + this.view = onDiskGraphIndex.getView(); + this.dimension = view.dimension(); + this.size = view.size(); + this.similarityFunction = similarityFunction; + this.graphNodeIdToDocMap = graphNodeIdToDocMap; + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public int size() { + return size; + } + + // This allows us to access the vector without copying it to float[] + public VectorFloat vectorFloatValue(int ord) { + return view.getVector(ord); + } + + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int docId = -1; + private final Bits liveNodes = view.liveNodes(); + + @Override + public long cost() { + return size(); + } + + @Override + public int index() { + return graphNodeIdToDocMap.getJVectorNodeId(docId); + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + // Advance to the next node docId starts from -1 which is why we need to increment docId by + // 1 "size" + // times + while (docId < size - 1) { + docId++; + if (liveNodes.get(docId)) { + return docId; + } } + docId = NO_MORE_DOCS; + + return docId; + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + }; + } + + @Override + public float[] vectorValue(int i) throws IOException { + try { + final VectorFloat vector = vectorFloatValue(i); + return (float[]) vector.get(); + } catch (Throwable e) { + throw new RuntimeException(e); } - - public VectorFloat vectorValueObject(int i) throws IOException { - return vectorFloatValue(i); - } - - @Override - public FloatVectorValues copy() throws IOException { - return this; - } - - @Override - public VectorScorer scorer(float[] query) throws IOException { - return new JVectorVectorScorer(this, VECTOR_TYPE_SUPPORT.createFloatVector(query), similarityFunction); - } - + } + + public VectorFloat vectorValueObject(int i) throws IOException { + return vectorFloatValue(i); + } + + @Override + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + return new JVectorVectorScorer( + this, VECTOR_TYPE_SUPPORT.createFloatVector(query), similarityFunction); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 29cefe6598e2..a7f3a13ee865 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -17,6 +17,10 @@ package org.opensearch.knn.index.codec.jvector; +import java.io.IOException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ForkJoinWorkerThread; +import java.util.function.Function; import lombok.extern.log4j.Log4j2; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; @@ -25,184 +29,186 @@ import org.apache.lucene.index.SegmentWriteState; import org.opensearch.knn.common.KNNConstants; -import java.io.IOException; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.ForkJoinWorkerThread; -import java.util.function.Function; - @Log4j2 public class JVectorFormat extends KnnVectorsFormat { - public static final String NAME = "JVectorFormat"; - public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; - public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; - public static final String NEIGHBORS_SCORE_CACHE_CODEC_NAME = "JVectorVectorsFormatNeighborsScoreCache"; - public static final String JVECTOR_FILES_SUFFIX = "jvector"; - public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; - public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; - public static final String NEIGHBORS_SCORE_CACHE_EXTENSION = "neighbors-score-cache-" + JVECTOR_FILES_SUFFIX; - - public static final int VERSION_START = 0; - public static final int VERSION_CURRENT = VERSION_START; - public static final int DEFAULT_MAX_CONN = 32; - public static final int DEFAULT_BEAM_WIDTH = 100; - // Unfortunately, this can't be managed yet by the OpenSearch ThreadPool because it's not supporting {@link ForkJoinPool} types - public static final ForkJoinPool SIMD_POOL_MERGE = getPhysicalCoreExecutor(); - public static final ForkJoinPool SIMD_POOL_FLUSH = getPhysicalCoreExecutor(); - - private final int maxConn; - private final int beamWidth; - private final Function numberOfSubspacesPerVectorSupplier; // as a function of the original dimension - private final int minBatchSizeForQuantization; - private final float alpha; - private final float neighborOverflow; - private final boolean hierarchyEnabled; - - public JVectorFormat() { - this( - NAME, - DEFAULT_MAX_CONN, - DEFAULT_BEAM_WIDTH, - KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), - KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), - JVectorFormat::getDefaultNumberOfSubspacesPerVector, - KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, - KNNConstants.DEFAULT_HIERARCHY_ENABLED - ); - } - - public JVectorFormat(int minBatchSizeForQuantization) { - this( - NAME, - DEFAULT_MAX_CONN, - DEFAULT_BEAM_WIDTH, - KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), - KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), - JVectorFormat::getDefaultNumberOfSubspacesPerVector, - minBatchSizeForQuantization, - KNNConstants.DEFAULT_HIERARCHY_ENABLED - ); + public static final String NAME = "JVectorFormat"; + public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; + public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; + public static final String NEIGHBORS_SCORE_CACHE_CODEC_NAME = + "JVectorVectorsFormatNeighborsScoreCache"; + public static final String JVECTOR_FILES_SUFFIX = "jvector"; + public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; + public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; + public static final String NEIGHBORS_SCORE_CACHE_EXTENSION = + "neighbors-score-cache-" + JVECTOR_FILES_SUFFIX; + + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + public static final int DEFAULT_MAX_CONN = 32; + public static final int DEFAULT_BEAM_WIDTH = 100; + // Unfortunately, this can't be managed yet by the OpenSearch ThreadPool because it's not + // supporting {@link ForkJoinPool} types + public static final ForkJoinPool SIMD_POOL_MERGE = getPhysicalCoreExecutor(); + public static final ForkJoinPool SIMD_POOL_FLUSH = getPhysicalCoreExecutor(); + + private final int maxConn; + private final int beamWidth; + private final Function + numberOfSubspacesPerVectorSupplier; // as a function of the original dimension + private final int minBatchSizeForQuantization; + private final float alpha; + private final float neighborOverflow; + private final boolean hierarchyEnabled; + + public JVectorFormat() { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), + KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + KNNConstants.DEFAULT_HIERARCHY_ENABLED); + } + + public JVectorFormat(int minBatchSizeForQuantization) { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), + KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + minBatchSizeForQuantization, + KNNConstants.DEFAULT_HIERARCHY_ENABLED); + } + + public JVectorFormat( + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { + this( + NAME, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + public JVectorFormat( + String name, + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { + super(name); + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minBatchSizeForQuantization = minBatchSizeForQuantization; + this.alpha = alpha; + this.neighborOverflow = neighborOverflow; + this.hierarchyEnabled = hierarchyEnabled; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new JVectorWriter( + state, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new JVectorReader(state); + } + + @Override + public int getMaxDimensions(String s) { + // Not a hard limit, but a reasonable default + return 8192; + } + + /** + * This method returns the default number of subspaces per vector for a given original dimension. + * Should be used as a default value for the number of subspaces per vector in case no value is + * provided. + * + * @param originalDimension original vector dimension + * @return default number of subspaces per vector + */ + public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { + // the idea here is that higher dimensions compress well, but not so well that we should use + // fewer bits + // than a lower-dimension vector, which is what you could get with cutoff points to switch + // between (e.g.) + // D*0.5 and D*0.25. Thus, the following ensures that bytes per vector is strictly increasing + // with D. + int compressedBytes; + if (originalDimension <= 32) { + // We are compressing from 4-byte floats to single-byte codebook indexes, + // so this represents compression of 4x + // * GloVe-25 needs 25 BPV to achieve good recall + compressedBytes = originalDimension; + } else if (originalDimension <= 64) { + // * GloVe-50 performs fine at 25 + compressedBytes = 32; + } else if (originalDimension <= 200) { + // * GloVe-100 and -200 perform well at 50 and 100 BPV, respectively + compressedBytes = (int) (originalDimension * 0.5); + } else if (originalDimension <= 400) { + // * NYTimes-256 actually performs fine at 64 BPV but we'll be conservative + // since we don't want BPV to decrease + compressedBytes = 100; + } else if (originalDimension <= 768) { + // allow BPV to increase linearly up to 192 + compressedBytes = (int) (originalDimension * 0.25); + } else if (originalDimension <= 1536) { + // * ada002 vectors have good recall even at 192 BPV = compression of 32x + compressedBytes = 192; + } else { + // We have not tested recall with larger vectors than this, let's let it increase linearly + compressedBytes = (int) (originalDimension * 0.125); } + return compressedBytes; + } - public JVectorFormat( - int maxConn, - int beamWidth, - float neighborOverflow, - float alpha, - Function numberOfSubspacesPerVectorSupplier, - int minBatchSizeForQuantization, - boolean hierarchyEnabled - ) { - this( - NAME, - maxConn, - beamWidth, - neighborOverflow, - alpha, - numberOfSubspacesPerVectorSupplier, - minBatchSizeForQuantization, - hierarchyEnabled - ); - } - - public JVectorFormat( - String name, - int maxConn, - int beamWidth, - float neighborOverflow, - float alpha, - Function numberOfSubspacesPerVectorSupplier, - int minBatchSizeForQuantization, - boolean hierarchyEnabled - ) { - super(name); - this.maxConn = maxConn; - this.beamWidth = beamWidth; - this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; - this.minBatchSizeForQuantization = minBatchSizeForQuantization; - this.alpha = alpha; - this.neighborOverflow = neighborOverflow; - this.hierarchyEnabled = hierarchyEnabled; - } - - @Override - public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new JVectorWriter( - state, - maxConn, - beamWidth, - neighborOverflow, - alpha, - numberOfSubspacesPerVectorSupplier, - minBatchSizeForQuantization, - hierarchyEnabled - ); - } - - @Override - public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { - return new JVectorReader(state); - } - - @Override - public int getMaxDimensions(String s) { - // Not a hard limit, but a reasonable default - return 8192; - } - - /** - * This method returns the default number of subspaces per vector for a given original dimension. - * Should be used as a default value for the number of subspaces per vector in case no value is provided. - * - * @param originalDimension original vector dimension - * @return default number of subspaces per vector - */ - public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { - // the idea here is that higher dimensions compress well, but not so well that we should use fewer bits - // than a lower-dimension vector, which is what you could get with cutoff points to switch between (e.g.) - // D*0.5 and D*0.25. Thus, the following ensures that bytes per vector is strictly increasing with D. - int compressedBytes; - if (originalDimension <= 32) { - // We are compressing from 4-byte floats to single-byte codebook indexes, - // so this represents compression of 4x - // * GloVe-25 needs 25 BPV to achieve good recall - compressedBytes = originalDimension; - } else if (originalDimension <= 64) { - // * GloVe-50 performs fine at 25 - compressedBytes = 32; - } else if (originalDimension <= 200) { - // * GloVe-100 and -200 perform well at 50 and 100 BPV, respectively - compressedBytes = (int) (originalDimension * 0.5); - } else if (originalDimension <= 400) { - // * NYTimes-256 actually performs fine at 64 BPV but we'll be conservative - // since we don't want BPV to decrease - compressedBytes = 100; - } else if (originalDimension <= 768) { - // allow BPV to increase linearly up to 192 - compressedBytes = (int) (originalDimension * 0.25); - } else if (originalDimension <= 1536) { - // * ada002 vectors have good recall even at 192 BPV = compression of 32x - compressedBytes = 192; - } else { - // We have not tested recall with larger vectors than this, let's let it increase linearly - compressedBytes = (int) (originalDimension * 0.125); - } - return compressedBytes; - } - - public static ForkJoinPool getPhysicalCoreExecutor() { - final int estimatedPhysicalCoreCount = Integer.getInteger( + public static ForkJoinPool getPhysicalCoreExecutor() { + final int estimatedPhysicalCoreCount = + Integer.getInteger( "jvector.physical_core_count", - Math.max(1, Runtime.getRuntime().availableProcessors() / 2) - ); - assert estimatedPhysicalCoreCount > 0 && estimatedPhysicalCoreCount <= Runtime.getRuntime().availableProcessors() - : "Invalid core count: " + estimatedPhysicalCoreCount; - final ForkJoinPool.ForkJoinWorkerThreadFactory factory = pool -> { - ForkJoinWorkerThread thread = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool); - thread.setPriority(Thread.NORM_PRIORITY - 2); - return thread; + Math.max(1, Runtime.getRuntime().availableProcessors() / 2)); + assert estimatedPhysicalCoreCount > 0 + && estimatedPhysicalCoreCount <= Runtime.getRuntime().availableProcessors() + : "Invalid core count: " + estimatedPhysicalCoreCount; + final ForkJoinPool.ForkJoinWorkerThreadFactory factory = + pool -> { + ForkJoinWorkerThread thread = + ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool); + thread.setPriority(Thread.NORM_PRIORITY - 2); + return thread; }; - log.info("Creating SIMD ForkJoinPool with {} physical cores for JVector SIMD operations", estimatedPhysicalCoreCount); - return new ForkJoinPool(estimatedPhysicalCoreCount, factory, null, true); - } + log.info( + "Creating SIMD ForkJoinPool with {} physical cores for JVector SIMD operations", + estimatedPhysicalCoreCount); + return new ForkJoinPool(estimatedPhysicalCoreCount, factory, null, true); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index c4cc2f715bec..3a99635582a7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -18,100 +18,102 @@ package org.opensearch.knn.index.codec.jvector; import io.github.jbellis.jvector.disk.IndexWriter; +import java.io.IOException; import lombok.extern.log4j.Log4j2; import org.apache.lucene.store.IndexOutput; -import java.io.IOException; - /** * JVectorRandomAccessWriter is a wrapper around IndexOutput that implements RandomAccessWriter. * Note: This is not thread safe! */ @Log4j2 public class JVectorIndexWriter implements IndexWriter { - private final IndexOutput indexOutputDelegate; - - public JVectorIndexWriter(IndexOutput indexOutputDelegate) { - this.indexOutputDelegate = indexOutputDelegate; - } - - @Override - public long position() throws IOException { - return indexOutputDelegate.getFilePointer(); - } - - @Override - public void close() throws IOException { - indexOutputDelegate.close(); - } - - @Override - public void write(int b) throws IOException { - indexOutputDelegate.writeByte((byte) b); - } - - @Override - public void write(byte[] b) throws IOException { - indexOutputDelegate.writeBytes(b, 0, b.length); - } - - @Override - public void write(byte[] b, int off, int len) throws IOException { - indexOutputDelegate.writeBytes(b, off, len); - } - - @Override - public void writeBoolean(boolean v) throws IOException { - indexOutputDelegate.writeByte((byte) (v ? 1 : 0)); - } - - @Override - public void writeByte(int v) throws IOException { - indexOutputDelegate.writeByte((byte) v); - } - - @Override - public void writeShort(int v) throws IOException { - indexOutputDelegate.writeShort((short) v); - } - - @Override - public void writeChar(int v) throws IOException { - throw new UnsupportedOperationException("JVectorRandomAccessWriter does not support writing chars"); - } - - @Override - public void writeInt(int v) throws IOException { - indexOutputDelegate.writeInt(v); - } - - @Override - public void writeLong(long v) throws IOException { - indexOutputDelegate.writeLong(v); - } - - @Override - public void writeFloat(float v) throws IOException { - indexOutputDelegate.writeInt(Float.floatToIntBits(v)); - } - - @Override - public void writeDouble(double v) throws IOException { - writeLong(Double.doubleToLongBits(v)); - } - - @Override - public void writeBytes(String s) throws IOException { - throw new UnsupportedOperationException("JVectorIndexWriter does not support writing String as bytes"); - } - - @Override - public void writeChars(String s) throws IOException { - throw new UnsupportedOperationException("JVectorIndexWriter does not support writing chars"); - } - - @Override - public void writeUTF(String s) throws IOException { - throw new UnsupportedOperationException("JVectorIndexWriter does not support writing UTF strings"); - } + private final IndexOutput indexOutputDelegate; + + public JVectorIndexWriter(IndexOutput indexOutputDelegate) { + this.indexOutputDelegate = indexOutputDelegate; + } + + @Override + public long position() throws IOException { + return indexOutputDelegate.getFilePointer(); + } + + @Override + public void close() throws IOException { + indexOutputDelegate.close(); + } + + @Override + public void write(int b) throws IOException { + indexOutputDelegate.writeByte((byte) b); + } + + @Override + public void write(byte[] b) throws IOException { + indexOutputDelegate.writeBytes(b, 0, b.length); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + indexOutputDelegate.writeBytes(b, off, len); + } + + @Override + public void writeBoolean(boolean v) throws IOException { + indexOutputDelegate.writeByte((byte) (v ? 1 : 0)); + } + + @Override + public void writeByte(int v) throws IOException { + indexOutputDelegate.writeByte((byte) v); + } + + @Override + public void writeShort(int v) throws IOException { + indexOutputDelegate.writeShort((short) v); + } + + @Override + public void writeChar(int v) throws IOException { + throw new UnsupportedOperationException( + "JVectorRandomAccessWriter does not support writing chars"); + } + + @Override + public void writeInt(int v) throws IOException { + indexOutputDelegate.writeInt(v); + } + + @Override + public void writeLong(long v) throws IOException { + indexOutputDelegate.writeLong(v); + } + + @Override + public void writeFloat(float v) throws IOException { + indexOutputDelegate.writeInt(Float.floatToIntBits(v)); + } + + @Override + public void writeDouble(double v) throws IOException { + writeLong(Double.doubleToLongBits(v)); + } + + @Override + public void writeBytes(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing String as bytes"); + } + + @Override + public void writeChars(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing chars"); + } + + @Override + public void writeUTF(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing UTF strings"); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java index 32b35af7c012..c5490349ef0a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -18,62 +18,63 @@ import lombok.Value; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.knn.KnnSearchStrategy; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnSearchStrategy; /** - * Wrapper class for KnnCollector that provides passing of additional parameters specific for JVector. + * Wrapper class for KnnCollector that provides passing of additional parameters specific for + * JVector. */ @Value public class JVectorKnnCollector implements KnnCollector { - KnnCollector delegate; - float threshold; - float rerankFloor; - int overQueryFactor; - boolean usePruning; + KnnCollector delegate; + float threshold; + float rerankFloor; + int overQueryFactor; + boolean usePruning; - @Override - public boolean earlyTerminated() { - return delegate.earlyTerminated(); - } + @Override + public boolean earlyTerminated() { + return delegate.earlyTerminated(); + } - @Override - public void incVisitedCount(int count) { - delegate.incVisitedCount(count); - } + @Override + public void incVisitedCount(int count) { + delegate.incVisitedCount(count); + } - @Override - public long visitedCount() { - return delegate.visitedCount(); - } + @Override + public long visitedCount() { + return delegate.visitedCount(); + } - @Override - public long visitLimit() { - return delegate.visitLimit(); - } + @Override + public long visitLimit() { + return delegate.visitLimit(); + } - @Override - public int k() { - return delegate.k(); - } + @Override + public int k() { + return delegate.k(); + } - @Override - public boolean collect(int docId, float similarity) { - return delegate.collect(docId, similarity); - } + @Override + public boolean collect(int docId, float similarity) { + return delegate.collect(docId, similarity); + } - @Override - public float minCompetitiveSimilarity() { - return delegate.minCompetitiveSimilarity(); - } + @Override + public float minCompetitiveSimilarity() { + return delegate.minCompetitiveSimilarity(); + } - @Override - public TopDocs topDocs() { - return delegate.topDocs(); - } + @Override + public TopDocs topDocs() { + return delegate.topDocs(); + } - @Override - public KnnSearchStrategy getSearchStrategy() { - return delegate.getSearchStrategy(); - } + @Override + public KnnSearchStrategy getSearchStrategy() { + return delegate.getSearchStrategy(); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java index 1ee729db1543..d2ece0b9eebc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java @@ -16,6 +16,7 @@ */ package org.opensearch.knn.index.codec.jvector; +import java.io.IOException; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; @@ -24,72 +25,71 @@ import org.apache.lucene.search.knn.KnnSearchStrategy; import org.apache.lucene.util.Bits; -import java.io.IOException; - /** - * {@link KnnFloatVectorQuery} that uses jVector to perform the search. - * We use this wrapper simply because we can't pass jVector specific parameters with the upstream {@link KnnFloatVectorQuery}. + * {@link KnnFloatVectorQuery} that uses jVector to perform the search. We use this wrapper simply + * because we can't pass jVector specific parameters with the upstream {@link KnnFloatVectorQuery}. */ public class JVectorKnnFloatVectorQuery extends KnnFloatVectorQuery { - private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final int overQueryFactor; - private final float threshold; - private final float rerankFloor; - private final boolean usePruning; + private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; + private final int overQueryFactor; + private final float threshold; + private final float rerankFloor; + private final boolean usePruning; - public JVectorKnnFloatVectorQuery( - String field, - float[] target, - int k, - int overQueryFactor, - float threshold, - float rerankFloor, - boolean usePruning - ) { - super(field, target, k); - this.overQueryFactor = overQueryFactor; - this.threshold = threshold; - this.rerankFloor = rerankFloor; - this.usePruning = usePruning; - } + public JVectorKnnFloatVectorQuery( + String field, + float[] target, + int k, + int overQueryFactor, + float threshold, + float rerankFloor, + boolean usePruning) { + super(field, target, k); + this.overQueryFactor = overQueryFactor; + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.usePruning = usePruning; + } - public JVectorKnnFloatVectorQuery( - String field, - float[] target, - int k, - Query filter, - int overQueryFactor, - float threshold, - float rerankFloor, - boolean usePruning - ) { - super(field, target, k, filter); - this.overQueryFactor = overQueryFactor; - this.threshold = threshold; - this.rerankFloor = rerankFloor; - this.usePruning = usePruning; - } + public JVectorKnnFloatVectorQuery( + String field, + float[] target, + int k, + Query filter, + int overQueryFactor, + float threshold, + float rerankFloor, + boolean usePruning) { + super(field, target, k, filter); + this.overQueryFactor = overQueryFactor; + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.usePruning = usePruning; + } - @Override - protected TopDocs approximateSearch( - LeafReaderContext context, - Bits acceptDocs, - int visitedLimit, - KnnCollectorManager knnCollectorManager - ) throws IOException { - final KnnCollector delegateCollector = knnCollectorManager.newCollector(visitedLimit, KnnSearchStrategy.Hnsw.DEFAULT, context); - final KnnCollector knnCollector = new JVectorKnnCollector(delegateCollector, threshold, rerankFloor, overQueryFactor, usePruning); - LeafReader reader = context.reader(); - FloatVectorValues floatVectorValues = reader.getFloatVectorValues(field); - if (floatVectorValues == null) { - FloatVectorValues.checkField(reader, field); - return NO_RESULTS; - } - if (Math.min(knnCollector.k(), floatVectorValues.size()) == 0) { - return NO_RESULTS; - } - reader.searchNearestVectors(field, getTargetCopy(), knnCollector, acceptDocs); - TopDocs results = knnCollector.topDocs(); - return results != null ? results : NO_RESULTS; + @Override + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { + final KnnCollector delegateCollector = + knnCollectorManager.newCollector(visitedLimit, KnnSearchStrategy.Hnsw.DEFAULT, context); + final KnnCollector knnCollector = + new JVectorKnnCollector( + delegateCollector, threshold, rerankFloor, overQueryFactor, usePruning); + LeafReader reader = context.reader(); + FloatVectorValues floatVectorValues = reader.getFloatVectorValues(field); + if (floatVectorValues == null) { + FloatVectorValues.checkField(reader, field); + return NO_RESULTS; + } + if (Math.min(knnCollector.k(), floatVectorValues.size()) == 0) { + return NO_RESULTS; } + reader.searchNearestVectors(field, getTargetCopy(), knnCollector, acceptDocs); + TopDocs results = knnCollector.topDocs(); + return results != null ? results : NO_RESULTS; + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 0599ff2121cb..25f49a897c76 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -19,168 +19,177 @@ import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.disk.ReaderSupplier; -import lombok.extern.log4j.Log4j2; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.IOUtils; - import java.io.EOFException; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.FloatBuffer; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; @Log4j2 public class JVectorRandomAccessReader implements RandomAccessReader { - private final byte[] internalBuffer = new byte[Long.BYTES]; - private final byte[] internalFloatBuffer = new byte[Float.BYTES]; - private final IndexInput indexInputDelegate; - private volatile boolean closed = false; - - public JVectorRandomAccessReader(IndexInput indexInputDelegate) { - this.indexInputDelegate = indexInputDelegate; - } - - @Override - public void seek(long offset) throws IOException { - indexInputDelegate.seek(offset); - } - - @Override - public long getPosition() throws IOException { - return indexInputDelegate.getFilePointer(); + private final byte[] internalBuffer = new byte[Long.BYTES]; + private final byte[] internalFloatBuffer = new byte[Float.BYTES]; + private final IndexInput indexInputDelegate; + private volatile boolean closed = false; + + public JVectorRandomAccessReader(IndexInput indexInputDelegate) { + this.indexInputDelegate = indexInputDelegate; + } + + @Override + public void seek(long offset) throws IOException { + indexInputDelegate.seek(offset); + } + + @Override + public long getPosition() throws IOException { + return indexInputDelegate.getFilePointer(); + } + + @Override + public int readInt() throws IOException { + return indexInputDelegate.readInt(); + } + + @Override + public float readFloat() throws IOException { + return Float.intBitsToFloat(indexInputDelegate.readInt()); + } + + // TODO: bring back to override when upgrading jVector again + // @Override + public long readLong() throws IOException { + return indexInputDelegate.readLong(); + } + + @Override + public void readFully(byte[] bytes) throws IOException { + indexInputDelegate.readBytes(bytes, 0, bytes.length); + } + + @Override + public void readFully(ByteBuffer buffer) throws IOException { + // validate that the requested bytes actually exist ---- + long remainingInFile = indexInputDelegate.length() - indexInputDelegate.getFilePointer(); + if (buffer.remaining() > remainingInFile) { + throw new EOFException( + "Requested " + buffer.remaining() + " bytes but only " + remainingInFile + " available"); } - @Override - public int readInt() throws IOException { - return indexInputDelegate.readInt(); + // Heap buffers with a backing array can be filled in one call ---- + if (buffer.hasArray()) { + int off = buffer.arrayOffset() + buffer.position(); + int len = buffer.remaining(); + indexInputDelegate.readBytes(buffer.array(), off, len); + buffer.position(buffer.limit()); // advance fully + return; } - @Override - public float readFloat() throws IOException { - return Float.intBitsToFloat(indexInputDelegate.readInt()); + // Direct / non-array buffers: copy in reasonable chunks ---- + while (buffer.hasRemaining()) { + final int bytesToRead = Math.min(buffer.remaining(), Long.BYTES); + indexInputDelegate.readBytes(this.internalBuffer, 0, bytesToRead); + buffer.put(this.internalBuffer, 0, bytesToRead); } + } - // TODO: bring back to override when upgrading jVector again - // @Override - public long readLong() throws IOException { - return indexInputDelegate.readLong(); + @Override + public void readFully(long[] vector) throws IOException { + for (int i = 0; i < vector.length; i++) { + vector[i] = readLong(); } + } - @Override - public void readFully(byte[] bytes) throws IOException { - indexInputDelegate.readBytes(bytes, 0, bytes.length); + @Override + public void read(int[] ints, int offset, int count) throws IOException { + for (int i = 0; i < count; i++) { + ints[offset + i] = readInt(); } - - @Override - public void readFully(ByteBuffer buffer) throws IOException { - // validate that the requested bytes actually exist ---- - long remainingInFile = indexInputDelegate.length() - indexInputDelegate.getFilePointer(); - if (buffer.remaining() > remainingInFile) { - throw new EOFException("Requested " + buffer.remaining() + " bytes but only " + remainingInFile + " available"); - } - - // Heap buffers with a backing array can be filled in one call ---- - if (buffer.hasArray()) { - int off = buffer.arrayOffset() + buffer.position(); - int len = buffer.remaining(); - indexInputDelegate.readBytes(buffer.array(), off, len); - buffer.position(buffer.limit()); // advance fully - return; - } - - // Direct / non-array buffers: copy in reasonable chunks ---- - while (buffer.hasRemaining()) { - final int bytesToRead = Math.min(buffer.remaining(), Long.BYTES); - indexInputDelegate.readBytes(this.internalBuffer, 0, bytesToRead); - buffer.put(this.internalBuffer, 0, bytesToRead); - } - } - - @Override - public void readFully(long[] vector) throws IOException { - for (int i = 0; i < vector.length; i++) { - vector[i] = readLong(); - } + } + + @Override + public void read(float[] floats, int offset, int count) throws IOException { + final ByteBuffer byteBuffer = ByteBuffer.allocate(Float.BYTES * count); + indexInputDelegate.readBytes(byteBuffer.array(), offset, Float.BYTES * count); + FloatBuffer buffer = byteBuffer.asFloatBuffer(); + buffer.get(floats, offset, count); + } + + @Override + public void close() throws IOException { + log.debug("Closing JVectorRandomAccessReader for file: {}", indexInputDelegate); + this.closed = true; + // no need to really close the index input delegate since it is a clone + log.debug("Closed JVectorRandomAccessReader for file: {}", indexInputDelegate); + } + + @Override + public long length() throws IOException { + return indexInputDelegate.length(); + } + + /** + * Supplies readers which are actually slices of the original IndexInput. We will vend out slices + * in order for us to easily find the footer of the jVector graph index. This is useful because + * our logic that reads the graph that the footer is always at {@link IndexInput#length()} of the + * slice. Which is how {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} is working + * behind the scenes. The header offset, on the other hand, is flexible because we can provide it + * as a parameter to {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} + */ + public static class Supplier implements ReaderSupplier { + private final AtomicInteger readerCount = new AtomicInteger(0); + private final IndexInput currentInput; + private final long sliceStartOffset; + private final long sliceLength; + private final ConcurrentHashMap readers = + new ConcurrentHashMap<>(); + + public Supplier(IndexInput indexInput) throws IOException { + this( + indexInput, + indexInput.getFilePointer(), + indexInput.length() - indexInput.getFilePointer()); } - @Override - public void read(int[] ints, int offset, int count) throws IOException { - for (int i = 0; i < count; i++) { - ints[offset + i] = readInt(); - } + public Supplier(IndexInput indexInput, long sliceStartOffset, long sliceLength) + throws IOException { + this.currentInput = indexInput; + this.sliceStartOffset = sliceStartOffset; + this.sliceLength = sliceLength; } @Override - public void read(float[] floats, int offset, int count) throws IOException { - final ByteBuffer byteBuffer = ByteBuffer.allocate(Float.BYTES * count); - indexInputDelegate.readBytes(byteBuffer.array(), offset, Float.BYTES * count); - FloatBuffer buffer = byteBuffer.asFloatBuffer(); - buffer.get(floats, offset, count); + public RandomAccessReader get() throws IOException { + synchronized (this) { + final IndexInput input = + currentInput + .slice("Input Slice for the jVector graph or PQ", sliceStartOffset, sliceLength) + .clone(); + + var reader = new JVectorRandomAccessReader(input); + int readerId = readerCount.getAndIncrement(); + readers.put(readerId, reader); + return reader; + } } @Override public void close() throws IOException { - log.debug("Closing JVectorRandomAccessReader for file: {}", indexInputDelegate); - this.closed = true; - // no need to really close the index input delegate since it is a clone - log.debug("Closed JVectorRandomAccessReader for file: {}", indexInputDelegate); - } - - @Override - public long length() throws IOException { - return indexInputDelegate.length(); - } - - /** - * Supplies readers which are actually slices of the original IndexInput. - * We will vend out slices in order for us to easily find the footer of the jVector graph index. - * This is useful because our logic that reads the graph that the footer is always at {@link IndexInput#length()} of the slice. - * Which is how {@link io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} is working behind the scenes. - * The header offset, on the other hand, is flexible because we can provide it as a parameter to {@link io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} - */ - public static class Supplier implements ReaderSupplier { - private final AtomicInteger readerCount = new AtomicInteger(0); - private final IndexInput currentInput; - private final long sliceStartOffset; - private final long sliceLength; - private final ConcurrentHashMap readers = new ConcurrentHashMap<>(); - - public Supplier(IndexInput indexInput) throws IOException { - this(indexInput, indexInput.getFilePointer(), indexInput.length() - indexInput.getFilePointer()); - } - - public Supplier(IndexInput indexInput, long sliceStartOffset, long sliceLength) throws IOException { - this.currentInput = indexInput; - this.sliceStartOffset = sliceStartOffset; - this.sliceLength = sliceLength; - } - - @Override - public RandomAccessReader get() throws IOException { - synchronized (this) { - final IndexInput input = currentInput.slice("Input Slice for the jVector graph or PQ", sliceStartOffset, sliceLength) - .clone(); - - var reader = new JVectorRandomAccessReader(input); - int readerId = readerCount.getAndIncrement(); - readers.put(readerId, reader); - return reader; - } - - } - - @Override - public void close() throws IOException { - // Close source of all cloned inputs - IOUtils.closeWhileHandlingException(currentInput); - - // Close all readers - for (RandomAccessReader reader : readers.values()) { - IOUtils.closeWhileHandlingException(reader::close); - } - readers.clear(); - readerCount.set(0); - } + // Close source of all cloned inputs + IOUtils.closeWhileHandlingException(currentInput); + + // Close all readers + for (RandomAccessReader reader : readers.values()) { + IOUtils.closeWhileHandlingException(reader::close); + } + readers.clear(); + readerCount.set(0); } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 8e36c1c3dda3..95a98830ff5d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -27,9 +27,16 @@ import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; import lombok.extern.log4j.Log4j2; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; @@ -38,357 +45,360 @@ import org.apache.lucene.store.*; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.plugin.stats.KNNCounter; -import java.io.Closeable; -import java.io.IOException; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; - @Log4j2 public class JVectorReader extends KnnVectorsReader { - private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - - private final FieldInfos fieldInfos; - private final String baseDataFileName; - // Maps field name to field entries - private final Map fieldEntryMap = new HashMap<>(1); - private final Directory directory; - private final SegmentReadState state; - - public JVectorReader(SegmentReadState state) throws IOException { - this.state = state; - this.fieldInfos = state.fieldInfos; - this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; - final String metaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, - state.segmentSuffix, - JVectorFormat.META_EXTENSION - ); - this.directory = state.directory; - boolean success = false; - try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { - CodecUtil.checkIndexHeader( - meta, - JVectorFormat.META_CODEC_NAME, - JVectorFormat.VERSION_START, - JVectorFormat.VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix - ); - readFields(meta); - CodecUtil.checkFooter(meta); - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this); - } - } + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final FieldInfos fieldInfos; + private final String baseDataFileName; + // Maps field name to field entries + private final Map fieldEntryMap = new HashMap<>(1); + private final Directory directory; + private final SegmentReadState state; + + public JVectorReader(SegmentReadState state) throws IOException { + this.state = state; + this.fieldInfos = state.fieldInfos; + this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; + final String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, JVectorFormat.META_EXTENSION); + this.directory = state.directory; + boolean success = false; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + CodecUtil.checkIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + CodecUtil.checkFooter(meta); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } } - - @Override - public void checkIntegrity() throws IOException { - for (FieldEntry fieldEntry : fieldEntryMap.values()) { - // Verify the vector index file - try (var indexInput = state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, IOContext.READONCE)) { - CodecUtil.checksumEntireFile(indexInput); - } - - // Verify the neighbors score cache file - try (var indexInput = state.directory.openInput(fieldEntry.neighborsScoreCacheIndexFieldFileName, IOContext.READONCE)) { - CodecUtil.checksumEntireFile(indexInput); - } - } + } + + @Override + public void checkIntegrity() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + // Verify the vector index file + try (var indexInput = + state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, IOContext.READONCE)) { + CodecUtil.checksumEntireFile(indexInput); + } + + // Verify the neighbors score cache file + try (var indexInput = + state.directory.openInput( + fieldEntry.neighborsScoreCacheIndexFieldFileName, IOContext.READONCE)) { + CodecUtil.checksumEntireFile(indexInput); + } } - - @Override - public FloatVectorValues getFloatVectorValues(String field) throws IOException { - final FieldEntry fieldEntry = fieldEntryMap.get(field); - return new JVectorFloatVectorValues(fieldEntry.index, fieldEntry.similarityFunction, fieldEntry.graphNodeIdToDocMap); + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + return new JVectorFloatVectorValues( + fieldEntry.index, fieldEntry.similarityFunction, fieldEntry.graphNodeIdToDocMap); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + /** Byte vector values are not supported in jVector library. Instead use PQ. */ + return null; + } + + public Optional getProductQuantizationForField(String field) + throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + if (fieldEntry.pqVectors == null) { + return Optional.empty(); } - @Override - public ByteVectorValues getByteVectorValues(String field) throws IOException { - /** - * Byte vector values are not supported in jVector library. Instead use PQ. - */ - return null; + return Optional.of(fieldEntry.pqVectors.getCompressor()); + } + + public RandomAccessReader getNeighborsScoreCacheForField(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + return fieldEntry.neighborsScoreCacheIndexReaderSupplier.get(); + } + + public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { + return fieldEntryMap.get(field).index; + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + final OnDiskGraphIndex index = fieldEntryMap.get(field).index; + final JVectorKnnCollector jvectorKnnCollector; + if (knnCollector instanceof JVectorKnnCollector) { + jvectorKnnCollector = (JVectorKnnCollector) knnCollector; + } else { + log.warn( + "KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is not ideal"); + jvectorKnnCollector = + new JVectorKnnCollector( + knnCollector, + KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), + KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), + KNNConstants.DEFAULT_OVER_QUERY_FACTOR, + KNNConstants.DEFAULT_QUERY_USE_PRUNING); } - public Optional getProductQuantizationForField(String field) throws IOException { - final FieldEntry fieldEntry = fieldEntryMap.get(field); - if (fieldEntry.pqVectors == null) { - return Optional.empty(); + // search for a random vector using a GraphSearcher and SearchScoreProvider + VectorFloat q = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final SearchScoreProvider ssp; + + try (var view = index.getView()) { + final long graphSearchStart = System.currentTimeMillis(); + if (fieldEntryMap.get(field).pqVectors + != null) { // Quantized, use the precomputed score function + final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; + // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, + // then reranks with the exact vectors that are stored on disk in the index + ScoreFunction.ApproximateScoreFunction asf = + pqVectors.precomputedScoreFunctionFor(q, fieldEntryMap.get(field).similarityFunction); + ScoreFunction.ExactScoreFunction reranker = + view.rerankerFor(q, fieldEntryMap.get(field).similarityFunction); + ssp = new DefaultSearchScoreProvider(asf, reranker); + } else { // Not quantized, used typical searcher + ssp = + DefaultSearchScoreProvider.exact(q, fieldEntryMap.get(field).similarityFunction, view); + } + final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntryMap.get(field).graphNodeIdToDocMap; + // Convert the acceptDocs bitmap from Lucene to jVector ordinal bitmap filter + // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check + // if the jVector ordinal has a + // corresponding Lucene doc ID accepted by acceptDocs filter. + io.github.jbellis.jvector.util.Bits compatibleBits = + ord -> acceptDocs == null || acceptDocs.get(jvectorLuceneDocMap.getLuceneDocId(ord)); + + try (var graphSearcher = new GraphSearcher(index)) { + final var searchResults = + graphSearcher.search( + ssp, + jvectorKnnCollector.k(), + jvectorKnnCollector.k() * jvectorKnnCollector.getOverQueryFactor(), + jvectorKnnCollector.getThreshold(), + jvectorKnnCollector.getRerankFloor(), + compatibleBits); + for (SearchResult.NodeScore ns : searchResults.getNodes()) { + jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); } - - return Optional.of(fieldEntry.pqVectors.getCompressor()); + final long graphSearchEnd = System.currentTimeMillis(); + final long searchTime = graphSearchEnd - graphSearchStart; + log.debug("Search (including acquiring view) took {} ms", searchTime); + + // Collect the below metrics about the search and somehow wire this back to {@link + // @KNNStats} + final int visitedNodesCount = searchResults.getVisitedCount(); + final int rerankedCount = searchResults.getRerankedCount(); + + final int expandedCount = searchResults.getExpandedCount(); + final int expandedBaseLayerCount = searchResults.getExpandedCountBaseLayer(); + + KNNCounter.KNN_QUERY_VISITED_NODES.add(visitedNodesCount); + KNNCounter.KNN_QUERY_RERANKED_COUNT.add(rerankedCount); + KNNCounter.KNN_QUERY_EXPANDED_NODES.add(expandedCount); + KNNCounter.KNN_QUERY_EXPANDED_BASE_LAYER_NODES.add(expandedBaseLayerCount); + KNNCounter.KNN_QUERY_GRAPH_SEARCH_TIME.add(searchTime); + log.debug( + "rerankedCount: {}, visitedNodesCount: {}, expandedCount: {}, expandedBaseLayerCount: {}", + rerankedCount, + visitedNodesCount, + expandedCount, + expandedBaseLayerCount); + } } - - public RandomAccessReader getNeighborsScoreCacheForField(String field) throws IOException { - final FieldEntry fieldEntry = fieldEntryMap.get(field); - return fieldEntry.neighborsScoreCacheIndexReaderSupplier.get(); + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + // TODO: implement this + throw new UnsupportedOperationException("Byte vector search is not supported yet with jVector"); + } + + @Override + public void close() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + IOUtils.close(fieldEntry); } - - public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { - return fieldEntryMap.get(field).index; + fieldEntryMap.clear(); + } + + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number + JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = + new JVectorWriter.VectorIndexFieldMetadata(meta); + assert fieldInfo.number == vectorIndexFieldMetadata.getFieldNumber(); + fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); } - - @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - final OnDiskGraphIndex index = fieldEntryMap.get(field).index; - final JVectorKnnCollector jvectorKnnCollector; - if (knnCollector instanceof JVectorKnnCollector) { - jvectorKnnCollector = (JVectorKnnCollector) knnCollector; - } else { - log.warn("KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is not ideal"); - jvectorKnnCollector = new JVectorKnnCollector( - knnCollector, - KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), - KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), - KNNConstants.DEFAULT_OVER_QUERY_FACTOR, - KNNConstants.DEFAULT_QUERY_USE_PRUNING - ); - + } + + class FieldEntry implements Closeable { + private final FieldInfo fieldInfo; + private final VectorEncoding vectorEncoding; + private final VectorSimilarityFunction similarityFunction; + private final int dimension; + private final long vectorIndexOffset; + private final long vectorIndexLength; + private final long pqCodebooksAndVectorsLength; + private final long pqCodebooksAndVectorsOffset; + private final String vectorIndexFieldDataFileName; + private final String neighborsScoreCacheIndexFieldFileName; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final ReaderSupplier indexReaderSupplier; + private final ReaderSupplier pqCodebooksReaderSupplier; + private final ReaderSupplier neighborsScoreCacheIndexReaderSupplier; + private final OnDiskGraphIndex index; + private final PQVectors pqVectors; // The product quantized vectors with their codebooks + + public FieldEntry( + FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) + throws IOException { + this.fieldInfo = fieldInfo; + this.similarityFunction = + VectorSimilarityMapper.ordToDistFunc( + vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal()); + this.vectorEncoding = vectorIndexFieldMetadata.getVectorEncoding(); + this.vectorIndexOffset = vectorIndexFieldMetadata.getVectorIndexOffset(); + this.vectorIndexLength = vectorIndexFieldMetadata.getVectorIndexLength(); + this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.getPqCodebooksAndVectorsLength(); + this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.getPqCodebooksAndVectorsOffset(); + this.dimension = vectorIndexFieldMetadata.getVectorDimension(); + this.graphNodeIdToDocMap = vectorIndexFieldMetadata.getGraphNodeIdToDocMap(); + + this.vectorIndexFieldDataFileName = + baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; + this.neighborsScoreCacheIndexFieldFileName = + baseDataFileName + + "_" + + fieldInfo.name + + "." + + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; + + // For the slice we would like to include the Lucene header, unfortunately, we have to do this + // because jVector use global + // offsets instead of local offsets + final long sliceLength = + vectorIndexLength + + CodecUtil.indexHeaderLength( + JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); + // Load the graph index + this.indexReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + + // If quantized load the compressed product quantized vectors with their codebooks + if (pqCodebooksAndVectorsLength > 0) { + assert pqCodebooksAndVectorsOffset > 0; + if (pqCodebooksAndVectorsOffset < vectorIndexOffset) { + throw new IllegalArgumentException( + "pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); } - - // search for a random vector using a GraphSearcher and SearchScoreProvider - VectorFloat q = VECTOR_TYPE_SUPPORT.createFloatVector(target); - final SearchScoreProvider ssp; - - try (var view = index.getView()) { - final long graphSearchStart = System.currentTimeMillis(); - if (fieldEntryMap.get(field).pqVectors != null) { // Quantized, use the precomputed score function - final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; - // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, - // then reranks with the exact vectors that are stored on disk in the index - ScoreFunction.ApproximateScoreFunction asf = pqVectors.precomputedScoreFunctionFor( - q, - fieldEntryMap.get(field).similarityFunction - ); - ScoreFunction.ExactScoreFunction reranker = view.rerankerFor(q, fieldEntryMap.get(field).similarityFunction); - ssp = new DefaultSearchScoreProvider(asf, reranker); - } else { // Not quantized, used typical searcher - ssp = DefaultSearchScoreProvider.exact(q, fieldEntryMap.get(field).similarityFunction, view); - } - final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntryMap.get(field).graphNodeIdToDocMap; - // Convert the acceptDocs bitmap from Lucene to jVector ordinal bitmap filter - // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check if the jVector ordinal has a - // corresponding Lucene doc ID accepted by acceptDocs filter. - io.github.jbellis.jvector.util.Bits compatibleBits = ord -> acceptDocs == null - || acceptDocs.get(jvectorLuceneDocMap.getLuceneDocId(ord)); - - try (var graphSearcher = new GraphSearcher(index)) { - final var searchResults = graphSearcher.search( - ssp, - jvectorKnnCollector.k(), - jvectorKnnCollector.k() * jvectorKnnCollector.getOverQueryFactor(), - jvectorKnnCollector.getThreshold(), - jvectorKnnCollector.getRerankFloor(), - compatibleBits - ); - for (SearchResult.NodeScore ns : searchResults.getNodes()) { - jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); - } - final long graphSearchEnd = System.currentTimeMillis(); - final long searchTime = graphSearchEnd - graphSearchStart; - log.debug("Search (including acquiring view) took {} ms", searchTime); - - // Collect the below metrics about the search and somehow wire this back to {@link @KNNStats} - final int visitedNodesCount = searchResults.getVisitedCount(); - final int rerankedCount = searchResults.getRerankedCount(); - - final int expandedCount = searchResults.getExpandedCount(); - final int expandedBaseLayerCount = searchResults.getExpandedCountBaseLayer(); - - KNNCounter.KNN_QUERY_VISITED_NODES.add(visitedNodesCount); - KNNCounter.KNN_QUERY_RERANKED_COUNT.add(rerankedCount); - KNNCounter.KNN_QUERY_EXPANDED_NODES.add(expandedCount); - KNNCounter.KNN_QUERY_EXPANDED_BASE_LAYER_NODES.add(expandedBaseLayerCount); - KNNCounter.KNN_QUERY_GRAPH_SEARCH_TIME.add(searchTime); - log.debug( - "rerankedCount: {}, visitedNodesCount: {}, expandedCount: {}, expandedBaseLayerCount: {}", - rerankedCount, - visitedNodesCount, - expandedCount, - expandedBaseLayerCount - ); - - } + this.pqCodebooksReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, IOContext.READONCE), + pqCodebooksAndVectorsOffset, + pqCodebooksAndVectorsLength); + log.debug( + "Loading PQ codebooks and vectors for field {}, with numbers of vectors: {}", + fieldInfo.name, + state.segmentInfo.maxDoc()); + try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { + this.pqVectors = PQVectors.load(randomAccessReader); } - } + } else { + this.pqCodebooksReaderSupplier = null; + this.pqVectors = null; + } - @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - // TODO: implement this - throw new UnsupportedOperationException("Byte vector search is not supported yet with jVector"); + final IndexInput indexInput = + directory.openInput(neighborsScoreCacheIndexFieldFileName, state.context); + CodecUtil.readIndexHeader(indexInput); + + this.neighborsScoreCacheIndexReaderSupplier = + new JVectorRandomAccessReader.Supplier(indexInput); } @Override public void close() throws IOException { - for (FieldEntry fieldEntry : fieldEntryMap.values()) { - IOUtils.close(fieldEntry); - } - fieldEntryMap.clear(); - } - - private void readFields(ChecksumIndexInput meta) throws IOException { - for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { - final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number - JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = new JVectorWriter.VectorIndexFieldMetadata(meta); - assert fieldInfo.number == vectorIndexFieldMetadata.getFieldNumber(); - fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); - } - } - - class FieldEntry implements Closeable { - private final FieldInfo fieldInfo; - private final VectorEncoding vectorEncoding; - private final VectorSimilarityFunction similarityFunction; - private final int dimension; - private final long vectorIndexOffset; - private final long vectorIndexLength; - private final long pqCodebooksAndVectorsLength; - private final long pqCodebooksAndVectorsOffset; - private final String vectorIndexFieldDataFileName; - private final String neighborsScoreCacheIndexFieldFileName; - private final GraphNodeIdToDocMap graphNodeIdToDocMap; - private final ReaderSupplier indexReaderSupplier; - private final ReaderSupplier pqCodebooksReaderSupplier; - private final ReaderSupplier neighborsScoreCacheIndexReaderSupplier; - private final OnDiskGraphIndex index; - private final PQVectors pqVectors; // The product quantized vectors with their codebooks - - public FieldEntry(FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) throws IOException { - this.fieldInfo = fieldInfo; - this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( - vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal() - ); - this.vectorEncoding = vectorIndexFieldMetadata.getVectorEncoding(); - this.vectorIndexOffset = vectorIndexFieldMetadata.getVectorIndexOffset(); - this.vectorIndexLength = vectorIndexFieldMetadata.getVectorIndexLength(); - this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.getPqCodebooksAndVectorsLength(); - this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.getPqCodebooksAndVectorsOffset(); - this.dimension = vectorIndexFieldMetadata.getVectorDimension(); - this.graphNodeIdToDocMap = vectorIndexFieldMetadata.getGraphNodeIdToDocMap(); - - this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - this.neighborsScoreCacheIndexFieldFileName = baseDataFileName - + "_" - + fieldInfo.name - + "." - + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; - - // For the slice we would like to include the Lucene header, unfortunately, we have to do this because jVector use global - // offsets instead of local offsets - final long sliceLength = vectorIndexLength + CodecUtil.indexHeaderLength( - JVectorFormat.VECTOR_INDEX_CODEC_NAME, - state.segmentSuffix - ); - // Load the graph index - this.indexReaderSupplier = new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, state.context), - 0, - sliceLength - ); - this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); - - // If quantized load the compressed product quantized vectors with their codebooks - if (pqCodebooksAndVectorsLength > 0) { - assert pqCodebooksAndVectorsOffset > 0; - if (pqCodebooksAndVectorsOffset < vectorIndexOffset) { - throw new IllegalArgumentException("pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); - } - this.pqCodebooksReaderSupplier = new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, IOContext.READONCE), - pqCodebooksAndVectorsOffset, - pqCodebooksAndVectorsLength - ); - log.debug( - "Loading PQ codebooks and vectors for field {}, with numbers of vectors: {}", - fieldInfo.name, - state.segmentInfo.maxDoc() - ); - try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { - this.pqVectors = PQVectors.load(randomAccessReader); - } - } else { - this.pqCodebooksReaderSupplier = null; - this.pqVectors = null; - } - - final IndexInput indexInput = directory.openInput(neighborsScoreCacheIndexFieldFileName, state.context); - CodecUtil.readIndexHeader(indexInput); - - this.neighborsScoreCacheIndexReaderSupplier = new JVectorRandomAccessReader.Supplier(indexInput); - } - - @Override - public void close() throws IOException { - if (indexReaderSupplier != null) { - IOUtils.close(indexReaderSupplier::close); - } - if (pqCodebooksReaderSupplier != null) { - IOUtils.close(pqCodebooksReaderSupplier::close); - } - if (neighborsScoreCacheIndexReaderSupplier != null) { - IOUtils.close(neighborsScoreCacheIndexReaderSupplier::close); - } - } + if (indexReaderSupplier != null) { + IOUtils.close(indexReaderSupplier::close); + } + if (pqCodebooksReaderSupplier != null) { + IOUtils.close(pqCodebooksReaderSupplier::close); + } + if (neighborsScoreCacheIndexReaderSupplier != null) { + IOUtils.close(neighborsScoreCacheIndexReaderSupplier::close); + } } + } + /** Utility class to map between Lucene and jVector similarity functions and metadata ordinals. */ + public static class VectorSimilarityMapper { /** - * Utility class to map between Lucene and jVector similarity functions and metadata ordinals. + * List of vector similarity functions supported by jVector library The similarity functions orders + * matter in this list because it is later used to resolve the similarity function by ordinal. */ - public static class VectorSimilarityMapper { - /** - List of vector similarity functions supported by jVector library - The similarity functions orders matter in this list because it is later used to resolve the similarity function by ordinal. - */ - public static final List JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS = List.of( - VectorSimilarityFunction.EUCLIDEAN, - VectorSimilarityFunction.DOT_PRODUCT, - VectorSimilarityFunction.COSINE - ); - - public static final Map LUCENE_TO_JVECTOR_MAP = Map.of( - org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN, + public static final List JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS = + List.of( VectorSimilarityFunction.EUCLIDEAN, - org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT, VectorSimilarityFunction.DOT_PRODUCT, - org.apache.lucene.index.VectorSimilarityFunction.COSINE, - VectorSimilarityFunction.COSINE - ); - - public static int distFuncToOrd(org.apache.lucene.index.VectorSimilarityFunction func) { - if (LUCENE_TO_JVECTOR_MAP.containsKey(func)) { - return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.indexOf(LUCENE_TO_JVECTOR_MAP.get(func)); - } - - throw new IllegalArgumentException("invalid distance function: " + func); - } + VectorSimilarityFunction.COSINE); + + public static final Map< + org.apache.lucene.index.VectorSimilarityFunction, VectorSimilarityFunction> + LUCENE_TO_JVECTOR_MAP = + Map.of( + org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.EUCLIDEAN, + org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.DOT_PRODUCT, + org.apache.lucene.index.VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.COSINE); + + public static int distFuncToOrd(org.apache.lucene.index.VectorSimilarityFunction func) { + if (LUCENE_TO_JVECTOR_MAP.containsKey(func)) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.indexOf(LUCENE_TO_JVECTOR_MAP.get(func)); + } + + throw new IllegalArgumentException("invalid distance function: " + func); + } - public static VectorSimilarityFunction ordToDistFunc(int ord) { - return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); - } + public static VectorSimilarityFunction ordToDistFunc(int ord) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + } - public static org.apache.lucene.index.VectorSimilarityFunction ordToLuceneDistFunc(int ord) { - if (ord < 0 || ord >= JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.size()) { - throw new IllegalArgumentException("Invalid ordinal: " + ord); - } - VectorSimilarityFunction jvectorFunc = JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); - for (Map.Entry entry : LUCENE_TO_JVECTOR_MAP - .entrySet()) { - if (entry.getValue().equals(jvectorFunc)) { - return entry.getKey(); - } - } - throw new IllegalStateException("No matching Lucene VectorSimilarityFunction found for ordinal: " + ord); + public static org.apache.lucene.index.VectorSimilarityFunction ordToLuceneDistFunc(int ord) { + if (ord < 0 || ord >= JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("Invalid ordinal: " + ord); + } + VectorSimilarityFunction jvectorFunc = JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + for (Map.Entry + entry : LUCENE_TO_JVECTOR_MAP.entrySet()) { + if (entry.getValue().equals(jvectorFunc)) { + return entry.getKey(); } + } + throw new IllegalStateException( + "No matching Lucene VectorSimilarityFunction found for ordinal: " + ord); } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java index 3e0b042dbe2a..6b7937f51525 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -19,32 +19,35 @@ import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.types.VectorFloat; +import java.io.IOException; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; -import java.io.IOException; - public class JVectorVectorScorer implements VectorScorer { - private final JVectorFloatVectorValues floatVectorValues; - private final KnnVectorValues.DocIndexIterator docIndexIterator; - private final VectorFloat target; - private final VectorSimilarityFunction similarityFunction; + private final JVectorFloatVectorValues floatVectorValues; + private final KnnVectorValues.DocIndexIterator docIndexIterator; + private final VectorFloat target; + private final VectorSimilarityFunction similarityFunction; - public JVectorVectorScorer(JVectorFloatVectorValues vectorValues, VectorFloat target, VectorSimilarityFunction similarityFunction) { - this.floatVectorValues = vectorValues; - this.docIndexIterator = floatVectorValues.iterator(); - this.target = target; - this.similarityFunction = similarityFunction; - } + public JVectorVectorScorer( + JVectorFloatVectorValues vectorValues, + VectorFloat target, + VectorSimilarityFunction similarityFunction) { + this.floatVectorValues = vectorValues; + this.docIndexIterator = floatVectorValues.iterator(); + this.target = target; + this.similarityFunction = similarityFunction; + } - @Override - public float score() throws IOException { - return similarityFunction.compare(target, floatVectorValues.vectorFloatValue(docIndexIterator.index())); - } + @Override + public float score() throws IOException { + return similarityFunction.compare( + target, floatVectorValues.vectorFloatValue(docIndexIterator.index())); + } - @Override - public DocIdSetIterator iterator() { - return docIndexIterator; - } + @Override + public DocIdSetIterator iterator() { + return docIndexIterator; + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 764d4a21a15f..9b17c6165dfd 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -17,6 +17,11 @@ package org.opensearch.knn.index.codec.jvector; +import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; +import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_FLUSH; +import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_MERGE; + import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.graph.*; import io.github.jbellis.jvector.graph.disk.*; @@ -29,6 +34,13 @@ import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.time.Clock; +import java.util.*; +import java.util.concurrent.ForkJoinPool; +import java.util.function.Function; +import java.util.stream.IntStream; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Getter; @@ -47,1063 +59,1133 @@ import org.apache.lucene.util.RamUsageEstimator; import org.opensearch.knn.plugin.stats.KNNCounter; -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.time.Clock; -import java.util.*; -import java.util.concurrent.ForkJoinPool; -import java.util.function.Function; -import java.util.stream.IntStream; - -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; -import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; -import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_FLUSH; -import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_MERGE; - /** - * JVectorWriter is responsible for writing vector data into index segments using the JVector library. + * JVectorWriter is responsible for writing vector data into index segments using the JVector + * library. * *

Persisting the JVector Graph Index

* - *

- * Flushing data into disk segments occurs in two scenarios: + *

Flushing data into disk segments occurs in two scenarios: + * *

    - *
  1. When the segment is being flushed to disk (e.g., when a new segment is created) via {@link #flush(int, Sorter.DocMap)}
  2. - *
  3. When the segment is a result of a merge (e.g., when multiple segments are merged into one) via {@link #mergeOneField(FieldInfo, MergeState)}
  4. + *
  5. When the segment is being flushed to disk (e.g., when a new segment is created) via {@link + * #flush(int, Sorter.DocMap)} + *
  6. When the segment is a result of a merge (e.g., when multiple segments are merged into one) + * via {@link #mergeOneField(FieldInfo, MergeState)} *
* *

jVector Graph Ordinal to Lucene Document ID Mapping

* - *

- * JVector keeps its own ordinals to identify its nodes. Those ordinals can be different from the Lucene document IDs. - * Document IDs in Lucene can change after a merge operation. Therefore, we need to maintain a mapping between - * JVector ordinals and Lucene document IDs that can hold across merges. - *

- * Document IDs in Lucene are mapped across merges and sorts using the {@link org.apache.lucene.index.MergeState.DocMap} for merges and {@link org.apache.lucene.index.Sorter.DocMap} for flush/sorts. - * For jVector however, we don't want to modify the ordinals in the jVector graph, and therefore we need to maintain a mapping between the jVector ordinals and the new Lucene document IDs. - * This is achieved by keeping checkpoints of the {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as needed across merges by constructing a new mapping from the previous mapping and the {@link MergeState.DocMap} provided in the {@link MergeState}. - * And across sorts with {@link GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. - *

+ *

JVector keeps its own ordinals to identify its nodes. Those ordinals can be different from the + * Lucene document IDs. Document IDs in Lucene can change after a merge operation. Therefore, we + * need to maintain a mapping between JVector ordinals and Lucene document IDs that can hold across + * merges. * + *

Document IDs in Lucene are mapped across merges and sorts using the {@link + * org.apache.lucene.index.MergeState.DocMap} for merges and {@link + * org.apache.lucene.index.Sorter.DocMap} for flush/sorts. For jVector however, we don't want to + * modify the ordinals in the jVector graph, and therefore we need to maintain a mapping between the + * jVector ordinals and the new Lucene document IDs. This is achieved by keeping checkpoints of the + * {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as + * needed across merges by constructing a new mapping from the previous mapping and the {@link + * MergeState.DocMap} provided in the {@link MergeState}. And across sorts with {@link + * GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. */ @Log4j2 public class JVectorWriter extends KnnVectorsWriter { - private static final long SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); - - private final List> fields = new ArrayList<>(); - - private final IndexOutput meta; - private final IndexOutput vectorIndex; - private final String indexDataFileName; - private final String baseDataFileName; - private final SegmentWriteState segmentWriteState; - private final int maxConn; - private final int beamWidth; - private final float degreeOverflow; - private final float alpha; - private final Function numberOfSubspacesPerVectorSupplier; // Number of subspaces used per vector for PQ quantization - // as a function of the original dimension - private final int minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger PQ quantization - private final boolean hierarchyEnabled; - - private boolean finished = false; - - public JVectorWriter( - SegmentWriteState segmentWriteState, - int maxConn, - int beamWidth, - float degreeOverflow, - float alpha, - Function numberOfSubspacesPerVectorSupplier, - int minimumBatchSizeForQuantization, - boolean hierarchyEnabled - ) throws IOException { - this.segmentWriteState = segmentWriteState; - this.maxConn = maxConn; - this.beamWidth = beamWidth; - this.degreeOverflow = degreeOverflow; - this.alpha = alpha; - this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; - this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; - this.hierarchyEnabled = hierarchyEnabled; - String metaFileName = IndexFileNames.segmentFileName( + private static final long SHALLOW_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); + + private final List> fields = new ArrayList<>(); + + private final IndexOutput meta; + private final IndexOutput vectorIndex; + private final String indexDataFileName; + private final String baseDataFileName; + private final SegmentWriteState segmentWriteState; + private final int maxConn; + private final int beamWidth; + private final float degreeOverflow; + private final float alpha; + private final Function + numberOfSubspacesPerVectorSupplier; // Number of subspaces used per vector for PQ quantization + // as a function of the original dimension + private final int + minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger + // PQ quantization + private final boolean hierarchyEnabled; + + private boolean finished = false; + + public JVectorWriter( + SegmentWriteState segmentWriteState, + int maxConn, + int beamWidth, + float degreeOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minimumBatchSizeForQuantization, + boolean hierarchyEnabled) + throws IOException { + this.segmentWriteState = segmentWriteState; + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.degreeOverflow = degreeOverflow; + this.alpha = alpha; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; + this.hierarchyEnabled = hierarchyEnabled; + String metaFileName = + IndexFileNames.segmentFileName( segmentWriteState.segmentInfo.name, segmentWriteState.segmentSuffix, - JVectorFormat.META_EXTENSION - ); + JVectorFormat.META_EXTENSION); - this.indexDataFileName = IndexFileNames.segmentFileName( + this.indexDataFileName = + IndexFileNames.segmentFileName( segmentWriteState.segmentInfo.name, segmentWriteState.segmentSuffix, - JVectorFormat.VECTOR_INDEX_EXTENSION - ); - this.baseDataFileName = segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; - - boolean success = false; - try { - meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); - vectorIndex = segmentWriteState.directory.createOutput(indexDataFileName, segmentWriteState.context); - CodecUtil.writeIndexHeader( - meta, - JVectorFormat.META_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix - ); - - CodecUtil.writeIndexHeader( - vectorIndex, - JVectorFormat.VECTOR_INDEX_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix - ); - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this); - } - } + JVectorFormat.VECTOR_INDEX_EXTENSION); + this.baseDataFileName = + segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; + + boolean success = false; + try { + meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); + vectorIndex = + segmentWriteState.directory.createOutput(indexDataFileName, segmentWriteState.context); + CodecUtil.writeIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + + CodecUtil.writeIndexHeader( + vectorIndex, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } } - - @Override - public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { - log.info("Adding field {} in segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); - if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { - final String errorMessage = "byte[] vectors are not supported in JVector. " - + "Instead you should only use float vectors and leverage product quantization during indexing." - + "This can provides much greater savings in storage and memory"; - log.error(errorMessage); - throw new UnsupportedOperationException(errorMessage); - } - FieldWriter newField = new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name); - - fields.add(newField); - return newField; + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + log.info("Adding field {} in segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); + if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { + final String errorMessage = + "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + log.error(errorMessage); + throw new UnsupportedOperationException(errorMessage); } - - @Override - public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - log.info("Merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); - try { - final long mergeStart = Clock.systemDefaultZone().millis(); - switch (fieldInfo.getVectorEncoding()) { - case BYTE: - throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); - case FLOAT32: - final var mergeRavv = new RandomAccessMergedFloatVectorValues(fieldInfo, mergeState); - mergeRavv.merge(); - break; - } - final long mergeEnd = Clock.systemDefaultZone().millis(); - final long mergeTime = mergeEnd - mergeStart; - KNNCounter.KNN_GRAPH_MERGE_TIME.add(mergeTime); - log.info("Completed Merge field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); - } catch (Exception e) { - log.error("Error merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name, e); - throw e; - } + FieldWriter newField = new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name); + + fields.add(newField); + return newField; + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + log.info( + "Merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); + try { + final long mergeStart = Clock.systemDefaultZone().millis(); + switch (fieldInfo.getVectorEncoding()) { + case BYTE: + throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); + case FLOAT32: + final var mergeRavv = new RandomAccessMergedFloatVectorValues(fieldInfo, mergeState); + mergeRavv.merge(); + break; + } + final long mergeEnd = Clock.systemDefaultZone().millis(); + final long mergeTime = mergeEnd - mergeStart; + KNNCounter.KNN_GRAPH_MERGE_TIME.add(mergeTime); + log.info( + "Completed Merge field {} into segment {}", + fieldInfo.name, + segmentWriteState.segmentInfo.name); + } catch (Exception e) { + log.error( + "Error merging field {} into segment {}", + fieldInfo.name, + segmentWriteState.segmentInfo.name, + e); + throw e; } - - @Override - public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { - log.info("Flushing {} fields", fields.size()); - - log.info("Flushing jVector graph index"); - for (FieldWriter field : fields) { - final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; - final int[] newToOldOrds = new int[randomAccessVectorValues.size()]; - for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { - newToOldOrds[ord] = ord; - } - final BuildScoreProvider buildScoreProvider; - final PQVectors pqVectors; - final FieldInfo fieldInfo = field.fieldInfo; - if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { - log.info("Calculating codebooks and compressed vectors for field {}", fieldInfo.name); - pqVectors = getPQVectors(newToOldOrds, randomAccessVectorValues, fieldInfo); - buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(getVectorSimilarityFunction(fieldInfo), pqVectors); - } else { - log.info( - "Vector count: {}, less than limit to trigger PQ quantization: {}, for field {}, will use full precision vectors instead.", - randomAccessVectorValues.size(), - minimumBatchSizeForQuantization, - fieldInfo.name - ); - pqVectors = null; - buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( - randomAccessVectorValues, - getVectorSimilarityFunction(fieldInfo) - ); - } - - // Generate the ord to doc mapping - final int[] ordinalsToDocIds = new int[randomAccessVectorValues.size()]; - for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { - ordinalsToDocIds[ord] = field.docIds.get(ord); - } - final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(ordinalsToDocIds); - if (sortMap != null) { - graphNodeIdToDocMap.update(sortMap); - } - - OnHeapGraphIndex graph = getGraph( - buildScoreProvider, - randomAccessVectorValues, - newToOldOrds, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_FLUSH - ); - writeField(field.fieldInfo, field.randomAccessVectorValues, pqVectors, newToOldOrds, graphNodeIdToDocMap, graph); - - } - } - - private void writeField( - FieldInfo fieldInfo, - RandomAccessVectorValues randomAccessVectorValues, - PQVectors pqVectors, - int[] newToOldOrds, - GraphNodeIdToDocMap graphNodeIdToDocMap, - OnHeapGraphIndex graph - ) throws IOException { + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + log.info("Flushing {} fields", fields.size()); + + log.info("Flushing jVector graph index"); + for (FieldWriter field : fields) { + final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; + final int[] newToOldOrds = new int[randomAccessVectorValues.size()]; + for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { + newToOldOrds[ord] = ord; + } + final BuildScoreProvider buildScoreProvider; + final PQVectors pqVectors; + final FieldInfo fieldInfo = field.fieldInfo; + if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { + log.info("Calculating codebooks and compressed vectors for field {}", fieldInfo.name); + pqVectors = getPQVectors(newToOldOrds, randomAccessVectorValues, fieldInfo); + buildScoreProvider = + BuildScoreProvider.pqBuildScoreProvider( + getVectorSimilarityFunction(fieldInfo), pqVectors); + } else { log.info( - "Writing field {} with vector count: {}, for segment: {}", - fieldInfo.name, + "Vector count: {}, less than limit to trigger PQ quantization: {}, for field {}, will use full precision vectors instead.", randomAccessVectorValues.size(), - segmentWriteState.segmentInfo.name - ); - final var vectorIndexFieldMetadata = writeGraph( + minimumBatchSizeForQuantization, + fieldInfo.name); + pqVectors = null; + buildScoreProvider = + BuildScoreProvider.randomAccessScoreProvider( + randomAccessVectorValues, getVectorSimilarityFunction(fieldInfo)); + } + + // Generate the ord to doc mapping + final int[] ordinalsToDocIds = new int[randomAccessVectorValues.size()]; + for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { + ordinalsToDocIds[ord] = field.docIds.get(ord); + } + final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(ordinalsToDocIds); + if (sortMap != null) { + graphNodeIdToDocMap.update(sortMap); + } + + OnHeapGraphIndex graph = + getGraph( + buildScoreProvider, + randomAccessVectorValues, + newToOldOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_FLUSH); + writeField( + field.fieldInfo, + field.randomAccessVectorValues, + pqVectors, + newToOldOrds, + graphNodeIdToDocMap, + graph); + } + } + + private void writeField( + FieldInfo fieldInfo, + RandomAccessVectorValues randomAccessVectorValues, + PQVectors pqVectors, + int[] newToOldOrds, + GraphNodeIdToDocMap graphNodeIdToDocMap, + OnHeapGraphIndex graph) + throws IOException { + log.info( + "Writing field {} with vector count: {}, for segment: {}", + fieldInfo.name, + randomAccessVectorValues.size(), + segmentWriteState.segmentInfo.name); + final var vectorIndexFieldMetadata = + writeGraph( graph, randomAccessVectorValues, fieldInfo, pqVectors, newToOldOrds, - graphNodeIdToDocMap - ); - meta.writeInt(fieldInfo.number); - vectorIndexFieldMetadata.toOutput(meta); - - log.info("Writing neighbors score cache for field {}", fieldInfo.name); - // field data file, which contains the graph - final String neighborsScoreCacheIndexFieldFileName = baseDataFileName + graphNodeIdToDocMap); + meta.writeInt(fieldInfo.number); + vectorIndexFieldMetadata.toOutput(meta); + + log.info("Writing neighbors score cache for field {}", fieldInfo.name); + // field data file, which contains the graph + final String neighborsScoreCacheIndexFieldFileName = + baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; - try ( - IndexOutput indexOutput = segmentWriteState.directory.createOutput( - neighborsScoreCacheIndexFieldFileName, - segmentWriteState.context - ); - final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput) - ) { - CodecUtil.writeIndexHeader( - indexOutput, - JVectorFormat.NEIGHBORS_SCORE_CACHE_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix - ); - graph.save(jVectorIndexWriter); - CodecUtil.writeFooter(indexOutput); - } + try (IndexOutput indexOutput = + segmentWriteState.directory.createOutput( + neighborsScoreCacheIndexFieldFileName, segmentWriteState.context); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput)) { + CodecUtil.writeIndexHeader( + indexOutput, + JVectorFormat.NEIGHBORS_SCORE_CACHE_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + graph.save(jVectorIndexWriter); + CodecUtil.writeFooter(indexOutput); } - - /** - * Writes the graph and PQ codebooks and compressed vectors to the vector index file - * @param graph graph - * @param randomAccessVectorValues random access vector values - * @param fieldInfo field info - * @return Tuple of start offset and length of the graph - * @throws IOException IOException - */ - private VectorIndexFieldMetadata writeGraph( - OnHeapGraphIndex graph, - RandomAccessVectorValues randomAccessVectorValues, - FieldInfo fieldInfo, - PQVectors pqVectors, - int[] newToOldOrds, - GraphNodeIdToDocMap graphNodeIdToDocMap - ) throws IOException { - // field data file, which contains the graph - final String vectorIndexFieldFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - - try ( - IndexOutput indexOutput = segmentWriteState.directory.createOutput(vectorIndexFieldFileName, segmentWriteState.context); - final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput) - ) { - // Header for the field data file - CodecUtil.writeIndexHeader( - indexOutput, - JVectorFormat.VECTOR_INDEX_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix - ); - final long startOffset = indexOutput.getFilePointer(); - - log.info("Writing graph to {}", vectorIndexFieldFileName); - var resultBuilder = VectorIndexFieldMetadata.builder() - .fieldNumber(fieldInfo.number) - .vectorEncoding(fieldInfo.getVectorEncoding()) - .vectorSimilarityFunction(fieldInfo.getVectorSimilarityFunction()) - .vectorDimension(randomAccessVectorValues.dimension()) - .graphNodeIdToDocMap(graphNodeIdToDocMap); - - try ( - var writer = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter).with( - new InlineVectors(randomAccessVectorValues.dimension()) - ).build() - ) { - var suppliers = Feature.singleStateFactory( - FeatureId.INLINE_VECTORS, - nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(newToOldOrds[nodeId])) - ); - writer.write(suppliers); - long endGraphOffset = jVectorIndexWriter.position(); - resultBuilder.vectorIndexOffset(startOffset); - resultBuilder.vectorIndexLength(endGraphOffset - startOffset); - - // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed vectors - if (pqVectors != null) { - log.info( - "Writing PQ codebooks and vectors for field {} since the size is {} >= {}", - fieldInfo.name, - randomAccessVectorValues.size(), - minimumBatchSizeForQuantization - ); - resultBuilder.pqCodebooksAndVectorsOffset(endGraphOffset); - // write the compressed vectors and codebooks to disk - pqVectors.write(jVectorIndexWriter); - resultBuilder.pqCodebooksAndVectorsLength(jVectorIndexWriter.position() - endGraphOffset); - } else { - resultBuilder.pqCodebooksAndVectorsOffset(0); - resultBuilder.pqCodebooksAndVectorsLength(0); - } - CodecUtil.writeFooter(indexOutput); - } - - return resultBuilder.build(); + } + + /** + * Writes the graph and PQ codebooks and compressed vectors to the vector index file + * + * @param graph graph + * @param randomAccessVectorValues random access vector values + * @param fieldInfo field info + * @return Tuple of start offset and length of the graph + * @throws IOException IOException + */ + private VectorIndexFieldMetadata writeGraph( + OnHeapGraphIndex graph, + RandomAccessVectorValues randomAccessVectorValues, + FieldInfo fieldInfo, + PQVectors pqVectors, + int[] newToOldOrds, + GraphNodeIdToDocMap graphNodeIdToDocMap) + throws IOException { + // field data file, which contains the graph + final String vectorIndexFieldFileName = + baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; + + try (IndexOutput indexOutput = + segmentWriteState.directory.createOutput( + vectorIndexFieldFileName, segmentWriteState.context); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput)) { + // Header for the field data file + CodecUtil.writeIndexHeader( + indexOutput, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + final long startOffset = indexOutput.getFilePointer(); + + log.info("Writing graph to {}", vectorIndexFieldFileName); + var resultBuilder = + VectorIndexFieldMetadata.builder() + .fieldNumber(fieldInfo.number) + .vectorEncoding(fieldInfo.getVectorEncoding()) + .vectorSimilarityFunction(fieldInfo.getVectorSimilarityFunction()) + .vectorDimension(randomAccessVectorValues.dimension()) + .graphNodeIdToDocMap(graphNodeIdToDocMap); + + try (var writer = + new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) + .with(new InlineVectors(randomAccessVectorValues.dimension())) + .build()) { + var suppliers = + Feature.singleStateFactory( + FeatureId.INLINE_VECTORS, + nodeId -> + new InlineVectors.State( + randomAccessVectorValues.getVector(newToOldOrds[nodeId]))); + writer.write(suppliers); + long endGraphOffset = jVectorIndexWriter.position(); + resultBuilder.vectorIndexOffset(startOffset); + resultBuilder.vectorIndexLength(endGraphOffset - startOffset); + + // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed + // vectors + if (pqVectors != null) { + log.info( + "Writing PQ codebooks and vectors for field {} since the size is {} >= {}", + fieldInfo.name, + randomAccessVectorValues.size(), + minimumBatchSizeForQuantization); + resultBuilder.pqCodebooksAndVectorsOffset(endGraphOffset); + // write the compressed vectors and codebooks to disk + pqVectors.write(jVectorIndexWriter); + resultBuilder.pqCodebooksAndVectorsLength(jVectorIndexWriter.position() - endGraphOffset); + } else { + resultBuilder.pqCodebooksAndVectorsOffset(0); + resultBuilder.pqCodebooksAndVectorsLength(0); } - } + CodecUtil.writeFooter(indexOutput); + } - private PQVectors getPQVectors(int[] newToOldOrds, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) - throws IOException { - final String fieldName = fieldInfo.name; - final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); - log.info("Computing PQ codebooks for field {} for {} vectors", fieldName, randomAccessVectorValues.size()); - final long start = Clock.systemDefaultZone().millis(); - final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); - final var numberOfClustersPerSubspace = Math.min(256, randomAccessVectorValues.size()); // number of centroids per - // subspace - ProductQuantization pq = ProductQuantization.compute( + return resultBuilder.build(); + } + } + + private PQVectors getPQVectors( + int[] newToOldOrds, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) + throws IOException { + final String fieldName = fieldInfo.name; + final VectorSimilarityFunction vectorSimilarityFunction = + fieldInfo.getVectorSimilarityFunction(); + log.info( + "Computing PQ codebooks for field {} for {} vectors", + fieldName, + randomAccessVectorValues.size()); + final long start = Clock.systemDefaultZone().millis(); + final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); + final var numberOfClustersPerSubspace = + Math.min(256, randomAccessVectorValues.size()); // number of centroids per + // subspace + ProductQuantization pq = + ProductQuantization.compute( randomAccessVectorValues, M, // number of subspaces numberOfClustersPerSubspace, // number of centroids per subspace vectorSimilarityFunction == VectorSimilarityFunction.EUCLIDEAN, // center the dataset UNWEIGHTED, SIMD_POOL_MERGE, - ForkJoinPool.commonPool() - ); + ForkJoinPool.commonPool()); + + final long end = Clock.systemDefaultZone().millis(); + final long trainingTime = end - start; + log.info("Computed PQ codebooks for field {}, in {} millis", fieldName, trainingTime); + KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); + log.info( + "Encoding and building PQ vectors for field {} for {} vectors", + fieldName, + randomAccessVectorValues.size()); + // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); + PQVectors pqVectors = + PQVectors.encodeAndBuild( + pq, newToOldOrds.length, newToOldOrds, randomAccessVectorValues, SIMD_POOL_MERGE); + log.info( + "Encoded and built PQ vectors for field {}, original size: {} bytes, compressed size: {} bytes", + fieldName, + pqVectors.getOriginalSize(), + pqVectors.getCompressedSize()); + return pqVectors; + } + + @Value + @Builder(toBuilder = true) + @AllArgsConstructor + public static class VectorIndexFieldMetadata { + int fieldNumber; + VectorEncoding vectorEncoding; + VectorSimilarityFunction vectorSimilarityFunction; + int vectorDimension; + long vectorIndexOffset; + long vectorIndexLength; + long pqCodebooksAndVectorsOffset; + long pqCodebooksAndVectorsLength; + float degreeOverflow; // important when leveraging cache + GraphNodeIdToDocMap graphNodeIdToDocMap; + + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(fieldNumber); + out.writeInt(vectorEncoding.ordinal()); + out.writeInt(JVectorReader.VectorSimilarityMapper.distFuncToOrd(vectorSimilarityFunction)); + out.writeVInt(vectorDimension); + out.writeVLong(vectorIndexOffset); + out.writeVLong(vectorIndexLength); + out.writeVLong(pqCodebooksAndVectorsOffset); + out.writeVLong(pqCodebooksAndVectorsLength); + out.writeInt(Float.floatToIntBits(degreeOverflow)); + graphNodeIdToDocMap.toOutput(out); + } - final long end = Clock.systemDefaultZone().millis(); - final long trainingTime = end - start; - log.info("Computed PQ codebooks for field {}, in {} millis", fieldName, trainingTime); - KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); - log.info("Encoding and building PQ vectors for field {} for {} vectors", fieldName, randomAccessVectorValues.size()); - // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); - PQVectors pqVectors = PQVectors.encodeAndBuild(pq, newToOldOrds.length, newToOldOrds, randomAccessVectorValues, SIMD_POOL_MERGE); - log.info( - "Encoded and built PQ vectors for field {}, original size: {} bytes, compressed size: {} bytes", - fieldName, - pqVectors.getOriginalSize(), - pqVectors.getCompressedSize() - ); - return pqVectors; + public VectorIndexFieldMetadata(IndexInput in) throws IOException { + this.fieldNumber = in.readInt(); + this.vectorEncoding = readVectorEncoding(in); + this.vectorSimilarityFunction = + JVectorReader.VectorSimilarityMapper.ordToLuceneDistFunc(in.readInt()); + this.vectorDimension = in.readVInt(); + this.vectorIndexOffset = in.readVLong(); + this.vectorIndexLength = in.readVLong(); + this.pqCodebooksAndVectorsOffset = in.readVLong(); + this.pqCodebooksAndVectorsLength = in.readVLong(); + this.degreeOverflow = Float.intBitsToFloat(in.readInt()); + this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(in); } + } - @Value - @Builder(toBuilder = true) - @AllArgsConstructor - public static class VectorIndexFieldMetadata { - int fieldNumber; - VectorEncoding vectorEncoding; - VectorSimilarityFunction vectorSimilarityFunction; - int vectorDimension; - long vectorIndexOffset; - long vectorIndexLength; - long pqCodebooksAndVectorsOffset; - long pqCodebooksAndVectorsLength; - float degreeOverflow; // important when leveraging cache - GraphNodeIdToDocMap graphNodeIdToDocMap; - - public void toOutput(IndexOutput out) throws IOException { - out.writeInt(fieldNumber); - out.writeInt(vectorEncoding.ordinal()); - out.writeInt(JVectorReader.VectorSimilarityMapper.distFuncToOrd(vectorSimilarityFunction)); - out.writeVInt(vectorDimension); - out.writeVLong(vectorIndexOffset); - out.writeVLong(vectorIndexLength); - out.writeVLong(pqCodebooksAndVectorsOffset); - out.writeVLong(pqCodebooksAndVectorsLength); - out.writeInt(Float.floatToIntBits(degreeOverflow)); - graphNodeIdToDocMap.toOutput(out); - } + @Override + public void finish() throws IOException { + log.info("Finishing segment {}", segmentWriteState.segmentInfo.name); + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; - public VectorIndexFieldMetadata(IndexInput in) throws IOException { - this.fieldNumber = in.readInt(); - this.vectorEncoding = readVectorEncoding(in); - this.vectorSimilarityFunction = JVectorReader.VectorSimilarityMapper.ordToLuceneDistFunc(in.readInt()); - this.vectorDimension = in.readVInt(); - this.vectorIndexOffset = in.readVLong(); - this.vectorIndexLength = in.readVLong(); - this.pqCodebooksAndVectorsOffset = in.readVLong(); - this.pqCodebooksAndVectorsLength = in.readVLong(); - this.degreeOverflow = Float.intBitsToFloat(in.readInt()); - this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(in); - } + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + if (vectorIndex != null) { + CodecUtil.writeFooter(vectorIndex); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorIndex); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (FieldWriter field : fields) { + // the field tracks the delegate field usage + total += field.ramBytesUsed(); + } + return total; + } + + /** + * The FieldWriter class is responsible for writing vector field data into index segments. It + * provides functionality to process vector values as those being added, manage memory usage, and + * build HNSW graph indexing structures for efficient retrieval during search queries. + * + * @param The type of vector value to be handled by the writer. This is often specialized to + * support specific implementations, such as float[] or byte[] vectors. + */ + static class FieldWriter extends KnnFieldVectorsWriter { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); + @Getter private final FieldInfo fieldInfo; + private int lastDocID = -1; + private final String segmentName; + private final RandomAccessVectorValues randomAccessVectorValues; + // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to + // the jVector ordinal + private final List> vectors = new ArrayList<>(); + private final List docIds = new ArrayList<>(); + + FieldWriter(FieldInfo fieldInfo, String segmentName) { + /** For creating a new field from a flat field vectors writer. */ + this.randomAccessVectorValues = + new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); + this.fieldInfo = fieldInfo; + this.segmentName = segmentName; } @Override - public void finish() throws IOException { - log.info("Finishing segment {}", segmentWriteState.segmentInfo.name); - if (finished) { - throw new IllegalStateException("already finished"); - } - finished = true; - - if (meta != null) { - // write end of fields marker - meta.writeInt(-1); - CodecUtil.writeFooter(meta); - } - - if (vectorIndex != null) { - CodecUtil.writeFooter(vectorIndex); - } + public void addValue(int docID, T vectorValue) throws IOException { + log.trace( + "Adding value {} to field {} in segment {}", vectorValue, fieldInfo.name, segmentName); + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + docIds.add(docID); + if (vectorValue instanceof float[]) { + vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); + } else if (vectorValue instanceof byte[]) { + final String errorMessage = + "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + log.error("{}", errorMessage); + throw new UnsupportedOperationException(errorMessage); + } else { + throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); + } + lastDocID = docID; } @Override - public void close() throws IOException { - IOUtils.close(meta, vectorIndex); + public T copyValue(T vectorValue) { + throw new UnsupportedOperationException("copyValue not supported"); } @Override public long ramBytesUsed() { - long total = SHALLOW_RAM_BYTES_USED; - for (FieldWriter field : fields) { - // the field tracks the delegate field usage - total += field.ramBytesUsed(); - } - return total; + return SHALLOW_SIZE + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES; } + } + + static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction( + FieldInfo fieldInfo) { + log.info( + "Matching vector similarity function {} for field {}", + fieldInfo.getVectorSimilarityFunction(), + fieldInfo.name); + return switch (fieldInfo.getVectorSimilarityFunction()) { + case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; + case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + default -> + throw new IllegalArgumentException( + "Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); + }; + } + + /** + * Implementation of RandomAccessVectorValues that directly uses the source FloatVectorValues from + * multiple segments without copying the vectors. + * + *

Some details about the implementation logic: + * + *

First, we identify the leading reader, which is the one with the most live vectors. Second, + * we build a mapping between the ravv ordinals and the reader index and the ordinal in that + * reader. Third, we build a mapping between the ravv ordinals and the global doc ids. + * + *

Very important to note that for the leading graph the node Ids need to correspond to their + * original ravv ordinals in the reader. This is because we are later going to expand that graph + * with new vectors from the other readers. While the new vectors can be assigned arbitrary node + * Ids, the leading graph needs to preserve its original node Ids and map them to the original + * ravv vector ordinals. + */ + class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { + private static final int READER_ID = 0; + private static final int READER_ORD = 1; + private static final int LEADING_READER_IDX = 0; + + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + // Array of sub-readers + private final KnnVectorsReader[] readers; + private final JVectorFloatVectorValues[] perReaderFloatVectorValues; + + // Maps the ravv ordinals to the reader index and the ordinal in that reader. This is allowing + // us to get a unified view of all the + // vectors in all the readers with a single unified ordinal space. + private final int[][] ravvOrdToReaderMapping; + + // Total number of vectors + private final int size; + // Total number of documents including those without values + private final int totalDocsCount; + + // Vector dimension + private final int dimension; + private final FieldInfo fieldInfo; + private final MergeState mergeState; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final int[] graphNodeIdsToRavvOrds; + private boolean deletesFound = false; /** - * The FieldWriter class is responsible for writing vector field data into index segments. - * It provides functionality to process vector values as those being added, manage memory usage, and build HNSW graph - * indexing structures for efficient retrieval during search queries. + * Creates a random access view over merged float vector values. * - * @param The type of vector value to be handled by the writer. - * This is often specialized to support specific implementations, such as float[] or byte[] vectors. + * @param fieldInfo Field info for the vector field + * @param mergeState Merge state containing readers and doc maps */ - static class FieldWriter extends KnnFieldVectorsWriter { - private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); - @Getter - private final FieldInfo fieldInfo; - private int lastDocID = -1; - private final String segmentName; - private final RandomAccessVectorValues randomAccessVectorValues; - // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to the jVector ordinal - private final List> vectors = new ArrayList<>(); - private final List docIds = new ArrayList<>(); - - FieldWriter(FieldInfo fieldInfo, String segmentName) { - /** - * For creating a new field from a flat field vectors writer. - */ - this.randomAccessVectorValues = new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); - this.fieldInfo = fieldInfo; - this.segmentName = segmentName; - } - - @Override - public void addValue(int docID, T vectorValue) throws IOException { - log.trace("Adding value {} to field {} in segment {}", vectorValue, fieldInfo.name, segmentName); - if (docID == lastDocID) { - throw new IllegalArgumentException( - "VectorValuesField \"" - + fieldInfo.name - + "\" appears more than once in this document (only one value is allowed per field)" - ); + public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState mergeState) + throws IOException { + this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); + this.fieldInfo = fieldInfo; + this.mergeState = mergeState; + + final String fieldName = fieldInfo.name; + + // Count total vectors, collect readers and identify leading reader, collect base ordinals to + // later be used to build the mapping + // between global ordinals and global lucene doc ids + int totalVectorsCount = 0; + int totalLiveVectorsCount = 0; + int dimension = 0; + int tempLeadingReaderIdx = -1; + int vectorsCountInLeadingReader = -1; + List allReaders = new ArrayList<>(); + final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); + final Bits[] liveDocs = mergeState.liveDocs.clone(); + final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; + final int[] deletedOrds = + new int + [mergeState + .knnVectorsReaders + .length]; // counts the number of deleted documents in each reader + // that previously had a vector + + // Find the leading reader, count the total number of live vectors, and the base ordinals for + // each reader + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + FieldInfos fieldInfos = mergeState.fieldInfos[i]; + baseOrds[i] = totalVectorsCount; + if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { + KnnVectorsReader reader = mergeState.knnVectorsReaders[i]; + if (reader != null) { + FloatVectorValues values = reader.getFloatVectorValues(fieldName); + if (values != null) { + allReaders.add(reader); + int vectorCountInReader = values.size(); + int liveVectorCountInReader = 0; + KnnVectorValues.DocIndexIterator it = values.iterator(); + while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + if (liveDocs[i] == null || liveDocs[i].get(it.docID())) { + liveVectorCountInReader++; + } else { + deletedOrds[i]++; + deletesFound = true; + } + } + if (liveVectorCountInReader >= vectorsCountInLeadingReader) { + vectorsCountInLeadingReader = liveVectorCountInReader; + tempLeadingReaderIdx = i; + } + totalVectorsCount += vectorCountInReader; + totalLiveVectorsCount += liveVectorCountInReader; + dimension = Math.max(dimension, values.dimension()); } - docIds.add(docID); - if (vectorValue instanceof float[]) { - vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); - } else if (vectorValue instanceof byte[]) { - final String errorMessage = "byte[] vectors are not supported in JVector. " - + "Instead you should only use float vectors and leverage product quantization during indexing." - + "This can provides much greater savings in storage and memory"; - log.error("{}", errorMessage); - throw new UnsupportedOperationException(errorMessage); + } + } + } + + assert (totalVectorsCount <= totalDocsCount) + : "Total number of vectors exceeds the total number of documents"; + assert (totalLiveVectorsCount <= totalVectorsCount) + : "Total number of live vectors exceeds the total number of vectors"; + assert (dimension > 0) : "No vectors found for field " + fieldName; + + this.size = totalVectorsCount; + this.readers = new KnnVectorsReader[allReaders.size()]; + for (int i = 0; i < readers.length; i++) { + readers[i] = allReaders.get(i); + } + + // always swap the leading reader to the first position + // For this part we need to make sure we also swap all the other metadata arrays that are + // indexed by reader index + // Such as readers, docMaps, liveDocs, baseOrds, deletedOrds + if (tempLeadingReaderIdx != 0) { + final KnnVectorsReader temp = readers[LEADING_READER_IDX]; + readers[LEADING_READER_IDX] = readers[tempLeadingReaderIdx]; + readers[tempLeadingReaderIdx] = temp; + // also swap the leading doc map to the first position to match the readers + final MergeState.DocMap tempDocMap = docMaps[LEADING_READER_IDX]; + docMaps[LEADING_READER_IDX] = docMaps[tempLeadingReaderIdx]; + docMaps[tempLeadingReaderIdx] = tempDocMap; + // swap base ords + final int tempBaseOrd = baseOrds[LEADING_READER_IDX]; + baseOrds[LEADING_READER_IDX] = baseOrds[tempLeadingReaderIdx]; + baseOrds[tempLeadingReaderIdx] = tempBaseOrd; + } + + this.perReaderFloatVectorValues = new JVectorFloatVectorValues[readers.length]; + this.dimension = dimension; + + // Build mapping from global ordinal to [readerIndex, readerOrd] + this.ravvOrdToReaderMapping = new int[totalDocsCount][2]; + + int documentsIterated = 0; + + // Will be used to build the new graphNodeIdToDocMap with the new graph node id to docId + // mapping. + // This mapping should not be used to access the vectors at any time during construction, but + // only after the merge is complete + // and the new segment is created and used by searchers. + final int[] graphNodeIdToDocIds = new int[totalLiveVectorsCount]; + this.graphNodeIdsToRavvOrds = new int[totalLiveVectorsCount]; + + int graphNodeId = 0; + if (deletesFound) { + // If there are deletes, we need to build a new graph from scratch and compact the graph + // node ids + // TODO: remove this logic once we support incremental graph building with deletes see + // https://github.com/opensearch-project/opensearch-jvector/issues/171 + for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { + final JVectorFloatVectorValues values = + (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + // For each vector in this reader + KnnVectorValues.DocIndexIterator it = values.iterator(); + + for (int docId = it.nextDoc(); + docId != DocIdSetIterator.NO_MORE_DOCS; + docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + readerIdx); } else { - throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); + // Mapping from ravv ordinals to [readerIndex, readerOrd] + // Map graph node id to ravv ordinal + // Map graph node id to doc id + final int newGlobalDocId = docMaps[readerIdx].get(docId); + final int ravvLocalOrd = it.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; + graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; + graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader } - lastDocID = docID; + documentsIterated++; + } } - - @Override - public T copyValue(T vectorValue) { - throw new UnsupportedOperationException("copyValue not supported"); - } - - @Override - public long ramBytesUsed() { - return SHALLOW_SIZE + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES; + } else { + // If there are no deletes, we can reuse the existing graph and simply remap the ravv + // ordinals to the new global doc ids + // for the leading reader we must preserve the original node Ids and map them to the + // corresponding ravv vectors originally + // used to build the graph + // This is necessary because we are later going to expand that graph with new vectors from + // the other readers. + // The leading reader is ALWAYS the first one in the readers array + final JVectorFloatVectorValues leadingReaderValues = + (JVectorFloatVectorValues) readers[LEADING_READER_IDX].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[LEADING_READER_IDX] = leadingReaderValues; + var leadingReaderIt = leadingReaderValues.iterator(); + for (int docId = leadingReaderIt.nextDoc(); + docId != DocIdSetIterator.NO_MORE_DOCS; + docId = leadingReaderIt.nextDoc()) { + final int newGlobalDocId = docMaps[LEADING_READER_IDX].get(docId); + if (newGlobalDocId == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + LEADING_READER_IDX); + } else { + final int ravvLocalOrd = leadingReaderIt.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[LEADING_READER_IDX]; + graphNodeIdToDocIds[ravvLocalOrd] = newGlobalDocId; + graphNodeIdsToRavvOrds[ravvLocalOrd] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = LEADING_READER_IDX; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader + } + + documentsIterated++; } - } + // For the remaining readers we map the graph node id to the ravv ordinal in the order they + // appear + for (int readerIdx = 1; readerIdx < readers.length; readerIdx++) { + final JVectorFloatVectorValues values = + (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + // For each vector in this reader + KnnVectorValues.DocIndexIterator it = values.iterator(); + + for (int docId = it.nextDoc(); + docId != DocIdSetIterator.NO_MORE_DOCS; + docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) == -1) { + log.warn( + "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", + docId, + readerIdx); + } else { + // Mapping from ravv ordinals to [readerIndex, readerOrd] + // Map graph node id to ravv ordinal + // Map graph node id to doc id + final int newGlobalDocId = docMaps[readerIdx].get(docId); + final int ravvLocalOrd = it.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; + graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; + graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; + graphNodeId++; + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader + } - static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction(FieldInfo fieldInfo) { - log.info("Matching vector similarity function {} for field {}", fieldInfo.getVectorSimilarityFunction(), fieldInfo.name); - return switch (fieldInfo.getVectorSimilarityFunction()) { - case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; - case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; - case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; - default -> throw new IllegalArgumentException("Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); - }; + documentsIterated++; + } + } + } + + if (documentsIterated < totalVectorsCount) { + throw new IllegalStateException( + "More documents were expected than what was found in the readers." + + "Expected at least number of total vectors: " + + totalVectorsCount + + " but found only: " + + documentsIterated + + " documents."); + } + + this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(graphNodeIdToDocIds); + log.debug( + "Created RandomAccessMergedFloatVectorValues with {} total vectors from {} readers", + size, + readers.length); } /** - * Implementation of RandomAccessVectorValues that directly uses the source - * FloatVectorValues from multiple segments without copying the vectors. + * Merges the float vector values from multiple readers into a unified structure. This process + * includes handling product quantization (PQ) for vector compression, generating ord-to-doc + * mappings, and writing the merged index into a new segment file. + * + *

The method determines if pre-existing product quantization codebooks are available from + * the leading reader. If available, it refines them using remaining vectors from other readers + * in the merge. If no pre-existing codebooks are found and the total vector count meets the + * required minimum threshold, new codebooks and compressed vectors are computed. Otherwise, no + * PQ compression is applied. + * + *

Also, it generates a mapping of ordinals to document IDs by iterating through the provided + * vector data, which is further used to write the field data. * - * Some details about the implementation logic: + *

In the event of no deletes or quantization, the graph construction is done by + * incrementally adding vectors from smaller segments into the largest segment. For all other + * cases, we build a new graph from scratch from all the vectors. * - * First, we identify the leading reader, which is the one with the most live vectors. - * Second, we build a mapping between the ravv ordinals and the reader index and the ordinal in that reader. - * Third, we build a mapping between the ravv ordinals and the global doc ids. + *

TODO: Add support for incremental graph building with quantization see issue * - * Very important to note that for the leading graph the node Ids need to correspond to their original ravv ordinals in the reader. - * This is because we are later going to expand that graph with new vectors from the other readers. - * While the new vectors can be assigned arbitrary node Ids, the leading graph needs to preserve its original node Ids and map them to the original ravv vector ordinals. + * @throws IOException if there is an issue during reading or writing vector data. */ - class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { - private static final int READER_ID = 0; - private static final int READER_ORD = 1; - private static final int LEADING_READER_IDX = 0; - - private final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - - // Array of sub-readers - private final KnnVectorsReader[] readers; - private final JVectorFloatVectorValues[] perReaderFloatVectorValues; - - // Maps the ravv ordinals to the reader index and the ordinal in that reader. This is allowing us to get a unified view of all the - // vectors in all the readers with a single unified ordinal space. - private final int[][] ravvOrdToReaderMapping; - - // Total number of vectors - private final int size; - // Total number of documents including those without values - private final int totalDocsCount; - - // Vector dimension - private final int dimension; - private final FieldInfo fieldInfo; - private final MergeState mergeState; - private final GraphNodeIdToDocMap graphNodeIdToDocMap; - private final int[] graphNodeIdsToRavvOrds; - private boolean deletesFound = false; - - /** - * Creates a random access view over merged float vector values. - * - * @param fieldInfo Field info for the vector field - * @param mergeState Merge state containing readers and doc maps - */ - public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); - this.fieldInfo = fieldInfo; - this.mergeState = mergeState; - - final String fieldName = fieldInfo.name; - - // Count total vectors, collect readers and identify leading reader, collect base ordinals to later be used to build the mapping - // between global ordinals and global lucene doc ids - int totalVectorsCount = 0; - int totalLiveVectorsCount = 0; - int dimension = 0; - int tempLeadingReaderIdx = -1; - int vectorsCountInLeadingReader = -1; - List allReaders = new ArrayList<>(); - final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); - final Bits[] liveDocs = mergeState.liveDocs.clone(); - final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; - final int[] deletedOrds = new int[mergeState.knnVectorsReaders.length]; // counts the number of deleted documents in each reader - // that previously had a vector - - // Find the leading reader, count the total number of live vectors, and the base ordinals for each reader - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - FieldInfos fieldInfos = mergeState.fieldInfos[i]; - baseOrds[i] = totalVectorsCount; - if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { - KnnVectorsReader reader = mergeState.knnVectorsReaders[i]; - if (reader != null) { - FloatVectorValues values = reader.getFloatVectorValues(fieldName); - if (values != null) { - allReaders.add(reader); - int vectorCountInReader = values.size(); - int liveVectorCountInReader = 0; - KnnVectorValues.DocIndexIterator it = values.iterator(); - while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - if (liveDocs[i] == null || liveDocs[i].get(it.docID())) { - liveVectorCountInReader++; - } else { - deletedOrds[i]++; - deletesFound = true; - } - } - if (liveVectorCountInReader >= vectorsCountInLeadingReader) { - vectorsCountInLeadingReader = liveVectorCountInReader; - tempLeadingReaderIdx = i; - } - totalVectorsCount += vectorCountInReader; - totalLiveVectorsCount += liveVectorCountInReader; - dimension = Math.max(dimension, values.dimension()); - } - } - } - } - - assert (totalVectorsCount <= totalDocsCount) : "Total number of vectors exceeds the total number of documents"; - assert (totalLiveVectorsCount <= totalVectorsCount) : "Total number of live vectors exceeds the total number of vectors"; - assert (dimension > 0) : "No vectors found for field " + fieldName; - - this.size = totalVectorsCount; - this.readers = new KnnVectorsReader[allReaders.size()]; - for (int i = 0; i < readers.length; i++) { - readers[i] = allReaders.get(i); - } - - // always swap the leading reader to the first position - // For this part we need to make sure we also swap all the other metadata arrays that are indexed by reader index - // Such as readers, docMaps, liveDocs, baseOrds, deletedOrds - if (tempLeadingReaderIdx != 0) { - final KnnVectorsReader temp = readers[LEADING_READER_IDX]; - readers[LEADING_READER_IDX] = readers[tempLeadingReaderIdx]; - readers[tempLeadingReaderIdx] = temp; - // also swap the leading doc map to the first position to match the readers - final MergeState.DocMap tempDocMap = docMaps[LEADING_READER_IDX]; - docMaps[LEADING_READER_IDX] = docMaps[tempLeadingReaderIdx]; - docMaps[tempLeadingReaderIdx] = tempDocMap; - // swap base ords - final int tempBaseOrd = baseOrds[LEADING_READER_IDX]; - baseOrds[LEADING_READER_IDX] = baseOrds[tempLeadingReaderIdx]; - baseOrds[tempLeadingReaderIdx] = tempBaseOrd; - } - - this.perReaderFloatVectorValues = new JVectorFloatVectorValues[readers.length]; - this.dimension = dimension; - - // Build mapping from global ordinal to [readerIndex, readerOrd] - this.ravvOrdToReaderMapping = new int[totalDocsCount][2]; - - int documentsIterated = 0; - - // Will be used to build the new graphNodeIdToDocMap with the new graph node id to docId mapping. - // This mapping should not be used to access the vectors at any time during construction, but only after the merge is complete - // and the new segment is created and used by searchers. - final int[] graphNodeIdToDocIds = new int[totalLiveVectorsCount]; - this.graphNodeIdsToRavvOrds = new int[totalLiveVectorsCount]; - - int graphNodeId = 0; - if (deletesFound) { - // If there are deletes, we need to build a new graph from scratch and compact the graph node ids - // TODO: remove this logic once we support incremental graph building with deletes see - // https://github.com/opensearch-project/opensearch-jvector/issues/171 - for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { - final JVectorFloatVectorValues values = (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); - perReaderFloatVectorValues[readerIdx] = values; - // For each vector in this reader - KnnVectorValues.DocIndexIterator it = values.iterator(); - - for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - readerIdx - ); - } else { - // Mapping from ravv ordinals to [readerIndex, readerOrd] - // Map graph node id to ravv ordinal - // Map graph node id to doc id - final int newGlobalDocId = docMaps[readerIdx].get(docId); - final int ravvLocalOrd = it.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; - graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; - graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } - } - } else { - // If there are no deletes, we can reuse the existing graph and simply remap the ravv ordinals to the new global doc ids - // for the leading reader we must preserve the original node Ids and map them to the corresponding ravv vectors originally - // used to build the graph - // This is necessary because we are later going to expand that graph with new vectors from the other readers. - // The leading reader is ALWAYS the first one in the readers array - final JVectorFloatVectorValues leadingReaderValues = (JVectorFloatVectorValues) readers[LEADING_READER_IDX] - .getFloatVectorValues(fieldName); - perReaderFloatVectorValues[LEADING_READER_IDX] = leadingReaderValues; - var leadingReaderIt = leadingReaderValues.iterator(); - for (int docId = leadingReaderIt.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = leadingReaderIt.nextDoc()) { - final int newGlobalDocId = docMaps[LEADING_READER_IDX].get(docId); - if (newGlobalDocId == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - LEADING_READER_IDX - ); - } else { - final int ravvLocalOrd = leadingReaderIt.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[LEADING_READER_IDX]; - graphNodeIdToDocIds[ravvLocalOrd] = newGlobalDocId; - graphNodeIdsToRavvOrds[ravvLocalOrd] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = LEADING_READER_IDX; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } - - // For the remaining readers we map the graph node id to the ravv ordinal in the order they appear - for (int readerIdx = 1; readerIdx < readers.length; readerIdx++) { - final JVectorFloatVectorValues values = (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); - perReaderFloatVectorValues[readerIdx] = values; - // For each vector in this reader - KnnVectorValues.DocIndexIterator it = values.iterator(); - - for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - readerIdx - ); - } else { - // Mapping from ravv ordinals to [readerIndex, readerOrd] - // Map graph node id to ravv ordinal - // Map graph node id to doc id - final int newGlobalDocId = docMaps[readerIdx].get(docId); - final int ravvLocalOrd = it.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; - graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; - graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } - } - } - - if (documentsIterated < totalVectorsCount) { - throw new IllegalStateException( - "More documents were expected than what was found in the readers." - + "Expected at least number of total vectors: " - + totalVectorsCount - + " but found only: " - + documentsIterated - + " documents." - ); - } - - this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(graphNodeIdToDocIds); - log.debug("Created RandomAccessMergedFloatVectorValues with {} total vectors from {} readers", size, readers.length); - - } - - /** - * Merges the float vector values from multiple readers into a unified structure. - * This process includes handling product quantization (PQ) for vector compression, - * generating ord-to-doc mappings, and writing the merged index into a new segment file. - *

- * The method determines if pre-existing product quantization codebooks are available - * from the leading reader. If available, it refines them using remaining vectors - * from other readers in the merge. If no pre-existing codebooks are found and - * the total vector count meets the required minimum threshold, new codebooks - * and compressed vectors are computed. Otherwise, no PQ compression is applied. - *

- * Also, it generates a mapping of ordinals to document IDs by iterating through - * the provided vector data, which is further used to write the field data. - *

- * In the event of no deletes or quantization, the graph construction is done by incrementally adding vectors from smaller segments into the largest segment. - * For all other cases, we build a new graph from scratch from all the vectors. - * - * TODO: Add support for incremental graph building with quantization see issue - * - * @throws IOException if there is an issue during reading or writing vector data. - */ - public void merge() throws IOException { - // This section creates the PQVectors to be used for this merge - // Get PQ compressor for leading reader - final int totalVectorsCount = size; - final String fieldName = fieldInfo.name; - final PQVectors pqVectors; - final OnHeapGraphIndex graph; - // Get the leading reader - PerFieldKnnVectorsFormat.FieldsReader fieldsReader = (PerFieldKnnVectorsFormat.FieldsReader) readers[LEADING_READER_IDX]; - JVectorReader leadingReader = (JVectorReader) fieldsReader.getFieldReader(fieldName); - final BuildScoreProvider buildScoreProvider; - // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the remaining vectors - if (leadingReader.getProductQuantizationForField(fieldInfo.name).isEmpty()) { - // No pre-existing codebooks, check if we have enough vectors to trigger quantization - log.info( - "No Pre-existing PQ codebooks found in this merge for field {} in segment {}, will check if a new codebooks is necessary", - fieldName, - mergeState.segmentInfo.name - ); - if (this.size() >= minimumBatchSizeForQuantization) { - log.info( - "Calculating new codebooks and compressed vectors for field: {}, with totalVectorCount: {}, above minimumBatchSizeForQuantization: {}", - fieldName, - totalVectorsCount, - minimumBatchSizeForQuantization - ); - pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); - } else { - log.info( - "Not enough vectors found for field: {}, totalVectorCount: {}, is below minimumBatchSizeForQuantization: {}", - fieldName, - totalVectorsCount, - minimumBatchSizeForQuantization - ); - pqVectors = null; - } - } else { - log.info( - "Pre-existing PQ codebooks found in this merge for field {} in segment {}, will refine the codebooks from the leading reader with the remaining vectors", - fieldName, - mergeState.segmentInfo.name - ); - final long start = Clock.systemDefaultZone().millis(); - ProductQuantization leadingCompressor = leadingReader.getProductQuantizationForField(fieldName).get(); - // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading reader since it's already been - // used to create the leadingCompressor - // We assume the leading reader is ALWAYS the first one in the readers array - for (int i = LEADING_READER_IDX + 1; i < readers.length; i++) { - final FloatVectorValues values = readers[i].getFloatVectorValues(fieldName); - final RandomAccessVectorValues randomAccessVectorValues = new RandomAccessVectorValuesOverVectorValues(values); - leadingCompressor.refine(randomAccessVectorValues); - } - final long end = Clock.systemDefaultZone().millis(); - final long trainingTime = end - start; - log.info("Refined PQ codebooks for field {}, in {} millis", fieldName, trainingTime); - KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); - pqVectors = PQVectors.encodeAndBuild( - leadingCompressor, - graphNodeIdsToRavvOrds.length, - graphNodeIdsToRavvOrds, - this, - SIMD_POOL_MERGE - ); - } - - if (pqVectors == null) { - buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( - this, - graphNodeIdsToRavvOrds, - getVectorSimilarityFunction(fieldInfo) - ); - // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, segmentWriteState.segmentInfo.name); - if (!deletesFound) { - final String segmentName = segmentWriteState.segmentInfo.name; - log.info( - "No deletes found, and no PQ codebooks found, expanding previous graph with additional vectors for field {} in segment {}", - fieldName, - segmentName - ); - final RandomAccessReader leadingOnHeapGraphReader = leadingReader.getNeighborsScoreCacheForField(fieldName); - final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); - graph = (OnHeapGraphIndex) GraphIndexBuilder.buildAndMergeNewNodes( - leadingOnHeapGraphReader, - this, - buildScoreProvider, - numBaseVectors, - graphNodeIdsToRavvOrds, - beamWidth, - degreeOverflow, - alpha, - hierarchyEnabled - ); - } else { - log.info("Deletes found, and no PQ codebooks found, building new graph from scratch"); - graph = getGraph( - buildScoreProvider, - this, - graphNodeIdsToRavvOrds, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE - ); - } - } else { - log.info("PQ codebooks found, building graph from scratch with PQ vectors"); - buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(getVectorSimilarityFunction(fieldInfo), pqVectors); - // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD threads) - buildScoreProvider.diversityProviderFor(0); - graph = getGraph( - buildScoreProvider, - this, - graphNodeIdsToRavvOrds, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE - ); - } - - writeField(fieldInfo, this, pqVectors, graphNodeIdsToRavvOrds, graphNodeIdToDocMap, graph); + public void merge() throws IOException { + // This section creates the PQVectors to be used for this merge + // Get PQ compressor for leading reader + final int totalVectorsCount = size; + final String fieldName = fieldInfo.name; + final PQVectors pqVectors; + final OnHeapGraphIndex graph; + // Get the leading reader + PerFieldKnnVectorsFormat.FieldsReader fieldsReader = + (PerFieldKnnVectorsFormat.FieldsReader) readers[LEADING_READER_IDX]; + JVectorReader leadingReader = (JVectorReader) fieldsReader.getFieldReader(fieldName); + final BuildScoreProvider buildScoreProvider; + // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the + // remaining vectors + if (leadingReader.getProductQuantizationForField(fieldInfo.name).isEmpty()) { + // No pre-existing codebooks, check if we have enough vectors to trigger quantization + log.info( + "No Pre-existing PQ codebooks found in this merge for field {} in segment {}, will check if a new codebooks is necessary", + fieldName, + mergeState.segmentInfo.name); + if (this.size() >= minimumBatchSizeForQuantization) { + log.info( + "Calculating new codebooks and compressed vectors for field: {}, with totalVectorCount: {}, above minimumBatchSizeForQuantization: {}", + fieldName, + totalVectorsCount, + minimumBatchSizeForQuantization); + pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); + } else { + log.info( + "Not enough vectors found for field: {}, totalVectorCount: {}, is below minimumBatchSizeForQuantization: {}", + fieldName, + totalVectorsCount, + minimumBatchSizeForQuantization); + pqVectors = null; } - - @Override - public int size() { - return size; + } else { + log.info( + "Pre-existing PQ codebooks found in this merge for field {} in segment {}, will refine the codebooks from the leading reader with the remaining vectors", + fieldName, + mergeState.segmentInfo.name); + final long start = Clock.systemDefaultZone().millis(); + ProductQuantization leadingCompressor = + leadingReader.getProductQuantizationForField(fieldName).get(); + // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading + // reader since it's already been + // used to create the leadingCompressor + // We assume the leading reader is ALWAYS the first one in the readers array + for (int i = LEADING_READER_IDX + 1; i < readers.length; i++) { + final FloatVectorValues values = readers[i].getFloatVectorValues(fieldName); + final RandomAccessVectorValues randomAccessVectorValues = + new RandomAccessVectorValuesOverVectorValues(values); + leadingCompressor.refine(randomAccessVectorValues); } - - @Override - public int dimension() { - return dimension; + final long end = Clock.systemDefaultZone().millis(); + final long trainingTime = end - start; + log.info("Refined PQ codebooks for field {}, in {} millis", fieldName, trainingTime); + KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); + pqVectors = + PQVectors.encodeAndBuild( + leadingCompressor, + graphNodeIdsToRavvOrds.length, + graphNodeIdsToRavvOrds, + this, + SIMD_POOL_MERGE); + } + + if (pqVectors == null) { + buildScoreProvider = + BuildScoreProvider.randomAccessScoreProvider( + this, graphNodeIdsToRavvOrds, getVectorSimilarityFunction(fieldInfo)); + // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, + // segmentWriteState.segmentInfo.name); + if (!deletesFound) { + final String segmentName = segmentWriteState.segmentInfo.name; + log.info( + "No deletes found, and no PQ codebooks found, expanding previous graph with additional vectors for field {} in segment {}", + fieldName, + segmentName); + final RandomAccessReader leadingOnHeapGraphReader = + leadingReader.getNeighborsScoreCacheForField(fieldName); + final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); + graph = + (OnHeapGraphIndex) + GraphIndexBuilder.buildAndMergeNewNodes( + leadingOnHeapGraphReader, + this, + buildScoreProvider, + numBaseVectors, + graphNodeIdsToRavvOrds, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled); + } else { + log.info("Deletes found, and no PQ codebooks found, building new graph from scratch"); + graph = + getGraph( + buildScoreProvider, + this, + graphNodeIdsToRavvOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE); } + } else { + log.info("PQ codebooks found, building graph from scratch with PQ vectors"); + buildScoreProvider = + BuildScoreProvider.pqBuildScoreProvider( + getVectorSimilarityFunction(fieldInfo), pqVectors); + // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD + // threads) + buildScoreProvider.diversityProviderFor(0); + graph = + getGraph( + buildScoreProvider, + this, + graphNodeIdsToRavvOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE); + } - @Override - public VectorFloat getVector(int ord) { - if (ord < 0 || ord >= totalDocsCount) { - throw new IllegalArgumentException("Ordinal out of bounds: " + ord); - } + writeField(fieldInfo, this, pqVectors, graphNodeIdsToRavvOrds, graphNodeIdToDocMap, graph); + } - final int readerIdx = ravvOrdToReaderMapping[ord][READER_ID]; - final int readerOrd = ravvOrdToReaderMapping[ord][READER_ORD]; + @Override + public int size() { + return size; + } - // Access to float values is not thread safe - synchronized (perReaderFloatVectorValues[readerIdx]) { - return perReaderFloatVectorValues[readerIdx].vectorFloatValue(readerOrd); - } - } + @Override + public int dimension() { + return dimension; + } - @Override - public boolean isValueShared() { - return false; - } + @Override + public VectorFloat getVector(int ord) { + if (ord < 0 || ord >= totalDocsCount) { + throw new IllegalArgumentException("Ordinal out of bounds: " + ord); + } + + final int readerIdx = ravvOrdToReaderMapping[ord][READER_ID]; + final int readerOrd = ravvOrdToReaderMapping[ord][READER_ORD]; + + // Access to float values is not thread safe + synchronized (perReaderFloatVectorValues[readerIdx]) { + return perReaderFloatVectorValues[readerIdx].vectorFloatValue(readerOrd); + } + } - @Override - public RandomAccessVectorValues copy() { - throw new UnsupportedOperationException("Copy not supported"); - } + @Override + public boolean isValueShared() { + return false; } - /** - * This method will return the graph index for the field - * @return OnHeapGraphIndex - */ - public OnHeapGraphIndex getGraph( - BuildScoreProvider buildScoreProvider, - RandomAccessVectorValues randomAccessVectorValues, - int[] newToOldOrds, - FieldInfo fieldInfo, - String segmentName, - ForkJoinPool SIMD_POOL - ) { - final GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder( + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } + + /** + * This method will return the graph index for the field + * + * @return OnHeapGraphIndex + */ + public OnHeapGraphIndex getGraph( + BuildScoreProvider buildScoreProvider, + RandomAccessVectorValues randomAccessVectorValues, + int[] newToOldOrds, + FieldInfo fieldInfo, + String segmentName, + ForkJoinPool SIMD_POOL) { + final GraphIndexBuilder graphIndexBuilder = + new GraphIndexBuilder( buildScoreProvider, fieldInfo.getVectorDimension(), maxConn, beamWidth, degreeOverflow, alpha, - hierarchyEnabled - ); - - /* - * We cannot always use randomAccessVectorValues for the graph building - * because it's size will not always correspond to the document count. - * To have the right mapping from docId to vector ordinal we need to use the mergedFloatVector. - * This is the case when we are merging segments and we might have more documents than vectors. - */ - final long start = Clock.systemDefaultZone().millis(); - final OnHeapGraphIndex graphIndex; - var vv = randomAccessVectorValues.threadLocalSupplier(); - - log.info("Building graph from merged float vector"); - // parallel graph construction from the merge documents Ids - SIMD_POOL.submit(() -> IntStream.range(0, newToOldOrds.length).parallel().forEach(ord -> { - graphIndexBuilder.addGraphNode(ord, vv.get().getVector(newToOldOrds[ord])); - })).join(); - graphIndexBuilder.cleanup(); - graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); - final long end = Clock.systemDefaultZone().millis(); + hierarchyEnabled); - log.info("Built graph for field {} in segment {} in {} millis", fieldInfo.name, segmentName, end - start); - return graphIndex; + /* + * We cannot always use randomAccessVectorValues for the graph building + * because it's size will not always correspond to the document count. + * To have the right mapping from docId to vector ordinal we need to use the mergedFloatVector. + * This is the case when we are merging segments and we might have more documents than vectors. + */ + final long start = Clock.systemDefaultZone().millis(); + final OnHeapGraphIndex graphIndex; + var vv = randomAccessVectorValues.threadLocalSupplier(); + + log.info("Building graph from merged float vector"); + // parallel graph construction from the merge documents Ids + SIMD_POOL + .submit( + () -> + IntStream.range(0, newToOldOrds.length) + .parallel() + .forEach( + ord -> { + graphIndexBuilder.addGraphNode( + ord, vv.get().getVector(newToOldOrds[ord])); + })) + .join(); + graphIndexBuilder.cleanup(); + graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); + final long end = Clock.systemDefaultZone().millis(); + + log.info( + "Built graph for field {} in segment {} in {} millis", + fieldInfo.name, + segmentName, + end - start); + return graphIndex; + } + + static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + private final FloatVectorValues values; + + public RandomAccessVectorValuesOverVectorValues(FloatVectorValues values) { + this.values = values; } - static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { - private final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - private final FloatVectorValues values; - - public RandomAccessVectorValuesOverVectorValues(FloatVectorValues values) { - this.values = values; - } - - @Override - public int size() { - return values.size(); - } - - @Override - public int dimension() { - return values.dimension(); - } + @Override + public int size() { + return values.size(); + } - @Override - public VectorFloat getVector(int nodeId) { - try { - // Access to float values is not thread safe - synchronized (this) { - final float[] vector = values.vectorValue(nodeId); - final float[] copy = new float[vector.length]; - System.arraycopy(vector, 0, copy, 0, vector.length); - return VECTOR_TYPE_SUPPORT.createFloatVector(copy); - } - } catch (IOException e) { - log.error("Error retrieving vector at ordinal {}", nodeId, e); - throw new RuntimeException(e); - } - } + @Override + public int dimension() { + return values.dimension(); + } - @Override - public boolean isValueShared() { - return false; + @Override + public VectorFloat getVector(int nodeId) { + try { + // Access to float values is not thread safe + synchronized (this) { + final float[] vector = values.vectorValue(nodeId); + final float[] copy = new float[vector.length]; + System.arraycopy(vector, 0, copy, 0, vector.length); + return VECTOR_TYPE_SUPPORT.createFloatVector(copy); } + } catch (IOException e) { + log.error("Error retrieving vector at ordinal {}", nodeId, e); + throw new RuntimeException(e); + } + } - @Override - public RandomAccessVectorValues copy() { - throw new UnsupportedOperationException("Copy not supported"); - } + @Override + public boolean isValueShared() { + return false; } + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index b562e52fd4a1..b2f2ea075d3d 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -17,7 +17,16 @@ package org.opensearch.knn.index.codec.jvector; +import static org.opensearch.knn.common.KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; +import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; + import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import lombok.extern.log4j.Log4j2; import org.apache.lucene.document.*; import org.apache.lucene.index.*; @@ -33,1537 +42,1635 @@ import org.opensearch.knn.index.ThreadLeakFiltersForTests; import org.opensearch.knn.plugin.stats.KNNCounter; -import java.io.IOException; -import java.nio.file.Path; -import java.util.*; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.opensearch.knn.common.KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; -import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; - -/** - * Test used specifically for JVector - */ -// Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not being shutdown. +/** Test used specifically for JVector */ +// Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not +// being shutdown. // Ignore thread leaks until we remove the ForkJoinPool.commonPool() usage from IndexGraphBuilder -// TODO: Wire the execution thread pool to {@link IndexGraphBuilder} to avoid the failure of the UT due to leaked thread pool warning. -@ThreadLeakFilters(defaultFilters = true, filters = { ThreadLeakFiltersForTests.class }) +// TODO: Wire the execution thread pool to {@link IndexGraphBuilder} to avoid the failure of the UT +// due to leaked thread pool warning. +@ThreadLeakFilters( + defaultFilters = true, + filters = {ThreadLeakFiltersForTests.class}) @LuceneTestCase.SuppressSysoutChecks(bugUrl = "") @Log4j2 public class KNNJVectorTests extends LuceneTestCase { - private static final String TEST_FIELD = "test_field"; - private static final String TEST_ID_FIELD = "id"; - - /** - * Test to verify that the JVector codec is able to successfully search for the nearest neighbours - * in the index. - * Single field is used to store the vectors. - * All the documents are stored in a single segment. - * Single commit without refreshing the index. - * No merge. - */ - @Test - public void testJVectorKnnIndex_simpleCase() throws IOException { - int k = 3; // The number of nearest neighbors to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f / i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(9, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(8, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(7, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } - log.info("successfully closed directory"); + private static final String TEST_FIELD = "test_field"; + private static final String TEST_ID_FIELD = "id"; + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. All the documents are stored in a + * single segment. Single commit without refreshing the index. No merge. + */ + @Test + public void testJVectorKnnIndex_simpleCase() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test the scenario when not all documents are populated with the vector field - */ - public void testMissing_fields() throws IOException { - final int k = 3; // The number of nearest neighbors to gather - final int totalNumberOfDocs = 10; - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 0; i < totalNumberOfDocs; i++) { - final Document doc = new Document(); - if (i % 2 == 0) { - final float[] source = new float[] { 0.0f, i }; - doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); - } - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(0, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 0.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(2, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(4, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 4.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } + log.info("successfully closed directory"); + } + + /** Test the scenario when not all documents are populated with the vector field */ + public void testMissing_fields() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + if (i % 2 == 0) { + final float[] source = new float[] {0.0f, i}; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); } - log.info("successfully closed directory"); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 0.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(2, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(4, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 4.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test the scenario when the index is sorted by a doc value - * We want to make sure the docIDs are correctly mapped to the jVector ordinals - * @throws IOException if an I/O error occurs - */ - public void test_sorted_index() throws IOException { - final int k = 3; // The number of nearest neighbors to gather - final int totalNumberOfDocs = 10; - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - final String sortFieldName = "sorted_field"; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - // Add index sorting configuration - indexWriterConfig.setIndexSort(new Sort(new SortField(sortFieldName, SortField.Type.INT, true))); // true = reverse order - - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 0; i < totalNumberOfDocs; i++) { - final Document doc = new Document(); - final float[] source = new float[] { 0.0f, i }; - doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - // Add the sortable field - doc.add(new NumericDocValuesField(sortFieldName, i)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(9, topDocs.scoreDocs[0].doc); - assertEquals(0, reader.storedFields().document(topDocs.scoreDocs[0].doc).getField(TEST_ID_FIELD).numericValue().intValue()); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 0.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(8, topDocs.scoreDocs[1].doc); - assertEquals(1, reader.storedFields().document(topDocs.scoreDocs[1].doc).getField(TEST_ID_FIELD).numericValue().intValue()); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(7, topDocs.scoreDocs[2].doc); - assertEquals(2, reader.storedFields().document(topDocs.scoreDocs[2].doc).getField(TEST_ID_FIELD).numericValue().intValue()); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } - log.info("successfully closed directory"); + log.info("successfully closed directory"); + } + + /** + * Test the scenario when the index is sorted by a doc value We want to make sure the docIDs are + * correctly mapped to the jVector ordinals + * + * @throws IOException if an I/O error occurs + */ + public void test_sorted_index() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final String sortFieldName = "sorted_field"; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // Add index sorting configuration + indexWriterConfig.setIndexSort( + new Sort(new SortField(sortFieldName, SortField.Type.INT, true))); // true = reverse order + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + final float[] source = new float[] {0.0f, i}; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + // Add the sortable field + doc.add(new NumericDocValuesField(sortFieldName, i)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + assertEquals( + 0, + reader + .storedFields() + .document(topDocs.scoreDocs[0].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 0.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + assertEquals( + 1, + reader + .storedFields() + .document(topDocs.scoreDocs[1].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + assertEquals( + 2, + reader + .storedFields() + .document(topDocs.scoreDocs[2].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test to verify that the JVector codec is able to successfully search for the nearest neighbours - * in the index. - * Single field is used to store the vectors. - * Documents are stored in a multiple segments. - * Multiple commits without refreshing the index. - * No merge. - */ - @Test - public void testJVectorKnnIndex_multipleSegments() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f / i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - w.addDocument(doc); - w.commit(); // this creates a new segment - } - log.info("Done writing all files to the file system"); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 10 segments, each with a single document"); - Assert.assertEquals(10, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = new KnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(9, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(8, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(7, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } + log.info("successfully closed directory"); + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in a multiple + * segments. Multiple commits without refreshing the index. No merge. + */ + @Test + public void testJVectorKnnIndex_multipleSegments() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.commit(); // this creates a new segment + } + log.info("Done writing all files to the file system"); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 10 segments, each with a single document"); + Assert.assertEquals(10, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test to verify that the JVector codec is able to successfully search for the nearest neighbours - * in the index. - * Single field is used to store the vectors. - * Documents are stored in a multiple segments. - * Multiple commits without refreshing the index. - * Merge is enabled. - */ - @Test - public void testJVectorKnnIndex_mergeEnabled() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f * i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); - w.addDocument(doc); - w.commit(); // this creates a new segment without triggering a merge - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); - assertEquals("1", doc.get("my_doc_id")); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); - assertEquals("2", doc.get("my_doc_id")); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); - assertEquals("3", doc.get("my_doc_id")); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 3.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in a multiple + * segments. Multiple commits without refreshing the index. Merge is enabled. + */ + @Test + public void testJVectorKnnIndex_mergeEnabled() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test to verify that the jVector codec is able to successfully search for the nearest neighbors - * in the index. - * Single field is used to store the vectors. - * Documents are stored in potentially multiple segments. - * Multiple commits. - * Multiple merges. - */ - @Test - public void multipleMerges() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - final Path indexPath = createTempDir(); - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f * i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); - doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); - w.addDocument(doc); - w.commit(); // this creates a new segment without triggering a merge - w.forceMerge(1); // this merges all segments into a single segment - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); - assertEquals("1", doc.get("my_doc_id")); - Assert.assertEquals( - vectorSimilarityFunction.compare(target, new float[] { 0.0f, 1.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); - assertEquals("2", doc.get("my_doc_id")); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 2.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); - assertEquals("3", doc.get("my_doc_id")); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 3.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbors + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. + */ + @Test + public void multipleMerges() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + w.forceMerge(1); // this merges all segments into a single segment + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + vectorSimilarityFunction.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test to verify that the jVector codec is able to successfully search for the nearest neighbours - * in the index. - * A Single field is used to store the vectors. - * Documents are stored in potentially multiple segments. - * Multiple commits. - * Multiple merges. - * Large batches - * Use a compound file - */ - @Test - public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() throws IOException { - int segmentSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; - int totalNumberOfDocs = segmentSize * 4; - int k = 3; // The number of nearest neighbors to gather - - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(true); - indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f / i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); - w.addDocument(doc); - if (i % segmentSize == 0) { - w.commit(); // this creates a new segment without triggering a merge - } - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - - float expectedMinScoreInTopK = VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, k }); - final float recall = calculateRecall(topDocs, expectedMinScoreInTopK); - Assert.assertEquals(1.0f, recall, 0.01f); - - log.info("successfully completed search tests"); - } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbours + * in the index. A Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Large batches Use a compound file + */ + @Test + public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() + throws IOException { + int segmentSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge } + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + float expectedMinScoreInTopK = + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, k}); + final float recall = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertEquals(1.0f, recall, 0.01f); + + log.info("successfully completed search tests"); + } } - - /** - * Similar to testJVectorKnnIndex_multiple_merges_large_batches_no_quantization but with random vectors - * It's important to add more randomness to the vectors to make sure the graph is not linear - * @throws IOException if an I/O error occurs - */ - @Test - public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization_with_random_vectors() throws IOException { - int segmentSize = 200; - int totalNumberOfDocs = segmentSize * 4; - int k = 3; // The number of nearest neighbors to gather - final int dimension = 2; - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - final float[] target = TestUtils.generateRandomVectors(1, dimension)[0]; - final float[][] source = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, source, k, vectorSimilarityFunction); - - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(true); - indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - for (int i = 0; i < source.length; i++) { - final Document doc = new Document(); - doc.add(new KnnFloatVectorField(TEST_FIELD, source[i], VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - if (i % segmentSize == 0) { - w.commit(); // this creates a new segment without triggering a merge - } - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } + } + + /** + * Similar to testJVectorKnnIndex_multiple_merges_large_batches_no_quantization but with random + * vectors It's important to add more randomness to the vectors to make sure the graph is not + * linear + * + * @throws IOException if an I/O error occurs + */ + @Test + public void + testJVectorKnnIndex_multiple_merges_large_batches_no_quantization_with_random_vectors() + throws IOException { + int segmentSize = 200; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + final int dimension = 2; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final float[] target = TestUtils.generateRandomVectors(1, dimension)[0]; + final float[][] source = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, source, k, vectorSimilarityFunction); + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + for (int i = 0; i < source.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge } + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } } - - /** - * Tests the functionality and integrity of a Lucene k-NN index under multiple merge cycles and verifies - * the proper ordering of vectors and document identifiers. - * - * The method performs the following validation steps: - * 1. Indexes a predefined number of documents into a Lucene index, creating many small segments. - * Each document - * includes a k-NN float vector field encoding a specific order. - * 2. Executes several merge operations on the index (partial and full merges) to validate that the merging - * process maintains correctness and consistency. - * 3. Validates the following invariants post-merge: - * (a) Verifies that the index is merged into a single segment. - * (b) Confirms the integrity of vector values by iterating through the merged segment and checking the - * relationship between vector components and document identifiers. - * (c) Performs k-NN searches with various cases: - * - Single-threaded searches using vectors to ensure correct results. - * - Multi-threaded concurrent searches to confirm robustness and verify the index operates correctly - * under concurrent access without exhausting file handles or encountering other issues. - * - * Assertions are used throughout to ensure the state of the index matches the expected behavior, - * validate merge - * results, and confirm the accuracy of search operations. - * The test also logs the number of successful k-NN queries - * during the concurrent search phase. - * - * @throws IOException if an I/O error occurs during index operations. - * @throws InterruptedException if the concurrent search phase is interrupted. - */ - @Test - public void testLuceneKnnIndex_multipleMerges_with_ordering_check() throws IOException, InterruptedException { - final int numDocs = 10000; - final String floatVectorField = "vec"; - final String expectedDocIdField = "expectedDocId"; - final Path indexPath = createTempDir(); - final float[][] sourceVectors = TestUtils.generateRandomVectors(numDocs, 2); - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - - try (Directory dir = newFSDirectory(indexPath)) { - IndexWriterConfig cfg = newIndexWriterConfig(); - cfg.setCodec(getCodec()); - cfg.setUseCompoundFile(false); - cfg.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); - cfg.setMergeScheduler(new SerialMergeScheduler()); - - try (IndexWriter w = new IndexWriter(dir, cfg)) { - /* ---------- 1. index documents, create many tiny segments ---------- */ - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - // vector whose first component encodes the future (segment-local) docID - doc.add(new KnnFloatVectorField(floatVectorField, sourceVectors[i], vectorSimilarityFunction)); - doc.add(new StoredField(expectedDocIdField, i)); - w.addDocument(doc); - } - w.commit(); - - /* ---------- 2. run several merge cycles ---------- */ - w.forceMerge(5); // partial merge - w.forceMerge(3); // another partial merge - w.forceMerge(1); // final full merge - } - - /* ---------- 3. open reader and assert the invariant ---------- */ - try (DirectoryReader reader = DirectoryReader.open(dir)) { - assertEquals("we merged down to exactly one segment", 1, reader.leaves().size()); - - // (a) iterate through vectors directly - for (LeafReaderContext context : reader.leaves()) { - FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); - final var docIdSetIterator = vectorValues.iterator(); // iterator for all the vectors with values - int docId = -1; - while ((docId = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - final int luceneDocId = context.docBase + docId; - final int globalDocId = reader.storedFields() - .document(luceneDocId) - .getField(expectedDocIdField) - .storedValue() - .getIntValue(); - float[] vectorValue = vectorValues.vectorValue(docIdSetIterator.index()); - float[] expectedVectorValue = sourceVectors[globalDocId]; - // if the vectors do not match, also look which source vector should be the right result - if (!Arrays.equals(expectedVectorValue, vectorValue)) { - for (int i = 0; i < sourceVectors.length; i++) { - if (Arrays.equals(sourceVectors[i], vectorValue)) { - log.error( - "found vector with global id: {}, in docId: {}, however the actual position of the vector in source is: {}", - globalDocId, - luceneDocId, - i - ); - } - } - } - Assert.assertArrayEquals( - "vector with global id " - + globalDocId - + " in source doesn't match vector value in lucene docID " - + luceneDocId - + " on the index", - expectedVectorValue, - vectorValue, - 0.0f - ); - } - } - - // (b) search with the same vector and confirm we are not exhausting the file handles with each search - IndexSearcher searcher = newSearcher(reader); - LeafReaderContext context = reader.leaves().get(0); // we only have one leaf at this point so we can use it to obtain the - // vector values - final int baseDocId = context.docBase; - final FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); - final int k = 1; - for (int i = 0; i < reader.maxDoc(); i++) { - float[] query = TestUtils.generateRandomVectors(1, 2)[0]; - TopDocs td = searcher.search(getJVectorKnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); - assertEquals(k, td.scoreDocs.length); - - compareSearchResults(td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); - } - - // (c) search with the same vector and this time add concurrency to make sure we are still not exhausting the file handles - int numThreads = 10; // Number of concurrent search threads - int queriesPerThread = 100; // Number of searches per thread - ExecutorService executor = Executors.newFixedThreadPool(numThreads); - CountDownLatch latch = new CountDownLatch(numThreads); - AtomicBoolean failureDetected = new AtomicBoolean(false); - AtomicInteger totalQueries = new AtomicInteger(0); - - try { - for (int t = 0; t < numThreads; t++) { - executor.submit(() -> { - int i = 0; - - try { - for (i = 0; i < queriesPerThread && !failureDetected.get(); i++) { - float[] query = TestUtils.generateRandomVectors(1, 2)[0]; - try { - TopDocs td = searcher.search(new KnnFloatVectorQuery("vec", query, k), k); - assertEquals("Search should return correct number of results", k, td.scoreDocs.length); - compareSearchResults(td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); - totalQueries.incrementAndGet(); - } catch (Throwable e) { - failureDetected.compareAndSet(false, true); - log.error("Exception encountered", e); - fail("Exception during concurrent search: " + e.getMessage()); - } - } - } finally { - latch.countDown(); - log.warn("Ran {} queries", i); - } - }); - } - - // Wait for all threads to complete or for a failure - boolean completed = latch.await(30, TimeUnit.SECONDS); - assertTrue("Test timed out while waiting for concurrent searches", completed); - assertFalse("Test encountered failures during concurrent searches", failureDetected.get()); - assertEquals("Incorrect number of queries executed", numThreads * queriesPerThread, totalQueries.get()); - - // Log the number of successful queries - log.info("Successfully completed {} concurrent kNN search queries!", totalQueries.get()); - - } finally { - executor.shutdownNow(); + } + + /** + * Tests the functionality and integrity of a Lucene k-NN index under multiple merge cycles and + * verifies the proper ordering of vectors and document identifiers. + * + *

The method performs the following validation steps: 1. Indexes a predefined number of + * documents into a Lucene index, creating many small segments. Each document includes a k-NN + * float vector field encoding a specific order. 2. Executes several merge operations on the index + * (partial and full merges) to validate that the merging process maintains correctness and + * consistency. 3. Validates the following invariants post-merge: (a) Verifies that the index is + * merged into a single segment. (b) Confirms the integrity of vector values by iterating through + * the merged segment and checking the relationship between vector components and document + * identifiers. (c) Performs k-NN searches with various cases: - Single-threaded searches using + * vectors to ensure correct results. - Multi-threaded concurrent searches to confirm robustness + * and verify the index operates correctly under concurrent access without exhausting file handles + * or encountering other issues. + * + *

Assertions are used throughout to ensure the state of the index matches the expected + * behavior, validate merge results, and confirm the accuracy of search operations. The test also + * logs the number of successful k-NN queries during the concurrent search phase. + * + * @throws IOException if an I/O error occurs during index operations. + * @throws InterruptedException if the concurrent search phase is interrupted. + */ + @Test + public void testLuceneKnnIndex_multipleMerges_with_ordering_check() + throws IOException, InterruptedException { + final int numDocs = 10000; + final String floatVectorField = "vec"; + final String expectedDocIdField = "expectedDocId"; + final Path indexPath = createTempDir(); + final float[][] sourceVectors = TestUtils.generateRandomVectors(numDocs, 2); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + try (Directory dir = newFSDirectory(indexPath)) { + IndexWriterConfig cfg = newIndexWriterConfig(); + cfg.setCodec(getCodec()); + cfg.setUseCompoundFile(false); + cfg.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + cfg.setMergeScheduler(new SerialMergeScheduler()); + + try (IndexWriter w = new IndexWriter(dir, cfg)) { + /* ---------- 1. index documents, create many tiny segments ---------- */ + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + // vector whose first component encodes the future (segment-local) docID + doc.add( + new KnnFloatVectorField( + floatVectorField, sourceVectors[i], vectorSimilarityFunction)); + doc.add(new StoredField(expectedDocIdField, i)); + w.addDocument(doc); + } + w.commit(); + + /* ---------- 2. run several merge cycles ---------- */ + w.forceMerge(5); // partial merge + w.forceMerge(3); // another partial merge + w.forceMerge(1); // final full merge + } + + /* ---------- 3. open reader and assert the invariant ---------- */ + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals("we merged down to exactly one segment", 1, reader.leaves().size()); + + // (a) iterate through vectors directly + for (LeafReaderContext context : reader.leaves()) { + FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final var docIdSetIterator = + vectorValues.iterator(); // iterator for all the vectors with values + int docId = -1; + while ((docId = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + final int luceneDocId = context.docBase + docId; + final int globalDocId = + reader + .storedFields() + .document(luceneDocId) + .getField(expectedDocIdField) + .storedValue() + .getIntValue(); + float[] vectorValue = vectorValues.vectorValue(docIdSetIterator.index()); + float[] expectedVectorValue = sourceVectors[globalDocId]; + // if the vectors do not match, also look which source vector should be the right result + if (!Arrays.equals(expectedVectorValue, vectorValue)) { + for (int i = 0; i < sourceVectors.length; i++) { + if (Arrays.equals(sourceVectors[i], vectorValue)) { + log.error( + "found vector with global id: {}, in docId: {}, however the actual position of the vector in source is: {}", + globalDocId, + luceneDocId, + i); } + } } + Assert.assertArrayEquals( + "vector with global id " + + globalDocId + + " in source doesn't match vector value in lucene docID " + + luceneDocId + + " on the index", + expectedVectorValue, + vectorValue, + 0.0f); + } } - } - - private void compareSearchResults( - TopDocs topDocs, - float[][] sourceVectors, - DirectoryReader reader, - String expectedDocIdField, - int baseDocId, - FloatVectorValues vectorValues - ) throws IOException { - // Get the ords matching the lucene doc ids so that we can later find their values in the {@link vectorValues} - final Map docToOrdMap = new HashMap<>(); // docToOrd map - final var docIdSetIterator = vectorValues.iterator(); - while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - docToOrdMap.put(docIdSetIterator.docID() + baseDocId, docIdSetIterator.index()); + // (b) search with the same vector and confirm we are not exhausting the file handles with + // each search + IndexSearcher searcher = newSearcher(reader); + LeafReaderContext context = + reader + .leaves() + .get(0); // we only have one leaf at this point so we can use it to obtain the + // vector values + final int baseDocId = context.docBase; + final FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final int k = 1; + for (int i = 0; i < reader.maxDoc(); i++) { + float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + TopDocs td = + searcher.search( + getJVectorKnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); + assertEquals(k, td.scoreDocs.length); + + compareSearchResults( + td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); } - for (int resultIdx = 0; resultIdx < topDocs.scoreDocs.length; resultIdx++) { - final int localDocId = topDocs.scoreDocs[resultIdx].doc; - final int globalDocId = reader.storedFields().document(localDocId).getField(expectedDocIdField).storedValue().getIntValue(); - - // Access to float values is not thread safe - final float[] vectorValue; - synchronized (vectorValues) { - vectorValue = vectorValues.vectorValue(docToOrdMap.get(localDocId)); - } - float[] expectedVectorValue = sourceVectors[globalDocId]; - Assert.assertArrayEquals("vectors in source and index should match", expectedVectorValue, vectorValue, 0.0f); - } - } - - /** - * Test to verify that a document which has been deleted is no longer - * returned in a k-NN search. The index uses the JVector codec and is - * kept in multiple segments to ensure we also cover the case where the - * deleted document still physically resides in the segment as a dead - * (non-live) record. - */ - @Test - public void deletedDocs() throws IOException { - final int totalNumberOfDocs = 100; - final int batchSize = 10; - final int k = batchSize - 1; - final int docToDeleteInEachBatch = 5; - final Path indexPath = createTempDir(); - final IndexWriterConfig iwc = newIndexWriterConfig(); - // JVector codec requires compound files to be disabled at the moment - iwc.setUseCompoundFile(false); - iwc.setCodec(getCodec()); - iwc.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); - - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter writer = new IndexWriter(dir, iwc)) { - - /* - * 1. Index 100 docs, in batches of 10. Delete the 5th doc in each batch. - * will leave us with 10 segments, each with 9 live docs. - */ - int batchNumber = 0; - for (int i = 1; i <= totalNumberOfDocs; i++) { - Document doc = new Document(); - final float[] vector = { 0.0f, 1.0f * (i + batchNumber) }; - doc.add(new StringField("docId", Integer.toString(i + 1), Field.Store.YES)); - doc.add(new KnnFloatVectorField("test_field", vector, VectorSimilarityFunction.EUCLIDEAN)); - writer.addDocument(doc); - if (i % batchSize == 0) { - writer.flush(); - writer.deleteDocuments(new TermQuery(new Term("docId", Integer.toString(i - docToDeleteInEachBatch)))); - batchNumber++; - } - } - writer.commit(); - - /* ---------------------------------------- - * 2. Merge all segments into one - * ---------------------------------------- */ - writer.forceMerge(1); - - /* ---------------------------------------- - * 3. Search – the deleted doc must be gone - * ---------------------------------------- */ - try (IndexReader reader = DirectoryReader.open(writer)) { - assertEquals( - "All documents except the deleted ones should be live", - totalNumberOfDocs - (totalNumberOfDocs / batchSize), - reader.numDocs() - ); - // For each batch we will verify that the deleted document doesn't come up in search and only it's neighbours are returned - - for (int i = 0; i < totalNumberOfDocs; i += batchSize) { - final float[] target = { 0.0f, 1.0f * (i + docToDeleteInEachBatch) }; - final IndexSearcher searcher = newSearcher(reader); - final KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery( - "test_field", - target, - k, - new MatchAllDocsQuery() - ); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - for (int j = 0; j < k; j++) { - Document doc = reader.storedFields().document(topDocs.scoreDocs[j].doc); - int docId = Integer.parseInt(doc.get("docId")); - assertNotEquals("Deleted doc should not be returned in search results", i + docToDeleteInEachBatch, docId); + // (c) search with the same vector and this time add concurrency to make sure we are still + // not exhausting the file handles + int numThreads = 10; // Number of concurrent search threads + int queriesPerThread = 100; // Number of searches per thread + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + CountDownLatch latch = new CountDownLatch(numThreads); + AtomicBoolean failureDetected = new AtomicBoolean(false); + AtomicInteger totalQueries = new AtomicInteger(0); + + try { + for (int t = 0; t < numThreads; t++) { + executor.submit( + () -> { + int i = 0; + + try { + for (i = 0; i < queriesPerThread && !failureDetected.get(); i++) { + float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + try { + TopDocs td = searcher.search(new KnnFloatVectorQuery("vec", query, k), k); + assertEquals( + "Search should return correct number of results", + k, + td.scoreDocs.length); + compareSearchResults( + td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); + totalQueries.incrementAndGet(); + } catch (Throwable e) { + failureDetected.compareAndSet(false, true); + log.error("Exception encountered", e); + fail("Exception during concurrent search: " + e.getMessage()); + } } - } - } + } finally { + latch.countDown(); + log.warn("Ran {} queries", i); + } + }); + } + + // Wait for all threads to complete or for a failure + boolean completed = latch.await(30, TimeUnit.SECONDS); + assertTrue("Test timed out while waiting for concurrent searches", completed); + assertFalse( + "Test encountered failures during concurrent searches", failureDetected.get()); + assertEquals( + "Incorrect number of queries executed", + numThreads * queriesPerThread, + totalQueries.get()); + + // Log the number of successful queries + log.info("Successfully completed {} concurrent kNN search queries!", totalQueries.get()); + + } finally { + executor.shutdownNow(); } + } } - - /** - * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours - * in the index. - * Single field is used to store the vectors. - * Documents are stored in potentially multiple segments. - * Multiple commits. - * Multiple merges. - * Merge is enabled. - * compound file is enabled. - */ - @Test - public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOException { - int k = 3; // The number of nearest neighbors to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(true); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f / i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - w.addDocument(doc); - w.flush(); // this creates a new segment without triggering a merge - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(9, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), - topDocs.scoreDocs[0].score, - 0.01f - ); - assertEquals(8, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 9.0f }), - topDocs.scoreDocs[1].score, - 0.01f - ); - assertEquals(7, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), - topDocs.scoreDocs[2].score, - 0.01f - ); - log.info("successfully completed search tests"); - } - } + } + + private void compareSearchResults( + TopDocs topDocs, + float[][] sourceVectors, + DirectoryReader reader, + String expectedDocIdField, + int baseDocId, + FloatVectorValues vectorValues) + throws IOException { + // Get the ords matching the lucene doc ids so that we can later find their values in the {@link + // vectorValues} + final Map docToOrdMap = new HashMap<>(); // docToOrd map + final var docIdSetIterator = vectorValues.iterator(); + while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + docToOrdMap.put(docIdSetIterator.docID() + baseDocId, docIdSetIterator.index()); } - /** - * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours - * in the index. - * Single field is used to store the vectors. - * Documents are stored in potentially multiple segments. - * Multiple commits. - * Multiple merges. - * Merge is enabled. - * compound file is enabled. - * cosine similarity is used. - */ - @Test - public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(true); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); - indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 1.0f, 1.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 1.0f + i, 2.0f * i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.COSINE)); - w.addDocument(doc); - w.flush(); // this creates a new segment without triggering a merge - } - log.info("Done writing all files to the file system"); - - w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(0, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.COSINE.compare(target, new float[] { 2.0f, 2.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(1, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.COSINE.compare(target, new float[] { 3.0f, 4.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(2, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.COSINE.compare(target, new float[] { 4.0f, 6.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed search tests"); - } - } + for (int resultIdx = 0; resultIdx < topDocs.scoreDocs.length; resultIdx++) { + final int localDocId = topDocs.scoreDocs[resultIdx].doc; + final int globalDocId = + reader + .storedFields() + .document(localDocId) + .getField(expectedDocIdField) + .storedValue() + .getIntValue(); + + // Access to float values is not thread safe + final float[] vectorValue; + synchronized (vectorValues) { + vectorValue = vectorValues.vectorValue(docToOrdMap.get(localDocId)); + } + float[] expectedVectorValue = sourceVectors[globalDocId]; + Assert.assertArrayEquals( + "vectors in source and index should match", expectedVectorValue, vectorValue, 0.0f); } - - /** - * Test to verify that the JVector codec is providing proper error if used with byte vector - * TODO: Create Binary Quantization support for JVector codec - */ - @Test - public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - // TODO: re-enable this after fixing the compound file augmentation for JVector - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (Directory dir = newFSDirectory(indexPath); RandomIndexWriter w = new RandomIndexWriter(random(), dir, indexWriterConfig)) { - final byte[] source = new byte[] { (byte) 0, (byte) 0 }; - final Document doc = new Document(); - doc.add(new KnnByteVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - Assert.assertThrows(UnsupportedOperationException.class, () -> w.addDocument(doc)); + } + + /** + * Test to verify that a document which has been deleted is no longer returned in a k-NN search. + * The index uses the JVector codec and is kept in multiple segments to ensure we also cover the + * case where the deleted document still physically resides in the segment as a dead (non-live) + * record. + */ + @Test + public void deletedDocs() throws IOException { + final int totalNumberOfDocs = 100; + final int batchSize = 10; + final int k = batchSize - 1; + final int docToDeleteInEachBatch = 5; + final Path indexPath = createTempDir(); + final IndexWriterConfig iwc = newIndexWriterConfig(); + // JVector codec requires compound files to be disabled at the moment + iwc.setUseCompoundFile(false); + iwc.setCodec(getCodec()); + iwc.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter writer = new IndexWriter(dir, iwc)) { + + /* + * 1. Index 100 docs, in batches of 10. Delete the 5th doc in each batch. + * will leave us with 10 segments, each with 9 live docs. + */ + int batchNumber = 0; + for (int i = 1; i <= totalNumberOfDocs; i++) { + Document doc = new Document(); + final float[] vector = {0.0f, 1.0f * (i + batchNumber)}; + doc.add(new StringField("docId", Integer.toString(i + 1), Field.Store.YES)); + doc.add(new KnnFloatVectorField("test_field", vector, VectorSimilarityFunction.EUCLIDEAN)); + writer.addDocument(doc); + if (i % batchSize == 0) { + writer.flush(); + writer.deleteDocuments( + new TermQuery(new Term("docId", Integer.toString(i - docToDeleteInEachBatch)))); + batchNumber++; } - } - - /** - * Test to verify that the JVector codec is able to successfully search for the nearest neighbours - * in the index with a filter applied. - */ - @Test - public void testJVectorKnnIndex_withFilter() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec()); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (Directory dir = newFSDirectory(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = new float[] { 0.0f, 0.0f }; - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = new float[] { 0.0f, 1.0f / i }; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new StringField("filter_field", i % 2 == 0 ? "even" : "odd", Field.Store.YES)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("Applying filter to the KNN search"); - final Query filterQuery = new TermQuery(new Term("filter_field", "even")); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - - log.info("Validating filtered KNN results"); - assertEquals(k, topDocs.totalHits.value()); - assertEquals(9, topDocs.scoreDocs[0].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 10.0f }), - topDocs.scoreDocs[0].score, - 0.001f - ); - assertEquals(7, topDocs.scoreDocs[1].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 8.0f }), - topDocs.scoreDocs[1].score, - 0.001f - ); - assertEquals(5, topDocs.scoreDocs[2].doc); - Assert.assertEquals( - VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] { 0.0f, 1.0f / 6.0f }), - topDocs.scoreDocs[2].score, - 0.001f - ); - log.info("successfully completed filtered search tests"); - } + } + writer.commit(); + + /* ---------------------------------------- + * 2. Merge all segments into one + * ---------------------------------------- */ + writer.forceMerge(1); + + /* ---------------------------------------- + * 3. Search – the deleted doc must be gone + * ---------------------------------------- */ + try (IndexReader reader = DirectoryReader.open(writer)) { + assertEquals( + "All documents except the deleted ones should be live", + totalNumberOfDocs - (totalNumberOfDocs / batchSize), + reader.numDocs()); + // For each batch we will verify that the deleted document doesn't come up in search and + // only it's neighbours are returned + + for (int i = 0; i < totalNumberOfDocs; i += batchSize) { + final float[] target = {0.0f, 1.0f * (i + docToDeleteInEachBatch)}; + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, new MatchAllDocsQuery()); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + for (int j = 0; j < k; j++) { + Document doc = reader.storedFields().document(topDocs.scoreDocs[j].doc); + int docId = Integer.parseInt(doc.get("docId")); + assertNotEquals( + "Deleted doc should not be returned in search results", + i + docToDeleteInEachBatch, + docId); + } } + } } - - /** - * Test the simple case of quantization where we have the perfect batch single batch size with no merges or too small batch sizes - */ - @Test - public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException { - int k = 50; // The number of nearest neighbours to gather - int dimension = 16; - int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); - for (int i = 0; i < vectors.length; i++) { - final Document doc = new Document(); - doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Merge is enabled. compound file is + * enabled. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.01f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.01f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.01f); + log.info("successfully completed search tests"); + } } - - /** - * Test recall with different types of rerank parameters - */ - @Test - public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOException { - int k = 1; // The number of nearest neighbours to gather - int dimension = 16; - int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - for (int i = 1; i < totalNumberOfDocs + 1; i++) { - final float[] source = generateZerosVectorWithLastValue(dimension, i); - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); - w.addDocument(doc); - } - log.info("Flushing docs to make them discoverable on the file system"); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - float expectedMinScoreInTopK = VectorSimilarityFunction.EUCLIDEAN.compare( - target, - new float[] { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, k } - ); - - // Query with essentially no reranking and expect recall to be very low - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - - final float recallWithLowOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); - - // Query with reranking and expect recall to be high - knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 5); - topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); - Assert.assertTrue(recallWithLowOverqueryFactor <= recallWithHighOverqueryFactor); - - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Merge is enabled. compound file is + * enabled. cosine similarity is used. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {1.0f, 1.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {1.0f + i, 2.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.COSINE)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + log.info("Done writing all files to the file system"); + + w.forceMerge(1); // this merges all segments into a single segment + log.info("Done merging all segments"); + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have 1 segment with 10 documents"); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {2.0f, 2.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(1, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {3.0f, 4.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(2, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {4.0f, 6.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed search tests"); + } } - - /** - * Test the simple case of quantization where we have the perfect batch single batch size each time with a merge of - * multiple segments - */ - @Test - public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() throws IOException { - final int dimension = 16; - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and - // jittery tests - final int perfectBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION is the minimal - // batch size that will trigger a quantization without - // breaking it, generally speaking the batch size can't be - // lower than the number of clusters - final int totalNumberOfDocs = perfectBatchSize * 2; - - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); - - for (int i = 0; i < vectors.length; i++) { - final Document doc = new Document(); - doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - if (i % perfectBatchSize == 0) { - w.commit(); - } - } - log.info("Flushing docs to make them discoverable on the file system"); - w.forceMerge(1); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the JVector codec is providing proper error if used with byte vector TODO: + * Create Binary Quantization support for JVector codec + */ + @Test + public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + // TODO: re-enable this after fixing the compound file augmentation for JVector + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (Directory dir = newFSDirectory(indexPath); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, indexWriterConfig)) { + final byte[] source = new byte[] {(byte) 0, (byte) 0}; + final Document doc = new Document(); + doc.add(new KnnByteVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + Assert.assertThrows(UnsupportedOperationException.class, () -> w.addDocument(doc)); } - - /** - * Test the non-ideal case where batch sizes are not perfect and are lower than the number of recommended clusters in the index - * The expected behavior is for the quantization to only kick in when we have a merge or batch size that is bigger than the minimal required batch size - */ - @Test - public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges() throws IOException { - final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and - // jittery tests - final int dimension = 16; - final int notIdealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION / 3; // Batch size that is not ideal for quantization and - // shouldn't trigger it - final int totalNumberOfDocs = notIdealBatchSize * 3; // 3 batches of documents each will result in quantization only when the merge - // is triggered, and we have a batch size of {@link - // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} as a result of merging all the smaller - // batches - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(false); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); - // We set the below parameters to make sure no permature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); - for (int i = 0; i < totalNumberOfDocs; i++) { - final float[] source = vectors[i]; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - if (i % notIdealBatchSize == 0) { - w.commit(); - } - } - log.info("Flushing docs to make them discoverable on the file system"); - w.forceMerge(1); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } - } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index with a filter applied. + */ + @Test + public void testJVectorKnnIndex_withFilter() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (Directory dir = newFSDirectory(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("filter_field", i % 2 == 0 ? "even" : "odd", Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("Applying filter to the KNN search"); + final Query filterQuery = new TermQuery(new Term("filter_field", "even")); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + + log.info("Validating filtered KNN results"); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(5, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 6.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + log.info("successfully completed filtered search tests"); + } } - - /** - * Test the non-ideal case where batch sizes are not perfect and are lower than the number of recommended clusters in the index - * The expected behavior is for the quantization to only kick in when we have a merge or batch size that is bigger than the minimal required batch size - * Also this is adding the compound file to the mix - */ - @Test - public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_withCompoundFile() throws IOException { - final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and - // jittery tests - final int dimension = 16; - final int notIdealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION / 3; // Batch size that is not ideal for quantization and - // shouldn't trigger it - final int totalNumberOfDocs = notIdealBatchSize * 10; // 3 batches of documents each will result in quantization only when the merge - // is triggered, and we have a batch size of {@link MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} - // as a result of merging all the smaller batches - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - - boolean useCompoundFile = true; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(useCompoundFile); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); - // We set the below parameters to make sure no premature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - // We will use random vectors because otherwise PQ will have a correlated subspaces which will result in a broken linear graph - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); - for (int i = 0; i < totalNumberOfDocs; i++) { - final float[] source = vectors[i]; - final Document doc = new Document(); - doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - w.addDocument(doc); - if (i % notIdealBatchSize == 0) { - w.commit(); - } - } - w.commit(); - log.info("Flushing docs to make them discoverable on the file system"); - w.forceMerge(1); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals("Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } - } - - Assert.assertTrue("No quantization time recorded", KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount() > 0); - Assert.assertTrue("No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size with no + * merges or too small batch sizes + */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException { + int k = 50; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } } - - /** - * We will use multiple batches, each can trigger a quantization and later merge them in an appending order to keep track - * of refinement - * @throws IOException - */ - @Test - public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinement() throws IOException { - final int k = 50; // The number of nearest neighbours to gather, we set a high number here to avoid an inaccurate result and - // jittery tests - final int dimension = 16; - final int idealBatchSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // Batch size that is not ideal for quantization and - // shouldn't trigger it - final int totalNumberOfDocs = idealBatchSize * 10; // 10 batches, each batch on it's own will trigger quantization - final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - - boolean useCompoundFile = true; - IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); - indexWriterConfig.setUseCompoundFile(useCompoundFile); - indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); - indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); - // We set the below parameters to make sure no premature flush will occur, this way we can have a single segment, and we can force - // test the quantization case - indexWriterConfig.setMaxBufferedDocs(10000); // force flush every 10000 docs, this way we make sure that we only have a single - // segment for a totalNumberOfDocs < 1000 - indexWriterConfig.setRAMPerThreadHardLimitMB(1000); // 1000MB per thread, this way we make sure that no premature flush will occur - final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); - try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { - final float[] target = generateZerosVectorWithLastValue(dimension, 0); - // We will use random vectors because otherwise PQ will have a correlated subspaces which will result in a broken linear graph - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); - final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); - for (int i = 0; i < totalNumberOfDocs; i++) { - final float[] source = vectors[i]; - final Document doc = new Document(); - doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); - doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); - w.addDocument(doc); - if (i % idealBatchSize == 0) { - final long beforeTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); - w.commit(); - w.forceMerge(1); // force merge will trigger PQ refinement if other segments are present - final long afterTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); - Assert.assertTrue( - "Expected to have a training time of at least " + beforeTrainingTime + " but got " + afterTrainingTime, - afterTrainingTime >= beforeTrainingTime - ); - } - } - w.commit(); - log.info("Flushing docs to make them discoverable on the file system"); - w.forceMerge(1); - - try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); - Assert.assertEquals(1, reader.getContext().leaves().size()); - Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); - - final Query filterQuery = new MatchAllDocsQuery(); - final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); - TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - assertEquals(k, topDocs.totalHits.value()); - final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); - Assert.assertEquals("Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); - log.info("successfully completed search tests"); - } + } + + /** Test recall with different types of rerank parameters */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOException { + int k = 1; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = generateZerosVectorWithLastValue(dimension, i); + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + float expectedMinScoreInTopK = + VectorSimilarityFunction.EUCLIDEAN.compare( + target, + new float[] { + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, k + }); + + // Query with essentially no reranking and expect recall to be very low + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + final float recallWithLowOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + + // Query with reranking and expect recall to be high + knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 5); + topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertTrue(recallWithLowOverqueryFactor <= recallWithHighOverqueryFactor); + + log.info("successfully completed search tests"); + } + } + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size each + * time with a merge of multiple segments + */ + @Test + public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() throws IOException { + final int dimension = 16; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int perfectBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION is the + // minimal + // batch size that will trigger a quantization without + // breaking it, generally speaking the batch size can't be + // lower than the number of clusters + final int totalNumberOfDocs = perfectBatchSize * 2; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % perfectBatchSize == 0) { + w.commit(); } - - Assert.assertTrue("No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } } - - /** - * Calculate the recall for the top k documents - * For simplicity we assume that all documents have unique scores and therefore the minimum score in the top k documents is the kth document - * @param topDocs the top documents returned by the search - * @param minScoreInTopK the minimum score in the top k documents - * @return the recall of the top k documents - */ - private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { - int totalRelevantDocs = 0; - for (int i = 0; i < topDocs.scoreDocs.length; i++) { - if (topDocs.scoreDocs[i].score >= minScoreInTopK) { - totalRelevantDocs++; - } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of + * recommended clusters in the index The expected behavior is for the quantization to only kick in + * when we have a merge or batch size that is bigger than the minimal required batch size + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION + / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + notIdealBatchSize + * 3; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link + // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} as a result of merging all the smaller + // batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); } - float recall = ((float) totalRelevantDocs) / ((float) topDocs.scoreDocs.length); - - if (recall == 0.0f) { - log.info( - "Recall is 0.0, this is probably not correct, here is some debug information\n topDocs: {}, minScoreInTopK: {}, totalRelevantDocs: {}", - topDocsToString(topDocs), - minScoreInTopK, - totalRelevantDocs - ); + } + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } + } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of + * recommended clusters in the index The expected behavior is for the quantization to only kick in + * when we have a merge or batch size that is bigger than the minimal required batch size Also + * this is adding the compound file to the mix + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_withCompoundFile() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION + / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + notIdealBatchSize + * 10; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} + // as a result of merging all the smaller batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will + // result in a broken linear graph + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); } - return recall; + } + w.commit(); + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals( + "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } } - // convert topDocs to a pretty printed string - private String topDocsToString(TopDocs topDocs) { - StringBuilder sb = new StringBuilder(); - sb.append("TopDocs: ["); - for (int i = 0; i < topDocs.scoreDocs.length; i++) { - sb.append(topDocs.scoreDocs[i].doc).append(" (").append(topDocs.scoreDocs[i].score).append("), "); + Assert.assertTrue( + "No quantization time recorded", KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount() > 0); + Assert.assertTrue( + "No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + + /** + * We will use multiple batches, each can trigger a quantization and later merge them in an + * appending order to keep track of refinement + */ + @Test + public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinement() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int idealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // Batch size that is not ideal for + // quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + idealBatchSize * 10; // 10 batches, each batch on it's own will trigger quantization + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + log.info("Index path: {}", indexPath); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will + // result in a broken linear graph + final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + w.addDocument(doc); + if (i % idealBatchSize == 0) { + final long beforeTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); + w.commit(); + w.forceMerge(1); // force merge will trigger PQ refinement if other segments are present + final long afterTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); + Assert.assertTrue( + "Expected to have a training time of at least " + + beforeTrainingTime + + " but got " + + afterTrainingTime, + afterTrainingTime >= beforeTrainingTime); } - sb.append("]"); - return sb.toString(); + } + w.commit(); + log.info("Flushing docs to make them discoverable on the file system"); + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery knnFloatVectorQuery = + getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals( + "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + log.info("successfully completed search tests"); + } } - private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery(String fieldName, float[] target, int k, Query filterQuery) { - return getJVectorKnnFloatVectorQuery(fieldName, target, k, filterQuery, KNNConstants.DEFAULT_OVER_QUERY_FACTOR); + Assert.assertTrue( + "No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + } + + /** + * Calculate the recall for the top k documents For simplicity we assume that all documents have + * unique scores and therefore the minimum score in the top k documents is the kth document + * + * @param topDocs the top documents returned by the search + * @param minScoreInTopK the minimum score in the top k documents + * @return the recall of the top k documents + */ + private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { + int totalRelevantDocs = 0; + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + if (topDocs.scoreDocs[i].score >= minScoreInTopK) { + totalRelevantDocs++; + } } - - private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( - String fieldName, - float[] target, - int k, - Query filterQuery, - int overQueryFactor - ) { - return new JVectorKnnFloatVectorQuery( - fieldName, - target, - k, - filterQuery, - overQueryFactor, - KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), - KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), - KNNConstants.DEFAULT_QUERY_USE_PRUNING - ); + float recall = ((float) totalRelevantDocs) / ((float) topDocs.scoreDocs.length); + + if (recall == 0.0f) { + log.info( + "Recall is 0.0, this is probably not correct, here is some debug information\n topDocs: {}, minScoreInTopK: {}, totalRelevantDocs: {}", + topDocsToString(topDocs), + minScoreInTopK, + totalRelevantDocs); + } + return recall; + } + + // convert topDocs to a pretty printed string + private String topDocsToString(TopDocs topDocs) { + StringBuilder sb = new StringBuilder(); + sb.append("TopDocs: ["); + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + sb.append(topDocs.scoreDocs[i].doc) + .append(" (") + .append(topDocs.scoreDocs[i].score) + .append("), "); + } + sb.append("]"); + return sb.toString(); + } + + private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( + String fieldName, float[] target, int k, Query filterQuery) { + return getJVectorKnnFloatVectorQuery( + fieldName, target, k, filterQuery, KNNConstants.DEFAULT_OVER_QUERY_FACTOR); + } + + private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( + String fieldName, float[] target, int k, Query filterQuery, int overQueryFactor) { + return new JVectorKnnFloatVectorQuery( + fieldName, + target, + k, + filterQuery, + overQueryFactor, + KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), + KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), + KNNConstants.DEFAULT_QUERY_USE_PRUNING); + } + + private static float[][] getMonotonicallyIncreasingVectors(int numVectors, int vectorDimension) { + float[][] vectors = new float[numVectors][vectorDimension]; + for (int i = 0; i < numVectors; i++) { + vectors[i] = generateZerosVectorWithLastValue(vectorDimension, i); } - private static float[][] getMonotonicallyIncreasingVectors(int numVectors, int vectorDimension) { - float[][] vectors = new float[numVectors][vectorDimension]; - for (int i = 0; i < numVectors; i++) { - vectors[i] = generateZerosVectorWithLastValue(vectorDimension, i); - } + return vectors; + } - return vectors; + private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { + float[] vector = new float[vectorDimension]; + for (int i = 0; i < vectorDimension - 1; i++) { + vector[i] = 0; } - - private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { - float[] vector = new float[vectorDimension]; - for (int i = 0; i < vectorDimension - 1; i++) { - vector[i] = 0; - } - vector[vectorDimension - 1] = lastValue; - return vector; + vector[vectorDimension - 1] = lastValue; + return vector; + } + + private static float calculateRecall( + IndexReader reader, Set groundTruthVectorsIds, TopDocs topDocs, int k) + throws IOException { + final ScoreDoc[] scoreDocs = topDocs.scoreDocs; + Assert.assertEquals(groundTruthVectorsIds.size(), scoreDocs.length); + int totalRelevantDocs = 0; + for (ScoreDoc scoreDoc : scoreDocs) { + final int id = + reader + .storedFields() + .document(scoreDoc.doc) + .getField(TEST_ID_FIELD) + .storedValue() + .getIntValue(); + if (groundTruthVectorsIds.contains(id)) { + totalRelevantDocs++; + } } - - private static float calculateRecall(IndexReader reader, Set groundTruthVectorsIds, TopDocs topDocs, int k) - throws IOException { - final ScoreDoc[] scoreDocs = topDocs.scoreDocs; - Assert.assertEquals(groundTruthVectorsIds.size(), scoreDocs.length); - int totalRelevantDocs = 0; - for (ScoreDoc scoreDoc : scoreDocs) { - final int id = reader.storedFields().document(scoreDoc.doc).getField(TEST_ID_FIELD).storedValue().getIntValue(); - if (groundTruthVectorsIds.contains(id)) { - totalRelevantDocs++; - } + return ((float) totalRelevantDocs) / ((float) k); + } + + /** + * Find the IDs of the ground truth vectors in the dataset + * + * @param query query vector + * @param dataset dataset of all the vectors with their ordinal position in the array as their ID + * @param k the number of expected results + * @return the IDs of the ground truth vectors in the dataset + */ + private static Set calculateGroundTruthVectorsIds( + float[] query, + final float[][] dataset, + int k, + VectorSimilarityFunction vectorSimilarityFunction) { + final Set groundTruthVectorsIds = new HashSet<>(); + final PriorityQueue priorityQueue = + new PriorityQueue<>(k, (o1, o2) -> Float.compare(o1.score, o2.score)); + for (int i = 0; i < dataset.length; i++) { + ScoreDoc scoreDoc = new ScoreDoc(i, vectorSimilarityFunction.compare(query, dataset[i])); + if (priorityQueue.size() >= k) { + final ScoreDoc top = priorityQueue.poll(); + if (top.score < scoreDoc.score) { + priorityQueue.add(scoreDoc); + } else { + priorityQueue.add(top); } - return ((float) totalRelevantDocs) / ((float) k); + } else { + priorityQueue.add(scoreDoc); + } } - - /** - * Find the IDs of the ground truth vectors in the dataset - * @param query query vector - * @param dataset dataset of all the vectors with their ordinal position in the array as their ID - * @param k the number of expected results - * @return the IDs of the ground truth vectors in the dataset - */ - private static Set calculateGroundTruthVectorsIds( - float[] query, - final float[][] dataset, - int k, - VectorSimilarityFunction vectorSimilarityFunction - ) { - final Set groundTruthVectorsIds = new HashSet<>(); - final PriorityQueue priorityQueue = new PriorityQueue<>(k, (o1, o2) -> Float.compare(o1.score, o2.score)); - for (int i = 0; i < dataset.length; i++) { - ScoreDoc scoreDoc = new ScoreDoc(i, vectorSimilarityFunction.compare(query, dataset[i])); - if (priorityQueue.size() >= k) { - final ScoreDoc top = priorityQueue.poll(); - if (top.score < scoreDoc.score) { - priorityQueue.add(scoreDoc); - } else { - priorityQueue.add(top); - } - } else { - priorityQueue.add(scoreDoc); - } - } - while (!priorityQueue.isEmpty()) { - groundTruthVectorsIds.add(priorityQueue.poll().doc); - } - - return groundTruthVectorsIds; + while (!priorityQueue.isEmpty()) { + groundTruthVectorsIds.add(priorityQueue.poll().doc); } + + return groundTruthVectorsIds; + } } From 3b98cd8626100b28ea28eb0709bb90cd02fdced9 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 15:24:22 +0000 Subject: [PATCH 05/86] Fix package declarations --- .../sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java | 2 +- .../lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java | 2 +- .../lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java | 2 +- .../org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java | 2 +- .../lucene/sandbox/codecs/jvector/JVectorIndexWriter.java | 2 +- .../lucene/sandbox/codecs/jvector/JVectorKnnCollector.java | 2 +- .../sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java | 2 +- .../sandbox/codecs/jvector/JVectorRandomAccessReader.java | 2 +- .../org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java | 2 +- .../lucene/sandbox/codecs/jvector/JVectorVectorScorer.java | 2 +- .../org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 2 +- .../apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java index d43e7e4ac80f..2e74da91c8d0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; import java.util.List; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index ce6050088d68..97daea71c3ab 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; import java.util.Arrays; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 5dba75410ac0..c4039c6d12b9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.util.Bits; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index a7f3a13ee865..1f6ee2b93080 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; import java.util.concurrent.ForkJoinPool; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index 3a99635582a7..70217c1f1f25 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.disk.IndexWriter; import java.io.IOException; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java index c5490349ef0a..8051e967e884 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import lombok.Value; import org.apache.lucene.search.KnnCollector; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java index d2ece0b9eebc..f8903d67bde5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; import org.apache.lucene.index.FloatVectorValues; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 25f49a897c76..8394fa1c9ada 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.disk.ReaderSupplier; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 95a98830ff5d..6cbd237c9b15 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.disk.ReaderSupplier; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java index 6b7937f51525..cc6f3e6d6bff 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.types.VectorFloat; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 9b17c6165dfd..0911e2f0d524 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index b2f2ea075d3d..77949d7d039e 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.opensearch.knn.index.codec.jvector; +package org.apache.lucene.sandbox.codecs.jvector; import static org.opensearch.knn.common.KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; From 8f00f87707c4b92bc47dd0abe6779d67b87ff9af Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:03:38 +0000 Subject: [PATCH 06/86] Remove logging --- .../codecs/jvector/GraphNodeIdToDocMap.java | 10 +- .../sandbox/codecs/jvector/JVectorFormat.java | 5 - .../codecs/jvector/JVectorIndexWriter.java | 2 - .../jvector/JVectorRandomAccessReader.java | 4 - .../sandbox/codecs/jvector/JVectorReader.java | 18 +-- .../sandbox/codecs/jvector/JVectorWriter.java | 132 ++-------------- .../codecs/jvector/KNNJVectorTests.java | 149 +++--------------- 7 files changed, 39 insertions(+), 281 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 97daea71c3ab..0bd8febec442 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.Arrays; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.Sorter; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -33,7 +32,6 @@ * *

Which means that we also need to persist this mapping to disk to be available across merges. */ -@Log4j2 public class GraphNodeIdToDocMap { private static final int VERSION = 1; private int[] graphNodeIdsToDocIds; @@ -88,12 +86,8 @@ public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { + " is less than the number of ordinals " + graphNodeIdsToDocIds.length); } - if (maxDocId > graphNodeIdsToDocIds.length) { - log.warn( - "Max doc id {} is greater than the number of ordinals {}, this implies a lot of deleted documents. Or that some documents are missing vectors. Wasting a lot of memory", - maxDocId, - graphNodeIdsToDocIds.length); - } + // When maxDocId > graphNodeIdsToDocIds.length, there are lots of deleted documents or missing + // values, which wastes memory this.docIdsToGraphNodeIds = new int[maxDocs]; Arrays.fill(this.docIdsToGraphNodeIds, -1); // -1 means no mapping to ordinal for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 1f6ee2b93080..020a82835d60 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -21,7 +21,6 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinWorkerThread; import java.util.function.Function; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; @@ -29,7 +28,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.opensearch.knn.common.KNNConstants; -@Log4j2 public class JVectorFormat extends KnnVectorsFormat { public static final String NAME = "JVectorFormat"; public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; @@ -206,9 +204,6 @@ public static ForkJoinPool getPhysicalCoreExecutor() { return thread; }; - log.info( - "Creating SIMD ForkJoinPool with {} physical cores for JVector SIMD operations", - estimatedPhysicalCoreCount); return new ForkJoinPool(estimatedPhysicalCoreCount, factory, null, true); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index 70217c1f1f25..6483d7c71393 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -19,14 +19,12 @@ import io.github.jbellis.jvector.disk.IndexWriter; import java.io.IOException; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.store.IndexOutput; /** * JVectorRandomAccessWriter is a wrapper around IndexOutput that implements RandomAccessWriter. * Note: This is not thread safe! */ -@Log4j2 public class JVectorIndexWriter implements IndexWriter { private final IndexOutput indexOutputDelegate; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 8394fa1c9ada..97f7cec66dec 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -25,11 +25,9 @@ import java.nio.FloatBuffer; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; -@Log4j2 public class JVectorRandomAccessReader implements RandomAccessReader { private final byte[] internalBuffer = new byte[Long.BYTES]; private final byte[] internalFloatBuffer = new byte[Float.BYTES]; @@ -121,10 +119,8 @@ public void read(float[] floats, int offset, int count) throws IOException { @Override public void close() throws IOException { - log.debug("Closing JVectorRandomAccessReader for file: {}", indexInputDelegate); this.closed = true; // no need to really close the index input delegate since it is a clone - log.debug("Closed JVectorRandomAccessReader for file: {}", indexInputDelegate); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 6cbd237c9b15..8110937aec99 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -37,7 +37,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.*; @@ -48,7 +47,6 @@ import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.plugin.stats.KNNCounter; -@Log4j2 public class JVectorReader extends KnnVectorsReader { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); @@ -146,8 +144,8 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits if (knnCollector instanceof JVectorKnnCollector) { jvectorKnnCollector = (JVectorKnnCollector) knnCollector; } else { - log.warn( - "KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is not ideal"); + // KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is + // not ideal jvectorKnnCollector = new JVectorKnnCollector( knnCollector, @@ -199,7 +197,6 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } final long graphSearchEnd = System.currentTimeMillis(); final long searchTime = graphSearchEnd - graphSearchStart; - log.debug("Search (including acquiring view) took {} ms", searchTime); // Collect the below metrics about the search and somehow wire this back to {@link // @KNNStats} @@ -214,12 +211,6 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits KNNCounter.KNN_QUERY_EXPANDED_NODES.add(expandedCount); KNNCounter.KNN_QUERY_EXPANDED_BASE_LAYER_NODES.add(expandedBaseLayerCount); KNNCounter.KNN_QUERY_GRAPH_SEARCH_TIME.add(searchTime); - log.debug( - "rerankedCount: {}, visitedNodesCount: {}, expandedCount: {}, expandedBaseLayerCount: {}", - rerankedCount, - visitedNodesCount, - expandedCount, - expandedBaseLayerCount); } } } @@ -270,7 +261,6 @@ class FieldEntry implements Closeable { public FieldEntry( FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) throws IOException { - this.fieldInfo = fieldInfo; this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal()); @@ -316,10 +306,6 @@ public FieldEntry( directory.openInput(vectorIndexFieldDataFileName, IOContext.READONCE), pqCodebooksAndVectorsOffset, pqCodebooksAndVectorsLength); - log.debug( - "Loading PQ codebooks and vectors for field {}, with numbers of vectors: {}", - fieldInfo.name, - state.segmentInfo.maxDoc()); try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { this.pqVectors = PQVectors.load(randomAccessReader); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 0911e2f0d524..536f5f49517f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -45,7 +45,6 @@ import lombok.Builder; import lombok.Getter; import lombok.Value; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsReader; @@ -91,7 +90,6 @@ * MergeState.DocMap} provided in the {@link MergeState}. And across sorts with {@link * GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. */ -@Log4j2 public class JVectorWriter extends KnnVectorsWriter { private static final long SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); @@ -178,16 +176,14 @@ public JVectorWriter( @Override public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { - log.info("Adding field {} in segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { final String errorMessage = "byte[] vectors are not supported in JVector. " + "Instead you should only use float vectors and leverage product quantization during indexing." + "This can provides much greater savings in storage and memory"; - log.error(errorMessage); throw new UnsupportedOperationException(errorMessage); } - FieldWriter newField = new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name); + FieldWriter newField = new FieldWriter<>(fieldInfo); fields.add(newField); return newField; @@ -195,8 +191,6 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException @Override public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - log.info( - "Merging field {} into segment {}", fieldInfo.name, segmentWriteState.segmentInfo.name); try { final long mergeStart = Clock.systemDefaultZone().millis(); switch (fieldInfo.getVectorEncoding()) { @@ -210,25 +204,13 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE final long mergeEnd = Clock.systemDefaultZone().millis(); final long mergeTime = mergeEnd - mergeStart; KNNCounter.KNN_GRAPH_MERGE_TIME.add(mergeTime); - log.info( - "Completed Merge field {} into segment {}", - fieldInfo.name, - segmentWriteState.segmentInfo.name); } catch (Exception e) { - log.error( - "Error merging field {} into segment {}", - fieldInfo.name, - segmentWriteState.segmentInfo.name, - e); throw e; } } @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { - log.info("Flushing {} fields", fields.size()); - - log.info("Flushing jVector graph index"); for (FieldWriter field : fields) { final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; final int[] newToOldOrds = new int[randomAccessVectorValues.size()]; @@ -239,17 +221,12 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { final PQVectors pqVectors; final FieldInfo fieldInfo = field.fieldInfo; if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { - log.info("Calculating codebooks and compressed vectors for field {}", fieldInfo.name); pqVectors = getPQVectors(newToOldOrds, randomAccessVectorValues, fieldInfo); buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( getVectorSimilarityFunction(fieldInfo), pqVectors); } else { - log.info( - "Vector count: {}, less than limit to trigger PQ quantization: {}, for field {}, will use full precision vectors instead.", - randomAccessVectorValues.size(), - minimumBatchSizeForQuantization, - fieldInfo.name); + // Not enough vectors for quantization; use full precision vectors instead pqVectors = null; buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( @@ -292,11 +269,6 @@ private void writeField( GraphNodeIdToDocMap graphNodeIdToDocMap, OnHeapGraphIndex graph) throws IOException { - log.info( - "Writing field {} with vector count: {}, for segment: {}", - fieldInfo.name, - randomAccessVectorValues.size(), - segmentWriteState.segmentInfo.name); final var vectorIndexFieldMetadata = writeGraph( graph, @@ -308,7 +280,6 @@ private void writeField( meta.writeInt(fieldInfo.number); vectorIndexFieldMetadata.toOutput(meta); - log.info("Writing neighbors score cache for field {}", fieldInfo.name); // field data file, which contains the graph final String neighborsScoreCacheIndexFieldFileName = baseDataFileName @@ -365,7 +336,6 @@ private VectorIndexFieldMetadata writeGraph( segmentWriteState.segmentSuffix); final long startOffset = indexOutput.getFilePointer(); - log.info("Writing graph to {}", vectorIndexFieldFileName); var resultBuilder = VectorIndexFieldMetadata.builder() .fieldNumber(fieldInfo.number) @@ -392,11 +362,6 @@ private VectorIndexFieldMetadata writeGraph( // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed // vectors if (pqVectors != null) { - log.info( - "Writing PQ codebooks and vectors for field {} since the size is {} >= {}", - fieldInfo.name, - randomAccessVectorValues.size(), - minimumBatchSizeForQuantization); resultBuilder.pqCodebooksAndVectorsOffset(endGraphOffset); // write the compressed vectors and codebooks to disk pqVectors.write(jVectorIndexWriter); @@ -415,13 +380,8 @@ private VectorIndexFieldMetadata writeGraph( private PQVectors getPQVectors( int[] newToOldOrds, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) throws IOException { - final String fieldName = fieldInfo.name; final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); - log.info( - "Computing PQ codebooks for field {} for {} vectors", - fieldName, - randomAccessVectorValues.size()); final long start = Clock.systemDefaultZone().millis(); final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); final var numberOfClustersPerSubspace = @@ -439,21 +399,11 @@ private PQVectors getPQVectors( final long end = Clock.systemDefaultZone().millis(); final long trainingTime = end - start; - log.info("Computed PQ codebooks for field {}, in {} millis", fieldName, trainingTime); KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); - log.info( - "Encoding and building PQ vectors for field {} for {} vectors", - fieldName, - randomAccessVectorValues.size()); // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); PQVectors pqVectors = PQVectors.encodeAndBuild( pq, newToOldOrds.length, newToOldOrds, randomAccessVectorValues, SIMD_POOL_MERGE); - log.info( - "Encoded and built PQ vectors for field {}, original size: {} bytes, compressed size: {} bytes", - fieldName, - pqVectors.getOriginalSize(), - pqVectors.getCompressedSize()); return pqVectors; } @@ -502,7 +452,6 @@ public VectorIndexFieldMetadata(IndexInput in) throws IOException { @Override public void finish() throws IOException { - log.info("Finishing segment {}", segmentWriteState.segmentInfo.name); if (finished) { throw new IllegalStateException("already finished"); } @@ -548,25 +497,21 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); @Getter private final FieldInfo fieldInfo; private int lastDocID = -1; - private final String segmentName; private final RandomAccessVectorValues randomAccessVectorValues; // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to // the jVector ordinal private final List> vectors = new ArrayList<>(); private final List docIds = new ArrayList<>(); - FieldWriter(FieldInfo fieldInfo, String segmentName) { + FieldWriter(FieldInfo fieldInfo) { /** For creating a new field from a flat field vectors writer. */ this.randomAccessVectorValues = new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); this.fieldInfo = fieldInfo; - this.segmentName = segmentName; } @Override public void addValue(int docID, T vectorValue) throws IOException { - log.trace( - "Adding value {} to field {} in segment {}", vectorValue, fieldInfo.name, segmentName); if (docID == lastDocID) { throw new IllegalArgumentException( "VectorValuesField \"" @@ -581,7 +526,6 @@ public void addValue(int docID, T vectorValue) throws IOException { "byte[] vectors are not supported in JVector. " + "Instead you should only use float vectors and leverage product quantization during indexing." + "This can provides much greater savings in storage and memory"; - log.error("{}", errorMessage); throw new UnsupportedOperationException(errorMessage); } else { throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); @@ -603,10 +547,6 @@ public long ramBytesUsed() { static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction( FieldInfo fieldInfo) { - log.info( - "Matching vector similarity function {} for field {}", - fieldInfo.getVectorSimilarityFunction(), - fieldInfo.name); return switch (fieldInfo.getVectorSimilarityFunction()) { case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; @@ -658,7 +598,6 @@ class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { // Vector dimension private final int dimension; private final FieldInfo fieldInfo; - private final MergeState mergeState; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final int[] graphNodeIdsToRavvOrds; private boolean deletesFound = false; @@ -673,7 +612,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge throws IOException { this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); this.fieldInfo = fieldInfo; - this.mergeState = mergeState; final String fieldName = fieldInfo.name; @@ -792,12 +730,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - readerIdx); - } else { + if (docMaps[readerIdx].get(docId) != -1) { // Mapping from ravv ordinals to [readerIndex, readerOrd] // Map graph node id to ravv ordinal // Map graph node id to doc id @@ -831,12 +764,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge docId != DocIdSetIterator.NO_MORE_DOCS; docId = leadingReaderIt.nextDoc()) { final int newGlobalDocId = docMaps[LEADING_READER_IDX].get(docId); - if (newGlobalDocId == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - LEADING_READER_IDX); - } else { + if (newGlobalDocId != -1) { final int ravvLocalOrd = leadingReaderIt.index(); final int ravvGlobalOrd = ravvLocalOrd + baseOrds[LEADING_READER_IDX]; graphNodeIdToDocIds[ravvLocalOrd] = newGlobalDocId; @@ -861,12 +789,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) == -1) { - log.warn( - "Document {} in reader {} is not mapped to a global ordinal from the merge docMaps. Will skip this document for now", - docId, - readerIdx); - } else { + if (docMaps[readerIdx].get(docId) != -1) { // Mapping from ravv ordinals to [readerIndex, readerOrd] // Map graph node id to ravv ordinal // Map graph node id to doc id @@ -896,10 +819,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge } this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(graphNodeIdToDocIds); - log.debug( - "Created RandomAccessMergedFloatVectorValues with {} total vectors from {} readers", - size, - readers.length); } /** @@ -928,7 +847,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge public void merge() throws IOException { // This section creates the PQVectors to be used for this merge // Get PQ compressor for leading reader - final int totalVectorsCount = size; final String fieldName = fieldInfo.name; final PQVectors pqVectors; final OnHeapGraphIndex graph; @@ -941,30 +859,12 @@ public void merge() throws IOException { // remaining vectors if (leadingReader.getProductQuantizationForField(fieldInfo.name).isEmpty()) { // No pre-existing codebooks, check if we have enough vectors to trigger quantization - log.info( - "No Pre-existing PQ codebooks found in this merge for field {} in segment {}, will check if a new codebooks is necessary", - fieldName, - mergeState.segmentInfo.name); if (this.size() >= minimumBatchSizeForQuantization) { - log.info( - "Calculating new codebooks and compressed vectors for field: {}, with totalVectorCount: {}, above minimumBatchSizeForQuantization: {}", - fieldName, - totalVectorsCount, - minimumBatchSizeForQuantization); pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); } else { - log.info( - "Not enough vectors found for field: {}, totalVectorCount: {}, is below minimumBatchSizeForQuantization: {}", - fieldName, - totalVectorsCount, - minimumBatchSizeForQuantization); pqVectors = null; } } else { - log.info( - "Pre-existing PQ codebooks found in this merge for field {} in segment {}, will refine the codebooks from the leading reader with the remaining vectors", - fieldName, - mergeState.segmentInfo.name); final long start = Clock.systemDefaultZone().millis(); ProductQuantization leadingCompressor = leadingReader.getProductQuantizationForField(fieldName).get(); @@ -980,7 +880,6 @@ public void merge() throws IOException { } final long end = Clock.systemDefaultZone().millis(); final long trainingTime = end - start; - log.info("Refined PQ codebooks for field {}, in {} millis", fieldName, trainingTime); KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); pqVectors = PQVectors.encodeAndBuild( @@ -998,11 +897,7 @@ public void merge() throws IOException { // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, // segmentWriteState.segmentInfo.name); if (!deletesFound) { - final String segmentName = segmentWriteState.segmentInfo.name; - log.info( - "No deletes found, and no PQ codebooks found, expanding previous graph with additional vectors for field {} in segment {}", - fieldName, - segmentName); + // Expand graph when there are no deletes and no PQ codebooks final RandomAccessReader leadingOnHeapGraphReader = leadingReader.getNeighborsScoreCacheForField(fieldName); final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); @@ -1019,7 +914,7 @@ public void merge() throws IOException { alpha, hierarchyEnabled); } else { - log.info("Deletes found, and no PQ codebooks found, building new graph from scratch"); + // Build a new graph from scratch when there are deletes and no PQ codebooks graph = getGraph( buildScoreProvider, @@ -1030,7 +925,7 @@ public void merge() throws IOException { SIMD_POOL_MERGE); } } else { - log.info("PQ codebooks found, building graph from scratch with PQ vectors"); + // Re-use PQ codebooks to build a new graph from scratch buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( getVectorSimilarityFunction(fieldInfo), pqVectors); @@ -1114,11 +1009,9 @@ public OnHeapGraphIndex getGraph( * To have the right mapping from docId to vector ordinal we need to use the mergedFloatVector. * This is the case when we are merging segments and we might have more documents than vectors. */ - final long start = Clock.systemDefaultZone().millis(); final OnHeapGraphIndex graphIndex; var vv = randomAccessVectorValues.threadLocalSupplier(); - log.info("Building graph from merged float vector"); // parallel graph construction from the merge documents Ids SIMD_POOL .submit( @@ -1133,13 +1026,7 @@ public OnHeapGraphIndex getGraph( .join(); graphIndexBuilder.cleanup(); graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); - final long end = Clock.systemDefaultZone().millis(); - log.info( - "Built graph for field {} in segment {} in {} millis", - fieldInfo.name, - segmentName, - end - start); return graphIndex; } @@ -1173,7 +1060,6 @@ public VectorFloat getVector(int nodeId) { return VECTOR_TYPE_SUPPORT.createFloatVector(copy); } } catch (IOException e) { - log.error("Error retrieving vector at ordinal {}", nodeId, e); throw new RuntimeException(e); } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index 77949d7d039e..e07f6519a18a 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -27,7 +27,6 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import lombok.extern.log4j.Log4j2; import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.search.*; @@ -51,8 +50,6 @@ @ThreadLeakFilters( defaultFilters = true, filters = {ThreadLeakFiltersForTests.class}) -@LuceneTestCase.SuppressSysoutChecks(bugUrl = "") -@Log4j2 public class KNNJVectorTests extends LuceneTestCase { private static final String TEST_FIELD = "test_field"; private static final String TEST_ID_FIELD = "id"; @@ -71,7 +68,6 @@ public void testJVectorKnnIndex_simpleCase() throws IOException { indexWriterConfig.setCodec(getCodec()); indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -81,11 +77,11 @@ public void testJVectorKnnIndex_simpleCase() throws IOException { doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flush docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); + // We should now have a single segment with 10 documents; Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -110,10 +106,8 @@ public void testJVectorKnnIndex_simpleCase() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } - log.info("successfully closed directory"); } /** Test the scenario when not all documents are populated with the vector field */ @@ -126,7 +120,6 @@ public void testMissing_fields() throws IOException { indexWriterConfig.setCodec(getCodec()); indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -139,11 +132,11 @@ public void testMissing_fields() throws IOException { doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flush docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); + // We should now have a single segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -168,10 +161,8 @@ public void testMissing_fields() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 4.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } - log.info("successfully closed directory"); } /** @@ -194,7 +185,6 @@ public void test_sorted_index() throws IOException { new Sort(new SortField(sortFieldName, SortField.Type.INT, true))); // true = reverse order final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -207,11 +197,11 @@ public void test_sorted_index() throws IOException { doc.add(new NumericDocValuesField(sortFieldName, i)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with 10 documents"); + // We should now have a single segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -260,10 +250,8 @@ public void test_sorted_index() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } - log.info("successfully closed directory"); } /** @@ -280,7 +268,6 @@ public void testJVectorKnnIndex_multipleSegments() throws IOException { indexWriterConfig.setCodec(getCodec()); indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -291,10 +278,9 @@ public void testJVectorKnnIndex_multipleSegments() throws IOException { w.addDocument(doc); w.commit(); // this creates a new segment } - log.info("Done writing all files to the file system"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 10 segments, each with a single document"); + // We should now have 10 segments, each with a single document Assert.assertEquals(10, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -318,7 +304,6 @@ public void testJVectorKnnIndex_multipleSegments() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } } @@ -338,7 +323,6 @@ public void testJVectorKnnIndex_mergeEnabled() throws IOException { indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -350,12 +334,10 @@ public void testJVectorKnnIndex_mergeEnabled() throws IOException { w.addDocument(doc); w.commit(); // this creates a new segment without triggering a merge } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); + // We should now have 1 segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -382,7 +364,6 @@ public void testJVectorKnnIndex_mergeEnabled() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } } @@ -403,7 +384,6 @@ public void multipleMerges() throws IOException { indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); final Path indexPath = createTempDir(); final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -416,12 +396,10 @@ public void multipleMerges() throws IOException { w.commit(); // this creates a new segment without triggering a merge w.forceMerge(1); // this merges all segments into a single segment } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); + // We should now have 1 segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -448,7 +426,6 @@ public void multipleMerges() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } } @@ -480,7 +457,6 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -494,12 +470,10 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() w.commit(); // this creates a new segment without triggering a merge } } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with {} documents", totalNumberOfDocs); + // We should now have 1 segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -513,8 +487,6 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, k}); final float recall = calculateRecall(topDocs, expectedMinScoreInTopK); Assert.assertEquals(1.0f, recall, 0.01f); - - log.info("successfully completed search tests"); } } } @@ -555,7 +527,6 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { for (int i = 0; i < source.length; i++) { @@ -567,12 +538,10 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() w.commit(); // this creates a new segment without triggering a merge } } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -584,7 +553,6 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } } @@ -669,18 +637,6 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() .getIntValue(); float[] vectorValue = vectorValues.vectorValue(docIdSetIterator.index()); float[] expectedVectorValue = sourceVectors[globalDocId]; - // if the vectors do not match, also look which source vector should be the right result - if (!Arrays.equals(expectedVectorValue, vectorValue)) { - for (int i = 0; i < sourceVectors.length; i++) { - if (Arrays.equals(sourceVectors[i], vectorValue)) { - log.error( - "found vector with global id: {}, in docId: {}, however the actual position of the vector in source is: {}", - globalDocId, - luceneDocId, - i); - } - } - } Assert.assertArrayEquals( "vector with global id " + globalDocId @@ -744,13 +700,11 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() totalQueries.incrementAndGet(); } catch (Throwable e) { failureDetected.compareAndSet(false, true); - log.error("Exception encountered", e); fail("Exception during concurrent search: " + e.getMessage()); } } } finally { latch.countDown(); - log.warn("Ran {} queries", i); } }); } @@ -765,9 +719,6 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() numThreads * queriesPerThread, totalQueries.get()); - // Log the number of successful queries - log.info("Successfully completed {} concurrent kNN search queries!", totalQueries.get()); - } finally { executor.shutdownNow(); } @@ -906,7 +857,6 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOExceptio indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -917,12 +867,10 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOExceptio w.addDocument(doc); w.flush(); // this creates a new segment without triggering a merge } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); + // We should now have 1 segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -946,7 +894,6 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOExceptio VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), topDocs.scoreDocs[2].score, 0.01f); - log.info("successfully completed search tests"); } } } @@ -967,7 +914,6 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOE indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {1.0f, 1.0f}; @@ -978,12 +924,10 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOE w.addDocument(doc); w.flush(); // this creates a new segment without triggering a merge } - log.info("Done writing all files to the file system"); w.forceMerge(1); // this merges all segments into a single segment - log.info("Done merging all segments"); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have 1 segment with 10 documents"); + // We should now have 1 segment with 10 documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); @@ -1007,7 +951,6 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOE VectorSimilarityFunction.COSINE.compare(target, new float[] {4.0f, 6.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed search tests"); } } } @@ -1026,7 +969,6 @@ public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException indexWriterConfig.setCodec(getCodec()); indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (Directory dir = newFSDirectory(indexPath); RandomIndexWriter w = new RandomIndexWriter(random(), dir, indexWriterConfig)) { final byte[] source = new byte[] {(byte) 0, (byte) 0}; @@ -1049,7 +991,6 @@ public void testJVectorKnnIndex_withFilter() throws IOException { indexWriterConfig.setCodec(getCodec()); indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (Directory dir = newFSDirectory(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = new float[] {0.0f, 0.0f}; @@ -1060,18 +1001,16 @@ public void testJVectorKnnIndex_withFilter() throws IOException { doc.add(new StringField("filter_field", i % 2 == 0 ? "even" : "odd", Field.Store.YES)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("Applying filter to the KNN search"); final Query filterQuery = new TermQuery(new Term("filter_field", "even")); final IndexSearcher searcher = newSearcher(reader); KnnFloatVectorQuery knnFloatVectorQuery = getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); - log.info("Validating filtered KNN results"); assertEquals(k, topDocs.totalHits.value()); assertEquals(9, topDocs.scoreDocs[0].doc); Assert.assertEquals( @@ -1088,7 +1027,6 @@ public void testJVectorKnnIndex_withFilter() throws IOException { VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 6.0f}), topDocs.scoreDocs[2].score, 0.001f); - log.info("successfully completed filtered search tests"); } } } @@ -1117,7 +1055,6 @@ public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1130,11 +1067,11 @@ public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1146,7 +1083,6 @@ public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } } @@ -1170,7 +1106,6 @@ public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOEx indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1180,11 +1115,11 @@ public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOEx doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); w.addDocument(doc); } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.commit(); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1213,8 +1148,6 @@ public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOEx assertEquals(k, topDocs.totalHits.value()); float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); Assert.assertTrue(recallWithLowOverqueryFactor <= recallWithHighOverqueryFactor); - - log.info("successfully completed search tests"); } } } @@ -1252,7 +1185,6 @@ public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() th indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1269,11 +1201,11 @@ public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() th w.commit(); } } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.forceMerge(1); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1285,7 +1217,6 @@ public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() th assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } } @@ -1327,7 +1258,6 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges( indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1344,11 +1274,11 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges( w.commit(); } } - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.forceMerge(1); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1360,7 +1290,6 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges( assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals(1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } } @@ -1404,7 +1333,6 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1424,11 +1352,11 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ } } w.commit(); - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.forceMerge(1); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1441,7 +1369,6 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals( "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } @@ -1485,7 +1412,6 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen indexWriterConfig.setRAMPerThreadHardLimitMB( 1000); // 1000MB per thread, this way we make sure that no premature flush will occur final Path indexPath = createTempDir(); - log.info("Index path: {}", indexPath); try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); @@ -1514,11 +1440,11 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen } } w.commit(); - log.info("Flushing docs to make them discoverable on the file system"); + // Flushing docs to make them discoverable on the file system w.forceMerge(1); try (IndexReader reader = DirectoryReader.open(w)) { - log.info("We should now have a single segment with {} documents", totalNumberOfDocs); + // We should now have a single segment with totalNumberOfDocs documents Assert.assertEquals(1, reader.getContext().leaves().size()); Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); @@ -1531,7 +1457,6 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); Assert.assertEquals( "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); - log.info("successfully completed search tests"); } } @@ -1555,31 +1480,9 @@ private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { } } float recall = ((float) totalRelevantDocs) / ((float) topDocs.scoreDocs.length); - - if (recall == 0.0f) { - log.info( - "Recall is 0.0, this is probably not correct, here is some debug information\n topDocs: {}, minScoreInTopK: {}, totalRelevantDocs: {}", - topDocsToString(topDocs), - minScoreInTopK, - totalRelevantDocs); - } return recall; } - // convert topDocs to a pretty printed string - private String topDocsToString(TopDocs topDocs) { - StringBuilder sb = new StringBuilder(); - sb.append("TopDocs: ["); - for (int i = 0; i < topDocs.scoreDocs.length; i++) { - sb.append(topDocs.scoreDocs[i].doc) - .append(" (") - .append(topDocs.scoreDocs[i].score) - .append("), "); - } - sb.append("]"); - return sb.toString(); - } - private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( String fieldName, float[] target, int k, Query filterQuery) { return getJVectorKnnFloatVectorQuery( From 2e2f5640adaa8985c0782866d6c4cd67810e07d5 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:31:11 +0000 Subject: [PATCH 07/86] Remove KNNCounter stats --- .../sandbox/codecs/jvector/JVectorReader.java | 18 ----------------- .../sandbox/codecs/jvector/JVectorWriter.java | 14 ------------- .../codecs/jvector/KNNJVectorTests.java | 20 +++---------------- 3 files changed, 3 insertions(+), 49 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 8110937aec99..6ff34b02c2ae 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -45,7 +45,6 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.opensearch.knn.common.KNNConstants; -import org.opensearch.knn.plugin.stats.KNNCounter; public class JVectorReader extends KnnVectorsReader { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = @@ -160,7 +159,6 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits final SearchScoreProvider ssp; try (var view = index.getView()) { - final long graphSearchStart = System.currentTimeMillis(); if (fieldEntryMap.get(field).pqVectors != null) { // Quantized, use the precomputed score function final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; @@ -195,22 +193,6 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits for (SearchResult.NodeScore ns : searchResults.getNodes()) { jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); } - final long graphSearchEnd = System.currentTimeMillis(); - final long searchTime = graphSearchEnd - graphSearchStart; - - // Collect the below metrics about the search and somehow wire this back to {@link - // @KNNStats} - final int visitedNodesCount = searchResults.getVisitedCount(); - final int rerankedCount = searchResults.getRerankedCount(); - - final int expandedCount = searchResults.getExpandedCount(); - final int expandedBaseLayerCount = searchResults.getExpandedCountBaseLayer(); - - KNNCounter.KNN_QUERY_VISITED_NODES.add(visitedNodesCount); - KNNCounter.KNN_QUERY_RERANKED_COUNT.add(rerankedCount); - KNNCounter.KNN_QUERY_EXPANDED_NODES.add(expandedCount); - KNNCounter.KNN_QUERY_EXPANDED_BASE_LAYER_NODES.add(expandedBaseLayerCount); - KNNCounter.KNN_QUERY_GRAPH_SEARCH_TIME.add(searchTime); } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 536f5f49517f..96eae3227ca0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -36,7 +36,6 @@ import io.github.jbellis.jvector.vector.types.VectorTypeSupport; import java.io.IOException; import java.io.UnsupportedEncodingException; -import java.time.Clock; import java.util.*; import java.util.concurrent.ForkJoinPool; import java.util.function.Function; @@ -56,7 +55,6 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; -import org.opensearch.knn.plugin.stats.KNNCounter; /** * JVectorWriter is responsible for writing vector data into index segments using the JVector @@ -192,7 +190,6 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException @Override public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { try { - final long mergeStart = Clock.systemDefaultZone().millis(); switch (fieldInfo.getVectorEncoding()) { case BYTE: throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); @@ -201,9 +198,6 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE mergeRavv.merge(); break; } - final long mergeEnd = Clock.systemDefaultZone().millis(); - final long mergeTime = mergeEnd - mergeStart; - KNNCounter.KNN_GRAPH_MERGE_TIME.add(mergeTime); } catch (Exception e) { throw e; } @@ -382,7 +376,6 @@ private PQVectors getPQVectors( throws IOException { final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); - final long start = Clock.systemDefaultZone().millis(); final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); final var numberOfClustersPerSubspace = Math.min(256, randomAccessVectorValues.size()); // number of centroids per @@ -397,9 +390,6 @@ private PQVectors getPQVectors( SIMD_POOL_MERGE, ForkJoinPool.commonPool()); - final long end = Clock.systemDefaultZone().millis(); - final long trainingTime = end - start; - KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); PQVectors pqVectors = PQVectors.encodeAndBuild( @@ -865,7 +855,6 @@ public void merge() throws IOException { pqVectors = null; } } else { - final long start = Clock.systemDefaultZone().millis(); ProductQuantization leadingCompressor = leadingReader.getProductQuantizationForField(fieldName).get(); // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading @@ -878,9 +867,6 @@ public void merge() throws IOException { new RandomAccessVectorValuesOverVectorValues(values); leadingCompressor.refine(randomAccessVectorValues); } - final long end = Clock.systemDefaultZone().millis(); - final long trainingTime = end - start; - KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.add(trainingTime); pqVectors = PQVectors.encodeAndBuild( leadingCompressor, diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index e07f6519a18a..636279e43693 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -39,7 +39,6 @@ import org.opensearch.knn.TestUtils; import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.index.ThreadLeakFiltersForTests; -import org.opensearch.knn.plugin.stats.KNNCounter; /** Test used specifically for JVector */ // Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not @@ -1371,11 +1370,8 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); } } - - Assert.assertTrue( - "No quantization time recorded", KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount() > 0); - Assert.assertTrue( - "No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + // TODO: assert no quantization + // TODO: assert no graph merge } /** @@ -1427,16 +1423,8 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); w.addDocument(doc); if (i % idealBatchSize == 0) { - final long beforeTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); w.commit(); w.forceMerge(1); // force merge will trigger PQ refinement if other segments are present - final long afterTrainingTime = KNNCounter.KNN_QUANTIZATION_TRAINING_TIME.getCount(); - Assert.assertTrue( - "Expected to have a training time of at least " - + beforeTrainingTime - + " but got " - + afterTrainingTime, - afterTrainingTime >= beforeTrainingTime); } } w.commit(); @@ -1459,9 +1447,7 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); } } - - Assert.assertTrue( - "No graph merge time recorded", KNNCounter.KNN_GRAPH_MERGE_TIME.getCount() > 0); + // TODO: Assert no graph merge } /** From a47ca910c84b0d6cd16286167717eecd7a149b76 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:38:30 +0000 Subject: [PATCH 08/86] Fix references to missing KNNConstants --- .../sandbox/codecs/jvector/JVectorFormat.java | 19 +++++++++++-------- .../sandbox/codecs/jvector/JVectorReader.java | 14 +++++++++----- .../codecs/jvector/KNNJVectorTests.java | 11 +++++------ 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 020a82835d60..1d208d31415f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -26,7 +26,6 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.opensearch.knn.common.KNNConstants; public class JVectorFormat extends KnnVectorsFormat { public static final String NAME = "JVectorFormat"; @@ -44,6 +43,10 @@ public class JVectorFormat extends KnnVectorsFormat { public static final int VERSION_CURRENT = VERSION_START; public static final int DEFAULT_MAX_CONN = 32; public static final int DEFAULT_BEAM_WIDTH = 100; + public static final int DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION = 1024; + public static final float DEFAULT_NEIGHBOR_OVERFLOW = 2f; + public static final float DEFAULT_ALPHA = 2f; + public static final boolean DEFAULT_HIERARCHY_ENABLED = true; // Unfortunately, this can't be managed yet by the OpenSearch ThreadPool because it's not // supporting {@link ForkJoinPool} types public static final ForkJoinPool SIMD_POOL_MERGE = getPhysicalCoreExecutor(); @@ -63,11 +66,11 @@ public JVectorFormat() { NAME, DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, - KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), - KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, JVectorFormat::getDefaultNumberOfSubspacesPerVector, - KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, - KNNConstants.DEFAULT_HIERARCHY_ENABLED); + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + DEFAULT_HIERARCHY_ENABLED); } public JVectorFormat(int minBatchSizeForQuantization) { @@ -75,11 +78,11 @@ public JVectorFormat(int minBatchSizeForQuantization) { NAME, DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, - KNNConstants.DEFAULT_NEIGHBOR_OVERFLOW_VALUE.floatValue(), - KNNConstants.DEFAULT_ALPHA_VALUE.floatValue(), + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, JVectorFormat::getDefaultNumberOfSubspacesPerVector, minBatchSizeForQuantization, - KNNConstants.DEFAULT_HIERARCHY_ENABLED); + DEFAULT_HIERARCHY_ENABLED); } public JVectorFormat( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 6ff34b02c2ae..a5d07b8ba63a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -44,9 +44,13 @@ import org.apache.lucene.store.*; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; -import org.opensearch.knn.common.KNNConstants; public class JVectorReader extends KnnVectorsReader { + public static final float DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0f; + public static final float DEFAULT_QUERY_RERANK_FLOOR = 0f; + public static final int DEFAULT_OVER_QUERY_FACTOR = 3; + public static final boolean DEFAULT_QUERY_USE_PRUNING = false; + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); @@ -148,10 +152,10 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits jvectorKnnCollector = new JVectorKnnCollector( knnCollector, - KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), - KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), - KNNConstants.DEFAULT_OVER_QUERY_FACTOR, - KNNConstants.DEFAULT_QUERY_USE_PRUNING); + DEFAULT_QUERY_SIMILARITY_THRESHOLD, + DEFAULT_QUERY_RERANK_FLOOR, + DEFAULT_OVER_QUERY_FACTOR, + DEFAULT_QUERY_USE_PRUNING); } // search for a random vector using a GraphSearcher and SearchScoreProvider diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index 636279e43693..de9554f6bcff 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -17,7 +17,7 @@ package org.apache.lucene.sandbox.codecs.jvector; -import static org.opensearch.knn.common.KNNConstants.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; +import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; @@ -37,7 +37,6 @@ import org.junit.Assert; import org.junit.Test; import org.opensearch.knn.TestUtils; -import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.index.ThreadLeakFiltersForTests; /** Test used specifically for JVector */ @@ -1472,7 +1471,7 @@ private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( String fieldName, float[] target, int k, Query filterQuery) { return getJVectorKnnFloatVectorQuery( - fieldName, target, k, filterQuery, KNNConstants.DEFAULT_OVER_QUERY_FACTOR); + fieldName, target, k, filterQuery, JVectorReader.DEFAULT_OVER_QUERY_FACTOR); } private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( @@ -1483,9 +1482,9 @@ private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( k, filterQuery, overQueryFactor, - KNNConstants.DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), - KNNConstants.DEFAULT_QUERY_RERANK_FLOOR.floatValue(), - KNNConstants.DEFAULT_QUERY_USE_PRUNING); + JVectorReader.DEFAULT_QUERY_SIMILARITY_THRESHOLD, + JVectorReader.DEFAULT_QUERY_RERANK_FLOOR, + JVectorReader.DEFAULT_QUERY_USE_PRUNING); } private static float[][] getMonotonicallyIncreasingVectors(int numVectors, int vectorDimension) { From c30a0702be9721552ae56cf9f2fb8bc6295fd386 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:41:39 +0000 Subject: [PATCH 09/86] Remove lombok.Value annotation from JVectorKnnCollector --- .../codecs/jvector/JVectorKnnCollector.java | 25 +++++++++++++------ .../sandbox/codecs/jvector/JVectorReader.java | 6 ++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java index 8051e967e884..d2fad6532570 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.sandbox.codecs.jvector; -import lombok.Value; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.knn.KnnSearchStrategy; @@ -25,13 +24,25 @@ * Wrapper class for KnnCollector that provides passing of additional parameters specific for * JVector. */ -@Value public class JVectorKnnCollector implements KnnCollector { - KnnCollector delegate; - float threshold; - float rerankFloor; - int overQueryFactor; - boolean usePruning; + final KnnCollector delegate; + final float threshold; + final float rerankFloor; + final int overQueryFactor; + final boolean usePruning; + + public JVectorKnnCollector( + KnnCollector delegate, + float threshold, + float rerankFloor, + int overQueryFactor, + boolean usePruning) { + this.delegate = delegate; + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.overQueryFactor = overQueryFactor; + this.usePruning = usePruning; + } @Override public boolean earlyTerminated() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index a5d07b8ba63a..698258744509 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -190,9 +190,9 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits graphSearcher.search( ssp, jvectorKnnCollector.k(), - jvectorKnnCollector.k() * jvectorKnnCollector.getOverQueryFactor(), - jvectorKnnCollector.getThreshold(), - jvectorKnnCollector.getRerankFloor(), + jvectorKnnCollector.k() * jvectorKnnCollector.overQueryFactor, + jvectorKnnCollector.threshold, + jvectorKnnCollector.rerankFloor, compatibleBits); for (SearchResult.NodeScore ns : searchResults.getNodes()) { jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); From d9e5ba316435f576e510f27507f6e573ce90a663 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:42:32 +0000 Subject: [PATCH 10/86] Fix AcceptDocs param in JVectorKnnFloatVectorQuery --- .../sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java index f8903d67bde5..50246250ad60 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java @@ -23,7 +23,6 @@ import org.apache.lucene.search.*; import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.search.knn.KnnSearchStrategy; -import org.apache.lucene.util.Bits; /** * {@link KnnFloatVectorQuery} that uses jVector to perform the search. We use this wrapper simply @@ -70,7 +69,7 @@ public JVectorKnnFloatVectorQuery( @Override protected TopDocs approximateSearch( LeafReaderContext context, - Bits acceptDocs, + AcceptDocs acceptDocs, int visitedLimit, KnnCollectorManager knnCollectorManager) throws IOException { From 6b178d8fcc27c1dfe36d535d67348753cad44f31 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:55:50 +0000 Subject: [PATCH 11/86] Fix AcceptDocs param in JVectorReader --- .../sandbox/codecs/jvector/JVectorReader.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 698258744509..35482cd91dbe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -27,6 +27,7 @@ import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.util.Bits; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; @@ -40,9 +41,9 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.*; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.*; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; public class JVectorReader extends KnnVectorsReader { @@ -140,7 +141,7 @@ public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { final OnDiskGraphIndex index = fieldEntryMap.get(field).index; final JVectorKnnCollector jvectorKnnCollector; @@ -182,8 +183,14 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check // if the jVector ordinal has a // corresponding Lucene doc ID accepted by acceptDocs filter. - io.github.jbellis.jvector.util.Bits compatibleBits = - ord -> acceptDocs == null || acceptDocs.get(jvectorLuceneDocMap.getLuceneDocId(ord)); + + Bits compatibleBits = Bits.ALL; + if (acceptDocs != null) { + final var luceneBits = acceptDocs.bits(); + if (luceneBits != null) { + compatibleBits = ord -> luceneBits.get(jvectorLuceneDocMap.getLuceneDocId(ord)); + } + } try (var graphSearcher = new GraphSearcher(index)) { final var searchResults = @@ -202,7 +209,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { // TODO: implement this throw new UnsupportedOperationException("Byte vector search is not supported yet with jVector"); From 9604146db48484c4dfa214e00c724c444f66ea25 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 16:57:59 +0000 Subject: [PATCH 12/86] Fix static imports of SIMD_POOL* --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 96eae3227ca0..740d6459cc5c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -19,8 +19,8 @@ import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; -import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_FLUSH; -import static org.opensearch.knn.index.codec.jvector.JVectorFormat.SIMD_POOL_MERGE; +import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; +import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.graph.*; From caba8d9f33a84c0cf8f102ca6351094d12216af9 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:07:38 +0000 Subject: [PATCH 13/86] Remove Lombok.Getter from JVectorWriter --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 740d6459cc5c..9b9e8e7cfdb9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -42,7 +42,6 @@ import java.util.stream.IntStream; import lombok.AllArgsConstructor; import lombok.Builder; -import lombok.Getter; import lombok.Value; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -485,7 +484,7 @@ static class FieldWriter extends KnnFieldVectorsWriter { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); - @Getter private final FieldInfo fieldInfo; + private final FieldInfo fieldInfo; private int lastDocID = -1; private final RandomAccessVectorValues randomAccessVectorValues; // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to From 0fcac22fea2511a1d345d70eafd5f193a7f3f8f0 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:09:14 +0000 Subject: [PATCH 14/86] Remove lombok annotations from VectorIndexFieldMetadata --- .../sandbox/codecs/jvector/JVectorReader.java | 18 ++-- .../sandbox/codecs/jvector/JVectorWriter.java | 86 +++++++++++-------- 2 files changed, 61 insertions(+), 43 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 35482cd91dbe..9ac1234fb258 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -228,7 +228,7 @@ private void readFields(ChecksumIndexInput meta) throws IOException { final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = new JVectorWriter.VectorIndexFieldMetadata(meta); - assert fieldInfo.number == vectorIndexFieldMetadata.getFieldNumber(); + assert fieldInfo.number == vectorIndexFieldMetadata.fieldNumber; fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); } } @@ -256,14 +256,14 @@ public FieldEntry( throws IOException { this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( - vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal()); - this.vectorEncoding = vectorIndexFieldMetadata.getVectorEncoding(); - this.vectorIndexOffset = vectorIndexFieldMetadata.getVectorIndexOffset(); - this.vectorIndexLength = vectorIndexFieldMetadata.getVectorIndexLength(); - this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.getPqCodebooksAndVectorsLength(); - this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.getPqCodebooksAndVectorsOffset(); - this.dimension = vectorIndexFieldMetadata.getVectorDimension(); - this.graphNodeIdToDocMap = vectorIndexFieldMetadata.getGraphNodeIdToDocMap(); + vectorIndexFieldMetadata.vectorSimilarityFunction.ordinal()); + this.vectorEncoding = vectorIndexFieldMetadata.vectorEncoding; + this.vectorIndexOffset = vectorIndexFieldMetadata.vectorIndexOffset; + this.vectorIndexLength = vectorIndexFieldMetadata.vectorIndexLength; + this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; + this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; + this.dimension = vectorIndexFieldMetadata.vectorDimension; + this.graphNodeIdToDocMap = vectorIndexFieldMetadata.graphNodeIdToDocMap; this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 9b9e8e7cfdb9..8c4679495f91 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -40,9 +40,6 @@ import java.util.concurrent.ForkJoinPool; import java.util.function.Function; import java.util.stream.IntStream; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Value; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsReader; @@ -328,15 +325,6 @@ private VectorIndexFieldMetadata writeGraph( segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); final long startOffset = indexOutput.getFilePointer(); - - var resultBuilder = - VectorIndexFieldMetadata.builder() - .fieldNumber(fieldInfo.number) - .vectorEncoding(fieldInfo.getVectorEncoding()) - .vectorSimilarityFunction(fieldInfo.getVectorSimilarityFunction()) - .vectorDimension(randomAccessVectorValues.dimension()) - .graphNodeIdToDocMap(graphNodeIdToDocMap); - try (var writer = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) .with(new InlineVectors(randomAccessVectorValues.dimension())) @@ -348,25 +336,35 @@ private VectorIndexFieldMetadata writeGraph( new InlineVectors.State( randomAccessVectorValues.getVector(newToOldOrds[nodeId]))); writer.write(suppliers); - long endGraphOffset = jVectorIndexWriter.position(); - resultBuilder.vectorIndexOffset(startOffset); - resultBuilder.vectorIndexLength(endGraphOffset - startOffset); + final long endGraphOffset = jVectorIndexWriter.position(); // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed // vectors + final long pqOffset; + final long pqLength; if (pqVectors != null) { - resultBuilder.pqCodebooksAndVectorsOffset(endGraphOffset); + pqOffset = endGraphOffset; // write the compressed vectors and codebooks to disk pqVectors.write(jVectorIndexWriter); - resultBuilder.pqCodebooksAndVectorsLength(jVectorIndexWriter.position() - endGraphOffset); + pqLength = jVectorIndexWriter.position() - endGraphOffset; } else { - resultBuilder.pqCodebooksAndVectorsOffset(0); - resultBuilder.pqCodebooksAndVectorsLength(0); + pqOffset = 0; + pqLength = 0; } CodecUtil.writeFooter(indexOutput); - } - return resultBuilder.build(); + return new VectorIndexFieldMetadata( + fieldInfo.number, + fieldInfo.getVectorEncoding(), + fieldInfo.getVectorSimilarityFunction(), + randomAccessVectorValues.dimension(), + startOffset, + endGraphOffset - startOffset, + pqOffset, + pqLength, + degreeOverflow, + graphNodeIdToDocMap); + } } } @@ -396,20 +394,40 @@ private PQVectors getPQVectors( return pqVectors; } - @Value - @Builder(toBuilder = true) - @AllArgsConstructor public static class VectorIndexFieldMetadata { - int fieldNumber; - VectorEncoding vectorEncoding; - VectorSimilarityFunction vectorSimilarityFunction; - int vectorDimension; - long vectorIndexOffset; - long vectorIndexLength; - long pqCodebooksAndVectorsOffset; - long pqCodebooksAndVectorsLength; - float degreeOverflow; // important when leveraging cache - GraphNodeIdToDocMap graphNodeIdToDocMap; + final int fieldNumber; + final VectorEncoding vectorEncoding; + final VectorSimilarityFunction vectorSimilarityFunction; + final int vectorDimension; + final long vectorIndexOffset; + final long vectorIndexLength; + final long pqCodebooksAndVectorsOffset; + final long pqCodebooksAndVectorsLength; + final float degreeOverflow; // important when leveraging cache + final GraphNodeIdToDocMap graphNodeIdToDocMap; + + public VectorIndexFieldMetadata( + int fieldNumber, + VectorEncoding vectorEncoding, + VectorSimilarityFunction vectorSimilarityFunction, + int vectorDimension, + long vectorIndexOffset, + long vectorIndexLength, + long pqCodebooksAndVectorsOffset, + long pqCodebooksAndVectorsLength, + float degreeOverflow, + GraphNodeIdToDocMap graphNodeIdToDocMap) { + this.fieldNumber = fieldNumber; + this.vectorEncoding = vectorEncoding; + this.vectorSimilarityFunction = vectorSimilarityFunction; + this.vectorDimension = vectorDimension; + this.vectorIndexOffset = vectorIndexOffset; + this.vectorIndexLength = vectorIndexLength; + this.pqCodebooksAndVectorsOffset = pqCodebooksAndVectorsOffset; + this.pqCodebooksAndVectorsLength = pqCodebooksAndVectorsLength; + this.degreeOverflow = degreeOverflow; + this.graphNodeIdToDocMap = graphNodeIdToDocMap; + } public void toOutput(IndexOutput out) throws IOException { out.writeInt(fieldNumber); From 384cde8549ea7e12f02ec4805cfdc07587fc093b Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:15:11 +0000 Subject: [PATCH 15/86] Fix illegal access to PerFieldKnnVectorsFormat.FieldReader --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 8c4679495f91..957d5f2acfa5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -44,7 +44,6 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.index.*; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.*; @@ -858,9 +857,8 @@ public void merge() throws IOException { final PQVectors pqVectors; final OnHeapGraphIndex graph; // Get the leading reader - PerFieldKnnVectorsFormat.FieldsReader fieldsReader = - (PerFieldKnnVectorsFormat.FieldsReader) readers[LEADING_READER_IDX]; - JVectorReader leadingReader = (JVectorReader) fieldsReader.getFieldReader(fieldName); + final JVectorReader leadingReader = + (JVectorReader) readers[LEADING_READER_IDX].unwrapReaderForField(fieldName); final BuildScoreProvider buildScoreProvider; // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the // remaining vectors From 4a5639280d031e5d25eb772265d1cb523684e3b9 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:20:47 +0000 Subject: [PATCH 16/86] Fix references to getCodec --- .../sandbox/codecs/jvector/KNNJVectorTests.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index de9554f6bcff..764d99c49274 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -18,7 +18,6 @@ package org.apache.lucene.sandbox.codecs.jvector; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; -import static org.opensearch.knn.index.engine.CommonTestUtils.getCodec; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.io.IOException; @@ -27,6 +26,7 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.search.*; @@ -34,6 +34,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.junit.Assert; import org.junit.Test; import org.opensearch.knn.TestUtils; @@ -1561,4 +1562,12 @@ private static Set calculateGroundTruthVectorsIds( return groundTruthVectorsIds; } + + private Codec getCodec() { + return getCodec(JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION); + } + + private Codec getCodec(final int minimumBatchSizeForQuantization) { + return TestUtil.alwaysKnnVectorsFormat(new JVectorFormat(minimumBatchSizeForQuantization)); + } } From 585e7772c110ab5de85836bfd714eb6c61ed76c8 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 19:34:01 +0000 Subject: [PATCH 17/86] Fix references to TestUtils.generateRandomVectors --- .../codecs/jvector/KNNJVectorTests.java | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index 764d99c49274..f9d3e5f756ae 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -37,7 +37,6 @@ import org.apache.lucene.tests.util.TestUtil; import org.junit.Assert; import org.junit.Test; -import org.opensearch.knn.TestUtils; import org.opensearch.knn.index.ThreadLeakFiltersForTests; /** Test used specifically for JVector */ @@ -506,8 +505,8 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() int k = 3; // The number of nearest neighbors to gather final int dimension = 2; final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - final float[] target = TestUtils.generateRandomVectors(1, dimension)[0]; - final float[][] source = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[] target = generateRandomVectors(1, dimension)[0]; + final float[][] source = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, source, k, vectorSimilarityFunction); @@ -586,7 +585,7 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() final String floatVectorField = "vec"; final String expectedDocIdField = "expectedDocId"; final Path indexPath = createTempDir(); - final float[][] sourceVectors = TestUtils.generateRandomVectors(numDocs, 2); + final float[][] sourceVectors = generateRandomVectors(numDocs, 2); final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; try (Directory dir = newFSDirectory(indexPath)) { @@ -660,7 +659,7 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() final FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); final int k = 1; for (int i = 0; i < reader.maxDoc(); i++) { - float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + float[] query = generateRandomVectors(1, 2)[0]; TopDocs td = searcher.search( getJVectorKnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); @@ -687,7 +686,7 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() try { for (i = 0; i < queriesPerThread && !failureDetected.get(); i++) { - float[] query = TestUtils.generateRandomVectors(1, 2)[0]; + float[] query = generateRandomVectors(1, 2)[0]; try { TopDocs td = searcher.search(new KnnFloatVectorQuery("vec", query, k), k); assertEquals( @@ -1057,7 +1056,7 @@ public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); for (int i = 0; i < vectors.length; i++) { @@ -1187,7 +1186,7 @@ public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() th try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); @@ -1260,7 +1259,7 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges( try (FSDirectory dir = FSDirectory.open(indexPath); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { final float[] target = generateZerosVectorWithLastValue(dimension, 0); - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); for (int i = 0; i < totalNumberOfDocs; i++) { @@ -1337,7 +1336,7 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ final float[] target = generateZerosVectorWithLastValue(dimension, 0); // We will use random vectors because otherwise PQ will have a correlated subspaces which will // result in a broken linear graph - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); for (int i = 0; i < totalNumberOfDocs; i++) { @@ -1413,7 +1412,7 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen final float[] target = generateZerosVectorWithLastValue(dimension, 0); // We will use random vectors because otherwise PQ will have a correlated subspaces which will // result in a broken linear graph - final float[][] vectors = TestUtils.generateRandomVectors(totalNumberOfDocs, dimension); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); final Set groundTruthVectorsIds = calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); for (int i = 0; i < totalNumberOfDocs; i++) { @@ -1563,6 +1562,17 @@ private static Set calculateGroundTruthVectorsIds( return groundTruthVectorsIds; } + static float[][] generateRandomVectors(int count, int dimension) { + final var rng = nonAssertingRandom(random()); + final float[][] vectors = new float[count][dimension]; + for (int i = 0; i < vectors.length; ++i) { + for (int j = 0; j < vectors[i].length; ++j) { + vectors[i][j] = rng.nextFloat(); + } + } + return vectors; + } + private Codec getCodec() { return getCodec(JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION); } From 8ddcddb19bf624077a94a422be0e205c3ee019a7 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 19:14:03 +0000 Subject: [PATCH 18/86] Fix ThreadLeakFilters in test --- .../sandbox/codecs/jvector/KNNJVectorTests.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index f9d3e5f756ae..f5093abd9770 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -19,6 +19,7 @@ import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; +import com.carrotsearch.randomizedtesting.ThreadFilter; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.io.IOException; import java.nio.file.Path; @@ -37,7 +38,6 @@ import org.apache.lucene.tests.util.TestUtil; import org.junit.Assert; import org.junit.Test; -import org.opensearch.knn.index.ThreadLeakFiltersForTests; /** Test used specifically for JVector */ // Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not @@ -47,7 +47,7 @@ // due to leaked thread pool warning. @ThreadLeakFilters( defaultFilters = true, - filters = {ThreadLeakFiltersForTests.class}) + filters = {KNNJVectorTests.ThreadLeakFilter.class}) public class KNNJVectorTests extends LuceneTestCase { private static final String TEST_FIELD = "test_field"; private static final String TEST_ID_FIELD = "id"; @@ -1580,4 +1580,11 @@ private Codec getCodec() { private Codec getCodec(final int minimumBatchSizeForQuantization) { return TestUtil.alwaysKnnVectorsFormat(new JVectorFormat(minimumBatchSizeForQuantization)); } + + public static class ThreadLeakFilter implements ThreadFilter { + @Override + public boolean reject(Thread thread) { + return thread.getName().contains("ForkJoinPool"); + } + } } From 25cd540a7504df6789831a1da196e3542e3ba6c2 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:31:59 +0000 Subject: [PATCH 19/86] Fix missing @Override --- .../lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java | 1 + .../sandbox/codecs/jvector/JVectorRandomAccessReader.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index c4039c6d12b9..bc34d9141463 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -64,6 +64,7 @@ public VectorFloat vectorFloatValue(int ord) { return view.getVector(ord); } + @Override public DocIndexIterator iterator() { return new DocIndexIterator() { private int docId = -1; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 97f7cec66dec..6c1519a3b04e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -59,7 +59,7 @@ public float readFloat() throws IOException { } // TODO: bring back to override when upgrading jVector again - // @Override + @Override public long readLong() throws IOException { return indexInputDelegate.readLong(); } From ee5ed2d71399ed08a3934cf470a4864ba38fdaae Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:42:24 +0000 Subject: [PATCH 20/86] Remove unused members --- .../codecs/jvector/JVectorRandomAccessReader.java | 3 --- .../lucene/sandbox/codecs/jvector/JVectorReader.java | 5 ----- .../lucene/sandbox/codecs/jvector/JVectorWriter.java | 3 --- .../sandbox/codecs/jvector/KNNJVectorTests.java | 11 ----------- 4 files changed, 22 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 6c1519a3b04e..c3017aca8ffa 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -30,9 +30,7 @@ public class JVectorRandomAccessReader implements RandomAccessReader { private final byte[] internalBuffer = new byte[Long.BYTES]; - private final byte[] internalFloatBuffer = new byte[Float.BYTES]; private final IndexInput indexInputDelegate; - private volatile boolean closed = false; public JVectorRandomAccessReader(IndexInput indexInputDelegate) { this.indexInputDelegate = indexInputDelegate; @@ -119,7 +117,6 @@ public void read(float[] floats, int offset, int count) throws IOException { @Override public void close() throws IOException { - this.closed = true; // no need to really close the index input delegate since it is a clone } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 9ac1234fb258..21e6fd918fe1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -234,10 +234,7 @@ private void readFields(ChecksumIndexInput meta) throws IOException { } class FieldEntry implements Closeable { - private final FieldInfo fieldInfo; - private final VectorEncoding vectorEncoding; private final VectorSimilarityFunction similarityFunction; - private final int dimension; private final long vectorIndexOffset; private final long vectorIndexLength; private final long pqCodebooksAndVectorsLength; @@ -257,12 +254,10 @@ public FieldEntry( this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( vectorIndexFieldMetadata.vectorSimilarityFunction.ordinal()); - this.vectorEncoding = vectorIndexFieldMetadata.vectorEncoding; this.vectorIndexOffset = vectorIndexFieldMetadata.vectorIndexOffset; this.vectorIndexLength = vectorIndexFieldMetadata.vectorIndexLength; this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; - this.dimension = vectorIndexFieldMetadata.vectorDimension; this.graphNodeIdToDocMap = vectorIndexFieldMetadata.graphNodeIdToDocMap; this.vectorIndexFieldDataFileName = diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 957d5f2acfa5..55c1fa163d79 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -584,9 +584,6 @@ class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { private static final int READER_ORD = 1; private static final int LEADING_READER_IDX = 0; - private final VectorTypeSupport VECTOR_TYPE_SUPPORT = - VectorizationProvider.getInstance().getVectorTypeSupport(); - // Array of sub-readers private final KnnVectorsReader[] readers; private final JVectorFloatVectorValues[] perReaderFloatVectorValues; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index f5093abd9770..de4ae5283371 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -959,8 +959,6 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOE */ @Test public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException { - int k = 3; // The number of nearest neighbours to gather - int totalNumberOfDocs = 10; IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); // TODO: re-enable this after fixing the compound file augmentation for JVector indexWriterConfig.setUseCompoundFile(false); @@ -1487,15 +1485,6 @@ private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( JVectorReader.DEFAULT_QUERY_USE_PRUNING); } - private static float[][] getMonotonicallyIncreasingVectors(int numVectors, int vectorDimension) { - float[][] vectors = new float[numVectors][vectorDimension]; - for (int i = 0; i < numVectors; i++) { - vectors[i] = generateZerosVectorWithLastValue(vectorDimension, i); - } - - return vectors; - } - private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { float[] vector = new float[vectorDimension]; for (int i = 0; i < vectorDimension - 1; i++) { From 3c47063c097b96a5fd5d797a57b3ee87918635f3 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:48:42 +0000 Subject: [PATCH 21/86] Fix unqualified javadoc --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 55c1fa163d79..a2bedb0cca31 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -80,8 +80,8 @@ * jVector ordinals and the new Lucene document IDs. This is achieved by keeping checkpoints of the * {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as * needed across merges by constructing a new mapping from the previous mapping and the {@link - * MergeState.DocMap} provided in the {@link MergeState}. And across sorts with {@link - * GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. + * org.apache.lucene.index.MergeState.DocMap} provided in the {@link MergeState}. And across sorts + * with {@link GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. */ public class JVectorWriter extends KnnVectorsWriter { private static final long SHALLOW_RAM_BYTES_USED = From af5e0bedf3bb14fa264f626756b33d195cc3a8a1 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 17:49:43 +0000 Subject: [PATCH 22/86] Suppress cases-omitted from switch expression --- .../org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index a2bedb0cca31..8b0eacf0656b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -557,6 +557,7 @@ static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimila case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + // $CASES-OMITTED$ default -> throw new IllegalArgumentException( "Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); From 4b2beb89d4dcbcdf9afd13357174b70929e1692f Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 18:56:55 +0000 Subject: [PATCH 23/86] Add basic javadocs for classes without --- .../sandbox/codecs/jvector/JVectorFloatVectorValues.java | 4 ++-- .../apache/lucene/sandbox/codecs/jvector/JVectorFormat.java | 1 + .../sandbox/codecs/jvector/JVectorRandomAccessReader.java | 1 + .../apache/lucene/sandbox/codecs/jvector/JVectorReader.java | 1 + .../lucene/sandbox/codecs/jvector/JVectorVectorScorer.java | 1 + .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 1 + 6 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index bc34d9141463..df9b71a385b8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.search.VectorScorer; +/// Implements Lucene vector access over a JVector on-disk index public class JVectorFloatVectorValues extends FloatVectorValues { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); @@ -88,8 +89,7 @@ public int docID() { @Override public int nextDoc() throws IOException { // Advance to the next node docId starts from -1 which is why we need to increment docId by - // 1 "size" - // times + // 1 "size" times while (docId < size - 1) { docId++; if (liveNodes.get(docId)) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 1d208d31415f..75aa58ba0181 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +/// Implements K-NN search using JVector library for indexing public class JVectorFormat extends KnnVectorsFormat { public static final String NAME = "JVectorFormat"; public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index c3017aca8ffa..de87f451f5c8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -28,6 +28,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; +/// Implements JVector reader capabilities over a Lucene IndexInput public class JVectorRandomAccessReader implements RandomAccessReader { private final byte[] internalBuffer = new byte[Long.BYTES]; private final IndexInput indexInputDelegate; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 21e6fd918fe1..753d321a6429 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -46,6 +46,7 @@ import org.apache.lucene.store.*; import org.apache.lucene.util.IOUtils; +/// Implements KnnVectorsReader over an on-disk JVector index serialized using {@link JVectorWriter} public class JVectorReader extends KnnVectorsReader { public static final float DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0f; public static final float DEFAULT_QUERY_RERANK_FLOOR = 0f; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java index cc6f3e6d6bff..8c9006dd0901 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -24,6 +24,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; +/// Implements Lucene scoring over a JVector index public class JVectorVectorScorer implements VectorScorer { private final JVectorFloatVectorValues floatVectorValues; private final KnnVectorValues.DocIndexIterator docIndexIterator; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 8b0eacf0656b..0bf0ea618370 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -393,6 +393,7 @@ private PQVectors getPQVectors( return pqVectors; } + /// Metadata about the index to be persisted on disk public static class VectorIndexFieldMetadata { final int fieldNumber; final VectorEncoding vectorEncoding; From f97bfc51394d7bd1cdd36930d7982829942630cc Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 19:03:33 +0000 Subject: [PATCH 24/86] Fix forbiddenApis error --- .../apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java index de4ae5283371..1931d0a4e2d0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java @@ -36,6 +36,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.NamedThreadFactory; import org.junit.Assert; import org.junit.Test; @@ -673,7 +674,8 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() // not exhausting the file handles int numThreads = 10; // Number of concurrent search threads int queriesPerThread = 100; // Number of searches per thread - ExecutorService executor = Executors.newFixedThreadPool(numThreads); + ExecutorService executor = + Executors.newFixedThreadPool(numThreads, new NamedThreadFactory("KNNJVectorTests")); CountDownLatch latch = new CountDownLatch(numThreads); AtomicBoolean failureDetected = new AtomicBoolean(false); AtomicInteger totalQueries = new AtomicInteger(0); From 17e211f5d5b6b498c0ea4a7762a9068a5bc349b2 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 19:05:51 +0000 Subject: [PATCH 25/86] Rename KNNJVectorTests --- .../jvector/{KNNJVectorTests.java => TestJVectorFormat.java} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/{KNNJVectorTests.java => TestJVectorFormat.java} (99%) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java similarity index 99% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java index 1931d0a4e2d0..0524a24af7a0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/KNNJVectorTests.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java @@ -48,8 +48,8 @@ // due to leaked thread pool warning. @ThreadLeakFilters( defaultFilters = true, - filters = {KNNJVectorTests.ThreadLeakFilter.class}) -public class KNNJVectorTests extends LuceneTestCase { + filters = {TestJVectorFormat.ThreadLeakFilter.class}) +public class TestJVectorFormat extends LuceneTestCase { private static final String TEST_FIELD = "test_field"; private static final String TEST_ID_FIELD = "id"; From c2c8082fd158f593c8043288f85c8e77aa128fa9 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 19:37:04 +0000 Subject: [PATCH 26/86] Fix missing @Test annotations --- .../apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java index 0524a24af7a0..18ac79e1da3f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java @@ -110,6 +110,7 @@ public void testJVectorKnnIndex_simpleCase() throws IOException { } /** Test the scenario when not all documents are populated with the vector field */ + @Test public void testMissing_fields() throws IOException { final int k = 3; // The number of nearest neighbors to gather final int totalNumberOfDocs = 10; @@ -170,6 +171,7 @@ public void testMissing_fields() throws IOException { * * @throws IOException if an I/O error occurs */ + @Test public void test_sorted_index() throws IOException { final int k = 3; // The number of nearest neighbors to gather final int totalNumberOfDocs = 10; From d26e1f67a37720922eaf4e0f804226f84e2019e6 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 29 Oct 2025 20:11:26 +0000 Subject: [PATCH 27/86] Use JVectorSearchStrategy to plumb search parameters to JVectorReader --- .../codecs/jvector/JVectorKnnCollector.java | 91 ------------- .../jvector/JVectorKnnFloatVectorQuery.java | 94 -------------- .../sandbox/codecs/jvector/JVectorReader.java | 38 ++---- .../codecs/jvector/JVectorSearchStrategy.java | 121 ++++++++++++++++++ .../codecs/jvector/TestJVectorFormat.java | 97 ++++++-------- 5 files changed, 177 insertions(+), 264 deletions(-) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java deleted file mode 100644 index d2fad6532570..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.codecs.jvector; - -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.knn.KnnSearchStrategy; - -/** - * Wrapper class for KnnCollector that provides passing of additional parameters specific for - * JVector. - */ -public class JVectorKnnCollector implements KnnCollector { - final KnnCollector delegate; - final float threshold; - final float rerankFloor; - final int overQueryFactor; - final boolean usePruning; - - public JVectorKnnCollector( - KnnCollector delegate, - float threshold, - float rerankFloor, - int overQueryFactor, - boolean usePruning) { - this.delegate = delegate; - this.threshold = threshold; - this.rerankFloor = rerankFloor; - this.overQueryFactor = overQueryFactor; - this.usePruning = usePruning; - } - - @Override - public boolean earlyTerminated() { - return delegate.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - delegate.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return delegate.visitedCount(); - } - - @Override - public long visitLimit() { - return delegate.visitLimit(); - } - - @Override - public int k() { - return delegate.k(); - } - - @Override - public boolean collect(int docId, float similarity) { - return delegate.collect(docId, similarity); - } - - @Override - public float minCompetitiveSimilarity() { - return delegate.minCompetitiveSimilarity(); - } - - @Override - public TopDocs topDocs() { - return delegate.topDocs(); - } - - @Override - public KnnSearchStrategy getSearchStrategy() { - return delegate.getSearchStrategy(); - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java deleted file mode 100644 index 50246250ad60..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnFloatVectorQuery.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.codecs.jvector; - -import java.io.IOException; -import org.apache.lucene.index.FloatVectorValues; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.*; -import org.apache.lucene.search.knn.KnnCollectorManager; -import org.apache.lucene.search.knn.KnnSearchStrategy; - -/** - * {@link KnnFloatVectorQuery} that uses jVector to perform the search. We use this wrapper simply - * because we can't pass jVector specific parameters with the upstream {@link KnnFloatVectorQuery}. - */ -public class JVectorKnnFloatVectorQuery extends KnnFloatVectorQuery { - private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final int overQueryFactor; - private final float threshold; - private final float rerankFloor; - private final boolean usePruning; - - public JVectorKnnFloatVectorQuery( - String field, - float[] target, - int k, - int overQueryFactor, - float threshold, - float rerankFloor, - boolean usePruning) { - super(field, target, k); - this.overQueryFactor = overQueryFactor; - this.threshold = threshold; - this.rerankFloor = rerankFloor; - this.usePruning = usePruning; - } - - public JVectorKnnFloatVectorQuery( - String field, - float[] target, - int k, - Query filter, - int overQueryFactor, - float threshold, - float rerankFloor, - boolean usePruning) { - super(field, target, k, filter); - this.overQueryFactor = overQueryFactor; - this.threshold = threshold; - this.rerankFloor = rerankFloor; - this.usePruning = usePruning; - } - - @Override - protected TopDocs approximateSearch( - LeafReaderContext context, - AcceptDocs acceptDocs, - int visitedLimit, - KnnCollectorManager knnCollectorManager) - throws IOException { - final KnnCollector delegateCollector = - knnCollectorManager.newCollector(visitedLimit, KnnSearchStrategy.Hnsw.DEFAULT, context); - final KnnCollector knnCollector = - new JVectorKnnCollector( - delegateCollector, threshold, rerankFloor, overQueryFactor, usePruning); - LeafReader reader = context.reader(); - FloatVectorValues floatVectorValues = reader.getFloatVectorValues(field); - if (floatVectorValues == null) { - FloatVectorValues.checkField(reader, field); - return NO_RESULTS; - } - if (Math.min(knnCollector.k(), floatVectorValues.size()) == 0) { - return NO_RESULTS; - } - reader.searchNearestVectors(field, getTargetCopy(), knnCollector, acceptDocs); - TopDocs results = knnCollector.topDocs(); - return results != null ? results : NO_RESULTS; - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 753d321a6429..a2abcda40c3d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -43,16 +43,12 @@ import org.apache.lucene.index.*; import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.knn.KnnSearchStrategy; import org.apache.lucene.store.*; import org.apache.lucene.util.IOUtils; /// Implements KnnVectorsReader over an on-disk JVector index serialized using {@link JVectorWriter} public class JVectorReader extends KnnVectorsReader { - public static final float DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0f; - public static final float DEFAULT_QUERY_RERANK_FLOOR = 0f; - public static final int DEFAULT_OVER_QUERY_FACTOR = 3; - public static final boolean DEFAULT_QUERY_USE_PRUNING = false; - private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); @@ -145,20 +141,14 @@ public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { final OnDiskGraphIndex index = fieldEntryMap.get(field).index; - final JVectorKnnCollector jvectorKnnCollector; - if (knnCollector instanceof JVectorKnnCollector) { - jvectorKnnCollector = (JVectorKnnCollector) knnCollector; - } else { - // KnnCollector must be of type JVectorKnnCollector, for now we will re-wrap it but this is - // not ideal - jvectorKnnCollector = - new JVectorKnnCollector( - knnCollector, - DEFAULT_QUERY_SIMILARITY_THRESHOLD, - DEFAULT_QUERY_RERANK_FLOOR, - DEFAULT_OVER_QUERY_FACTOR, - DEFAULT_QUERY_USE_PRUNING); - } + + final JVectorSearchStrategy searchStrategy; + if (knnCollector.getSearchStrategy() instanceof JVectorSearchStrategy strategy) { + searchStrategy = strategy; + } else if (knnCollector.getSearchStrategy() instanceof KnnSearchStrategy.Seeded seeded + && seeded.originalStrategy() instanceof JVectorSearchStrategy strategy) { + searchStrategy = strategy; + } else searchStrategy = JVectorSearchStrategy.DEFAULT; // search for a random vector using a GraphSearcher and SearchScoreProvider VectorFloat q = VECTOR_TYPE_SUPPORT.createFloatVector(target); @@ -197,13 +187,13 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce final var searchResults = graphSearcher.search( ssp, - jvectorKnnCollector.k(), - jvectorKnnCollector.k() * jvectorKnnCollector.overQueryFactor, - jvectorKnnCollector.threshold, - jvectorKnnCollector.rerankFloor, + knnCollector.k(), + knnCollector.k() * searchStrategy.overQueryFactor, + searchStrategy.threshold, + searchStrategy.rerankFloor, compatibleBits); for (SearchResult.NodeScore ns : searchResults.getNodes()) { - jvectorKnnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); + knnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java new file mode 100644 index 000000000000..1f713a8b214b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.util.Locale; +import java.util.Objects; +import org.apache.lucene.search.knn.KnnSearchStrategy; + +/// Defines query-time parameters for searching a JVector index to be passed into +/// [`search()`][JVectorReader#search] via [`KnnCollector`][org.apache.lucene.search.KnnCollector]. +public class JVectorSearchStrategy extends KnnSearchStrategy { + static final float DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0f; + static final float DEFAULT_QUERY_RERANK_FLOOR = 0f; + static final int DEFAULT_OVER_QUERY_FACTOR = 3; + static final boolean DEFAULT_QUERY_USE_PRUNING = false; + + public static final JVectorSearchStrategy DEFAULT = + new JVectorSearchStrategy( + DEFAULT_QUERY_SIMILARITY_THRESHOLD, + DEFAULT_QUERY_RERANK_FLOOR, + DEFAULT_OVER_QUERY_FACTOR, + DEFAULT_QUERY_USE_PRUNING); + + final float threshold; + final float rerankFloor; + final int overQueryFactor; + final boolean usePruning; + + private JVectorSearchStrategy( + float threshold, float rerankFloor, int overQueryFactor, boolean usePruning) { + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.overQueryFactor = overQueryFactor; + this.usePruning = usePruning; + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "%s[threshold=%f, rerankFloor=%f, overQueryFactor=%d, usePruning=%s]", + getClass().getSimpleName(), + threshold, + rerankFloor, + overQueryFactor, + usePruning); + } + + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } else if (obj instanceof JVectorSearchStrategy other) { + return this.threshold == other.threshold + && this.rerankFloor == other.rerankFloor + && this.overQueryFactor == other.overQueryFactor + && this.usePruning == other.usePruning; + } else return false; + } + + @Override + public int hashCode() { + return Objects.hash(getClass(), threshold, rerankFloor, overQueryFactor, usePruning); + } + + @Override + public void nextVectorsBlock() {} + + public static Builder builder() { + return new Builder(); + } + + /// Builder for defining a [JVectorSearchStrategy]. + public static class Builder { + private float threshold = DEFAULT_QUERY_SIMILARITY_THRESHOLD; + private float rerankFloor = DEFAULT_QUERY_RERANK_FLOOR; + private int overQueryFactor = DEFAULT_OVER_QUERY_FACTOR; + private boolean usePruning = DEFAULT_QUERY_USE_PRUNING; + + private Builder() {} + + public Builder withThreshold(float threshold) { + this.threshold = threshold; + return this; + } + + public Builder withRerankFloor(float rerankFloor) { + this.rerankFloor = rerankFloor; + return this; + } + + public Builder withOverQueryFactor(int overQueryFactor) { + this.overQueryFactor = overQueryFactor; + return this; + } + + public Builder withUsePruning(boolean usePruning) { + this.usePruning = usePruning; + return this; + } + + public JVectorSearchStrategy build() { + return new JVectorSearchStrategy(threshold, rerankFloor, overQueryFactor, usePruning); + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java index 18ac79e1da3f..0c46a50a8b61 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java @@ -86,8 +86,8 @@ public void testJVectorKnnIndex_simpleCase() throws IOException { final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); assertEquals(9, topDocs.scoreDocs[0].doc); @@ -142,8 +142,8 @@ public void testMissing_fields() throws IOException { final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs[0].doc); @@ -208,8 +208,8 @@ public void test_sorted_index() throws IOException { final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); assertEquals(9, topDocs.scoreDocs[0].doc); @@ -286,7 +286,7 @@ public void testJVectorKnnIndex_multipleSegments() throws IOException { Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = + final KnnFloatVectorQuery knnFloatVectorQuery = new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); @@ -343,8 +343,8 @@ public void testJVectorKnnIndex_mergeEnabled() throws IOException { Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); @@ -405,8 +405,8 @@ public void multipleMerges() throws IOException { Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); @@ -479,8 +479,8 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); @@ -548,8 +548,8 @@ public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -664,8 +664,7 @@ public void testLuceneKnnIndex_multipleMerges_with_ordering_check() for (int i = 0; i < reader.maxDoc(); i++) { float[] query = generateRandomVectors(1, 2)[0]; TopDocs td = - searcher.search( - getJVectorKnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); + searcher.search(new KnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); assertEquals(k, td.scoreDocs.length); compareSearchResults( @@ -827,7 +826,7 @@ public void deletedDocs() throws IOException { final float[] target = {0.0f, 1.0f * (i + docToDeleteInEachBatch)}; final IndexSearcher searcher = newSearcher(reader); final KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, new MatchAllDocsQuery()); + new KnnFloatVectorQuery("test_field", target, k, new MatchAllDocsQuery()); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); for (int j = 0; j < k; j++) { @@ -877,8 +876,8 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOExceptio Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); assertEquals(9, topDocs.scoreDocs[0].doc); @@ -934,8 +933,8 @@ public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOE Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs[0].doc); @@ -1007,8 +1006,8 @@ public void testJVectorKnnIndex_withFilter() throws IOException { try (IndexReader reader = DirectoryReader.open(w)) { final Query filterQuery = new TermQuery(new Term("filter_field", "even")); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); @@ -1077,8 +1076,8 @@ public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -1134,16 +1133,19 @@ public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOEx }); // Query with essentially no reranking and expect recall to be very low + JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1).build(); KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1); + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recallWithLowOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); // Query with reranking and expect recall to be high + searchStrategy = JVectorSearchStrategy.builder().withOverQueryFactor(5).build(); knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 5); + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); @@ -1211,8 +1213,8 @@ public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() th final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -1284,8 +1286,8 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges( final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -1362,8 +1364,10 @@ public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_ final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + final JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1000).build(); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -1439,8 +1443,10 @@ public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinemen final Query filterQuery = new MatchAllDocsQuery(); final IndexSearcher searcher = newSearcher(reader); - KnnFloatVectorQuery knnFloatVectorQuery = - getJVectorKnnFloatVectorQuery("test_field", target, k, filterQuery, 1000); + final JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1000).build(); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); assertEquals(k, topDocs.totalHits.value()); final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); @@ -1470,25 +1476,6 @@ private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { return recall; } - private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( - String fieldName, float[] target, int k, Query filterQuery) { - return getJVectorKnnFloatVectorQuery( - fieldName, target, k, filterQuery, JVectorReader.DEFAULT_OVER_QUERY_FACTOR); - } - - private JVectorKnnFloatVectorQuery getJVectorKnnFloatVectorQuery( - String fieldName, float[] target, int k, Query filterQuery, int overQueryFactor) { - return new JVectorKnnFloatVectorQuery( - fieldName, - target, - k, - filterQuery, - overQueryFactor, - JVectorReader.DEFAULT_QUERY_SIMILARITY_THRESHOLD, - JVectorReader.DEFAULT_QUERY_RERANK_FLOOR, - JVectorReader.DEFAULT_QUERY_USE_PRUNING); - } - private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { float[] vector = new float[vectorDimension]; for (int i = 0; i < vectorDimension - 1; i++) { From 7435bf9e5cda7c038ef9809cbe4016b4f23e5211 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 14:58:02 +0000 Subject: [PATCH 28/86] Use IntUnaryOperator for numberOfSubspacesPerVetorSupplier --- .../lucene/sandbox/codecs/jvector/JVectorFormat.java | 10 +++++----- .../lucene/sandbox/codecs/jvector/JVectorWriter.java | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 75aa58ba0181..ef61fda380a6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -20,7 +20,7 @@ import java.io.IOException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinWorkerThread; -import java.util.function.Function; +import java.util.function.IntUnaryOperator; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; @@ -55,8 +55,8 @@ public class JVectorFormat extends KnnVectorsFormat { private final int maxConn; private final int beamWidth; - private final Function - numberOfSubspacesPerVectorSupplier; // as a function of the original dimension + // As a function of the original dimension + private final IntUnaryOperator numberOfSubspacesPerVectorSupplier; private final int minBatchSizeForQuantization; private final float alpha; private final float neighborOverflow; @@ -91,7 +91,7 @@ public JVectorFormat( int beamWidth, float neighborOverflow, float alpha, - Function numberOfSubspacesPerVectorSupplier, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, int minBatchSizeForQuantization, boolean hierarchyEnabled) { this( @@ -111,7 +111,7 @@ public JVectorFormat( int beamWidth, float neighborOverflow, float alpha, - Function numberOfSubspacesPerVectorSupplier, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, int minBatchSizeForQuantization, boolean hierarchyEnabled) { super(name); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 0bf0ea618370..f7351c1832cf 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -38,7 +38,7 @@ import java.io.UnsupportedEncodingException; import java.util.*; import java.util.concurrent.ForkJoinPool; -import java.util.function.Function; +import java.util.function.IntUnaryOperator; import java.util.stream.IntStream; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -98,9 +98,8 @@ public class JVectorWriter extends KnnVectorsWriter { private final int beamWidth; private final float degreeOverflow; private final float alpha; - private final Function - numberOfSubspacesPerVectorSupplier; // Number of subspaces used per vector for PQ quantization - // as a function of the original dimension + /// Number of subspaces used per vector in PQ quantization as a function of the original dimension + private final IntUnaryOperator numberOfSubspacesPerVectorSupplier; private final int minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger // PQ quantization @@ -114,7 +113,7 @@ public JVectorWriter( int beamWidth, float degreeOverflow, float alpha, - Function numberOfSubspacesPerVectorSupplier, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, int minimumBatchSizeForQuantization, boolean hierarchyEnabled) throws IOException { @@ -372,7 +371,8 @@ private PQVectors getPQVectors( throws IOException { final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); - final var M = numberOfSubspacesPerVectorSupplier.apply(randomAccessVectorValues.dimension()); + final int M = + numberOfSubspacesPerVectorSupplier.applyAsInt(randomAccessVectorValues.dimension()); final var numberOfClustersPerSubspace = Math.min(256, randomAccessVectorValues.size()); // number of centroids per // subspace From ddbae421f8b382e6ee2b352e356c504851d543ed Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 17:00:13 +0000 Subject: [PATCH 29/86] Fix missed call to KnnCollector.incVisitedCount --- .../apache/lucene/sandbox/codecs/jvector/JVectorReader.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index a2abcda40c3d..1a766e6d2924 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -195,6 +195,10 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce for (SearchResult.NodeScore ns : searchResults.getNodes()) { knnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); } + // JVector does not seem to count the entry-point as visited + if (index.size(index.getMaxLevel()) > 0) { + knnCollector.incVisitedCount(1 + searchResults.getVisitedCount()); + } } } } From e5b7619a95492ed13e8ef7e053956e6f0a192e0e Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 19:01:36 +0000 Subject: [PATCH 30/86] Skip search altogether when graph is empty --- .../lucene/sandbox/codecs/jvector/JVectorReader.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 1a766e6d2924..9f0354c1ca27 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -155,6 +155,10 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce final SearchScoreProvider ssp; try (var view = index.getView()) { + if (view.entryNode() == null) { + // Skip search when the graph is empty + return; + } if (fieldEntryMap.get(field).pqVectors != null) { // Quantized, use the precomputed score function final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; @@ -196,9 +200,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce knnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); } // JVector does not seem to count the entry-point as visited - if (index.size(index.getMaxLevel()) > 0) { - knnCollector.incVisitedCount(1 + searchResults.getVisitedCount()); - } + knnCollector.incVisitedCount(1 + searchResults.getVisitedCount()); } } } From 41856a6c86f2dccb53d5a87b256cf60a5add033a Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 20:05:01 +0000 Subject: [PATCH 31/86] Fix multiple fieldEntry lookups --- .../sandbox/codecs/jvector/JVectorReader.java | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 9f0354c1ca27..45dad566f626 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -23,7 +23,6 @@ import io.github.jbellis.jvector.graph.SearchResult; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider; -import io.github.jbellis.jvector.graph.similarity.ScoreFunction; import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; @@ -140,7 +139,8 @@ public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { @Override public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { - final OnDiskGraphIndex index = fieldEntryMap.get(field).index; + final var fieldEntry = fieldEntryMap.get(field); + final OnDiskGraphIndex index = fieldEntry.index; final JVectorSearchStrategy searchStrategy; if (knnCollector.getSearchStrategy() instanceof JVectorSearchStrategy strategy) { @@ -159,21 +159,17 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce // Skip search when the graph is empty return; } - if (fieldEntryMap.get(field).pqVectors - != null) { // Quantized, use the precomputed score function - final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; + if (fieldEntry.pqVectors != null) { // Quantized, use the precomputed score function + final PQVectors pqVectors = fieldEntry.pqVectors; // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, // then reranks with the exact vectors that are stored on disk in the index - ScoreFunction.ApproximateScoreFunction asf = - pqVectors.precomputedScoreFunctionFor(q, fieldEntryMap.get(field).similarityFunction); - ScoreFunction.ExactScoreFunction reranker = - view.rerankerFor(q, fieldEntryMap.get(field).similarityFunction); + final var asf = pqVectors.precomputedScoreFunctionFor(q, fieldEntry.similarityFunction); + final var reranker = view.rerankerFor(q, fieldEntry.similarityFunction); ssp = new DefaultSearchScoreProvider(asf, reranker); } else { // Not quantized, used typical searcher - ssp = - DefaultSearchScoreProvider.exact(q, fieldEntryMap.get(field).similarityFunction, view); + ssp = DefaultSearchScoreProvider.exact(q, fieldEntry.similarityFunction, view); } - final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntryMap.get(field).graphNodeIdToDocMap; + final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntry.graphNodeIdToDocMap; // Convert the acceptDocs bitmap from Lucene to jVector ordinal bitmap filter // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check // if the jVector ordinal has a From d5b6d4fff579f28667eaf1feddfada9e48eeaf55 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 20:55:09 +0000 Subject: [PATCH 32/86] Do not write empty graph --- .../sandbox/codecs/jvector/JVectorReader.java | 73 ++++++++++++++----- .../sandbox/codecs/jvector/JVectorWriter.java | 23 +++++- 2 files changed, 76 insertions(+), 20 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 45dad566f626..24ebdbc2dcf7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -42,6 +42,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.VectorScorer; import org.apache.lucene.search.knn.KnnSearchStrategy; import org.apache.lucene.store.*; import org.apache.lucene.util.IOUtils; @@ -107,6 +108,35 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { final FieldEntry fieldEntry = fieldEntryMap.get(field); + if (fieldEntry == null || fieldEntry.index == null) { + return new FloatVectorValues() { + @Override + public float[] vectorValue(int ord) throws IOException { + throw new IndexOutOfBoundsException(); + } + + @Override + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public int dimension() { + return fieldEntry.vectorDimension; + } + + @Override + public int size() { + return 0; + } + + @Override + public VectorScorer scorer(float[] target) throws IOException { + return null; + } + }; + } + return new JVectorFloatVectorValues( fieldEntry.index, fieldEntry.similarityFunction, fieldEntry.graphNodeIdToDocMap); } @@ -132,8 +162,9 @@ public RandomAccessReader getNeighborsScoreCacheForField(String field) throws IO return fieldEntry.neighborsScoreCacheIndexReaderSupplier.get(); } - public OnDiskGraphIndex getOnDiskGraphIndex(String field) throws IOException { - return fieldEntryMap.get(field).index; + public boolean hasIndex(String field) { + final var fieldEntry = fieldEntryMap.get(field); + return fieldEntry != null && fieldEntry.index != null; } @Override @@ -141,6 +172,10 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce throws IOException { final var fieldEntry = fieldEntryMap.get(field); final OnDiskGraphIndex index = fieldEntry.index; + if (index == null) { + // Skip search when the graph is empty + return; + } final JVectorSearchStrategy searchStrategy; if (knnCollector.getSearchStrategy() instanceof JVectorSearchStrategy strategy) { @@ -155,10 +190,6 @@ public void search(String field, float[] target, KnnCollector knnCollector, Acce final SearchScoreProvider ssp; try (var view = index.getView()) { - if (view.entryNode() == null) { - // Skip search when the graph is empty - return; - } if (fieldEntry.pqVectors != null) { // Quantized, use the precomputed score function final PQVectors pqVectors = fieldEntry.pqVectors; // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, @@ -228,6 +259,7 @@ private void readFields(ChecksumIndexInput meta) throws IOException { class FieldEntry implements Closeable { private final VectorSimilarityFunction similarityFunction; + private final int vectorDimension; private final long vectorIndexOffset; private final long vectorIndexLength; private final long pqCodebooksAndVectorsLength; @@ -247,6 +279,7 @@ public FieldEntry( this.similarityFunction = VectorSimilarityMapper.ordToDistFunc( vectorIndexFieldMetadata.vectorSimilarityFunction.ordinal()); + this.vectorDimension = vectorIndexFieldMetadata.vectorDimension; this.vectorIndexOffset = vectorIndexFieldMetadata.vectorIndexOffset; this.vectorIndexLength = vectorIndexFieldMetadata.vectorIndexLength; this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; @@ -262,18 +295,22 @@ public FieldEntry( + "." + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; - // For the slice we would like to include the Lucene header, unfortunately, we have to do this - // because jVector use global - // offsets instead of local offsets - final long sliceLength = - vectorIndexLength - + CodecUtil.indexHeaderLength( - JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); - // Load the graph index - this.indexReaderSupplier = - new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); - this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + if (vectorIndexLength != 0) { + // For the slice we would like to include the Lucene header, unfortunately, we have to do + // this because jVector use global offsets instead of local offsets + final long sliceLength = + vectorIndexLength + + CodecUtil.indexHeaderLength( + JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); + // Load the graph index + this.indexReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + } else { + this.indexReaderSupplier = null; + this.index = null; + } // If quantized load the compressed product quantized vectors with their codebooks if (pqCodebooksAndVectorsLength > 0) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index f7351c1832cf..a384937248c1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -285,7 +285,9 @@ private void writeField( JVectorFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); - graph.save(jVectorIndexWriter); + if (graph.entryNode() != null) { + graph.save(jVectorIndexWriter); + } CodecUtil.writeFooter(indexOutput); } } @@ -323,6 +325,21 @@ private VectorIndexFieldMetadata writeGraph( segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); final long startOffset = indexOutput.getFilePointer(); + if (graph.size() == 0) { + CodecUtil.writeFooter(indexOutput); + return new VectorIndexFieldMetadata( + fieldInfo.number, + fieldInfo.getVectorEncoding(), + fieldInfo.getVectorSimilarityFunction(), + randomAccessVectorValues.dimension(), + 0, + 0, + 0, + 0, + degreeOverflow, + graphNodeIdToDocMap + ); + } try (var writer = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) .with(new InlineVectors(randomAccessVectorValues.dimension())) @@ -896,7 +913,9 @@ public void merge() throws IOException { this, graphNodeIdsToRavvOrds, getVectorSimilarityFunction(fieldInfo)); // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, // segmentWriteState.segmentInfo.name); - if (!deletesFound) { + if (!deletesFound + && leadingReader instanceof JVectorReader reader + && reader.hasIndex(fieldName)) { // Expand graph when there are no deletes and no PQ codebooks final RandomAccessReader leadingOnHeapGraphReader = leadingReader.getNeighborsScoreCacheForField(fieldName); From 7726aa0455331fcca7065bad3eb73122837dd886 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 30 Oct 2025 21:59:38 +0000 Subject: [PATCH 33/86] Fix merging other formats --- .../sandbox/codecs/jvector/JVectorWriter.java | 81 ++++++++++--------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index a384937248c1..0d714ae39c46 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -35,6 +35,7 @@ import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; import java.io.IOException; +import java.io.UncheckedIOException; import java.io.UnsupportedEncodingException; import java.util.*; import java.util.concurrent.ForkJoinPool; @@ -84,6 +85,8 @@ * with {@link GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. */ public class JVectorWriter extends KnnVectorsWriter { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); private static final long SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); @@ -328,17 +331,16 @@ private VectorIndexFieldMetadata writeGraph( if (graph.size() == 0) { CodecUtil.writeFooter(indexOutput); return new VectorIndexFieldMetadata( - fieldInfo.number, - fieldInfo.getVectorEncoding(), - fieldInfo.getVectorSimilarityFunction(), - randomAccessVectorValues.dimension(), - 0, - 0, - 0, - 0, - degreeOverflow, - graphNodeIdToDocMap - ); + fieldInfo.number, + fieldInfo.getVectorEncoding(), + fieldInfo.getVectorSimilarityFunction(), + randomAccessVectorValues.dimension(), + 0, + 0, + 0, + 0, + degreeOverflow, + graphNodeIdToDocMap); } try (var writer = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) @@ -516,8 +518,6 @@ public long ramBytesUsed() { * support specific implementations, such as float[] or byte[] vectors. */ static class FieldWriter extends KnnFieldVectorsWriter { - private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = - VectorizationProvider.getInstance().getVectorTypeSupport(); private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); private final FieldInfo fieldInfo; private int lastDocID = -1; @@ -605,7 +605,7 @@ class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { // Array of sub-readers private final KnnVectorsReader[] readers; - private final JVectorFloatVectorValues[] perReaderFloatVectorValues; + private final FloatVectorValues[] perReaderFloatVectorValues; // Maps the ravv ordinals to the reader index and the ordinal in that reader. This is allowing // us to get a unified view of all the @@ -662,7 +662,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge FieldInfos fieldInfos = mergeState.fieldInfos[i]; baseOrds[i] = totalVectorsCount; if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { - KnnVectorsReader reader = mergeState.knnVectorsReaders[i]; + KnnVectorsReader reader = mergeState.knnVectorsReaders[i].unwrapReaderForField(fieldName); if (reader != null) { FloatVectorValues values = reader.getFloatVectorValues(fieldName); if (values != null) { @@ -678,7 +678,8 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge deletesFound = true; } } - if (liveVectorCountInReader >= vectorsCountInLeadingReader) { + if (reader instanceof JVectorReader + && liveVectorCountInReader >= vectorsCountInLeadingReader) { vectorsCountInLeadingReader = liveVectorCountInReader; tempLeadingReaderIdx = i; } @@ -706,7 +707,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge // For this part we need to make sure we also swap all the other metadata arrays that are // indexed by reader index // Such as readers, docMaps, liveDocs, baseOrds, deletedOrds - if (tempLeadingReaderIdx != 0) { + if (tempLeadingReaderIdx > 0) { final KnnVectorsReader temp = readers[LEADING_READER_IDX]; readers[LEADING_READER_IDX] = readers[tempLeadingReaderIdx]; readers[tempLeadingReaderIdx] = temp; @@ -720,7 +721,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge baseOrds[tempLeadingReaderIdx] = tempBaseOrd; } - this.perReaderFloatVectorValues = new JVectorFloatVectorValues[readers.length]; + this.perReaderFloatVectorValues = new FloatVectorValues[readers.length]; this.dimension = dimension; // Build mapping from global ordinal to [readerIndex, readerOrd] @@ -743,8 +744,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge // TODO: remove this logic once we support incremental graph building with deletes see // https://github.com/opensearch-project/opensearch-jvector/issues/171 for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { - final JVectorFloatVectorValues values = - (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); perReaderFloatVectorValues[readerIdx] = values; // For each vector in this reader KnnVectorValues.DocIndexIterator it = values.iterator(); @@ -778,8 +778,8 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge // This is necessary because we are later going to expand that graph with new vectors from // the other readers. // The leading reader is ALWAYS the first one in the readers array - final JVectorFloatVectorValues leadingReaderValues = - (JVectorFloatVectorValues) readers[LEADING_READER_IDX].getFloatVectorValues(fieldName); + final FloatVectorValues leadingReaderValues = + readers[LEADING_READER_IDX].getFloatVectorValues(fieldName); perReaderFloatVectorValues[LEADING_READER_IDX] = leadingReaderValues; var leadingReaderIt = leadingReaderValues.iterator(); for (int docId = leadingReaderIt.nextDoc(); @@ -802,8 +802,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge // For the remaining readers we map the graph node id to the ravv ordinal in the order they // appear for (int readerIdx = 1; readerIdx < readers.length; readerIdx++) { - final JVectorFloatVectorValues values = - (JVectorFloatVectorValues) readers[readerIdx].getFloatVectorValues(fieldName); + final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); perReaderFloatVectorValues[readerIdx] = values; // For each vector in this reader KnnVectorValues.DocIndexIterator it = values.iterator(); @@ -873,21 +872,14 @@ public void merge() throws IOException { final PQVectors pqVectors; final OnHeapGraphIndex graph; // Get the leading reader - final JVectorReader leadingReader = - (JVectorReader) readers[LEADING_READER_IDX].unwrapReaderForField(fieldName); + final var leadingReader = readers[LEADING_READER_IDX].unwrapReaderForField(fieldName); final BuildScoreProvider buildScoreProvider; // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the // remaining vectors - if (leadingReader.getProductQuantizationForField(fieldInfo.name).isEmpty()) { - // No pre-existing codebooks, check if we have enough vectors to trigger quantization - if (this.size() >= minimumBatchSizeForQuantization) { - pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); - } else { - pqVectors = null; - } - } else { - ProductQuantization leadingCompressor = - leadingReader.getProductQuantizationForField(fieldName).get(); + if (leadingReader instanceof JVectorReader reader + && reader.getProductQuantizationForField(fieldName).isPresent()) { + final ProductQuantization leadingCompressor = + reader.getProductQuantizationForField(fieldName).get(); // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading // reader since it's already been // used to create the leadingCompressor @@ -905,6 +897,11 @@ public void merge() throws IOException { graphNodeIdsToRavvOrds, this, SIMD_POOL_MERGE); + } else if (this.size() >= minimumBatchSizeForQuantization) { + // No pre-existing codebooks, check if we have enough vectors to trigger quantization + pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); + } else { + pqVectors = null; } if (pqVectors == null) { @@ -918,7 +915,7 @@ public void merge() throws IOException { && reader.hasIndex(fieldName)) { // Expand graph when there are no deletes and no PQ codebooks final RandomAccessReader leadingOnHeapGraphReader = - leadingReader.getNeighborsScoreCacheForField(fieldName); + reader.getNeighborsScoreCacheForField(fieldName); final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); graph = (OnHeapGraphIndex) @@ -985,7 +982,15 @@ public VectorFloat getVector(int ord) { // Access to float values is not thread safe synchronized (perReaderFloatVectorValues[readerIdx]) { - return perReaderFloatVectorValues[readerIdx].vectorFloatValue(readerOrd); + if (perReaderFloatVectorValues[readerIdx] instanceof JVectorFloatVectorValues values) { + return values.vectorFloatValue(readerOrd); + } + try { + return VECTOR_TYPE_SUPPORT.createFloatVector( + perReaderFloatVectorValues[readerIdx].vectorValue(readerOrd)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } } From 6cb01839b8424027d1b9b1aa3781af8390772f08 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 1 Nov 2025 17:17:09 +0000 Subject: [PATCH 34/86] Remove incremental graph build --- .../sandbox/codecs/jvector/JVectorFormat.java | 4 - .../sandbox/codecs/jvector/JVectorReader.java | 31 --- .../sandbox/codecs/jvector/JVectorWriter.java | 196 +++--------------- 3 files changed, 33 insertions(+), 198 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index ef61fda380a6..af727760c375 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -32,13 +32,9 @@ public class JVectorFormat extends KnnVectorsFormat { public static final String NAME = "JVectorFormat"; public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; - public static final String NEIGHBORS_SCORE_CACHE_CODEC_NAME = - "JVectorVectorsFormatNeighborsScoreCache"; public static final String JVECTOR_FILES_SUFFIX = "jvector"; public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; - public static final String NEIGHBORS_SCORE_CACHE_EXTENSION = - "neighbors-score-cache-" + JVECTOR_FILES_SUFFIX; public static final int VERSION_START = 0; public static final int VERSION_CURRENT = VERSION_START; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 24ebdbc2dcf7..a7ca23e98132 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -17,7 +17,6 @@ package org.apache.lucene.sandbox.codecs.jvector; -import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.disk.ReaderSupplier; import io.github.jbellis.jvector.graph.GraphSearcher; import io.github.jbellis.jvector.graph.SearchResult; @@ -95,13 +94,6 @@ public void checkIntegrity() throws IOException { state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, IOContext.READONCE)) { CodecUtil.checksumEntireFile(indexInput); } - - // Verify the neighbors score cache file - try (var indexInput = - state.directory.openInput( - fieldEntry.neighborsScoreCacheIndexFieldFileName, IOContext.READONCE)) { - CodecUtil.checksumEntireFile(indexInput); - } } } @@ -157,11 +149,6 @@ public Optional getProductQuantizationForField(String field return Optional.of(fieldEntry.pqVectors.getCompressor()); } - public RandomAccessReader getNeighborsScoreCacheForField(String field) throws IOException { - final FieldEntry fieldEntry = fieldEntryMap.get(field); - return fieldEntry.neighborsScoreCacheIndexReaderSupplier.get(); - } - public boolean hasIndex(String field) { final var fieldEntry = fieldEntryMap.get(field); return fieldEntry != null && fieldEntry.index != null; @@ -265,11 +252,9 @@ class FieldEntry implements Closeable { private final long pqCodebooksAndVectorsLength; private final long pqCodebooksAndVectorsOffset; private final String vectorIndexFieldDataFileName; - private final String neighborsScoreCacheIndexFieldFileName; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final ReaderSupplier indexReaderSupplier; private final ReaderSupplier pqCodebooksReaderSupplier; - private final ReaderSupplier neighborsScoreCacheIndexReaderSupplier; private final OnDiskGraphIndex index; private final PQVectors pqVectors; // The product quantized vectors with their codebooks @@ -288,12 +273,6 @@ public FieldEntry( this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - this.neighborsScoreCacheIndexFieldFileName = - baseDataFileName - + "_" - + fieldInfo.name - + "." - + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; if (vectorIndexLength != 0) { // For the slice we would like to include the Lucene header, unfortunately, we have to do @@ -331,13 +310,6 @@ public FieldEntry( this.pqCodebooksReaderSupplier = null; this.pqVectors = null; } - - final IndexInput indexInput = - directory.openInput(neighborsScoreCacheIndexFieldFileName, state.context); - CodecUtil.readIndexHeader(indexInput); - - this.neighborsScoreCacheIndexReaderSupplier = - new JVectorRandomAccessReader.Supplier(indexInput); } @Override @@ -348,9 +320,6 @@ public void close() throws IOException { if (pqCodebooksReaderSupplier != null) { IOUtils.close(pqCodebooksReaderSupplier::close); } - if (neighborsScoreCacheIndexReaderSupplier != null) { - IOUtils.close(neighborsScoreCacheIndexReaderSupplier::close); - } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 0d714ae39c46..bcc3b66d9ae0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -22,7 +22,6 @@ import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; -import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.graph.*; import io.github.jbellis.jvector.graph.disk.*; import io.github.jbellis.jvector.graph.disk.feature.Feature; @@ -270,29 +269,6 @@ private void writeField( graphNodeIdToDocMap); meta.writeInt(fieldInfo.number); vectorIndexFieldMetadata.toOutput(meta); - - // field data file, which contains the graph - final String neighborsScoreCacheIndexFieldFileName = - baseDataFileName - + "_" - + fieldInfo.name - + "." - + JVectorFormat.NEIGHBORS_SCORE_CACHE_EXTENSION; - try (IndexOutput indexOutput = - segmentWriteState.directory.createOutput( - neighborsScoreCacheIndexFieldFileName, segmentWriteState.context); - final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput)) { - CodecUtil.writeIndexHeader( - indexOutput, - JVectorFormat.NEIGHBORS_SCORE_CACHE_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix); - if (graph.entryNode() != null) { - graph.save(jVectorIndexWriter); - } - CodecUtil.writeFooter(indexOutput); - } } /** @@ -622,7 +598,6 @@ class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { private final FieldInfo fieldInfo; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final int[] graphNodeIdsToRavvOrds; - private boolean deletesFound = false; /** * Creates a random access view over merged float vector values. @@ -649,12 +624,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); final Bits[] liveDocs = mergeState.liveDocs.clone(); final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; - final int[] deletedOrds = - new int - [mergeState - .knnVectorsReaders - .length]; // counts the number of deleted documents in each reader - // that previously had a vector // Find the leading reader, count the total number of live vectors, and the base ordinals for // each reader @@ -673,9 +642,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (liveDocs[i] == null || liveDocs[i].get(it.docID())) { liveVectorCountInReader++; - } else { - deletedOrds[i]++; - deletesFound = true; } } if (reader instanceof JVectorReader @@ -738,95 +704,32 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge this.graphNodeIdsToRavvOrds = new int[totalLiveVectorsCount]; int graphNodeId = 0; - if (deletesFound) { - // If there are deletes, we need to build a new graph from scratch and compact the graph - // node ids - // TODO: remove this logic once we support incremental graph building with deletes see - // https://github.com/opensearch-project/opensearch-jvector/issues/171 - for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { - final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); - perReaderFloatVectorValues[readerIdx] = values; - // For each vector in this reader - KnnVectorValues.DocIndexIterator it = values.iterator(); - - for (int docId = it.nextDoc(); - docId != DocIdSetIterator.NO_MORE_DOCS; - docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) != -1) { - // Mapping from ravv ordinals to [readerIndex, readerOrd] - // Map graph node id to ravv ordinal - // Map graph node id to doc id - final int newGlobalDocId = docMaps[readerIdx].get(docId); - final int ravvLocalOrd = it.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; - graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; - graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } - } - } else { - // If there are no deletes, we can reuse the existing graph and simply remap the ravv - // ordinals to the new global doc ids - // for the leading reader we must preserve the original node Ids and map them to the - // corresponding ravv vectors originally - // used to build the graph - // This is necessary because we are later going to expand that graph with new vectors from - // the other readers. - // The leading reader is ALWAYS the first one in the readers array - final FloatVectorValues leadingReaderValues = - readers[LEADING_READER_IDX].getFloatVectorValues(fieldName); - perReaderFloatVectorValues[LEADING_READER_IDX] = leadingReaderValues; - var leadingReaderIt = leadingReaderValues.iterator(); - for (int docId = leadingReaderIt.nextDoc(); + // Build a new graph from scratch and compact the graph node ids + for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { + final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + // For each vector in this reader + KnnVectorValues.DocIndexIterator it = values.iterator(); + + for (int docId = it.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; - docId = leadingReaderIt.nextDoc()) { - final int newGlobalDocId = docMaps[LEADING_READER_IDX].get(docId); - if (newGlobalDocId != -1) { - final int ravvLocalOrd = leadingReaderIt.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[LEADING_READER_IDX]; - graphNodeIdToDocIds[ravvLocalOrd] = newGlobalDocId; - graphNodeIdsToRavvOrds[ravvLocalOrd] = ravvGlobalOrd; + docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) != -1) { + // Mapping from ravv ordinals to [readerIndex, readerOrd] + // Map graph node id to ravv ordinal + // Map graph node id to doc id + final int newGlobalDocId = docMaps[readerIdx].get(docId); + final int ravvLocalOrd = it.index(); + final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; + graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; + graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = LEADING_READER_IDX; // Reader index + ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader } documentsIterated++; } - - // For the remaining readers we map the graph node id to the ravv ordinal in the order they - // appear - for (int readerIdx = 1; readerIdx < readers.length; readerIdx++) { - final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); - perReaderFloatVectorValues[readerIdx] = values; - // For each vector in this reader - KnnVectorValues.DocIndexIterator it = values.iterator(); - - for (int docId = it.nextDoc(); - docId != DocIdSetIterator.NO_MORE_DOCS; - docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) != -1) { - // Mapping from ravv ordinals to [readerIndex, readerOrd] - // Map graph node id to ravv ordinal - // Map graph node id to doc id - final int newGlobalDocId = docMaps[readerIdx].get(docId); - final int ravvLocalOrd = it.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; - graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; - graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } - } } if (documentsIterated < totalVectorsCount) { @@ -870,10 +773,8 @@ public void merge() throws IOException { // Get PQ compressor for leading reader final String fieldName = fieldInfo.name; final PQVectors pqVectors; - final OnHeapGraphIndex graph; // Get the leading reader final var leadingReader = readers[LEADING_READER_IDX].unwrapReaderForField(fieldName); - final BuildScoreProvider buildScoreProvider; // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the // remaining vectors if (leadingReader instanceof JVectorReader reader @@ -904,43 +805,8 @@ public void merge() throws IOException { pqVectors = null; } - if (pqVectors == null) { - buildScoreProvider = - BuildScoreProvider.randomAccessScoreProvider( - this, graphNodeIdsToRavvOrds, getVectorSimilarityFunction(fieldInfo)); - // graph = getGraph(buildScoreProvider, this, newToOldOrds, fieldInfo, - // segmentWriteState.segmentInfo.name); - if (!deletesFound - && leadingReader instanceof JVectorReader reader - && reader.hasIndex(fieldName)) { - // Expand graph when there are no deletes and no PQ codebooks - final RandomAccessReader leadingOnHeapGraphReader = - reader.getNeighborsScoreCacheForField(fieldName); - final int numBaseVectors = leadingReader.getFloatVectorValues(fieldName).size(); - graph = - (OnHeapGraphIndex) - GraphIndexBuilder.buildAndMergeNewNodes( - leadingOnHeapGraphReader, - this, - buildScoreProvider, - numBaseVectors, - graphNodeIdsToRavvOrds, - beamWidth, - degreeOverflow, - alpha, - hierarchyEnabled); - } else { - // Build a new graph from scratch when there are deletes and no PQ codebooks - graph = - getGraph( - buildScoreProvider, - this, - graphNodeIdsToRavvOrds, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE); - } - } else { + final BuildScoreProvider buildScoreProvider; + if (pqVectors != null) { // Re-use PQ codebooks to build a new graph from scratch buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( @@ -948,15 +814,19 @@ public void merge() throws IOException { // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD // threads) buildScoreProvider.diversityProviderFor(0); - graph = - getGraph( - buildScoreProvider, - this, - graphNodeIdsToRavvOrds, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE); + } else { + buildScoreProvider = + BuildScoreProvider.randomAccessScoreProvider( + this, graphNodeIdsToRavvOrds, getVectorSimilarityFunction(fieldInfo)); } + final OnHeapGraphIndex graph = + getGraph( + buildScoreProvider, + this, + graphNodeIdsToRavvOrds, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE); writeField(fieldInfo, this, pqVectors, graphNodeIdsToRavvOrds, graphNodeIdToDocMap, graph); } From 47080f5c7af4f5b644d3cd50879fa3ebc8a9eea0 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 1 Nov 2025 17:35:11 +0000 Subject: [PATCH 35/86] Fix leading readers indexing error --- .../lucene/sandbox/codecs/jvector/JVectorWriter.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index bcc3b66d9ae0..93762a726db1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -620,7 +620,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge int dimension = 0; int tempLeadingReaderIdx = -1; int vectorsCountInLeadingReader = -1; - List allReaders = new ArrayList<>(); + this.readers = mergeState.knnVectorsReaders.clone(); final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); final Bits[] liveDocs = mergeState.liveDocs.clone(); final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; @@ -635,7 +635,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge if (reader != null) { FloatVectorValues values = reader.getFloatVectorValues(fieldName); if (values != null) { - allReaders.add(reader); int vectorCountInReader = values.size(); int liveVectorCountInReader = 0; KnnVectorValues.DocIndexIterator it = values.iterator(); @@ -664,10 +663,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge assert (dimension > 0) : "No vectors found for field " + fieldName; this.size = totalVectorsCount; - this.readers = new KnnVectorsReader[allReaders.size()]; - for (int i = 0; i < readers.length; i++) { - readers[i] = allReaders.get(i); - } // always swap the leading reader to the first position // For this part we need to make sure we also swap all the other metadata arrays that are From 34f0685d2ee198eb9f74a959087bfded53650809 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 1 Nov 2025 18:12:48 +0000 Subject: [PATCH 36/86] Fix PQ refinement --- .../sandbox/codecs/jvector/JVectorWriter.java | 58 +++++++------------ 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 93762a726db1..3f73810c9469 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -577,7 +577,6 @@ static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimila class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { private static final int READER_ID = 0; private static final int READER_ORD = 1; - private static final int LEADING_READER_IDX = 0; // Array of sub-readers private final KnnVectorsReader[] readers; @@ -598,6 +597,8 @@ class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { private final FieldInfo fieldInfo; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final int[] graphNodeIdsToRavvOrds; + private final int pqReaderIndex; + private final ProductQuantization pq; /** * Creates a random access view over merged float vector values. @@ -618,7 +619,8 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge int totalVectorsCount = 0; int totalLiveVectorsCount = 0; int dimension = 0; - int tempLeadingReaderIdx = -1; + int pqReaderIndex = -1; + ProductQuantization pq = null; int vectorsCountInLeadingReader = -1; this.readers = mergeState.knnVectorsReaders.clone(); final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); @@ -643,10 +645,14 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge liveVectorCountInReader++; } } - if (reader instanceof JVectorReader + if (reader instanceof JVectorReader jVectorReader && liveVectorCountInReader >= vectorsCountInLeadingReader) { vectorsCountInLeadingReader = liveVectorCountInReader; - tempLeadingReaderIdx = i; + final var maybeNewPq = jVectorReader.getProductQuantizationForField(fieldName); + if (maybeNewPq.isPresent()) { + pqReaderIndex = i; + pq = maybeNewPq.get(); + } } totalVectorsCount += vectorCountInReader; totalLiveVectorsCount += liveVectorCountInReader; @@ -662,26 +668,9 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge : "Total number of live vectors exceeds the total number of vectors"; assert (dimension > 0) : "No vectors found for field " + fieldName; + this.pq = pq; + this.pqReaderIndex = pqReaderIndex; this.size = totalVectorsCount; - - // always swap the leading reader to the first position - // For this part we need to make sure we also swap all the other metadata arrays that are - // indexed by reader index - // Such as readers, docMaps, liveDocs, baseOrds, deletedOrds - if (tempLeadingReaderIdx > 0) { - final KnnVectorsReader temp = readers[LEADING_READER_IDX]; - readers[LEADING_READER_IDX] = readers[tempLeadingReaderIdx]; - readers[tempLeadingReaderIdx] = temp; - // also swap the leading doc map to the first position to match the readers - final MergeState.DocMap tempDocMap = docMaps[LEADING_READER_IDX]; - docMaps[LEADING_READER_IDX] = docMaps[tempLeadingReaderIdx]; - docMaps[tempLeadingReaderIdx] = tempDocMap; - // swap base ords - final int tempBaseOrd = baseOrds[LEADING_READER_IDX]; - baseOrds[LEADING_READER_IDX] = baseOrds[tempLeadingReaderIdx]; - baseOrds[tempLeadingReaderIdx] = tempBaseOrd; - } - this.perReaderFloatVectorValues = new FloatVectorValues[readers.length]; this.dimension = dimension; @@ -768,27 +757,24 @@ public void merge() throws IOException { // Get PQ compressor for leading reader final String fieldName = fieldInfo.name; final PQVectors pqVectors; - // Get the leading reader - final var leadingReader = readers[LEADING_READER_IDX].unwrapReaderForField(fieldName); // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the // remaining vectors - if (leadingReader instanceof JVectorReader reader - && reader.getProductQuantizationForField(fieldName).isPresent()) { - final ProductQuantization leadingCompressor = - reader.getProductQuantizationForField(fieldName).get(); - // Refine the leadingCompressor with the remaining vectors in the merge, we skip the leading - // reader since it's already been - // used to create the leadingCompressor - // We assume the leading reader is ALWAYS the first one in the readers array - for (int i = LEADING_READER_IDX + 1; i < readers.length; i++) { + if (pq != null) { + // Refine the leadingCompressor with the remaining vectors in the merge + ProductQuantization newPq = pq; + for (int i = 0; i < readers.length; i++) { + if (i == pqReaderIndex) { + // Skip the reader associated with the re-used PQ codebook + continue; + } final FloatVectorValues values = readers[i].getFloatVectorValues(fieldName); final RandomAccessVectorValues randomAccessVectorValues = new RandomAccessVectorValuesOverVectorValues(values); - leadingCompressor.refine(randomAccessVectorValues); + newPq = newPq.refine(randomAccessVectorValues); } pqVectors = PQVectors.encodeAndBuild( - leadingCompressor, + newPq, graphNodeIdsToRavvOrds.length, graphNodeIdsToRavvOrds, this, From 31b973e4d3c32dc824b24384a5ef2ed616698770 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 18:15:35 +0000 Subject: [PATCH 37/86] Dry out JVectorFloatVectorValues --- .../codecs/jvector/JVectorFloatVectorValues.java | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index df9b71a385b8..d3b2007c1427 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -34,8 +34,6 @@ public class JVectorFloatVectorValues extends FloatVectorValues { private final OnDiskGraphIndex.View view; private final VectorSimilarityFunction similarityFunction; - private final int dimension; - private final int size; private final GraphNodeIdToDocMap graphNodeIdToDocMap; public JVectorFloatVectorValues( @@ -44,20 +42,18 @@ public JVectorFloatVectorValues( GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { this.view = onDiskGraphIndex.getView(); - this.dimension = view.dimension(); - this.size = view.size(); this.similarityFunction = similarityFunction; this.graphNodeIdToDocMap = graphNodeIdToDocMap; } @Override public int dimension() { - return dimension; + return view.dimension(); } @Override public int size() { - return size; + return view.size(); } // This allows us to access the vector without copying it to float[] @@ -90,7 +86,7 @@ public int docID() { public int nextDoc() throws IOException { // Advance to the next node docId starts from -1 which is why we need to increment docId by // 1 "size" times - while (docId < size - 1) { + while (docId < size() - 1) { docId++; if (liveNodes.get(docId)) { return docId; @@ -118,10 +114,6 @@ public float[] vectorValue(int i) throws IOException { } } - public VectorFloat vectorValueObject(int i) throws IOException { - return vectorFloatValue(i); - } - @Override public FloatVectorValues copy() throws IOException { return this; From 10a4287330d0ea3958847b82d80eaef48df54a80 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 19:15:05 +0000 Subject: [PATCH 38/86] Fix JVectorFloatVectorValues VectorScorers --- .../jvector/JVectorFloatVectorValues.java | 42 ++++++++++++++- .../sandbox/codecs/jvector/JVectorReader.java | 5 +- .../codecs/jvector/JVectorVectorScorer.java | 54 ------------------- 3 files changed, 44 insertions(+), 57 deletions(-) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index d3b2007c1427..4e8376966613 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -18,6 +18,8 @@ package org.apache.lucene.sandbox.codecs.jvector; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.similarity.ScoreFunction; +import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.util.Bits; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; @@ -25,6 +27,7 @@ import io.github.jbellis.jvector.vector.types.VectorTypeSupport; import java.io.IOException; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; /// Implements Lucene vector access over a JVector on-disk index @@ -33,15 +36,18 @@ public class JVectorFloatVectorValues extends FloatVectorValues { VectorizationProvider.getInstance().getVectorTypeSupport(); private final OnDiskGraphIndex.View view; + private final PQVectors pq; private final VectorSimilarityFunction similarityFunction; private final GraphNodeIdToDocMap graphNodeIdToDocMap; public JVectorFloatVectorValues( OnDiskGraphIndex onDiskGraphIndex, + PQVectors pq, VectorSimilarityFunction similarityFunction, GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { this.view = onDiskGraphIndex.getView(); + this.pq = pq; this.similarityFunction = similarityFunction; this.graphNodeIdToDocMap = graphNodeIdToDocMap; } @@ -121,7 +127,39 @@ public FloatVectorValues copy() throws IOException { @Override public VectorScorer scorer(float[] query) throws IOException { - return new JVectorVectorScorer( - this, VECTOR_TYPE_SUPPORT.createFloatVector(query), similarityFunction); + if (pq != null) { + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(query); + final var quantizedScoreFunction = pq.precomputedScoreFunctionFor(vector, similarityFunction); + return new JVectorScorer(quantizedScoreFunction, iterator()); + } else { + return rescorer(query); + } + } + + @Override + public VectorScorer rescorer(float[] target) throws IOException { + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final var scoreFunction = view.rerankerFor(vector, similarityFunction); + return new JVectorScorer(scoreFunction, iterator()); + } + + private static class JVectorScorer implements VectorScorer { + private final ScoreFunction scoreFunction; + private final DocIndexIterator iterator; + + JVectorScorer(ScoreFunction scoreFunction, DocIndexIterator iterator) { + this.scoreFunction = scoreFunction; + this.iterator = iterator; + } + + @Override + public float score() throws IOException { + return scoreFunction.similarityTo(iterator.index()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index a7ca23e98132..ed2564130dad 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -130,7 +130,10 @@ public VectorScorer scorer(float[] target) throws IOException { } return new JVectorFloatVectorValues( - fieldEntry.index, fieldEntry.similarityFunction, fieldEntry.graphNodeIdToDocMap); + fieldEntry.index, + fieldEntry.pqVectors, + fieldEntry.similarityFunction, + fieldEntry.graphNodeIdToDocMap); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java deleted file mode 100644 index 8c9006dd0901..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.sandbox.codecs.jvector; - -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.github.jbellis.jvector.vector.types.VectorFloat; -import java.io.IOException; -import org.apache.lucene.index.KnnVectorValues; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.VectorScorer; - -/// Implements Lucene scoring over a JVector index -public class JVectorVectorScorer implements VectorScorer { - private final JVectorFloatVectorValues floatVectorValues; - private final KnnVectorValues.DocIndexIterator docIndexIterator; - private final VectorFloat target; - private final VectorSimilarityFunction similarityFunction; - - public JVectorVectorScorer( - JVectorFloatVectorValues vectorValues, - VectorFloat target, - VectorSimilarityFunction similarityFunction) { - this.floatVectorValues = vectorValues; - this.docIndexIterator = floatVectorValues.iterator(); - this.target = target; - this.similarityFunction = similarityFunction; - } - - @Override - public float score() throws IOException { - return similarityFunction.compare( - target, floatVectorValues.vectorFloatValue(docIndexIterator.index())); - } - - @Override - public DocIdSetIterator iterator() { - return docIndexIterator; - } -} From 65439575cf2a5e164149db4a8250cf033f2a8bd2 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 19:17:44 +0000 Subject: [PATCH 39/86] Fix merging empty graphs --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 3f73810c9469..6e3499c5d26c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -690,7 +690,13 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge int graphNodeId = 0; // Build a new graph from scratch and compact the graph node ids for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { + if (readers[readerIdx] == null) { + continue; + } final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); + if (values == null || values.size() == 0) { + continue; + } perReaderFloatVectorValues[readerIdx] = values; // For each vector in this reader KnnVectorValues.DocIndexIterator it = values.iterator(); From 5f069189504d5a320b20aab0b81fe71bfc1fe892 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 20:41:24 +0000 Subject: [PATCH 40/86] Pull dimension from merged fieldInfo --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 6e3499c5d26c..aee3767c647d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -610,6 +610,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge throws IOException { this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); this.fieldInfo = fieldInfo; + this.dimension = fieldInfo.getVectorDimension(); final String fieldName = fieldInfo.name; @@ -618,7 +619,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge // between global ordinals and global lucene doc ids int totalVectorsCount = 0; int totalLiveVectorsCount = 0; - int dimension = 0; int pqReaderIndex = -1; ProductQuantization pq = null; int vectorsCountInLeadingReader = -1; @@ -656,7 +656,7 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge } totalVectorsCount += vectorCountInReader; totalLiveVectorsCount += liveVectorCountInReader; - dimension = Math.max(dimension, values.dimension()); + assert values.dimension() == dimension; } } } @@ -672,7 +672,6 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge this.pqReaderIndex = pqReaderIndex; this.size = totalVectorsCount; this.perReaderFloatVectorValues = new FloatVectorValues[readers.length]; - this.dimension = dimension; // Build mapping from global ordinal to [readerIndex, readerOrd] this.ravvOrdToReaderMapping = new int[totalDocsCount][2]; From 7cc847993bf51020a07a5b10ac5160feaa265da8 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 20:53:33 +0000 Subject: [PATCH 41/86] Fix imports --- .../sandbox/codecs/jvector/JVectorWriter.java | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index aee3767c647d..4c4bdca64a59 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -22,8 +22,11 @@ import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; -import io.github.jbellis.jvector.graph.*; -import io.github.jbellis.jvector.graph.disk.*; +import io.github.jbellis.jvector.graph.GraphIndexBuilder; +import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; +import io.github.jbellis.jvector.graph.OnHeapGraphIndex; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; @@ -36,7 +39,9 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.io.UnsupportedEncodingException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.concurrent.ForkJoinPool; import java.util.function.IntUnaryOperator; import java.util.stream.IntStream; @@ -44,9 +49,19 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.index.*; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.store.*; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; From 9f4a55717cb50cc593f55c192d1d888494942198 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 3 Nov 2025 22:15:57 +0000 Subject: [PATCH 42/86] Use RemappedRandomAccessVectorValues to abstract graphNodeIdsToRavvOrds --- .../sandbox/codecs/jvector/JVectorWriter.java | 71 +++++-------------- 1 file changed, 18 insertions(+), 53 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 4c4bdca64a59..eeb36843adb4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -26,6 +26,7 @@ import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import io.github.jbellis.jvector.graph.OnHeapGraphIndex; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.graph.RemappedRandomAccessVectorValues; import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; @@ -218,15 +219,11 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { for (FieldWriter field : fields) { final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; - final int[] newToOldOrds = new int[randomAccessVectorValues.size()]; - for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { - newToOldOrds[ord] = ord; - } final BuildScoreProvider buildScoreProvider; final PQVectors pqVectors; final FieldInfo fieldInfo = field.fieldInfo; if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { - pqVectors = getPQVectors(newToOldOrds, randomAccessVectorValues, fieldInfo); + pqVectors = getPQVectors(randomAccessVectorValues, fieldInfo); buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( getVectorSimilarityFunction(fieldInfo), pqVectors); @@ -252,17 +249,11 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { getGraph( buildScoreProvider, randomAccessVectorValues, - newToOldOrds, fieldInfo, segmentWriteState.segmentInfo.name, SIMD_POOL_FLUSH); writeField( - field.fieldInfo, - field.randomAccessVectorValues, - pqVectors, - newToOldOrds, - graphNodeIdToDocMap, - graph); + field.fieldInfo, field.randomAccessVectorValues, pqVectors, graphNodeIdToDocMap, graph); } } @@ -270,18 +261,11 @@ private void writeField( FieldInfo fieldInfo, RandomAccessVectorValues randomAccessVectorValues, PQVectors pqVectors, - int[] newToOldOrds, GraphNodeIdToDocMap graphNodeIdToDocMap, OnHeapGraphIndex graph) throws IOException { final var vectorIndexFieldMetadata = - writeGraph( - graph, - randomAccessVectorValues, - fieldInfo, - pqVectors, - newToOldOrds, - graphNodeIdToDocMap); + writeGraph(graph, randomAccessVectorValues, fieldInfo, pqVectors, graphNodeIdToDocMap); meta.writeInt(fieldInfo.number); vectorIndexFieldMetadata.toOutput(meta); } @@ -300,7 +284,6 @@ private VectorIndexFieldMetadata writeGraph( RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo, PQVectors pqVectors, - int[] newToOldOrds, GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { // field data file, which contains the graph @@ -340,9 +323,7 @@ private VectorIndexFieldMetadata writeGraph( var suppliers = Feature.singleStateFactory( FeatureId.INLINE_VECTORS, - nodeId -> - new InlineVectors.State( - randomAccessVectorValues.getVector(newToOldOrds[nodeId]))); + nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(nodeId))); writer.write(suppliers); final long endGraphOffset = jVectorIndexWriter.position(); @@ -377,8 +358,7 @@ private VectorIndexFieldMetadata writeGraph( } private PQVectors getPQVectors( - int[] newToOldOrds, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) - throws IOException { + RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) throws IOException { final VectorSimilarityFunction vectorSimilarityFunction = fieldInfo.getVectorSimilarityFunction(); final int M = @@ -396,11 +376,7 @@ private PQVectors getPQVectors( SIMD_POOL_MERGE, ForkJoinPool.commonPool()); - // PQVectors pqVectors = pq.encodeAll(randomAccessVectorValues, SIMD_POOL); - PQVectors pqVectors = - PQVectors.encodeAndBuild( - pq, newToOldOrds.length, newToOldOrds, randomAccessVectorValues, SIMD_POOL_MERGE); - return pqVectors; + return pq.encodeAll(randomAccessVectorValues, SIMD_POOL_MERGE); } /// Metadata about the index to be persisted on disk @@ -773,6 +749,8 @@ public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState merge * @throws IOException if there is an issue during reading or writing vector data. */ public void merge() throws IOException { + final RandomAccessVectorValues mapped = + new RemappedRandomAccessVectorValues(this, graphNodeIdsToRavvOrds); // This section creates the PQVectors to be used for this merge // Get PQ compressor for leading reader final String fieldName = fieldInfo.name; @@ -792,16 +770,10 @@ public void merge() throws IOException { new RandomAccessVectorValuesOverVectorValues(values); newPq = newPq.refine(randomAccessVectorValues); } - pqVectors = - PQVectors.encodeAndBuild( - newPq, - graphNodeIdsToRavvOrds.length, - graphNodeIdsToRavvOrds, - this, - SIMD_POOL_MERGE); - } else if (this.size() >= minimumBatchSizeForQuantization) { + pqVectors = newPq.encodeAll(mapped, SIMD_POOL_MERGE); + } else if (mapped.size() >= minimumBatchSizeForQuantization) { // No pre-existing codebooks, check if we have enough vectors to trigger quantization - pqVectors = getPQVectors(graphNodeIdsToRavvOrds, this, fieldInfo); + pqVectors = getPQVectors(mapped, fieldInfo); } else { pqVectors = null; } @@ -816,20 +788,17 @@ public void merge() throws IOException { // threads) buildScoreProvider.diversityProviderFor(0); } else { - buildScoreProvider = - BuildScoreProvider.randomAccessScoreProvider( - this, graphNodeIdsToRavvOrds, getVectorSimilarityFunction(fieldInfo)); + buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(mapped, getVectorSimilarityFunction(fieldInfo)); } final OnHeapGraphIndex graph = getGraph( buildScoreProvider, - this, - graphNodeIdsToRavvOrds, + mapped, fieldInfo, segmentWriteState.segmentInfo.name, SIMD_POOL_MERGE); - writeField(fieldInfo, this, pqVectors, graphNodeIdsToRavvOrds, graphNodeIdToDocMap, graph); + writeField(fieldInfo, mapped, pqVectors, graphNodeIdToDocMap, graph); } @Override @@ -884,7 +853,6 @@ public RandomAccessVectorValues copy() { public OnHeapGraphIndex getGraph( BuildScoreProvider buildScoreProvider, RandomAccessVectorValues randomAccessVectorValues, - int[] newToOldOrds, FieldInfo fieldInfo, String segmentName, ForkJoinPool SIMD_POOL) { @@ -908,16 +876,13 @@ public OnHeapGraphIndex getGraph( var vv = randomAccessVectorValues.threadLocalSupplier(); // parallel graph construction from the merge documents Ids + final int size = randomAccessVectorValues.size(); SIMD_POOL .submit( () -> - IntStream.range(0, newToOldOrds.length) + IntStream.range(0, size) .parallel() - .forEach( - ord -> { - graphIndexBuilder.addGraphNode( - ord, vv.get().getVector(newToOldOrds[ord])); - })) + .forEach(ord -> graphIndexBuilder.addGraphNode(ord, vv.get().getVector(ord)))) .join(); graphIndexBuilder.cleanup(); graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); From a4d32c435642c093a976c6bcc39e0c867297445a Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 17:52:14 +0000 Subject: [PATCH 43/86] Fix GraphNodeIdToDocMap deserialization --- .../lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 0bd8febec442..c526bbafe152 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -53,6 +53,7 @@ public GraphNodeIdToDocMap(IndexInput in) throws IOException { graphNodeIdsToDocIds = new int[size]; docIdsToGraphNodeIds = new int[maxDocId]; + Arrays.fill(docIdsToGraphNodeIds, -1); for (int ord = 0; ord < size; ord++) { final int docId = in.readVInt(); graphNodeIdsToDocIds[ord] = docId; From 765cfaefb6f47465587b70300254736e6c2d95e5 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 18:11:25 +0000 Subject: [PATCH 44/86] Remove type parameter from FieldWriter --- .../sandbox/codecs/jvector/JVectorWriter.java | 31 ++++++------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index eeb36843adb4..e521de84c81b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -105,7 +105,7 @@ public class JVectorWriter extends KnnVectorsWriter { private static final long SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); - private final List> fields = new ArrayList<>(); + private final List fields = new ArrayList<>(); private final IndexOutput meta; private final IndexOutput vectorIndex; @@ -193,7 +193,7 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException + "This can provides much greater savings in storage and memory"; throw new UnsupportedOperationException(errorMessage); } - FieldWriter newField = new FieldWriter<>(fieldInfo); + FieldWriter newField = new FieldWriter(fieldInfo); fields.add(newField); return newField; @@ -217,7 +217,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { - for (FieldWriter field : fields) { + for (FieldWriter field : fields) { final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; final BuildScoreProvider buildScoreProvider; final PQVectors pqVectors; @@ -469,7 +469,7 @@ public void close() throws IOException { @Override public long ramBytesUsed() { long total = SHALLOW_RAM_BYTES_USED; - for (FieldWriter field : fields) { + for (FieldWriter field : fields) { // the field tracks the delegate field usage total += field.ramBytesUsed(); } @@ -480,11 +480,8 @@ public long ramBytesUsed() { * The FieldWriter class is responsible for writing vector field data into index segments. It * provides functionality to process vector values as those being added, manage memory usage, and * build HNSW graph indexing structures for efficient retrieval during search queries. - * - * @param The type of vector value to be handled by the writer. This is often specialized to - * support specific implementations, such as float[] or byte[] vectors. */ - static class FieldWriter extends KnnFieldVectorsWriter { + static class FieldWriter extends KnnFieldVectorsWriter { private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); private final FieldInfo fieldInfo; private int lastDocID = -1; @@ -502,7 +499,7 @@ static class FieldWriter extends KnnFieldVectorsWriter { } @Override - public void addValue(int docID, T vectorValue) throws IOException { + public void addValue(int docID, float[] vectorValue) throws IOException { if (docID == lastDocID) { throw new IllegalArgumentException( "VectorValuesField \"" @@ -510,24 +507,14 @@ public void addValue(int docID, T vectorValue) throws IOException { + "\" appears more than once in this document (only one value is allowed per field)"); } docIds.add(docID); - if (vectorValue instanceof float[]) { - vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); - } else if (vectorValue instanceof byte[]) { - final String errorMessage = - "byte[] vectors are not supported in JVector. " - + "Instead you should only use float vectors and leverage product quantization during indexing." - + "This can provides much greater savings in storage and memory"; - throw new UnsupportedOperationException(errorMessage); - } else { - throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); - } + vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); lastDocID = docID; } @Override - public T copyValue(T vectorValue) { - throw new UnsupportedOperationException("copyValue not supported"); + public float[] copyValue(float[] vectorValue) { + return vectorValue.clone(); } @Override From 19c9c3c0125645893c1d77fcb6040de6f3e971b4 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 18:14:13 +0000 Subject: [PATCH 45/86] Fix missing copy on FieldWriter.addValue --- .../org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index e521de84c81b..c5bc0407e73a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -507,7 +507,7 @@ public void addValue(int docID, float[] vectorValue) throws IOException { + "\" appears more than once in this document (only one value is allowed per field)"); } docIds.add(docID); - vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(vectorValue)); + vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue))); lastDocID = docID; } From 8a9d5ab58113b1a6ef29bfa15b90ce54fa0e1c3e Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 18:17:48 +0000 Subject: [PATCH 46/86] Move DocIndexIterator logic to GraphNodeIdToDocMap --- .../codecs/jvector/GraphNodeIdToDocMap.java | 46 +++++++++++++++++++ .../jvector/JVectorFloatVectorValues.java | 41 +---------------- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index c526bbafe152..8c1b1339da48 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -17,9 +17,11 @@ package org.apache.lucene.sandbox.codecs.jvector; +import io.github.jbellis.jvector.util.Bits; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.KnnVectorValues.DocIndexIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -165,4 +167,48 @@ public void toOutput(IndexOutput out) throws IOException { out.writeVInt(graphNodeIdsToDocIds[ord]); } } + + public DocIndexIterator iterator(Bits liveOrds) { + return new DocIndexIterator() { + int docId = -1; + @Override + public int index() { + return docIdsToGraphNodeIds[docId]; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + while (docId < docIdsToGraphNodeIds.length - 1) { + ++docId; + final int ord = docIdsToGraphNodeIds[docId]; + if (ord >= 0 && liveOrds.get(ord)) { + return docId; + } + } + return docId = NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + if (target <= docId) { + throw new IllegalArgumentException(); + } else if (target >= docIdsToGraphNodeIds.length) { + return docId = NO_MORE_DOCS; + } + + docId = target - 1; + return nextDoc(); + } + + @Override + public long cost() { + return graphNodeIdsToDocIds.length; + } + }; + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 4e8376966613..14a982dc8ab3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -20,7 +20,6 @@ import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.graph.similarity.ScoreFunction; import io.github.jbellis.jvector.quantization.PQVectors; -import io.github.jbellis.jvector.util.Bits; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; @@ -69,45 +68,7 @@ public VectorFloat vectorFloatValue(int ord) { @Override public DocIndexIterator iterator() { - return new DocIndexIterator() { - private int docId = -1; - private final Bits liveNodes = view.liveNodes(); - - @Override - public long cost() { - return size(); - } - - @Override - public int index() { - return graphNodeIdToDocMap.getJVectorNodeId(docId); - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - // Advance to the next node docId starts from -1 which is why we need to increment docId by - // 1 "size" times - while (docId < size() - 1) { - docId++; - if (liveNodes.get(docId)) { - return docId; - } - } - docId = NO_MORE_DOCS; - - return docId; - } - - @Override - public int advance(int target) throws IOException { - return slowAdvance(target); - } - }; + return graphNodeIdToDocMap.iterator(view.liveNodes()); } @Override From 64aefe5f013a78462fbad2ffe56342d3b6b69bae Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 5 Nov 2025 20:45:31 +0000 Subject: [PATCH 47/86] Replace OnDiskGraphIndex.View.liveBits() check with assertion --- .../lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java | 5 ++--- .../sandbox/codecs/jvector/JVectorFloatVectorValues.java | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 8c1b1339da48..f6ba30784994 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -17,7 +17,6 @@ package org.apache.lucene.sandbox.codecs.jvector; -import io.github.jbellis.jvector.util.Bits; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.index.Sorter; @@ -168,7 +167,7 @@ public void toOutput(IndexOutput out) throws IOException { } } - public DocIndexIterator iterator(Bits liveOrds) { + public DocIndexIterator iterator() { return new DocIndexIterator() { int docId = -1; @Override @@ -186,7 +185,7 @@ public int nextDoc() throws IOException { while (docId < docIdsToGraphNodeIds.length - 1) { ++docId; final int ord = docIdsToGraphNodeIds[docId]; - if (ord >= 0 && liveOrds.get(ord)) { + if (ord >= 0) { return docId; } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 14a982dc8ab3..09047727baf1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -20,6 +20,7 @@ import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.graph.similarity.ScoreFunction; import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.util.Bits.MatchAllBits; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; @@ -68,7 +69,8 @@ public VectorFloat vectorFloatValue(int ord) { @Override public DocIndexIterator iterator() { - return graphNodeIdToDocMap.iterator(view.liveNodes()); + assert view.liveNodes() instanceof MatchAllBits : "All OnDiskGraphIndex nodes must be live"; + return graphNodeIdToDocMap.iterator(); } @Override From 6defb82c41fb04c62a050cfcf79a1f0fc337b50b Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 18:37:18 +0000 Subject: [PATCH 48/86] Extract FieldWriter.randomAccessVectorValues to method --- .../lucene/sandbox/codecs/jvector/JVectorWriter.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index c5bc0407e73a..bf9458b91520 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -218,7 +218,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { for (FieldWriter field : fields) { - final RandomAccessVectorValues randomAccessVectorValues = field.randomAccessVectorValues; + final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); final BuildScoreProvider buildScoreProvider; final PQVectors pqVectors; final FieldInfo fieldInfo = field.fieldInfo; @@ -252,8 +252,7 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { fieldInfo, segmentWriteState.segmentInfo.name, SIMD_POOL_FLUSH); - writeField( - field.fieldInfo, field.randomAccessVectorValues, pqVectors, graphNodeIdToDocMap, graph); + writeField(field.fieldInfo, randomAccessVectorValues, pqVectors, graphNodeIdToDocMap, graph); } } @@ -485,7 +484,6 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); private final FieldInfo fieldInfo; private int lastDocID = -1; - private final RandomAccessVectorValues randomAccessVectorValues; // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to // the jVector ordinal private final List> vectors = new ArrayList<>(); @@ -493,8 +491,6 @@ static class FieldWriter extends KnnFieldVectorsWriter { FieldWriter(FieldInfo fieldInfo) { /** For creating a new field from a flat field vectors writer. */ - this.randomAccessVectorValues = - new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); this.fieldInfo = fieldInfo; } @@ -517,6 +513,10 @@ public float[] copyValue(float[] vectorValue) { return vectorValue.clone(); } + public RandomAccessVectorValues toRandomAccessVectorValues() { + return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); + } + @Override public long ramBytesUsed() { return SHALLOW_SIZE + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES; From ae5cb1619d93af7f5b146ee73dea538db41cc1f8 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 18:48:40 +0000 Subject: [PATCH 49/86] Use DocsWithFieldSet instead of List in FieldWriter --- .../sandbox/codecs/jvector/JVectorWriter.java | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index bf9458b91520..58211b49c2ca 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -21,6 +21,7 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; @@ -50,6 +51,7 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; @@ -237,8 +239,10 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { // Generate the ord to doc mapping final int[] ordinalsToDocIds = new int[randomAccessVectorValues.size()]; - for (int ord = 0; ord < randomAccessVectorValues.size(); ord++) { - ordinalsToDocIds[ord] = field.docIds.get(ord); + int ord = 0; + final var docIter = field.docIds.iterator(); + for (int docId = docIter.nextDoc(); docId != NO_MORE_DOCS; docId = docIter.nextDoc()) { + ordinalsToDocIds[ord++] = docId; } final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(ordinalsToDocIds); if (sortMap != null) { @@ -483,20 +487,20 @@ public long ramBytesUsed() { static class FieldWriter extends KnnFieldVectorsWriter { private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); private final FieldInfo fieldInfo; - private int lastDocID = -1; // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to // the jVector ordinal private final List> vectors = new ArrayList<>(); - private final List docIds = new ArrayList<>(); + private DocsWithFieldSet docIds; FieldWriter(FieldInfo fieldInfo) { /** For creating a new field from a flat field vectors writer. */ this.fieldInfo = fieldInfo; + this.docIds = new DocsWithFieldSet(); } @Override public void addValue(int docID, float[] vectorValue) throws IOException { - if (docID == lastDocID) { + if (docID < docIds.cardinality()) { throw new IllegalArgumentException( "VectorValuesField \"" + fieldInfo.name @@ -504,8 +508,6 @@ public void addValue(int docID, float[] vectorValue) throws IOException { } docIds.add(docID); vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue))); - - lastDocID = docID; } @Override @@ -519,7 +521,9 @@ public RandomAccessVectorValues toRandomAccessVectorValues() { @Override public long ramBytesUsed() { - return SHALLOW_SIZE + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES; + return SHALLOW_SIZE + + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES + + docIds.ramBytesUsed(); } } From 19817295e94e19e4d8aceaf87661c2939e03d075 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 21:28:18 +0000 Subject: [PATCH 50/86] Add GraphNodeIdToDocMap constructor from DocsWithFieldSet --- .../codecs/jvector/GraphNodeIdToDocMap.java | 32 +++++++++++++++++++ .../sandbox/codecs/jvector/JVectorWriter.java | 14 +++----- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index f6ba30784994..29592c2198b9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -17,9 +17,13 @@ package org.apache.lucene.sandbox.codecs.jvector; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Arrays; import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.KnnVectorValues.DocIndexIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -97,6 +101,33 @@ public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { } } + public GraphNodeIdToDocMap(DocsWithFieldSet docs) { + this.graphNodeIdsToDocIds = new int[docs.cardinality()]; + + int ord = 0; + int maxDocId = -1; + final var docsIterator = docs.iterator(); + try { + for (int docId = docsIterator.nextDoc(); + docId != NO_MORE_DOCS; + docId = docsIterator.nextDoc()) { + graphNodeIdsToDocIds[ord++] = docId; + if (docId > maxDocId) { + maxDocId = docId; + } + } + } catch (IOException e) { + // This should never happen; docsIterator should be FixedBitSet or DocSetIterator.all() + throw new UncheckedIOException(e); + } + + this.docIdsToGraphNodeIds = new int[maxDocId + 1]; + Arrays.fill(docIdsToGraphNodeIds, -1); + for (ord = 0; ord < graphNodeIdsToDocIds.length; ++ord) { + docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; + } + } + /** * Updates the mapping from the Lucene document IDs to the jVector ordinals based on the sort * operation. (during flush) @@ -170,6 +201,7 @@ public void toOutput(IndexOutput out) throws IOException { public DocIndexIterator iterator() { return new DocIndexIterator() { int docId = -1; + @Override public int index() { return docIdsToGraphNodeIds[docId]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 58211b49c2ca..e4d320034551 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -21,7 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; @@ -237,14 +236,7 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { randomAccessVectorValues, getVectorSimilarityFunction(fieldInfo)); } - // Generate the ord to doc mapping - final int[] ordinalsToDocIds = new int[randomAccessVectorValues.size()]; - int ord = 0; - final var docIter = field.docIds.iterator(); - for (int docId = docIter.nextDoc(); docId != NO_MORE_DOCS; docId = docIter.nextDoc()) { - ordinalsToDocIds[ord++] = docId; - } - final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(ordinalsToDocIds); + final GraphNodeIdToDocMap graphNodeIdToDocMap = field.createGraphNodeIdToDocMap(); if (sortMap != null) { graphNodeIdToDocMap.update(sortMap); } @@ -519,6 +511,10 @@ public RandomAccessVectorValues toRandomAccessVectorValues() { return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); } + public GraphNodeIdToDocMap createGraphNodeIdToDocMap() { + return new GraphNodeIdToDocMap(docIds); + } + @Override public long ramBytesUsed() { return SHALLOW_SIZE From 3171addcd27b6161db8fa704c9af58069d7931fd Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 21:30:22 +0000 Subject: [PATCH 51/86] Fix sort sort-on-flush logic --- .../codecs/jvector/GraphNodeIdToDocMap.java | 38 +------------------ .../sandbox/codecs/jvector/JVectorWriter.java | 36 +++++++++++++++--- 2 files changed, 33 insertions(+), 41 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index 29592c2198b9..d0324f342268 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -22,7 +22,6 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.Arrays; -import org.apache.lucene.index.Sorter; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.KnnVectorValues.DocIndexIterator; import org.apache.lucene.store.IndexInput; @@ -39,8 +38,8 @@ */ public class GraphNodeIdToDocMap { private static final int VERSION = 1; - private int[] graphNodeIdsToDocIds; - private int[] docIdsToGraphNodeIds; + private final int[] graphNodeIdsToDocIds; + private final int[] docIdsToGraphNodeIds; /** * Constructor that reads the mapping from the index input @@ -128,39 +127,6 @@ public GraphNodeIdToDocMap(DocsWithFieldSet docs) { } } - /** - * Updates the mapping from the Lucene document IDs to the jVector ordinals based on the sort - * operation. (during flush) - * - * @param sortMap The sort map - */ - public void update(Sorter.DocMap sortMap) { - final int[] newGraphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; - final int maxNewDocId = - Arrays.stream(graphNodeIdsToDocIds).map(sortMap::oldToNew).max().getAsInt(); - final int maxDocs = maxNewDocId + 1; - if (maxDocs < graphNodeIdsToDocIds.length) { - throw new IllegalStateException( - "Max docs " - + maxDocs - + " is less than the number of ordinals " - + graphNodeIdsToDocIds.length); - } - final int[] newDocIdsToOrdinals = new int[maxDocs]; - Arrays.fill(newDocIdsToOrdinals, -1); - for (int oldDocId = 0; oldDocId < docIdsToGraphNodeIds.length; oldDocId++) { - if (docIdsToGraphNodeIds[oldDocId] == -1) { - continue; - } - final int newDocId = sortMap.oldToNew(oldDocId); - final int oldOrd = docIdsToGraphNodeIds[oldDocId]; - newDocIdsToOrdinals[newDocId] = oldOrd; - newGraphNodeIdsToDocIds[oldOrd] = newDocId; - } - this.docIdsToGraphNodeIds = newDocIdsToOrdinals; - this.graphNodeIdsToDocIds = newGraphNodeIdsToDocIds; - } - /** * Returns the jVector node id for the given Lucene document ID * diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index e4d320034551..e348cfb5f831 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -42,6 +42,7 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.concurrent.ForkJoinPool; import java.util.function.IntUnaryOperator; @@ -98,7 +99,7 @@ * {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as * needed across merges by constructing a new mapping from the previous mapping and the {@link * org.apache.lucene.index.MergeState.DocMap} provided in the {@link MergeState}. And across sorts - * with {@link GraphNodeIdToDocMap#update(Sorter.DocMap)} during flushes. + * with {@link FieldWriter#applySort(Sorter.DocMap)} during flushes. */ public class JVectorWriter extends KnnVectorsWriter { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = @@ -219,6 +220,9 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { for (FieldWriter field : fields) { + if (sortMap != null) { + field.applySort(sortMap); + } final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); final BuildScoreProvider buildScoreProvider; final PQVectors pqVectors; @@ -237,10 +241,6 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { } final GraphNodeIdToDocMap graphNodeIdToDocMap = field.createGraphNodeIdToDocMap(); - if (sortMap != null) { - graphNodeIdToDocMap.update(sortMap); - } - OnHeapGraphIndex graph = getGraph( buildScoreProvider, @@ -507,6 +507,32 @@ public float[] copyValue(float[] vectorValue) { return vectorValue.clone(); } + public void applySort(Sorter.DocMap sortMap) throws IOException { + // Ensure that all existing docs can be sorted + final int[] oldToNewOrd = new int[vectors.size()]; + final DocsWithFieldSet oldDocIds = docIds; + docIds = new DocsWithFieldSet(); + mapOldOrdToNewOrd(oldDocIds, sortMap, oldToNewOrd, null, docIds); + + // Swap vectors into their new ordinals + for (int oldOrd = 0; oldOrd < vectors.size(); ++oldOrd) { + final int newOrd = oldToNewOrd[oldOrd]; + if (oldOrd == newOrd) { + continue; + } + + // Swap the element at oldOrd into its position at newOrd and update the index mapping + Collections.swap(vectors, oldOrd, newOrd); + oldToNewOrd[oldOrd] = oldToNewOrd[newOrd]; + oldToNewOrd[newOrd] = newOrd; + + // The element at oldOrd may be displaced and need to be swapped again + if (oldToNewOrd[oldOrd] != oldOrd) { + oldOrd -= 1; + } + } + } + public RandomAccessVectorValues toRandomAccessVectorValues() { return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); } From 38e0b47afa5c50cb8caded61ae9b85ce96f28256 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 22:45:26 +0000 Subject: [PATCH 52/86] Fix JVectorFloatVectorValues.ordToDoc --- .../sandbox/codecs/jvector/JVectorFloatVectorValues.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 09047727baf1..2af052ee2789 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -62,6 +62,11 @@ public int size() { return view.size(); } + @Override + public int ordToDoc(int ord) { + return graphNodeIdToDocMap.getLuceneDocId(ord); + } + // This allows us to access the vector without copying it to float[] public VectorFloat vectorFloatValue(int ord) { return view.getVector(ord); From b2340dd20b91e4a241cda0c849a988242fcdd55c Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 4 Nov 2025 22:45:10 +0000 Subject: [PATCH 53/86] Pull out merging functionality into method --- .../codecs/jvector/GraphNodeIdToDocMap.java | 35 -- .../jvector/JVectorFloatVectorValues.java | 4 + .../sandbox/codecs/jvector/JVectorWriter.java | 432 +++++++----------- 3 files changed, 175 insertions(+), 296 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java index d0324f342268..0c733c73b34b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -65,41 +65,6 @@ public GraphNodeIdToDocMap(IndexInput in) throws IOException { } } - /** - * Constructor that creates a new mapping between ordinals and docIds - * - * @param graphNodeIdsToDocIds The mapping from ordinals to docIds - */ - public GraphNodeIdToDocMap(int[] graphNodeIdsToDocIds) { - if (graphNodeIdsToDocIds.length == 0) { - this.graphNodeIdsToDocIds = new int[0]; - this.docIdsToGraphNodeIds = new int[0]; - return; - } - this.graphNodeIdsToDocIds = new int[graphNodeIdsToDocIds.length]; - System.arraycopy( - graphNodeIdsToDocIds, 0, this.graphNodeIdsToDocIds, 0, graphNodeIdsToDocIds.length); - final int maxDocId = Arrays.stream(graphNodeIdsToDocIds).max().getAsInt(); - final int maxDocs = maxDocId + 1; - // We are going to assume that the number of ordinals is roughly the same as the number of - // documents in the segment, therefore, - // the mapping will not be sparse. - if (maxDocs < graphNodeIdsToDocIds.length) { - throw new IllegalStateException( - "Max docs " - + maxDocs - + " is less than the number of ordinals " - + graphNodeIdsToDocIds.length); - } - // When maxDocId > graphNodeIdsToDocIds.length, there are lots of deleted documents or missing - // values, which wastes memory - this.docIdsToGraphNodeIds = new int[maxDocs]; - Arrays.fill(this.docIdsToGraphNodeIds, -1); // -1 means no mapping to ordinal - for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { - this.docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; - } - } - public GraphNodeIdToDocMap(DocsWithFieldSet docs) { this.graphNodeIdsToDocIds = new int[docs.cardinality()]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 2af052ee2789..0adc733f8dc6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -72,6 +72,10 @@ public VectorFloat vectorFloatValue(int ord) { return view.getVector(ord); } + public void getVectorInto(int node, VectorFloat vector, int offset) { + view.getVectorInto(node, vector, offset); + } + @Override public DocIndexIterator iterator() { assert view.liveNodes() instanceof MatchAllBits : "All OnDiskGraphIndex nodes must be live"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index e348cfb5f831..38670eb23157 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -26,7 +26,6 @@ import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import io.github.jbellis.jvector.graph.OnHeapGraphIndex; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; -import io.github.jbellis.jvector.graph.RemappedRandomAccessVectorValues; import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; @@ -49,11 +48,10 @@ import java.util.stream.IntStream; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; -import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.KnnVectorValues; @@ -62,10 +60,8 @@ import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -208,8 +204,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE case BYTE: throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); case FLOAT32: - final var mergeRavv = new RandomAccessMergedFloatVectorValues(fieldInfo, mergeState); - mergeRavv.merge(); + mergeAndWriteField(fieldInfo, mergeState); break; } } catch (Exception e) { @@ -562,261 +557,167 @@ static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimila }; } - /** - * Implementation of RandomAccessVectorValues that directly uses the source FloatVectorValues from - * multiple segments without copying the vectors. - * - *

Some details about the implementation logic: - * - *

First, we identify the leading reader, which is the one with the most live vectors. Second, - * we build a mapping between the ravv ordinals and the reader index and the ordinal in that - * reader. Third, we build a mapping between the ravv ordinals and the global doc ids. - * - *

Very important to note that for the leading graph the node Ids need to correspond to their - * original ravv ordinals in the reader. This is because we are later going to expand that graph - * with new vectors from the other readers. While the new vectors can be assigned arbitrary node - * Ids, the leading graph needs to preserve its original node Ids and map them to the original - * ravv vector ordinals. - */ - class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { - private static final int READER_ID = 0; - private static final int READER_ORD = 1; - - // Array of sub-readers - private final KnnVectorsReader[] readers; - private final FloatVectorValues[] perReaderFloatVectorValues; - - // Maps the ravv ordinals to the reader index and the ordinal in that reader. This is allowing - // us to get a unified view of all the - // vectors in all the readers with a single unified ordinal space. - private final int[][] ravvOrdToReaderMapping; - - // Total number of vectors - private final int size; - // Total number of documents including those without values - private final int totalDocsCount; + private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + assert fieldInfo.hasVectorValues(); + final int dimension = fieldInfo.getVectorDimension(); + final int mergeCount = mergeState.knnVectorsReaders.length; + + // Collect the sub-readers into a list to make a DocIdMerger + final List subs = new ArrayList<>(mergeCount); + final FloatVectorValues[] vectors = new FloatVectorValues[mergeCount]; + for (int i = 0; i < mergeCount; ++i) { + if (false == MergedVectorValues.hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) { + continue; + } + final var reader = mergeState.knnVectorsReaders[i]; + if (reader == null) { + continue; + } + final var values = reader.getFloatVectorValues(fieldInfo.name); + if (values == null || values.size() == 0) { + continue; + } - // Vector dimension - private final int dimension; - private final FieldInfo fieldInfo; - private final GraphNodeIdToDocMap graphNodeIdToDocMap; - private final int[] graphNodeIdsToRavvOrds; - private final int pqReaderIndex; - private final ProductQuantization pq; - - /** - * Creates a random access view over merged float vector values. - * - * @param fieldInfo Field info for the vector field - * @param mergeState Merge state containing readers and doc maps - */ - public RandomAccessMergedFloatVectorValues(FieldInfo fieldInfo, MergeState mergeState) - throws IOException { - this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); - this.fieldInfo = fieldInfo; - this.dimension = fieldInfo.getVectorDimension(); - - final String fieldName = fieldInfo.name; - - // Count total vectors, collect readers and identify leading reader, collect base ordinals to - // later be used to build the mapping - // between global ordinals and global lucene doc ids - int totalVectorsCount = 0; - int totalLiveVectorsCount = 0; - int pqReaderIndex = -1; - ProductQuantization pq = null; - int vectorsCountInLeadingReader = -1; - this.readers = mergeState.knnVectorsReaders.clone(); - final MergeState.DocMap[] docMaps = mergeState.docMaps.clone(); - final Bits[] liveDocs = mergeState.liveDocs.clone(); - final int[] baseOrds = new int[mergeState.knnVectorsReaders.length]; - - // Find the leading reader, count the total number of live vectors, and the base ordinals for - // each reader - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - FieldInfos fieldInfos = mergeState.fieldInfos[i]; - baseOrds[i] = totalVectorsCount; - if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { - KnnVectorsReader reader = mergeState.knnVectorsReaders[i].unwrapReaderForField(fieldName); - if (reader != null) { - FloatVectorValues values = reader.getFloatVectorValues(fieldName); - if (values != null) { - int vectorCountInReader = values.size(); - int liveVectorCountInReader = 0; - KnnVectorValues.DocIndexIterator it = values.iterator(); - while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - if (liveDocs[i] == null || liveDocs[i].get(it.docID())) { - liveVectorCountInReader++; - } - } - if (reader instanceof JVectorReader jVectorReader - && liveVectorCountInReader >= vectorsCountInLeadingReader) { - vectorsCountInLeadingReader = liveVectorCountInReader; - final var maybeNewPq = jVectorReader.getProductQuantizationForField(fieldName); - if (maybeNewPq.isPresent()) { - pqReaderIndex = i; - pq = maybeNewPq.get(); - } - } - totalVectorsCount += vectorCountInReader; - totalLiveVectorsCount += liveVectorCountInReader; - assert values.dimension() == dimension; - } + assert values.dimension() == dimension; + subs.add(new SubFloatVectors(mergeState.docMaps[i], i, values)); + vectors[i] = values; + } + + // These arrays may be larger than strictly necessary if there are deleted docs/missing fields + final int totalMaxDocs = Arrays.stream(mergeState.maxDocs).reduce(0, Math::addExact); + final int[] liveDocCounts = new int[mergeCount]; + final DocsWithFieldSet docIds = new DocsWithFieldSet(); + final int[] ordToReaderIndex = new int[totalMaxDocs]; + final int[] ordToReaderOrd = new int[totalMaxDocs]; + + // Construct ordinal mappings for the new graph + int ord = 0; + final var docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + for (var sub = docIdMerger.next(); sub != null; sub = docIdMerger.next()) { + final int readerIndex = sub.readerIndex; + liveDocCounts[readerIndex] += 1; + docIds.add(sub.mappedDocID); + ordToReaderIndex[ord] = sub.readerIndex; + ordToReaderOrd[ord] = sub.index(); + ord += 1; + } + + // Make a RandomAccessVectorValues instance using the new graph ordinals + final int totalLiveDocsCount = ord; + final var ravv = + new RandomAccessMergedFloatVectorValues( + totalLiveDocsCount, + dimension, + vectors, + i -> ordToReaderIndex[i], + i -> ordToReaderOrd[i]); + + // Find the largest quantized reader to re-use its PQ codebook, if possible + int largestQuantizedReaderIndex = 0; + ProductQuantization pq = null; + for (int i = 0; i < liveDocCounts.length; ++i) { + if (liveDocCounts[i] > liveDocCounts[largestQuantizedReaderIndex]) { + if (mergeState.knnVectorsReaders[i] instanceof JVectorReader jVectorReader) { + final var maybeNewPq = jVectorReader.getProductQuantizationForField(fieldInfo.name); + if (maybeNewPq.isPresent()) { + largestQuantizedReaderIndex = i; + pq = maybeNewPq.get(); } } } + } - assert (totalVectorsCount <= totalDocsCount) - : "Total number of vectors exceeds the total number of documents"; - assert (totalLiveVectorsCount <= totalVectorsCount) - : "Total number of live vectors exceeds the total number of vectors"; - assert (dimension > 0) : "No vectors found for field " + fieldName; - - this.pq = pq; - this.pqReaderIndex = pqReaderIndex; - this.size = totalVectorsCount; - this.perReaderFloatVectorValues = new FloatVectorValues[readers.length]; - - // Build mapping from global ordinal to [readerIndex, readerOrd] - this.ravvOrdToReaderMapping = new int[totalDocsCount][2]; - - int documentsIterated = 0; - - // Will be used to build the new graphNodeIdToDocMap with the new graph node id to docId - // mapping. - // This mapping should not be used to access the vectors at any time during construction, but - // only after the merge is complete - // and the new segment is created and used by searchers. - final int[] graphNodeIdToDocIds = new int[totalLiveVectorsCount]; - this.graphNodeIdsToRavvOrds = new int[totalLiveVectorsCount]; - - int graphNodeId = 0; - // Build a new graph from scratch and compact the graph node ids - for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { - if (readers[readerIdx] == null) { + // Perform PQ if applicable + final PQVectors pqVectors; + if (pq != null) { + // Refine the leadingCompressor with the remaining vectors in the merge + ProductQuantization newPq = pq; + for (int i = 0; i < mergeCount; i++) { + if (i == largestQuantizedReaderIndex || vectors[i] == null) { + // Skip the reader associated with the re-used PQ codebook continue; } - final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); - if (values == null || values.size() == 0) { - continue; - } - perReaderFloatVectorValues[readerIdx] = values; - // For each vector in this reader - KnnVectorValues.DocIndexIterator it = values.iterator(); - - for (int docId = it.nextDoc(); - docId != DocIdSetIterator.NO_MORE_DOCS; - docId = it.nextDoc()) { - if (docMaps[readerIdx].get(docId) != -1) { - // Mapping from ravv ordinals to [readerIndex, readerOrd] - // Map graph node id to ravv ordinal - // Map graph node id to doc id - final int newGlobalDocId = docMaps[readerIdx].get(docId); - final int ravvLocalOrd = it.index(); - final int ravvGlobalOrd = ravvLocalOrd + baseOrds[readerIdx]; - graphNodeIdToDocIds[graphNodeId] = newGlobalDocId; - graphNodeIdsToRavvOrds[graphNodeId] = ravvGlobalOrd; - graphNodeId++; - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ID] = readerIdx; // Reader index - ravvOrdToReaderMapping[ravvGlobalOrd][READER_ORD] = ravvLocalOrd; // Ordinal in reader - } - - documentsIterated++; - } + final FloatVectorValues values = vectors[i]; + final RandomAccessVectorValues randomAccessVectorValues = + new RandomAccessVectorValuesOverVectorValues(values); + newPq = newPq.refine(randomAccessVectorValues); } + pqVectors = newPq.encodeAll(ravv, SIMD_POOL_MERGE); + } else if (ravv.size() >= minimumBatchSizeForQuantization) { + // No pre-existing codebooks, check if we have enough vectors to trigger quantization + pqVectors = getPQVectors(ravv, fieldInfo); + } else { + pqVectors = null; + } + + final BuildScoreProvider buildScoreProvider; + final var similarityFunction = getVectorSimilarityFunction(fieldInfo); + if (pqVectors != null) { + // Re-use PQ codebooks to build a new graph from scratch + buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors); + // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD + // threads) + buildScoreProvider.diversityProviderFor(0); + } else { + buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(ravv, similarityFunction); + } + final var graphNodeIdToDocMap = new GraphNodeIdToDocMap(docIds); + final var graph = + getGraph( + buildScoreProvider, + ravv, + fieldInfo, + segmentWriteState.segmentInfo.name, + SIMD_POOL_MERGE); + writeField(fieldInfo, ravv, pqVectors, graphNodeIdToDocMap, graph); + } - if (documentsIterated < totalVectorsCount) { - throw new IllegalStateException( - "More documents were expected than what was found in the readers." - + "Expected at least number of total vectors: " - + totalVectorsCount - + " but found only: " - + documentsIterated - + " documents."); - } + private static final class SubFloatVectors extends DocIDMerger.Sub { + final int readerIndex; + final KnnVectorValues.DocIndexIterator iterator; + int docId = -1; - this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(graphNodeIdToDocIds); - } - - /** - * Merges the float vector values from multiple readers into a unified structure. This process - * includes handling product quantization (PQ) for vector compression, generating ord-to-doc - * mappings, and writing the merged index into a new segment file. - * - *

The method determines if pre-existing product quantization codebooks are available from - * the leading reader. If available, it refines them using remaining vectors from other readers - * in the merge. If no pre-existing codebooks are found and the total vector count meets the - * required minimum threshold, new codebooks and compressed vectors are computed. Otherwise, no - * PQ compression is applied. - * - *

Also, it generates a mapping of ordinals to document IDs by iterating through the provided - * vector data, which is further used to write the field data. - * - *

In the event of no deletes or quantization, the graph construction is done by - * incrementally adding vectors from smaller segments into the largest segment. For all other - * cases, we build a new graph from scratch from all the vectors. - * - *

TODO: Add support for incremental graph building with quantization see issue - * - * @throws IOException if there is an issue during reading or writing vector data. - */ - public void merge() throws IOException { - final RandomAccessVectorValues mapped = - new RemappedRandomAccessVectorValues(this, graphNodeIdsToRavvOrds); - // This section creates the PQVectors to be used for this merge - // Get PQ compressor for leading reader - final String fieldName = fieldInfo.name; - final PQVectors pqVectors; - // Check if the leading reader has pre-existing PQ codebooks and if so, refine them with the - // remaining vectors - if (pq != null) { - // Refine the leadingCompressor with the remaining vectors in the merge - ProductQuantization newPq = pq; - for (int i = 0; i < readers.length; i++) { - if (i == pqReaderIndex) { - // Skip the reader associated with the re-used PQ codebook - continue; - } - final FloatVectorValues values = readers[i].getFloatVectorValues(fieldName); - final RandomAccessVectorValues randomAccessVectorValues = - new RandomAccessVectorValuesOverVectorValues(values); - newPq = newPq.refine(randomAccessVectorValues); - } - pqVectors = newPq.encodeAll(mapped, SIMD_POOL_MERGE); - } else if (mapped.size() >= minimumBatchSizeForQuantization) { - // No pre-existing codebooks, check if we have enough vectors to trigger quantization - pqVectors = getPQVectors(mapped, fieldInfo); - } else { - pqVectors = null; - } + SubFloatVectors(MergeState.DocMap docMap, int readerIndex, FloatVectorValues values) { + super(docMap); + this.readerIndex = readerIndex; + this.iterator = values.iterator(); + } - final BuildScoreProvider buildScoreProvider; - if (pqVectors != null) { - // Re-use PQ codebooks to build a new graph from scratch - buildScoreProvider = - BuildScoreProvider.pqBuildScoreProvider( - getVectorSimilarityFunction(fieldInfo), pqVectors); - // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD - // threads) - buildScoreProvider.diversityProviderFor(0); - } else { - buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(mapped, getVectorSimilarityFunction(fieldInfo)); - } - final OnHeapGraphIndex graph = - getGraph( - buildScoreProvider, - mapped, - fieldInfo, - segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE); + @Override + public int nextDoc() throws IOException { + docId = iterator.nextDoc(); + return docId; + } - writeField(fieldInfo, mapped, pqVectors, graphNodeIdToDocMap, graph); + public int index() { + return iterator.index(); + } + } + + private static final class RandomAccessMergedFloatVectorValues + implements RandomAccessVectorValues { + private final int size; + private final int dimension; + private final FloatVectorValues[] vectors; + private final IntUnaryOperator ordToReader; + private final IntUnaryOperator ordToReaderOrd; + + public RandomAccessMergedFloatVectorValues( + int size, + int dimension, + FloatVectorValues[] values, + IntUnaryOperator ordToReader, + IntUnaryOperator ordToReaderOrd) { + this.size = size; + this.dimension = dimension; + this.vectors = values; + this.ordToReader = ordToReader; + this.ordToReaderOrd = ordToReaderOrd; } @Override - public int size() { - return size; + public RandomAccessMergedFloatVectorValues copy() { + throw new UnsupportedOperationException(); } @Override @@ -825,25 +726,34 @@ public int dimension() { } @Override - public VectorFloat getVector(int ord) { - if (ord < 0 || ord >= totalDocsCount) { - throw new IllegalArgumentException("Ordinal out of bounds: " + ord); - } + public VectorFloat getVector(int nodeId) { + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(dimension); + getVectorInto(nodeId, vector, 0); + return vector; + } - final int readerIdx = ravvOrdToReaderMapping[ord][READER_ID]; - final int readerOrd = ravvOrdToReaderMapping[ord][READER_ORD]; + @Override + public void getVectorInto(int node, VectorFloat destinationVector, int offset) { + final FloatVectorValues values = vectors[ordToReader.applyAsInt(node)]; + final int ord = ordToReaderOrd.applyAsInt(node); - // Access to float values is not thread safe - synchronized (perReaderFloatVectorValues[readerIdx]) { - if (perReaderFloatVectorValues[readerIdx] instanceof JVectorFloatVectorValues values) { - return values.vectorFloatValue(readerOrd); + if (values instanceof JVectorFloatVectorValues jVectorValues) { + synchronized (this) { + jVectorValues.getVectorInto(ord, destinationVector, offset); } + } + + synchronized (this) { + final float[] srcVector; try { - return VECTOR_TYPE_SUPPORT.createFloatVector( - perReaderFloatVectorValues[readerIdx].vectorValue(readerOrd)); + srcVector = values.vectorValue(ord); } catch (IOException e) { throw new UncheckedIOException(e); } + + for (int i = 0; i < srcVector.length; ++i) { + destinationVector.set(i + offset, srcVector[i]); + } } } @@ -853,8 +763,8 @@ public boolean isValueShared() { } @Override - public RandomAccessVectorValues copy() { - throw new UnsupportedOperationException("Copy not supported"); + public int size() { + return size; } } From b2e587b7710be330e5209e9aa726589aa4571959 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 5 Nov 2025 16:34:52 +0000 Subject: [PATCH 54/86] Remove synchronized from RandomAccessMergedFloatVectorValues --- .../jvector/JVectorFloatVectorValues.java | 8 ++-- .../sandbox/codecs/jvector/JVectorWriter.java | 38 +++++++++++-------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java index 0adc733f8dc6..ccbe286c776c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -35,18 +35,20 @@ public class JVectorFloatVectorValues extends FloatVectorValues { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); + private final OnDiskGraphIndex index; private final OnDiskGraphIndex.View view; private final PQVectors pq; private final VectorSimilarityFunction similarityFunction; private final GraphNodeIdToDocMap graphNodeIdToDocMap; public JVectorFloatVectorValues( - OnDiskGraphIndex onDiskGraphIndex, + OnDiskGraphIndex index, PQVectors pq, VectorSimilarityFunction similarityFunction, GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { - this.view = onDiskGraphIndex.getView(); + this.index = index; + this.view = index.getView(); this.pq = pq; this.similarityFunction = similarityFunction; this.graphNodeIdToDocMap = graphNodeIdToDocMap; @@ -94,7 +96,7 @@ public float[] vectorValue(int i) throws IOException { @Override public FloatVectorValues copy() throws IOException { - return this; + return new JVectorFloatVectorValues(index, pq, similarityFunction, graphNodeIdToDocMap); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 38670eb23157..da6917f66953 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -717,7 +717,18 @@ public RandomAccessMergedFloatVectorValues( @Override public RandomAccessMergedFloatVectorValues copy() { - throw new UnsupportedOperationException(); + final FloatVectorValues[] newVectors = new FloatVectorValues[vectors.length]; + for (int i = 0; i < newVectors.length; ++i) { + if (vectors[i] != null) { + try { + newVectors[i] = vectors[i].copy(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + return new RandomAccessMergedFloatVectorValues( + size, dimension, newVectors, ordToReader, ordToReaderOrd); } @Override @@ -738,28 +749,25 @@ public void getVectorInto(int node, VectorFloat destinationVector, int offset final int ord = ordToReaderOrd.applyAsInt(node); if (values instanceof JVectorFloatVectorValues jVectorValues) { - synchronized (this) { - jVectorValues.getVectorInto(ord, destinationVector, offset); - } + jVectorValues.getVectorInto(ord, destinationVector, offset); } - synchronized (this) { - final float[] srcVector; - try { - srcVector = values.vectorValue(ord); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + final float[] srcVector; + try { + srcVector = values.vectorValue(ord); + } catch (IOException e) { + throw new UncheckedIOException(e); + } - for (int i = 0; i < srcVector.length; ++i) { - destinationVector.set(i + offset, srcVector[i]); - } + for (int i = 0; i < srcVector.length; ++i) { + destinationVector.set(i + offset, srcVector[i]); } } @Override public boolean isValueShared() { - return false; + // force thread-local copies + return true; } @Override From 58776c315992d2a9bc9122703f3bdc7e955d1e42 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 5 Nov 2025 20:50:25 +0000 Subject: [PATCH 55/86] Add BaseKnnVectorsFormatTestCase to TestJVectorFormat --- .../codecs/jvector/TestJVectorFormat.java | 56 ++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java index 0c46a50a8b61..0f1413bbcce9 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java @@ -17,6 +17,10 @@ package org.apache.lucene.sandbox.codecs.jvector; +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; import com.carrotsearch.randomizedtesting.ThreadFilter; @@ -33,11 +37,13 @@ import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.NamedThreadFactory; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; /** Test used specifically for JVector */ @@ -49,10 +55,42 @@ @ThreadLeakFilters( defaultFilters = true, filters = {TestJVectorFormat.ThreadLeakFilter.class}) -public class TestJVectorFormat extends LuceneTestCase { +public class TestJVectorFormat extends BaseKnnVectorsFormatTestCase { + private static final VectorEncoding[] SUPPORTED_ENCODINGS = {FLOAT32}; + private static final VectorSimilarityFunction[] SUPPORTED_FUNCTIONS = { + DOT_PRODUCT, EUCLIDEAN, COSINE + }; private static final String TEST_FIELD = "test_field"; private static final String TEST_ID_FIELD = "id"; + @Override + @Ignore("Does not honor visitedLimit") + public void testSearchWithVisitedLimit() {} + + @Override + @Ignore("Does not support byte vectors") + public void testByteVectorScorerIteration() {} + + @Override + @Ignore("Does not support byte vectors") + public void testMismatchedFields() {} + + @Override + @Ignore("Does not support byte vectors") + public void testSortedIndexBytes() {} + + @Override + @Ignore("Does not support byte vectors") + public void testRandomBytes() {} + + @Override + @Ignore("Does not support byte vectors") + public void testEmptyByteVectorData() {} + + @Override + @Ignore("Does not support byte vectors") + public void testMergingWithDifferentByteKnnFields() {} + /** * Test to verify that the JVector codec is able to successfully search for the nearest neighbours * in the index. Single field is used to store the vectors. All the documents are stored in a @@ -1553,7 +1591,21 @@ static float[][] generateRandomVectors(int count, int dimension) { return vectors; } - private Codec getCodec() { + @Override + protected VectorEncoding randomVectorEncoding() { + return SUPPORTED_ENCODINGS[random().nextInt(SUPPORTED_ENCODINGS.length)]; + } + + @Override + protected VectorSimilarityFunction randomSimilarity() { + return SUPPORTED_FUNCTIONS[random().nextInt(SUPPORTED_FUNCTIONS.length)]; + } + + @Override + protected void assertOffHeapByteSize(LeafReader r, String fieldName) throws IOException {} + + @Override + protected Codec getCodec() { return getCodec(JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION); } From fa2908e2ae3374a53dc777d6eb1cb9f80aa29f70 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 19 Nov 2025 17:17:00 +0000 Subject: [PATCH 56/86] Remove explicit SIMD pools --- .../sandbox/codecs/jvector/JVectorFormat.java | 25 ----------- .../sandbox/codecs/jvector/JVectorWriter.java | 44 +++++++++---------- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index af727760c375..f917d835f5f0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -18,8 +18,6 @@ package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.ForkJoinWorkerThread; import java.util.function.IntUnaryOperator; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; @@ -44,10 +42,6 @@ public class JVectorFormat extends KnnVectorsFormat { public static final float DEFAULT_NEIGHBOR_OVERFLOW = 2f; public static final float DEFAULT_ALPHA = 2f; public static final boolean DEFAULT_HIERARCHY_ENABLED = true; - // Unfortunately, this can't be managed yet by the OpenSearch ThreadPool because it's not - // supporting {@link ForkJoinPool} types - public static final ForkJoinPool SIMD_POOL_MERGE = getPhysicalCoreExecutor(); - public static final ForkJoinPool SIMD_POOL_FLUSH = getPhysicalCoreExecutor(); private final int maxConn; private final int beamWidth; @@ -187,23 +181,4 @@ public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { } return compressedBytes; } - - public static ForkJoinPool getPhysicalCoreExecutor() { - final int estimatedPhysicalCoreCount = - Integer.getInteger( - "jvector.physical_core_count", - Math.max(1, Runtime.getRuntime().availableProcessors() / 2)); - assert estimatedPhysicalCoreCount > 0 - && estimatedPhysicalCoreCount <= Runtime.getRuntime().availableProcessors() - : "Invalid core count: " + estimatedPhysicalCoreCount; - final ForkJoinPool.ForkJoinWorkerThreadFactory factory = - pool -> { - ForkJoinWorkerThread thread = - ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool); - thread.setPriority(Thread.NORM_PRIORITY - 2); - return thread; - }; - - return new ForkJoinPool(estimatedPhysicalCoreCount, factory, null, true); - } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index da6917f66953..6966a7d1c45a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -17,10 +17,7 @@ package org.apache.lucene.sandbox.codecs.jvector; -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; -import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_FLUSH; -import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.SIMD_POOL_MERGE; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; @@ -43,7 +40,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; import java.util.function.IntUnaryOperator; import java.util.stream.IntStream; import org.apache.lucene.codecs.CodecUtil; @@ -242,7 +240,7 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { randomAccessVectorValues, fieldInfo, segmentWriteState.segmentInfo.name, - SIMD_POOL_FLUSH); + Runnable::run); writeField(field.fieldInfo, randomAccessVectorValues, pqVectors, graphNodeIdToDocMap, graph); } } @@ -356,17 +354,15 @@ private PQVectors getPQVectors( final var numberOfClustersPerSubspace = Math.min(256, randomAccessVectorValues.size()); // number of centroids per // subspace + ProductQuantization pq = ProductQuantization.compute( randomAccessVectorValues, - M, // number of subspaces - numberOfClustersPerSubspace, // number of centroids per subspace - vectorSimilarityFunction == VectorSimilarityFunction.EUCLIDEAN, // center the dataset - UNWEIGHTED, - SIMD_POOL_MERGE, - ForkJoinPool.commonPool()); - - return pq.encodeAll(randomAccessVectorValues, SIMD_POOL_MERGE); + M, + numberOfClustersPerSubspace, + vectorSimilarityFunction == VectorSimilarityFunction.EUCLIDEAN); + + return (PQVectors) pq.encodeAll(randomAccessVectorValues); } /// Metadata about the index to be persisted on disk @@ -642,7 +638,8 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro new RandomAccessVectorValuesOverVectorValues(values); newPq = newPq.refine(randomAccessVectorValues); } - pqVectors = newPq.encodeAll(ravv, SIMD_POOL_MERGE); + newPq.encodeAll(ravv); + pqVectors = (PQVectors) newPq.encodeAll(ravv); } else if (ravv.size() >= minimumBatchSizeForQuantization) { // No pre-existing codebooks, check if we have enough vectors to trigger quantization pqVectors = getPQVectors(ravv, fieldInfo); @@ -668,7 +665,7 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro ravv, fieldInfo, segmentWriteState.segmentInfo.name, - SIMD_POOL_MERGE); + mergeState.intraMergeTaskExecutor); writeField(fieldInfo, ravv, pqVectors, graphNodeIdToDocMap, graph); } @@ -786,7 +783,7 @@ public OnHeapGraphIndex getGraph( RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo, String segmentName, - ForkJoinPool SIMD_POOL) { + Executor executor) { final GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder( buildScoreProvider, @@ -804,17 +801,16 @@ public OnHeapGraphIndex getGraph( * This is the case when we are merging segments and we might have more documents than vectors. */ final OnHeapGraphIndex graphIndex; - var vv = randomAccessVectorValues.threadLocalSupplier(); + final var vv = randomAccessVectorValues.threadLocalSupplier(); // parallel graph construction from the merge documents Ids final int size = randomAccessVectorValues.size(); - SIMD_POOL - .submit( - () -> - IntStream.range(0, size) - .parallel() - .forEach(ord -> graphIndexBuilder.addGraphNode(ord, vv.get().getVector(ord)))) - .join(); + IntStream.range(0, size) + .mapToObj( + ord -> + CompletableFuture.runAsync( + () -> graphIndexBuilder.addGraphNode(ord, vv.get().getVector(ord)), executor)) + .forEach(CompletableFuture::join); graphIndexBuilder.cleanup(); graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); From 3ad86d8514c8dbf6f11ab9a64b6e8c4e8ce51f2c Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Wed, 19 Nov 2025 18:56:03 +0000 Subject: [PATCH 57/86] Replace VectorSimilarityMapper with simple switch --- .../sandbox/codecs/jvector/JVectorFormat.java | 20 +++++++ .../sandbox/codecs/jvector/JVectorReader.java | 57 +------------------ .../sandbox/codecs/jvector/JVectorWriter.java | 45 ++++++--------- 3 files changed, 38 insertions(+), 84 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index f917d835f5f0..25b0b3da6d5c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -181,4 +181,24 @@ public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { } return compressedBytes; } + + static io.github.jbellis.jvector.vector.VectorSimilarityFunction toJVectorSimilarity( + final org.apache.lucene.index.VectorSimilarityFunction luceneFunction) { + return switch (luceneFunction) { + case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; + case MAXIMUM_INNER_PRODUCT -> + throw new UnsupportedOperationException("JVector does not support MAXIMUM_INNER_PRODUCT"); + }; + } + + static org.apache.lucene.index.VectorSimilarityFunction toLuceneSimilarity( + final io.github.jbellis.jvector.vector.VectorSimilarityFunction jVectorFunction) { + return switch (jVectorFunction) { + case COSINE -> org.apache.lucene.index.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; + case EUCLIDEAN -> org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + }; + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index ed2564130dad..3dc056096ac0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -33,7 +33,6 @@ import java.io.Closeable; import java.io.IOException; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.lucene.codecs.CodecUtil; @@ -264,9 +263,7 @@ class FieldEntry implements Closeable { public FieldEntry( FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) throws IOException { - this.similarityFunction = - VectorSimilarityMapper.ordToDistFunc( - vectorIndexFieldMetadata.vectorSimilarityFunction.ordinal()); + this.similarityFunction = vectorIndexFieldMetadata.vectorSimilarityFunction; this.vectorDimension = vectorIndexFieldMetadata.vectorDimension; this.vectorIndexOffset = vectorIndexFieldMetadata.vectorIndexOffset; this.vectorIndexLength = vectorIndexFieldMetadata.vectorIndexLength; @@ -325,56 +322,4 @@ public void close() throws IOException { } } } - - /** Utility class to map between Lucene and jVector similarity functions and metadata ordinals. */ - public static class VectorSimilarityMapper { - /** - * List of vector similarity functions supported by jVector library The similarity functions orders - * matter in this list because it is later used to resolve the similarity function by ordinal. - */ - public static final List JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS = - List.of( - VectorSimilarityFunction.EUCLIDEAN, - VectorSimilarityFunction.DOT_PRODUCT, - VectorSimilarityFunction.COSINE); - - public static final Map< - org.apache.lucene.index.VectorSimilarityFunction, VectorSimilarityFunction> - LUCENE_TO_JVECTOR_MAP = - Map.of( - org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN, - VectorSimilarityFunction.EUCLIDEAN, - org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT, - VectorSimilarityFunction.DOT_PRODUCT, - org.apache.lucene.index.VectorSimilarityFunction.COSINE, - VectorSimilarityFunction.COSINE); - - public static int distFuncToOrd(org.apache.lucene.index.VectorSimilarityFunction func) { - if (LUCENE_TO_JVECTOR_MAP.containsKey(func)) { - return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.indexOf(LUCENE_TO_JVECTOR_MAP.get(func)); - } - - throw new IllegalArgumentException("invalid distance function: " + func); - } - - public static VectorSimilarityFunction ordToDistFunc(int ord) { - return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); - } - - public static org.apache.lucene.index.VectorSimilarityFunction ordToLuceneDistFunc(int ord) { - if (ord < 0 || ord >= JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.size()) { - throw new IllegalArgumentException("Invalid ordinal: " + ord); - } - VectorSimilarityFunction jvectorFunc = JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); - for (Map.Entry - entry : LUCENE_TO_JVECTOR_MAP.entrySet()) { - if (entry.getValue().equals(jvectorFunc)) { - return entry.getKey(); - } - } - throw new IllegalStateException( - "No matching Lucene VectorSimilarityFunction found for ordinal: " + ord); - } - } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 6966a7d1c45a..e4d2fe441b62 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -30,6 +30,7 @@ import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; @@ -57,7 +58,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; -import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; @@ -224,13 +224,15 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { pqVectors = getPQVectors(randomAccessVectorValues, fieldInfo); buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( - getVectorSimilarityFunction(fieldInfo), pqVectors); + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), + pqVectors); } else { // Not enough vectors for quantization; use full precision vectors instead pqVectors = null; buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( - randomAccessVectorValues, getVectorSimilarityFunction(fieldInfo)); + randomAccessVectorValues, + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction())); } final GraphNodeIdToDocMap graphNodeIdToDocMap = field.createGraphNodeIdToDocMap(); @@ -295,7 +297,7 @@ private VectorIndexFieldMetadata writeGraph( return new VectorIndexFieldMetadata( fieldInfo.number, fieldInfo.getVectorEncoding(), - fieldInfo.getVectorSimilarityFunction(), + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), randomAccessVectorValues.dimension(), 0, 0, @@ -333,7 +335,7 @@ private VectorIndexFieldMetadata writeGraph( return new VectorIndexFieldMetadata( fieldInfo.number, fieldInfo.getVectorEncoding(), - fieldInfo.getVectorSimilarityFunction(), + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), randomAccessVectorValues.dimension(), startOffset, endGraphOffset - startOffset, @@ -347,8 +349,11 @@ private VectorIndexFieldMetadata writeGraph( private PQVectors getPQVectors( RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) throws IOException { - final VectorSimilarityFunction vectorSimilarityFunction = - fieldInfo.getVectorSimilarityFunction(); + final boolean globallyCenter = + switch (fieldInfo.getVectorSimilarityFunction()) { + case EUCLIDEAN -> true; + case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; + }; final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(randomAccessVectorValues.dimension()); final var numberOfClustersPerSubspace = @@ -357,10 +362,7 @@ private PQVectors getPQVectors( ProductQuantization pq = ProductQuantization.compute( - randomAccessVectorValues, - M, - numberOfClustersPerSubspace, - vectorSimilarityFunction == VectorSimilarityFunction.EUCLIDEAN); + randomAccessVectorValues, M, numberOfClustersPerSubspace, globallyCenter); return (PQVectors) pq.encodeAll(randomAccessVectorValues); } @@ -404,7 +406,7 @@ public VectorIndexFieldMetadata( public void toOutput(IndexOutput out) throws IOException { out.writeInt(fieldNumber); out.writeInt(vectorEncoding.ordinal()); - out.writeInt(JVectorReader.VectorSimilarityMapper.distFuncToOrd(vectorSimilarityFunction)); + out.writeInt(vectorSimilarityFunction.ordinal()); out.writeVInt(vectorDimension); out.writeVLong(vectorIndexOffset); out.writeVLong(vectorIndexLength); @@ -417,8 +419,7 @@ public void toOutput(IndexOutput out) throws IOException { public VectorIndexFieldMetadata(IndexInput in) throws IOException { this.fieldNumber = in.readInt(); this.vectorEncoding = readVectorEncoding(in); - this.vectorSimilarityFunction = - JVectorReader.VectorSimilarityMapper.ordToLuceneDistFunc(in.readInt()); + this.vectorSimilarityFunction = VectorSimilarityFunction.values()[in.readInt()]; this.vectorDimension = in.readVInt(); this.vectorIndexOffset = in.readVLong(); this.vectorIndexLength = in.readVLong(); @@ -540,19 +541,6 @@ public long ramBytesUsed() { } } - static io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction( - FieldInfo fieldInfo) { - return switch (fieldInfo.getVectorSimilarityFunction()) { - case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; - case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; - case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; - // $CASES-OMITTED$ - default -> - throw new IllegalArgumentException( - "Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); - }; - } - private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { assert fieldInfo.hasVectorValues(); final int dimension = fieldInfo.getVectorDimension(); @@ -648,7 +636,8 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro } final BuildScoreProvider buildScoreProvider; - final var similarityFunction = getVectorSimilarityFunction(fieldInfo); + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); if (pqVectors != null) { // Re-use PQ codebooks to build a new graph from scratch buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors); From b5255b371e3459ce608ddf7752b0300724dc25da Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 20 Nov 2025 17:45:13 +0000 Subject: [PATCH 58/86] fixup! Remove explicit SIMD pools --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index e4d2fe441b62..3461e9c4aee3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -626,7 +626,6 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro new RandomAccessVectorValuesOverVectorValues(values); newPq = newPq.refine(randomAccessVectorValues); } - newPq.encodeAll(ravv); pqVectors = (PQVectors) newPq.encodeAll(ravv); } else if (ravv.size() >= minimumBatchSizeForQuantization) { // No pre-existing codebooks, check if we have enough vectors to trigger quantization @@ -799,7 +798,8 @@ public OnHeapGraphIndex getGraph( ord -> CompletableFuture.runAsync( () -> graphIndexBuilder.addGraphNode(ord, vv.get().getVector(ord)), executor)) - .forEach(CompletableFuture::join); + .reduce((a, b) -> a.runAfterBoth(b, () -> {})) + .ifPresent(CompletableFuture::join); graphIndexBuilder.cleanup(); graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); From 5e2a8a0fbbedc6254f47ef8030ddd1218657165d Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 20 Nov 2025 20:45:24 +0000 Subject: [PATCH 59/86] Make RandomAccessVectorValuesOverVectorValues thread-safe --- .../sandbox/codecs/jvector/JVectorWriter.java | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 3461e9c4aee3..7274c660755a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -807,8 +807,6 @@ public OnHeapGraphIndex getGraph( } static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { - private final VectorTypeSupport VECTOR_TYPE_SUPPORT = - VectorizationProvider.getInstance().getVectorTypeSupport(); private final FloatVectorValues values; public RandomAccessVectorValuesOverVectorValues(FloatVectorValues values) { @@ -828,26 +826,26 @@ public int dimension() { @Override public VectorFloat getVector(int nodeId) { try { - // Access to float values is not thread safe - synchronized (this) { - final float[] vector = values.vectorValue(nodeId); - final float[] copy = new float[vector.length]; - System.arraycopy(vector, 0, copy, 0, vector.length); - return VECTOR_TYPE_SUPPORT.createFloatVector(copy); - } + final float[] vector = values.vectorValue(nodeId); + return VECTOR_TYPE_SUPPORT.createFloatVector(Arrays.copyOf(vector, vector.length)); } catch (IOException e) { - throw new RuntimeException(e); + throw new UncheckedIOException(e); } } @Override public boolean isValueShared() { - return false; + // Access to float values is not thread safe + return true; } @Override public RandomAccessVectorValues copy() { - throw new UnsupportedOperationException("Copy not supported"); + try { + return new RandomAccessVectorValuesOverVectorValues(values.copy()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } } } From d3aeeb5e1d05a664120750aa8756f36af38d7604 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Fri, 21 Nov 2025 20:21:36 +0000 Subject: [PATCH 60/86] Use OrdinalMapper for sorting index --- .../sandbox/codecs/jvector/JVectorWriter.java | 97 +++++++++++-------- 1 file changed, 56 insertions(+), 41 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 7274c660755a..9fba3e6fc490 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -24,6 +24,7 @@ import io.github.jbellis.jvector.graph.OnHeapGraphIndex; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.OrdinalMapper; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; @@ -39,7 +40,6 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; @@ -92,8 +92,7 @@ * jVector ordinals and the new Lucene document IDs. This is achieved by keeping checkpoints of the * {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as * needed across merges by constructing a new mapping from the previous mapping and the {@link - * org.apache.lucene.index.MergeState.DocMap} provided in the {@link MergeState}. And across sorts - * with {@link FieldWriter#applySort(Sorter.DocMap)} during flushes. + * org.apache.lucene.index.MergeState.DocMap} provided in the {@link MergeState}. */ public class JVectorWriter extends KnnVectorsWriter { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = @@ -213,8 +212,19 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { for (FieldWriter field : fields) { + final DocsWithFieldSet newDocIds; + final OrdinalMapper ordinalMapper; if (sortMap != null) { - field.applySort(sortMap); + assert field.docIds.cardinality() <= sortMap.size(); + final int size = field.docIds.cardinality(); + final int[] oldToNew = new int[size]; + final int[] newToOld = new int[size]; + newDocIds = new DocsWithFieldSet(); + KnnVectorsWriter.mapOldOrdToNewOrd(field.docIds, sortMap, oldToNew, newToOld, newDocIds); + ordinalMapper = new ArrayOrdinalMapper(size - 1, oldToNew, newToOld); + } else { + newDocIds = field.docIds; + ordinalMapper = null; } final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); final BuildScoreProvider buildScoreProvider; @@ -235,7 +245,7 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction())); } - final GraphNodeIdToDocMap graphNodeIdToDocMap = field.createGraphNodeIdToDocMap(); + final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(newDocIds); OnHeapGraphIndex graph = getGraph( buildScoreProvider, @@ -243,7 +253,31 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { fieldInfo, segmentWriteState.segmentInfo.name, Runnable::run); - writeField(field.fieldInfo, randomAccessVectorValues, pqVectors, graphNodeIdToDocMap, graph); + writeField( + field.fieldInfo, + randomAccessVectorValues, + pqVectors, + ordinalMapper, + graphNodeIdToDocMap, + graph); + } + } + + private record ArrayOrdinalMapper(int maxOrdinal, int[] oldToNew, int[] newToOld) + implements OrdinalMapper { + @Override + public int maxOrdinal() { + return maxOrdinal; + } + + @Override + public int oldToNew(int oldOrdinal) { + return oldToNew[oldOrdinal]; + } + + @Override + public int newToOld(int newOrdinal) { + return newToOld[newOrdinal]; } } @@ -251,11 +285,18 @@ private void writeField( FieldInfo fieldInfo, RandomAccessVectorValues randomAccessVectorValues, PQVectors pqVectors, + OrdinalMapper ordinalMapper, GraphNodeIdToDocMap graphNodeIdToDocMap, OnHeapGraphIndex graph) throws IOException { final var vectorIndexFieldMetadata = - writeGraph(graph, randomAccessVectorValues, fieldInfo, pqVectors, graphNodeIdToDocMap); + writeGraph( + graph, + randomAccessVectorValues, + fieldInfo, + pqVectors, + ordinalMapper, + graphNodeIdToDocMap); meta.writeInt(fieldInfo.number); vectorIndexFieldMetadata.toOutput(meta); } @@ -274,6 +315,7 @@ private VectorIndexFieldMetadata writeGraph( RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo, PQVectors pqVectors, + OrdinalMapper ordinalMapper, GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { // field data file, which contains the graph @@ -306,10 +348,13 @@ private VectorIndexFieldMetadata writeGraph( degreeOverflow, graphNodeIdToDocMap); } - try (var writer = + final var writerBuilder = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) - .with(new InlineVectors(randomAccessVectorValues.dimension())) - .build()) { + .with(new InlineVectors(randomAccessVectorValues.dimension())); + if (ordinalMapper != null) { + writerBuilder.withMapper(ordinalMapper); + } + try (var writer = writerBuilder.build()) { var suppliers = Feature.singleStateFactory( FeatureId.INLINE_VECTORS, @@ -499,40 +544,10 @@ public float[] copyValue(float[] vectorValue) { return vectorValue.clone(); } - public void applySort(Sorter.DocMap sortMap) throws IOException { - // Ensure that all existing docs can be sorted - final int[] oldToNewOrd = new int[vectors.size()]; - final DocsWithFieldSet oldDocIds = docIds; - docIds = new DocsWithFieldSet(); - mapOldOrdToNewOrd(oldDocIds, sortMap, oldToNewOrd, null, docIds); - - // Swap vectors into their new ordinals - for (int oldOrd = 0; oldOrd < vectors.size(); ++oldOrd) { - final int newOrd = oldToNewOrd[oldOrd]; - if (oldOrd == newOrd) { - continue; - } - - // Swap the element at oldOrd into its position at newOrd and update the index mapping - Collections.swap(vectors, oldOrd, newOrd); - oldToNewOrd[oldOrd] = oldToNewOrd[newOrd]; - oldToNewOrd[newOrd] = newOrd; - - // The element at oldOrd may be displaced and need to be swapped again - if (oldToNewOrd[oldOrd] != oldOrd) { - oldOrd -= 1; - } - } - } - public RandomAccessVectorValues toRandomAccessVectorValues() { return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); } - public GraphNodeIdToDocMap createGraphNodeIdToDocMap() { - return new GraphNodeIdToDocMap(docIds); - } - @Override public long ramBytesUsed() { return SHALLOW_SIZE @@ -654,7 +669,7 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro fieldInfo, segmentWriteState.segmentInfo.name, mergeState.intraMergeTaskExecutor); - writeField(fieldInfo, ravv, pqVectors, graphNodeIdToDocMap, graph); + writeField(fieldInfo, ravv, pqVectors, null, graphNodeIdToDocMap, graph); } private static final class SubFloatVectors extends DocIDMerger.Sub { From d5f8bb259526ba72edb17fa90096dddf53e7e9c8 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 05:46:00 +0000 Subject: [PATCH 61/86] Don't write metadata for empty graphs --- .../sandbox/codecs/jvector/JVectorReader.java | 28 ++++++++----------- .../sandbox/codecs/jvector/JVectorWriter.java | 22 +++++---------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 3dc056096ac0..b006a9bb7c0c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -274,22 +274,18 @@ public FieldEntry( this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - if (vectorIndexLength != 0) { - // For the slice we would like to include the Lucene header, unfortunately, we have to do - // this because jVector use global offsets instead of local offsets - final long sliceLength = - vectorIndexLength - + CodecUtil.indexHeaderLength( - JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); - // Load the graph index - this.indexReaderSupplier = - new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); - this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); - } else { - this.indexReaderSupplier = null; - this.index = null; - } + assert vectorIndexLength > 0 : "Read empty JVector graph"; + // For the slice we would like to include the Lucene header, unfortunately, we have to do + // this because jVector use global offsets instead of local offsets + final long sliceLength = + vectorIndexLength + + CodecUtil.indexHeaderLength( + JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); + // Load the graph index + this.indexReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); // If quantized load the compressed product quantized vectors with their codebooks if (pqCodebooksAndVectorsLength > 0) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 9fba3e6fc490..8ec2332437d8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -334,20 +334,6 @@ private VectorIndexFieldMetadata writeGraph( segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); final long startOffset = indexOutput.getFilePointer(); - if (graph.size() == 0) { - CodecUtil.writeFooter(indexOutput); - return new VectorIndexFieldMetadata( - fieldInfo.number, - fieldInfo.getVectorEncoding(), - JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), - randomAccessVectorValues.dimension(), - 0, - 0, - 0, - 0, - degreeOverflow, - graphNodeIdToDocMap); - } final var writerBuilder = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) .with(new InlineVectors(randomAccessVectorValues.dimension())); @@ -601,8 +587,13 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro ord += 1; } - // Make a RandomAccessVectorValues instance using the new graph ordinals final int totalLiveDocsCount = ord; + if (totalLiveDocsCount == 0) { + // Avoid writing an empty graph + return; + } + + // Make a RandomAccessVectorValues instance using the new graph ordinals final var ravv = new RandomAccessMergedFloatVectorValues( totalLiveDocsCount, @@ -787,6 +778,7 @@ public OnHeapGraphIndex getGraph( FieldInfo fieldInfo, String segmentName, Executor executor) { + assert randomAccessVectorValues.size() > 0 : "Cannot build empty graph"; final GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder( buildScoreProvider, From d7b40bad6a048fb6f4a0b6c7180cbecb20ff2c16 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 06:03:33 +0000 Subject: [PATCH 62/86] Fix JVector data slicing --- .../jvector/JVectorRandomAccessReader.java | 45 +++---------------- .../sandbox/codecs/jvector/JVectorReader.java | 10 ++--- 2 files changed, 11 insertions(+), 44 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index de87f451f5c8..8d9445314975 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -137,53 +137,20 @@ public long length() throws IOException { * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} */ public static class Supplier implements ReaderSupplier { - private final AtomicInteger readerCount = new AtomicInteger(0); - private final IndexInput currentInput; - private final long sliceStartOffset; - private final long sliceLength; - private final ConcurrentHashMap readers = - new ConcurrentHashMap<>(); - - public Supplier(IndexInput indexInput) throws IOException { - this( - indexInput, - indexInput.getFilePointer(), - indexInput.length() - indexInput.getFilePointer()); - } + private final IndexInput input; - public Supplier(IndexInput indexInput, long sliceStartOffset, long sliceLength) - throws IOException { - this.currentInput = indexInput; - this.sliceStartOffset = sliceStartOffset; - this.sliceLength = sliceLength; + public Supplier(IndexInput input) { + this.input = input; } @Override - public RandomAccessReader get() throws IOException { - synchronized (this) { - final IndexInput input = - currentInput - .slice("Input Slice for the jVector graph or PQ", sliceStartOffset, sliceLength) - .clone(); - - var reader = new JVectorRandomAccessReader(input); - int readerId = readerCount.getAndIncrement(); - readers.put(readerId, reader); - return reader; - } + public synchronized RandomAccessReader get() throws IOException { + return new JVectorRandomAccessReader(input.clone()); } @Override public void close() throws IOException { - // Close source of all cloned inputs - IOUtils.closeWhileHandlingException(currentInput); - - // Close all readers - for (RandomAccessReader reader : readers.values()) { - IOUtils.closeWhileHandlingException(reader::close); - } - readers.clear(); - readerCount.set(0); + // Cloned inputs do not need to be closed } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index b006a9bb7c0c..ff6394b6fbe5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -255,6 +255,7 @@ class FieldEntry implements Closeable { private final long pqCodebooksAndVectorsOffset; private final String vectorIndexFieldDataFileName; private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final IndexInput data; private final ReaderSupplier indexReaderSupplier; private final ReaderSupplier pqCodebooksReaderSupplier; private final OnDiskGraphIndex index; @@ -275,6 +276,7 @@ public FieldEntry( baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; assert vectorIndexLength > 0 : "Read empty JVector graph"; + this.data = directory.openInput(vectorIndexFieldDataFileName, state.context); // For the slice we would like to include the Lucene header, unfortunately, we have to do // this because jVector use global offsets instead of local offsets final long sliceLength = @@ -283,8 +285,7 @@ public FieldEntry( JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); // Load the graph index this.indexReaderSupplier = - new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); + new JVectorRandomAccessReader.Supplier(data.slice("graph", 0, sliceLength)); this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); // If quantized load the compressed product quantized vectors with their codebooks @@ -296,9 +297,7 @@ public FieldEntry( } this.pqCodebooksReaderSupplier = new JVectorRandomAccessReader.Supplier( - directory.openInput(vectorIndexFieldDataFileName, IOContext.READONCE), - pqCodebooksAndVectorsOffset, - pqCodebooksAndVectorsLength); + data.slice("pq", pqCodebooksAndVectorsOffset, pqCodebooksAndVectorsLength)); try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { this.pqVectors = PQVectors.load(randomAccessReader); } @@ -310,6 +309,7 @@ public FieldEntry( @Override public void close() throws IOException { + IOUtils.close(data); if (indexReaderSupplier != null) { IOUtils.close(indexReaderSupplier::close); } From 7e21ddfd4fbeb745943ab72b46e0547706e6ff0b Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 06:07:36 +0000 Subject: [PATCH 63/86] Fix JVectorRandomAccessReader imports --- .../sandbox/codecs/jvector/JVectorRandomAccessReader.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 8d9445314975..d7622ebc85a8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -23,10 +23,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.FloatBuffer; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.IOUtils; /// Implements JVector reader capabilities over a Lucene IndexInput public class JVectorRandomAccessReader implements RandomAccessReader { From 3ff7e4c15974d43ffd440fc0312c7db53fec0563 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 06:08:57 +0000 Subject: [PATCH 64/86] Remove pqCodebooksReaderSupplier --- .../lucene/sandbox/codecs/jvector/JVectorReader.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index ff6394b6fbe5..80b00fce1b6b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -257,7 +257,6 @@ class FieldEntry implements Closeable { private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final IndexInput data; private final ReaderSupplier indexReaderSupplier; - private final ReaderSupplier pqCodebooksReaderSupplier; private final OnDiskGraphIndex index; private final PQVectors pqVectors; // The product quantized vectors with their codebooks @@ -295,14 +294,12 @@ public FieldEntry( throw new IllegalArgumentException( "pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); } - this.pqCodebooksReaderSupplier = - new JVectorRandomAccessReader.Supplier( - data.slice("pq", pqCodebooksAndVectorsOffset, pqCodebooksAndVectorsLength)); - try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { + final var pqSlice = + data.slice("pq", pqCodebooksAndVectorsOffset, pqCodebooksAndVectorsLength); + try (final var randomAccessReader = new JVectorRandomAccessReader(pqSlice)) { this.pqVectors = PQVectors.load(randomAccessReader); } } else { - this.pqCodebooksReaderSupplier = null; this.pqVectors = null; } } @@ -313,9 +310,6 @@ public void close() throws IOException { if (indexReaderSupplier != null) { IOUtils.close(indexReaderSupplier::close); } - if (pqCodebooksReaderSupplier != null) { - IOUtils.close(pqCodebooksReaderSupplier::close); - } } } } From 44847ec3f1287728f45f7e9c754f09615b7a1b55 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 06:12:14 +0000 Subject: [PATCH 65/86] Remove indexReaderSupplier --- .../lucene/sandbox/codecs/jvector/JVectorReader.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 80b00fce1b6b..3a3d1230b93f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -17,7 +17,6 @@ package org.apache.lucene.sandbox.codecs.jvector; -import io.github.jbellis.jvector.disk.ReaderSupplier; import io.github.jbellis.jvector.graph.GraphSearcher; import io.github.jbellis.jvector.graph.SearchResult; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; @@ -256,7 +255,6 @@ class FieldEntry implements Closeable { private final String vectorIndexFieldDataFileName; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final IndexInput data; - private final ReaderSupplier indexReaderSupplier; private final OnDiskGraphIndex index; private final PQVectors pqVectors; // The product quantized vectors with their codebooks @@ -282,8 +280,8 @@ public FieldEntry( vectorIndexLength + CodecUtil.indexHeaderLength( JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); - // Load the graph index - this.indexReaderSupplier = + // Load the graph index from cloned slices of data (no need to close) + final var indexReaderSupplier = new JVectorRandomAccessReader.Supplier(data.slice("graph", 0, sliceLength)); this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); @@ -307,9 +305,6 @@ public FieldEntry( @Override public void close() throws IOException { IOUtils.close(data); - if (indexReaderSupplier != null) { - IOUtils.close(indexReaderSupplier::close); - } } } } From 8aa2cb175d0b5bd7e51db9c02e843276e9579f71 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 06:19:12 +0000 Subject: [PATCH 66/86] Remove useless offset/length fields --- .../sandbox/codecs/jvector/JVectorReader.java | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 3a3d1230b93f..c158f724fc9e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -248,10 +248,6 @@ private void readFields(ChecksumIndexInput meta) throws IOException { class FieldEntry implements Closeable { private final VectorSimilarityFunction similarityFunction; private final int vectorDimension; - private final long vectorIndexOffset; - private final long vectorIndexLength; - private final long pqCodebooksAndVectorsLength; - private final long pqCodebooksAndVectorsOffset; private final String vectorIndexFieldDataFileName; private final GraphNodeIdToDocMap graphNodeIdToDocMap; private final IndexInput data; @@ -263,37 +259,35 @@ public FieldEntry( throws IOException { this.similarityFunction = vectorIndexFieldMetadata.vectorSimilarityFunction; this.vectorDimension = vectorIndexFieldMetadata.vectorDimension; - this.vectorIndexOffset = vectorIndexFieldMetadata.vectorIndexOffset; - this.vectorIndexLength = vectorIndexFieldMetadata.vectorIndexLength; - this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; - this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; this.graphNodeIdToDocMap = vectorIndexFieldMetadata.graphNodeIdToDocMap; - this.vectorIndexFieldDataFileName = baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - assert vectorIndexLength > 0 : "Read empty JVector graph"; + final long graphOffset = vectorIndexFieldMetadata.vectorIndexOffset; + final long graphLength = vectorIndexFieldMetadata.vectorIndexLength; + assert graphLength > 0 : "Read empty JVector graph"; this.data = directory.openInput(vectorIndexFieldDataFileName, state.context); // For the slice we would like to include the Lucene header, unfortunately, we have to do // this because jVector use global offsets instead of local offsets final long sliceLength = - vectorIndexLength + graphLength + CodecUtil.indexHeaderLength( JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); // Load the graph index from cloned slices of data (no need to close) final var indexReaderSupplier = new JVectorRandomAccessReader.Supplier(data.slice("graph", 0, sliceLength)); - this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, graphOffset); // If quantized load the compressed product quantized vectors with their codebooks - if (pqCodebooksAndVectorsLength > 0) { - assert pqCodebooksAndVectorsOffset > 0; - if (pqCodebooksAndVectorsOffset < vectorIndexOffset) { + final long pqOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; + final long pqLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; + if (pqLength > 0) { + assert pqOffset > 0; + if (pqOffset < graphOffset) { throw new IllegalArgumentException( - "pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); + "pqOffset must be greater than vectorIndexOffset"); } - final var pqSlice = - data.slice("pq", pqCodebooksAndVectorsOffset, pqCodebooksAndVectorsLength); + final var pqSlice = data.slice("pq", pqOffset, pqLength); try (final var randomAccessReader = new JVectorRandomAccessReader(pqSlice)) { this.pqVectors = PQVectors.load(randomAccessReader); } From af695713c63543c346b0a543bf5c5c45554fe712 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 08:24:17 +0000 Subject: [PATCH 67/86] Use proper index slicing --- .../codecs/jvector/JVectorIndexWriter.java | 8 ++++++-- .../sandbox/codecs/jvector/JVectorReader.java | 17 +++++++++-------- .../sandbox/codecs/jvector/JVectorWriter.java | 8 ++++---- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index 6483d7c71393..e4a03571f9f3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -27,19 +27,23 @@ */ public class JVectorIndexWriter implements IndexWriter { private final IndexOutput indexOutputDelegate; + /// Initial offset of the writer, which will be subtracted from [position()][#position()] to trick + /// JVector into using offsets that work for slices used by the readers. + private final long offset; public JVectorIndexWriter(IndexOutput indexOutputDelegate) { this.indexOutputDelegate = indexOutputDelegate; + this.offset = indexOutputDelegate.getFilePointer(); } @Override public long position() throws IOException { - return indexOutputDelegate.getFilePointer(); + return indexOutputDelegate.getFilePointer() - offset; } @Override public void close() throws IOException { - indexOutputDelegate.close(); + // Let the user close the delegate } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index c158f724fc9e..67b1117d8f4e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -267,16 +267,17 @@ public FieldEntry( final long graphLength = vectorIndexFieldMetadata.vectorIndexLength; assert graphLength > 0 : "Read empty JVector graph"; this.data = directory.openInput(vectorIndexFieldDataFileName, state.context); - // For the slice we would like to include the Lucene header, unfortunately, we have to do - // this because jVector use global offsets instead of local offsets - final long sliceLength = - graphLength - + CodecUtil.indexHeaderLength( - JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); + CodecUtil.checkIndexHeader( + this.data, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); // Load the graph index from cloned slices of data (no need to close) final var indexReaderSupplier = - new JVectorRandomAccessReader.Supplier(data.slice("graph", 0, sliceLength)); - this.index = OnDiskGraphIndex.load(indexReaderSupplier, graphOffset); + new JVectorRandomAccessReader.Supplier(data.slice("graph", graphOffset, graphLength)); + this.index = OnDiskGraphIndex.load(indexReaderSupplier); // If quantized load the compressed product quantized vectors with their codebooks final long pqOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 8ec2332437d8..077caa270651 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -324,8 +324,7 @@ private VectorIndexFieldMetadata writeGraph( try (IndexOutput indexOutput = segmentWriteState.directory.createOutput( - vectorIndexFieldFileName, segmentWriteState.context); - final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput)) { + vectorIndexFieldFileName, segmentWriteState.context)) { // Header for the field data file CodecUtil.writeIndexHeader( indexOutput, @@ -333,6 +332,7 @@ private VectorIndexFieldMetadata writeGraph( JVectorFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput); final long startOffset = indexOutput.getFilePointer(); final var writerBuilder = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) @@ -346,7 +346,7 @@ private VectorIndexFieldMetadata writeGraph( FeatureId.INLINE_VECTORS, nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(nodeId))); writer.write(suppliers); - final long endGraphOffset = jVectorIndexWriter.position(); + final long endGraphOffset = indexOutput.getFilePointer(); // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed // vectors @@ -356,7 +356,7 @@ private VectorIndexFieldMetadata writeGraph( pqOffset = endGraphOffset; // write the compressed vectors and codebooks to disk pqVectors.write(jVectorIndexWriter); - pqLength = jVectorIndexWriter.position() - endGraphOffset; + pqLength = indexOutput.getFilePointer() - endGraphOffset; } else { pqOffset = 0; pqLength = 0; From b03d201af298f38ad4326706a0ed308d23e599b9 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 08:27:56 +0000 Subject: [PATCH 68/86] Remove segmentName arg from getGraph --- .../sandbox/codecs/jvector/JVectorWriter.java | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 077caa270651..38f0a70e1b3e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -246,13 +246,8 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { } final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(newDocIds); - OnHeapGraphIndex graph = - getGraph( - buildScoreProvider, - randomAccessVectorValues, - fieldInfo, - segmentWriteState.segmentInfo.name, - Runnable::run); + final var graph = + getGraph(buildScoreProvider, randomAccessVectorValues, fieldInfo, Runnable::run); writeField( field.fieldInfo, randomAccessVectorValues, @@ -654,12 +649,7 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro } final var graphNodeIdToDocMap = new GraphNodeIdToDocMap(docIds); final var graph = - getGraph( - buildScoreProvider, - ravv, - fieldInfo, - segmentWriteState.segmentInfo.name, - mergeState.intraMergeTaskExecutor); + getGraph(buildScoreProvider, ravv, fieldInfo, mergeState.intraMergeTaskExecutor); writeField(fieldInfo, ravv, pqVectors, null, graphNodeIdToDocMap, graph); } @@ -776,7 +766,6 @@ public OnHeapGraphIndex getGraph( BuildScoreProvider buildScoreProvider, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo, - String segmentName, Executor executor) { assert randomAccessVectorValues.size() > 0 : "Cannot build empty graph"; final GraphIndexBuilder graphIndexBuilder = From 4267ec0e1fa23e03ac1b1462fdceab4233f25306 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 08:36:31 +0000 Subject: [PATCH 69/86] Improve primary index file handling on write --- .../sandbox/codecs/jvector/JVectorWriter.java | 54 ++++++++----------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 38f0a70e1b3e..561a573d4925 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -103,8 +103,7 @@ public class JVectorWriter extends KnnVectorsWriter { private final List fields = new ArrayList<>(); private final IndexOutput meta; - private final IndexOutput vectorIndex; - private final String indexDataFileName; + private final IndexOutput data; private final String baseDataFileName; private final SegmentWriteState segmentWriteState; private final int maxConn; @@ -138,25 +137,16 @@ public JVectorWriter( this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; this.hierarchyEnabled = hierarchyEnabled; - String metaFileName = - IndexFileNames.segmentFileName( - segmentWriteState.segmentInfo.name, - segmentWriteState.segmentSuffix, - JVectorFormat.META_EXTENSION); - - this.indexDataFileName = - IndexFileNames.segmentFileName( - segmentWriteState.segmentInfo.name, - segmentWriteState.segmentSuffix, - JVectorFormat.VECTOR_INDEX_EXTENSION); this.baseDataFileName = segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; - boolean success = false; try { + final String metaFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.META_EXTENSION); meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); - vectorIndex = - segmentWriteState.directory.createOutput(indexDataFileName, segmentWriteState.context); CodecUtil.writeIndexHeader( meta, JVectorFormat.META_CODEC_NAME, @@ -164,18 +154,21 @@ public JVectorWriter( segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); + final String dataFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.VECTOR_INDEX_EXTENSION); + data = segmentWriteState.directory.createOutput(dataFileName, segmentWriteState.context); CodecUtil.writeIndexHeader( - vectorIndex, + data, JVectorFormat.VECTOR_INDEX_CODEC_NAME, JVectorFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this); - } + } catch (Throwable t) { + IOUtils.closeWhileSuppressingExceptions(t, this); + throw t; } } @@ -463,20 +456,15 @@ public void finish() throws IOException { } finished = true; - if (meta != null) { - // write end of fields marker - meta.writeInt(-1); - CodecUtil.writeFooter(meta); - } - - if (vectorIndex != null) { - CodecUtil.writeFooter(vectorIndex); - } + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + CodecUtil.writeFooter(data); } @Override public void close() throws IOException { - IOUtils.close(meta, vectorIndex); + IOUtils.close(meta, data); } @Override From f516762cc305412b8dac068ba70119034534c0f3 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 08:43:54 +0000 Subject: [PATCH 70/86] Remove FieldInfos field --- .../apache/lucene/sandbox/codecs/jvector/JVectorReader.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 67b1117d8f4e..4243429cdbec 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -49,7 +49,6 @@ public class JVectorReader extends KnnVectorsReader { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - private final FieldInfos fieldInfos; private final String baseDataFileName; // Maps field name to field entries private final Map fieldEntryMap = new HashMap<>(1); @@ -58,7 +57,6 @@ public class JVectorReader extends KnnVectorsReader { public JVectorReader(SegmentReadState state) throws IOException { this.state = state; - this.fieldInfos = state.fieldInfos; this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; final String metaFileName = IndexFileNames.segmentFileName( @@ -73,7 +71,7 @@ public JVectorReader(SegmentReadState state) throws IOException { JVectorFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - readFields(meta); + readFields(meta, state.fieldInfos); CodecUtil.checkFooter(meta); success = true; @@ -235,7 +233,7 @@ public void close() throws IOException { fieldEntryMap.clear(); } - private void readFields(ChecksumIndexInput meta) throws IOException { + private void readFields(ChecksumIndexInput meta, FieldInfos fieldInfos) throws IOException { for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = From df18dc8cd1f084501da2a47d7da6824fb4eaca3e Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 08:45:21 +0000 Subject: [PATCH 71/86] Fixup improve missing graph --- .../lucene/sandbox/codecs/jvector/JVectorReader.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 4243429cdbec..34782bc6df33 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -96,7 +96,7 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { final FieldEntry fieldEntry = fieldEntryMap.get(field); - if (fieldEntry == null || fieldEntry.index == null) { + if (fieldEntry == null) { return new FloatVectorValues() { @Override public float[] vectorValue(int ord) throws IOException { @@ -110,7 +110,7 @@ public FloatVectorValues copy() throws IOException { @Override public int dimension() { - return fieldEntry.vectorDimension; + return 0; } @Override @@ -148,11 +148,6 @@ public Optional getProductQuantizationForField(String field return Optional.of(fieldEntry.pqVectors.getCompressor()); } - public boolean hasIndex(String field) { - final var fieldEntry = fieldEntryMap.get(field); - return fieldEntry != null && fieldEntry.index != null; - } - @Override public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { From bbdcee5d711f73bad1e5848ae59c3a3e0eaa9232 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Sat, 22 Nov 2025 09:21:19 +0000 Subject: [PATCH 72/86] Write all fields to the same file --- .../sandbox/codecs/jvector/JVectorReader.java | 113 ++++++++++-------- .../sandbox/codecs/jvector/JVectorWriter.java | 28 +---- 2 files changed, 66 insertions(+), 75 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java index 34782bc6df33..670c45c54447 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -31,7 +31,9 @@ import io.github.jbellis.jvector.vector.types.VectorTypeSupport; import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.lucene.codecs.CodecUtil; @@ -49,48 +51,65 @@ public class JVectorReader extends KnnVectorsReader { private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); - private final String baseDataFileName; + private final IndexInput data; // Maps field name to field entries - private final Map fieldEntryMap = new HashMap<>(1); - private final Directory directory; - private final SegmentReadState state; + private final Map fieldEntryMap; public JVectorReader(SegmentReadState state) throws IOException { - this.state = state; - this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; + final List fieldMetaList = new ArrayList<>(); final String metaFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, JVectorFormat.META_EXTENSION); - this.directory = state.directory; - boolean success = false; try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { - CodecUtil.checkIndexHeader( - meta, - JVectorFormat.META_CODEC_NAME, + Throwable priorE = null; + try { + CodecUtil.checkIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + + JVectorWriter.VectorIndexFieldMetadata fieldMeta; + while ((fieldMeta = parseNextField(meta, state.fieldInfos)) != null) { + fieldMetaList.add(fieldMeta); + } + } catch (Throwable t) { + priorE = t; + } finally { + CodecUtil.checkFooter(meta, priorE); + } + + final String dataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, JVectorFormat.VECTOR_INDEX_EXTENSION); + this.data = + state.directory.openInput( + dataFileName, state.context.withHints(FileTypeHint.DATA, DataAccessHint.RANDOM)); + + + CodecUtil.checkHeader( + data, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, JVectorFormat.VERSION_START, - JVectorFormat.VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix); - readFields(meta, state.fieldInfos); - CodecUtil.checkFooter(meta); - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this); + JVectorFormat.VERSION_CURRENT); + CodecUtil.retrieveChecksum(data); + + this.fieldEntryMap = new HashMap<>(fieldMetaList.size()); + for (var fieldMeta : fieldMetaList) { + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldMeta.fieldNumber); + if (fieldEntryMap.containsKey(fieldInfo.name)) { + throw new CorruptIndexException("Duplicate field: " + fieldInfo.name, meta); + } + fieldEntryMap.put(fieldInfo.name, new FieldEntry(data, fieldMeta)); } } } @Override public void checkIntegrity() throws IOException { - for (FieldEntry fieldEntry : fieldEntryMap.values()) { - // Verify the vector index file - try (var indexInput = - state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, IOContext.READONCE)) { - CodecUtil.checksumEntireFile(indexInput); - } - } + CodecUtil.checksumEntireFile(data); } @Override @@ -226,47 +245,39 @@ public void close() throws IOException { IOUtils.close(fieldEntry); } fieldEntryMap.clear(); + IOUtils.close(data); } - private void readFields(ChecksumIndexInput meta, FieldInfos fieldInfos) throws IOException { - for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { - final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); // read field number - JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = - new JVectorWriter.VectorIndexFieldMetadata(meta); - assert fieldInfo.number == vectorIndexFieldMetadata.fieldNumber; - fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); + private static JVectorWriter.VectorIndexFieldMetadata parseNextField( + IndexInput meta, FieldInfos fieldInfos) throws IOException { + final int fieldNumber = meta.readInt(); + if (fieldNumber == -1) { + return null; + } + + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + if (fieldInfo == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } + + return new JVectorWriter.VectorIndexFieldMetadata(meta); } class FieldEntry implements Closeable { private final VectorSimilarityFunction similarityFunction; - private final int vectorDimension; - private final String vectorIndexFieldDataFileName; private final GraphNodeIdToDocMap graphNodeIdToDocMap; - private final IndexInput data; private final OnDiskGraphIndex index; private final PQVectors pqVectors; // The product quantized vectors with their codebooks public FieldEntry( - FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) + IndexInput data, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) throws IOException { this.similarityFunction = vectorIndexFieldMetadata.vectorSimilarityFunction; - this.vectorDimension = vectorIndexFieldMetadata.vectorDimension; this.graphNodeIdToDocMap = vectorIndexFieldMetadata.graphNodeIdToDocMap; - this.vectorIndexFieldDataFileName = - baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; final long graphOffset = vectorIndexFieldMetadata.vectorIndexOffset; final long graphLength = vectorIndexFieldMetadata.vectorIndexLength; assert graphLength > 0 : "Read empty JVector graph"; - this.data = directory.openInput(vectorIndexFieldDataFileName, state.context); - CodecUtil.checkIndexHeader( - this.data, - JVectorFormat.VECTOR_INDEX_CODEC_NAME, - JVectorFormat.VERSION_START, - JVectorFormat.VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix); // Load the graph index from cloned slices of data (no need to close) final var indexReaderSupplier = new JVectorRandomAccessReader.Supplier(data.slice("graph", graphOffset, graphLength)); @@ -292,7 +303,7 @@ public FieldEntry( @Override public void close() throws IOException { - IOUtils.close(data); + index.close(); } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 561a573d4925..9d2fa5c64cdc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -104,8 +104,6 @@ public class JVectorWriter extends KnnVectorsWriter { private final IndexOutput meta; private final IndexOutput data; - private final String baseDataFileName; - private final SegmentWriteState segmentWriteState; private final int maxConn; private final int beamWidth; private final float degreeOverflow; @@ -129,7 +127,6 @@ public JVectorWriter( int minimumBatchSizeForQuantization, boolean hierarchyEnabled) throws IOException { - this.segmentWriteState = segmentWriteState; this.maxConn = maxConn; this.beamWidth = beamWidth; this.degreeOverflow = degreeOverflow; @@ -137,8 +134,6 @@ public JVectorWriter( this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; this.hierarchyEnabled = hierarchyEnabled; - this.baseDataFileName = - segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; try { final String metaFileName = @@ -306,22 +301,8 @@ private VectorIndexFieldMetadata writeGraph( OrdinalMapper ordinalMapper, GraphNodeIdToDocMap graphNodeIdToDocMap) throws IOException { - // field data file, which contains the graph - final String vectorIndexFieldFileName = - baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; - - try (IndexOutput indexOutput = - segmentWriteState.directory.createOutput( - vectorIndexFieldFileName, segmentWriteState.context)) { - // Header for the field data file - CodecUtil.writeIndexHeader( - indexOutput, - JVectorFormat.VECTOR_INDEX_CODEC_NAME, - JVectorFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix); - final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput); - final long startOffset = indexOutput.getFilePointer(); + try (final var jVectorIndexWriter = new JVectorIndexWriter(data)) { + final long startOffset = data.getFilePointer(); final var writerBuilder = new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) .with(new InlineVectors(randomAccessVectorValues.dimension())); @@ -334,7 +315,7 @@ private VectorIndexFieldMetadata writeGraph( FeatureId.INLINE_VECTORS, nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(nodeId))); writer.write(suppliers); - final long endGraphOffset = indexOutput.getFilePointer(); + final long endGraphOffset = data.getFilePointer(); // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed // vectors @@ -344,12 +325,11 @@ private VectorIndexFieldMetadata writeGraph( pqOffset = endGraphOffset; // write the compressed vectors and codebooks to disk pqVectors.write(jVectorIndexWriter); - pqLength = indexOutput.getFilePointer() - endGraphOffset; + pqLength = data.getFilePointer() - endGraphOffset; } else { pqOffset = 0; pqLength = 0; } - CodecUtil.writeFooter(indexOutput); return new VectorIndexFieldMetadata( fieldInfo.number, From f90eacfb40210d030957c026ecf86bfad829e379 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 24 Nov 2025 20:34:05 +0000 Subject: [PATCH 73/86] fixup! Fix missing @Override --- .../lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index d7622ebc85a8..3eda89105ce3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -54,7 +54,6 @@ public float readFloat() throws IOException { return Float.intBitsToFloat(indexInputDelegate.readInt()); } - // TODO: bring back to override when upgrading jVector again @Override public long readLong() throws IOException { return indexInputDelegate.readLong(); From cecc473d5721a968bed2f36c26e5fc16235dd5db Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 25 Nov 2025 22:24:28 +0000 Subject: [PATCH 74/86] Fix remove sorting --- .../org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 9d2fa5c64cdc..74785900b588 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -468,7 +468,7 @@ static class FieldWriter extends KnnFieldVectorsWriter { // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to // the jVector ordinal private final List> vectors = new ArrayList<>(); - private DocsWithFieldSet docIds; + private final DocsWithFieldSet docIds; FieldWriter(FieldInfo fieldInfo) { /** For creating a new field from a flat field vectors writer. */ From 1d2b4e83d8a0a5348901b9ba86a257398f7e8625 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 25 Nov 2025 22:42:43 +0000 Subject: [PATCH 75/86] Avoid extra copies in RandomAccessMergedFloatVectorValues.getVector() --- .../sandbox/codecs/jvector/JVectorWriter.java | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 74785900b588..020dab7c6b61 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -686,10 +686,19 @@ public int dimension() { } @Override - public VectorFloat getVector(int nodeId) { - final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(dimension); - getVectorInto(nodeId, vector, 0); - return vector; + public VectorFloat getVector(int node) { + final FloatVectorValues values = vectors[ordToReader.applyAsInt(node)]; + final int ord = ordToReaderOrd.applyAsInt(node); + + if (values instanceof JVectorFloatVectorValues jVectorValues) { + return jVectorValues.vectorFloatValue(ord); + } + + try { + return VECTOR_TYPE_SUPPORT.createFloatVector(values.vectorValue(ord)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } @Override @@ -701,16 +710,14 @@ public void getVectorInto(int node, VectorFloat destinationVector, int offset jVectorValues.getVectorInto(ord, destinationVector, offset); } - final float[] srcVector; + final VectorFloat srcVector; try { - srcVector = values.vectorValue(ord); + srcVector = VECTOR_TYPE_SUPPORT.createFloatVector(values.vectorValue(ord)); } catch (IOException e) { throw new UncheckedIOException(e); } - for (int i = 0; i < srcVector.length; ++i) { - destinationVector.set(i + offset, srcVector[i]); - } + destinationVector.copyFrom(srcVector, 0, offset, srcVector.length()); } @Override From cfbf4c2eeca554f35d72c594d7994e0d0febfc15 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Tue, 25 Nov 2025 22:55:38 +0000 Subject: [PATCH 76/86] Move PQ encoding to FieldWriter.addValue instead of flush --- .../sandbox/codecs/jvector/JVectorWriter.java | 48 ++++++++++++++++--- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 020dab7c6b61..0866e0af30ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -29,6 +29,7 @@ import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.quantization.MutablePQVectors; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; @@ -176,7 +177,8 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException + "This can provides much greater savings in storage and memory"; throw new UnsupportedOperationException(errorMessage); } - FieldWriter newField = new FieldWriter(fieldInfo); + final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(fieldInfo.getVectorDimension()); + final FieldWriter newField = new FieldWriter(fieldInfo, minimumBatchSizeForQuantization, M); fields.add(newField); return newField; @@ -216,17 +218,15 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { } final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); final BuildScoreProvider buildScoreProvider; - final PQVectors pqVectors; + final PQVectors pqVectors = field.getCompressedVectors(); final FieldInfo fieldInfo = field.fieldInfo; - if (randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { - pqVectors = getPQVectors(randomAccessVectorValues, fieldInfo); + if (pqVectors != null) { buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider( JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), pqVectors); } else { // Not enough vectors for quantization; use full precision vectors instead - pqVectors = null; buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider( randomAccessVectorValues, @@ -470,10 +470,18 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final List> vectors = new ArrayList<>(); private final DocsWithFieldSet docIds; - FieldWriter(FieldInfo fieldInfo) { + // PQ fields + private final int pqThreshold; + private final int pqSubspaceCount; + private MutablePQVectors pqVectors; + + FieldWriter(FieldInfo fieldInfo, int pqThreshold, int pqSubspaceCount) { /** For creating a new field from a flat field vectors writer. */ this.fieldInfo = fieldInfo; this.docIds = new DocsWithFieldSet(); + this.pqThreshold = pqThreshold; + this.pqSubspaceCount = pqSubspaceCount; + this.pqVectors = null; } @Override @@ -485,7 +493,29 @@ public void addValue(int docID, float[] vectorValue) throws IOException { + "\" appears more than once in this document (only one value is allowed per field)"); } docIds.add(docID); - vectors.add(VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue))); + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue)); + vectors.add(vector); + + if (pqVectors != null) { + pqVectors.encodeAndSet(vectors.size() - 1, vector); + } else if (vectors.size() > pqThreshold) { + final boolean globallyCenter = + switch (fieldInfo.getVectorSimilarityFunction()) { + case EUCLIDEAN -> true; + case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; + }; + final int pqCenterCount = Math.min(256, vectors.size()); + final var pq = + ProductQuantization.compute( + toRandomAccessVectorValues(), + pqSubspaceCount, + pqCenterCount, + globallyCenter); + pqVectors = new MutablePQVectors(pq); + for (int i = 0; i < vectors.size(); ++i) { + pqVectors.encodeAndSet(i, vectors.get(i)); + } + } } @Override @@ -497,6 +527,10 @@ public RandomAccessVectorValues toRandomAccessVectorValues() { return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); } + public PQVectors getCompressedVectors() { + return pqVectors; + } + @Override public long ramBytesUsed() { return SHALLOW_SIZE From dc3dba6090aa95635df0df0892af84ba042a4f77 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Thu, 4 Dec 2025 14:52:46 +0000 Subject: [PATCH 77/86] Use bulk read methods where possible (requires JVector byte-order pull 577) --- .../sandbox/codecs/jvector/JVectorIndexWriter.java | 10 ++++++++++ .../codecs/jvector/JVectorRandomAccessReader.java | 14 +++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java index e4a03571f9f3..5cbfece4c0e1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -19,6 +19,9 @@ import io.github.jbellis.jvector.disk.IndexWriter; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + import org.apache.lucene.store.IndexOutput; /** @@ -97,6 +100,13 @@ public void writeFloat(float v) throws IOException { indexOutputDelegate.writeInt(Float.floatToIntBits(v)); } + @Override + public void writeFloats(float[] floats, int offset, int count) throws IOException { + final ByteBuffer buf = ByteBuffer.allocate(count * Float.BYTES); + buf.order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floats, offset, count); + write(buf.array()); + } + @Override public void writeDouble(double v) throws IOException { writeLong(Double.doubleToLongBits(v)); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java index 3eda89105ce3..5374b822795e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -22,7 +22,6 @@ import java.io.EOFException; import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.FloatBuffer; import org.apache.lucene.store.IndexInput; /// Implements JVector reader capabilities over a Lucene IndexInput @@ -92,24 +91,17 @@ public void readFully(ByteBuffer buffer) throws IOException { @Override public void readFully(long[] vector) throws IOException { - for (int i = 0; i < vector.length; i++) { - vector[i] = readLong(); - } + indexInputDelegate.readLongs(vector, 0, vector.length); } @Override public void read(int[] ints, int offset, int count) throws IOException { - for (int i = 0; i < count; i++) { - ints[offset + i] = readInt(); - } + indexInputDelegate.readInts(ints, offset, count); } @Override public void read(float[] floats, int offset, int count) throws IOException { - final ByteBuffer byteBuffer = ByteBuffer.allocate(Float.BYTES * count); - indexInputDelegate.readBytes(byteBuffer.array(), offset, Float.BYTES * count); - FloatBuffer buffer = byteBuffer.asFloatBuffer(); - buffer.get(floats, offset, count); + indexInputDelegate.readFloats(floats, offset, count); } @Override From 9ef6dd913529f76b2b91ae8bfe352d5999214a16 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Fri, 5 Dec 2025 16:30:04 +0000 Subject: [PATCH 78/86] Move BuildScoreProvider to FieldWriter --- .../sandbox/codecs/jvector/JVectorWriter.java | 72 +++++++++++++++---- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 0866e0af30ef..7f659dc576bf 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -29,6 +29,7 @@ import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; import io.github.jbellis.jvector.quantization.MutablePQVectors; import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; @@ -42,6 +43,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; import java.util.function.IntUnaryOperator; @@ -217,21 +219,9 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { ordinalMapper = null; } final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); - final BuildScoreProvider buildScoreProvider; + final BuildScoreProvider buildScoreProvider = field.buildScoreProvider; final PQVectors pqVectors = field.getCompressedVectors(); final FieldInfo fieldInfo = field.fieldInfo; - if (pqVectors != null) { - buildScoreProvider = - BuildScoreProvider.pqBuildScoreProvider( - JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), - pqVectors); - } else { - // Not enough vectors for quantization; use full precision vectors instead - buildScoreProvider = - BuildScoreProvider.randomAccessScoreProvider( - randomAccessVectorValues, - JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction())); - } final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(newDocIds); final var graph = @@ -467,9 +457,11 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final FieldInfo fieldInfo; // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to // the jVector ordinal - private final List> vectors = new ArrayList<>(); + private final List> vectors; private final DocsWithFieldSet docIds; + private final DelegatingBuildScoreProvider buildScoreProvider; + // PQ fields private final int pqThreshold; private final int pqSubspaceCount; @@ -478,7 +470,17 @@ static class FieldWriter extends KnnFieldVectorsWriter { FieldWriter(FieldInfo fieldInfo, int pqThreshold, int pqSubspaceCount) { /** For creating a new field from a flat field vectors writer. */ this.fieldInfo = fieldInfo; + this.vectors = new ArrayList<>(); this.docIds = new DocsWithFieldSet(); + + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + this.buildScoreProvider = + new DelegatingBuildScoreProvider( + BuildScoreProvider.randomAccessScoreProvider( + toRandomAccessVectorValues(), + similarityFunction)); + this.pqThreshold = pqThreshold; this.pqSubspaceCount = pqSubspaceCount; this.pqVectors = null; @@ -515,6 +517,11 @@ public void addValue(int docID, float[] vectorValue) throws IOException { for (int i = 0; i < vectors.size(); ++i) { pqVectors.encodeAndSet(i, vectors.get(i)); } + + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + buildScoreProvider.setDelegate( + BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors)); } } @@ -539,6 +546,43 @@ public long ramBytesUsed() { } } + static final class DelegatingBuildScoreProvider implements BuildScoreProvider { + BuildScoreProvider delegate; + + DelegatingBuildScoreProvider(BuildScoreProvider delegate) { + this.delegate = Objects.requireNonNull(delegate); + } + + public void setDelegate(BuildScoreProvider delegate) { + this.delegate = Objects.requireNonNull(delegate); + } + + @Override + public boolean isExact() { + return delegate.isExact(); + } + + @Override + public VectorFloat approximateCentroid() { + return delegate.approximateCentroid(); + } + + @Override + public SearchScoreProvider searchProviderFor(VectorFloat vector) { + return delegate.searchProviderFor(vector); + } + + @Override + public SearchScoreProvider searchProviderFor(int node1) { + return delegate.searchProviderFor(node1); + } + + @Override + public SearchScoreProvider diversityProviderFor(int node1) { + return delegate.diversityProviderFor(node1); + } + } + private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { assert fieldInfo.hasVectorValues(); final int dimension = fieldInfo.getVectorDimension(); From 4b25b4bf09e1864481c972c89851a85bbe828b93 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Fri, 5 Dec 2025 16:56:32 +0000 Subject: [PATCH 79/86] Use ImmutableGraphIndex for writeField --- .../apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 7f659dc576bf..ecb479118f3e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -20,6 +20,7 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; import io.github.jbellis.jvector.graph.GraphIndexBuilder; +import io.github.jbellis.jvector.graph.ImmutableGraphIndex; import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import io.github.jbellis.jvector.graph.OnHeapGraphIndex; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; @@ -260,7 +261,7 @@ private void writeField( PQVectors pqVectors, OrdinalMapper ordinalMapper, GraphNodeIdToDocMap graphNodeIdToDocMap, - OnHeapGraphIndex graph) + ImmutableGraphIndex graph) throws IOException { final var vectorIndexFieldMetadata = writeGraph( @@ -284,7 +285,7 @@ private void writeField( * @throws IOException IOException */ private VectorIndexFieldMetadata writeGraph( - OnHeapGraphIndex graph, + ImmutableGraphIndex graph, RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo, PQVectors pqVectors, From 54959f50d4c3430bbca5c44bf8c78b5ef38055dd Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Fri, 5 Dec 2025 16:59:00 +0000 Subject: [PATCH 80/86] Build graph while adding docs --- .../sandbox/codecs/jvector/JVectorWriter.java | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index ecb479118f3e..ec6543555dfb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -181,7 +181,16 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException throw new UnsupportedOperationException(errorMessage); } final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(fieldInfo.getVectorDimension()); - final FieldWriter newField = new FieldWriter(fieldInfo, minimumBatchSizeForQuantization, M); + final FieldWriter newField = + new FieldWriter( + fieldInfo, + maxConn, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled, + minimumBatchSizeForQuantization, + M); fields.add(newField); return newField; @@ -220,13 +229,9 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { ordinalMapper = null; } final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); - final BuildScoreProvider buildScoreProvider = field.buildScoreProvider; final PQVectors pqVectors = field.getCompressedVectors(); - final FieldInfo fieldInfo = field.fieldInfo; - + final ImmutableGraphIndex graph = field.getGraphIndex(); final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(newDocIds); - final var graph = - getGraph(buildScoreProvider, randomAccessVectorValues, fieldInfo, Runnable::run); writeField( field.fieldInfo, randomAccessVectorValues, @@ -461,6 +466,7 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final List> vectors; private final DocsWithFieldSet docIds; + private GraphIndexBuilder indexBuilder; private final DelegatingBuildScoreProvider buildScoreProvider; // PQ fields @@ -468,7 +474,15 @@ static class FieldWriter extends KnnFieldVectorsWriter { private final int pqSubspaceCount; private MutablePQVectors pqVectors; - FieldWriter(FieldInfo fieldInfo, int pqThreshold, int pqSubspaceCount) { + FieldWriter( + FieldInfo fieldInfo, + int maxConn, + int beamWidth, + float degreeOverflow, + float alpha, + boolean hierarchyEnabled, + int pqThreshold, + int pqSubspaceCount) { /** For creating a new field from a flat field vectors writer. */ this.fieldInfo = fieldInfo; this.vectors = new ArrayList<>(); @@ -481,6 +495,15 @@ static class FieldWriter extends KnnFieldVectorsWriter { BuildScoreProvider.randomAccessScoreProvider( toRandomAccessVectorValues(), similarityFunction)); + this.indexBuilder = + new GraphIndexBuilder( + buildScoreProvider, + fieldInfo.getVectorDimension(), + maxConn, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled); this.pqThreshold = pqThreshold; this.pqSubspaceCount = pqSubspaceCount; @@ -495,12 +518,13 @@ public void addValue(int docID, float[] vectorValue) throws IOException { + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)"); } + final int ord = vectors.size(); docIds.add(docID); final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue)); vectors.add(vector); if (pqVectors != null) { - pqVectors.encodeAndSet(vectors.size() - 1, vector); + pqVectors.encodeAndSet(ord, vector); } else if (vectors.size() > pqThreshold) { final boolean globallyCenter = switch (fieldInfo.getVectorSimilarityFunction()) { @@ -523,7 +547,10 @@ public void addValue(int docID, float[] vectorValue) throws IOException { JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); buildScoreProvider.setDelegate( BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors)); + indexBuilder = GraphIndexBuilder.rescore(indexBuilder, buildScoreProvider); } + + indexBuilder.addGraphNode(ord, buildScoreProvider.searchProviderFor(vector)); } @Override @@ -539,6 +566,11 @@ public PQVectors getCompressedVectors() { return pqVectors; } + public ImmutableGraphIndex getGraphIndex() { + indexBuilder.cleanup(); + return indexBuilder.getGraph(); + } + @Override public long ramBytesUsed() { return SHALLOW_SIZE From ab9e92a5d7ef0aade7af0eedf3aa41f7d7eb247b Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Fri, 5 Dec 2025 19:50:11 +0000 Subject: [PATCH 81/86] Support maxDegrees per-layer --- .../sandbox/codecs/jvector/JVectorFormat.java | 27 ++++++++++++++++--- .../sandbox/codecs/jvector/JVectorWriter.java | 20 +++++++------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java index 25b0b3da6d5c..07a4f31f6bad 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -18,6 +18,7 @@ package org.apache.lucene.sandbox.codecs.jvector; import java.io.IOException; +import java.util.List; import java.util.function.IntUnaryOperator; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; @@ -43,7 +44,7 @@ public class JVectorFormat extends KnnVectorsFormat { public static final float DEFAULT_ALPHA = 2f; public static final boolean DEFAULT_HIERARCHY_ENABLED = true; - private final int maxConn; + private final List maxDegrees; private final int beamWidth; // As a function of the original dimension private final IntUnaryOperator numberOfSubspacesPerVectorSupplier; @@ -104,8 +105,28 @@ public JVectorFormat( IntUnaryOperator numberOfSubspacesPerVectorSupplier, int minBatchSizeForQuantization, boolean hierarchyEnabled) { + this( + name, + hierarchyEnabled ? List.of(maxConn * 2, maxConn) : List.of(maxConn), + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + public JVectorFormat( + String name, + List maxDegrees, + int beamWidth, + float neighborOverflow, + float alpha, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { super(name); - this.maxConn = maxConn; + this.maxDegrees = maxDegrees; this.beamWidth = beamWidth; this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; this.minBatchSizeForQuantization = minBatchSizeForQuantization; @@ -118,7 +139,7 @@ public JVectorFormat( public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { return new JVectorWriter( state, - maxConn, + maxDegrees, beamWidth, neighborOverflow, alpha, diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index ec6543555dfb..a66ca69da824 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -108,7 +108,7 @@ public class JVectorWriter extends KnnVectorsWriter { private final IndexOutput meta; private final IndexOutput data; - private final int maxConn; + private final List maxDegrees; private final int beamWidth; private final float degreeOverflow; private final float alpha; @@ -123,7 +123,7 @@ public class JVectorWriter extends KnnVectorsWriter { public JVectorWriter( SegmentWriteState segmentWriteState, - int maxConn, + List maxDegrees, int beamWidth, float degreeOverflow, float alpha, @@ -131,7 +131,7 @@ public JVectorWriter( int minimumBatchSizeForQuantization, boolean hierarchyEnabled) throws IOException { - this.maxConn = maxConn; + this.maxDegrees = maxDegrees; this.beamWidth = beamWidth; this.degreeOverflow = degreeOverflow; this.alpha = alpha; @@ -184,7 +184,7 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException final FieldWriter newField = new FieldWriter( fieldInfo, - maxConn, + maxDegrees, beamWidth, degreeOverflow, alpha, @@ -476,7 +476,7 @@ static class FieldWriter extends KnnFieldVectorsWriter { FieldWriter( FieldInfo fieldInfo, - int maxConn, + List maxDegrees, int beamWidth, float degreeOverflow, float alpha, @@ -499,11 +499,12 @@ static class FieldWriter extends KnnFieldVectorsWriter { new GraphIndexBuilder( buildScoreProvider, fieldInfo.getVectorDimension(), - maxConn, + maxDegrees, beamWidth, degreeOverflow, alpha, - hierarchyEnabled); + hierarchyEnabled, + true); this.pqThreshold = pqThreshold; this.pqSubspaceCount = pqSubspaceCount; @@ -858,11 +859,12 @@ public OnHeapGraphIndex getGraph( new GraphIndexBuilder( buildScoreProvider, fieldInfo.getVectorDimension(), - maxConn, + maxDegrees, beamWidth, degreeOverflow, alpha, - hierarchyEnabled); + hierarchyEnabled, + true); /* * We cannot always use randomAccessVectorValues for the graph building From 528b6d9d60d43c2ecda6101b924ab822eda0dcfd Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 8 Dec 2025 20:38:52 +0000 Subject: [PATCH 82/86] Start largestQuantizedReaderIndex at -1 --- .../lucene/sandbox/codecs/jvector/JVectorWriter.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index a66ca69da824..eee7ef0d1c6e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -678,11 +678,15 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro i -> ordToReaderOrd[i]); // Find the largest quantized reader to re-use its PQ codebook, if possible - int largestQuantizedReaderIndex = 0; + int largestQuantizedReaderIndex = -1; ProductQuantization pq = null; for (int i = 0; i < liveDocCounts.length; ++i) { - if (liveDocCounts[i] > liveDocCounts[largestQuantizedReaderIndex]) { - if (mergeState.knnVectorsReaders[i] instanceof JVectorReader jVectorReader) { + if (liveDocCounts[i] == 0) { + continue; + } + final var knnReader = mergeState.knnVectorsReaders[i].unwrapReaderForField(fieldInfo.name); + if (knnReader instanceof JVectorReader jVectorReader) { + if (pq == null || liveDocCounts[i] > liveDocCounts[largestQuantizedReaderIndex]) { final var maybeNewPq = jVectorReader.getProductQuantizationForField(fieldInfo.name); if (maybeNewPq.isPresent()) { largestQuantizedReaderIndex = i; From fe3c7832cd6bb731c1b2b4859f48b4107255be14 Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 8 Dec 2025 20:40:13 +0000 Subject: [PATCH 83/86] fixup! Move PQ encoding to FieldWriter.addValue instead of flush --- .../sandbox/codecs/jvector/JVectorWriter.java | 59 ++++++++----------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index eee7ef0d1c6e..8c0c264030fc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -342,26 +342,6 @@ private VectorIndexFieldMetadata writeGraph( } } - private PQVectors getPQVectors( - RandomAccessVectorValues randomAccessVectorValues, FieldInfo fieldInfo) throws IOException { - final boolean globallyCenter = - switch (fieldInfo.getVectorSimilarityFunction()) { - case EUCLIDEAN -> true; - case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; - }; - final int M = - numberOfSubspacesPerVectorSupplier.applyAsInt(randomAccessVectorValues.dimension()); - final var numberOfClustersPerSubspace = - Math.min(256, randomAccessVectorValues.size()); // number of centroids per - // subspace - - ProductQuantization pq = - ProductQuantization.compute( - randomAccessVectorValues, M, numberOfClustersPerSubspace, globallyCenter); - - return (PQVectors) pq.encodeAll(randomAccessVectorValues); - } - /// Metadata about the index to be persisted on disk public static class VectorIndexFieldMetadata { final int fieldNumber; @@ -527,18 +507,11 @@ public void addValue(int docID, float[] vectorValue) throws IOException { if (pqVectors != null) { pqVectors.encodeAndSet(ord, vector); } else if (vectors.size() > pqThreshold) { - final boolean globallyCenter = - switch (fieldInfo.getVectorSimilarityFunction()) { - case EUCLIDEAN -> true; - case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; - }; - final int pqCenterCount = Math.min(256, vectors.size()); - final var pq = - ProductQuantization.compute( + final ProductQuantization pq = + trainPQ( toRandomAccessVectorValues(), pqSubspaceCount, - pqCenterCount, - globallyCenter); + fieldInfo.getVectorSimilarityFunction()); pqVectors = new MutablePQVectors(pq); for (int i = 0; i < vectors.size(); ++i) { pqVectors.encodeAndSet(i, vectors.get(i)); @@ -551,7 +524,7 @@ public void addValue(int docID, float[] vectorValue) throws IOException { indexBuilder = GraphIndexBuilder.rescore(indexBuilder, buildScoreProvider); } - indexBuilder.addGraphNode(ord, buildScoreProvider.searchProviderFor(vector)); + indexBuilder.addGraphNode(ord, vector); } @Override @@ -713,8 +686,9 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro } pqVectors = (PQVectors) newPq.encodeAll(ravv); } else if (ravv.size() >= minimumBatchSizeForQuantization) { - // No pre-existing codebooks, check if we have enough vectors to trigger quantization - pqVectors = getPQVectors(ravv, fieldInfo); + final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(ravv.dimension()); + final ProductQuantization newPQ = trainPQ(ravv, M, fieldInfo.getVectorSimilarityFunction()); + pqVectors = newPQ.encodeAll(ravv, SIMD_POOL); } else { pqVectors = null; } @@ -894,6 +868,25 @@ public OnHeapGraphIndex getGraph( return graphIndex; } + private static ProductQuantization trainPQ( + RandomAccessVectorValues vectors, + int M, + org.apache.lucene.index.VectorSimilarityFunction similarityFunction) { + final boolean globallyCenter = + switch (similarityFunction) { + case EUCLIDEAN -> true; + case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; + }; + final int numberOfClustersPerSubspace = Math.min(256, vectors.size()); + // This extracts a random minimal subset of the vectors for training the PQ codebooks + return + ProductQuantization.compute( + vectors, + M, + numberOfClustersPerSubspace, + globallyCenter); + } + static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { private final FloatVectorValues values; From 029a116fda0bcd11100f6586dec7539d67e567bf Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 8 Dec 2025 21:45:10 +0000 Subject: [PATCH 84/86] Don't re-use PQ codebooks --- .../sandbox/codecs/jvector/JVectorWriter.java | 38 +------------------ 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 8c0c264030fc..5cc8452abeb5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -618,7 +618,6 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro // These arrays may be larger than strictly necessary if there are deleted docs/missing fields final int totalMaxDocs = Arrays.stream(mergeState.maxDocs).reduce(0, Math::addExact); - final int[] liveDocCounts = new int[mergeCount]; final DocsWithFieldSet docIds = new DocsWithFieldSet(); final int[] ordToReaderIndex = new int[totalMaxDocs]; final int[] ordToReaderOrd = new int[totalMaxDocs]; @@ -627,8 +626,6 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro int ord = 0; final var docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); for (var sub = docIdMerger.next(); sub != null; sub = docIdMerger.next()) { - final int readerIndex = sub.readerIndex; - liveDocCounts[readerIndex] += 1; docIds.add(sub.mappedDocID); ordToReaderIndex[ord] = sub.readerIndex; ordToReaderOrd[ord] = sub.index(); @@ -650,42 +647,9 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro i -> ordToReaderIndex[i], i -> ordToReaderOrd[i]); - // Find the largest quantized reader to re-use its PQ codebook, if possible - int largestQuantizedReaderIndex = -1; - ProductQuantization pq = null; - for (int i = 0; i < liveDocCounts.length; ++i) { - if (liveDocCounts[i] == 0) { - continue; - } - final var knnReader = mergeState.knnVectorsReaders[i].unwrapReaderForField(fieldInfo.name); - if (knnReader instanceof JVectorReader jVectorReader) { - if (pq == null || liveDocCounts[i] > liveDocCounts[largestQuantizedReaderIndex]) { - final var maybeNewPq = jVectorReader.getProductQuantizationForField(fieldInfo.name); - if (maybeNewPq.isPresent()) { - largestQuantizedReaderIndex = i; - pq = maybeNewPq.get(); - } - } - } - } - // Perform PQ if applicable final PQVectors pqVectors; - if (pq != null) { - // Refine the leadingCompressor with the remaining vectors in the merge - ProductQuantization newPq = pq; - for (int i = 0; i < mergeCount; i++) { - if (i == largestQuantizedReaderIndex || vectors[i] == null) { - // Skip the reader associated with the re-used PQ codebook - continue; - } - final FloatVectorValues values = vectors[i]; - final RandomAccessVectorValues randomAccessVectorValues = - new RandomAccessVectorValuesOverVectorValues(values); - newPq = newPq.refine(randomAccessVectorValues); - } - pqVectors = (PQVectors) newPq.encodeAll(ravv); - } else if (ravv.size() >= minimumBatchSizeForQuantization) { + if (ravv.size() >= minimumBatchSizeForQuantization) { final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(ravv.dimension()); final ProductQuantization newPQ = trainPQ(ravv, M, fieldInfo.getVectorSimilarityFunction()); pqVectors = newPQ.encodeAll(ravv, SIMD_POOL); From 5976164a8bea9f9c3e836af17b5cb2495e35dbab Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 8 Dec 2025 21:49:53 +0000 Subject: [PATCH 85/86] fixup! fixup! Move PQ encoding to FieldWriter.addValue instead of flush --- .../org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 5cc8452abeb5..9b0e14962ac3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -652,7 +652,7 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro if (ravv.size() >= minimumBatchSizeForQuantization) { final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(ravv.dimension()); final ProductQuantization newPQ = trainPQ(ravv, M, fieldInfo.getVectorSimilarityFunction()); - pqVectors = newPQ.encodeAll(ravv, SIMD_POOL); + pqVectors = (PQVectors) newPQ.encodeAll(ravv); } else { pqVectors = null; } From d2738f6a86a0bf93587bdd62b048a80f59145a3a Mon Sep 17 00:00:00 2001 From: Alec Bernardi Date: Mon, 8 Dec 2025 23:11:57 +0000 Subject: [PATCH 86/86] Small re-organize PQ merge --- .../sandbox/codecs/jvector/JVectorWriter.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java index 9b0e14962ac3..fb9b43f347a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -647,28 +647,25 @@ private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) thro i -> ordToReaderIndex[i], i -> ordToReaderOrd[i]); + final BuildScoreProvider buildScoreProvider; + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + // Perform PQ if applicable final PQVectors pqVectors; if (ravv.size() >= minimumBatchSizeForQuantization) { final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(ravv.dimension()); final ProductQuantization newPQ = trainPQ(ravv, M, fieldInfo.getVectorSimilarityFunction()); pqVectors = (PQVectors) newPQ.encodeAll(ravv); - } else { - pqVectors = null; - } - - final BuildScoreProvider buildScoreProvider; - final var similarityFunction = - JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); - if (pqVectors != null) { - // Re-use PQ codebooks to build a new graph from scratch buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors); // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD // threads) buildScoreProvider.diversityProviderFor(0); } else { + pqVectors = null; buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(ravv, similarityFunction); } + final var graphNodeIdToDocMap = new GraphNodeIdToDocMap(docIds); final var graph = getGraph(buildScoreProvider, ravv, fieldInfo, mergeState.intraMergeTaskExecutor);