diff --git a/lucene/licenses/agrona-1.20.0.jar.sha1 b/lucene/licenses/agrona-1.20.0.jar.sha1 new file mode 100644 index 000000000000..badef8d6e169 --- /dev/null +++ b/lucene/licenses/agrona-1.20.0.jar.sha1 @@ -0,0 +1 @@ +00580b67864f7739bf7778162f418ada69fa3037 diff --git a/lucene/licenses/agrona-LICENSE-ASL.txt b/lucene/licenses/agrona-LICENSE-ASL.txt new file mode 100644 index 000000000000..91d486281cdf --- /dev/null +++ b/lucene/licenses/agrona-LICENSE-ASL.txt @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/agrona-NOTICE.txt b/lucene/licenses/agrona-NOTICE.txt new file mode 100644 index 000000000000..795926439ada --- /dev/null +++ b/lucene/licenses/agrona-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the Agrona project. +https://github.com/real-logic/agrona + +Copyright © 2014-2023 Real Logic Limited + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/commons-math3-3.6.1.jar.sha1 b/lucene/licenses/commons-math3-3.6.1.jar.sha1 new file mode 100644 index 000000000000..ed9a549757f5 --- /dev/null +++ b/lucene/licenses/commons-math3-3.6.1.jar.sha1 @@ -0,0 +1 @@ +e4ba98f1d4b3c80ec46392f25e094a6a2e58fcbf diff --git a/lucene/licenses/commons-math3-LICENSE-ASL.txt b/lucene/licenses/commons-math3-LICENSE-ASL.txt new file mode 100644 index 000000000000..a08b1c749765 --- /dev/null +++ b/lucene/licenses/commons-math3-LICENSE-ASL.txt @@ -0,0 +1,456 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +Apache Commons Math includes the following code provided to the ASF under the +Apache License 2.0: + + - The inverse error function implementation in the Erf class is based on CUDA + code developed by Mike Giles, Oxford-Man Institute of Quantitative Finance, + and published in GPU Computing Gems, volume 2, 2010 (grant received on + March 23th 2013) + - The LinearConstraint, LinearObjectiveFunction, LinearOptimizer, + RelationShip, SimplexSolver and SimplexTableau classes in package + org.apache.commons.math3.optimization.linear include software developed by + Benjamin McCann (http://www.benmccann.com) and distributed with + the following copyright: Copyright 2009 Google Inc. (grant received on + March 16th 2009) + - The class "org.apache.commons.math3.exception.util.LocalizedFormatsTest" which + is an adapted version of "OrekitMessagesTest" test class for the Orekit library + - The "org.apache.commons.math3.analysis.interpolation.HermiteInterpolator" + has been imported from the Orekit space flight dynamics library. + +=============================================================================== + + + +APACHE COMMONS MATH DERIVATIVE WORKS: + +The Apache commons-math library includes a number of subcomponents +whose implementation is derived from original sources written +in C or Fortran. License terms of the original sources +are reproduced below. + +=============================================================================== +For the lmder, lmpar and qrsolv Fortran routine from minpack and translated in +the LevenbergMarquardtOptimizer class in package +org.apache.commons.math3.optimization.general +Original source copyright and license statement: + +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. +=============================================================================== + +Copyright and license statement for the odex Fortran routine developed by +E. Hairer and G. Wanner and translated in GraggBulirschStoerIntegrator class +in package org.apache.commons.math3.ode.nonstiff: + + +Copyright (c) 2004, Ernst Hairer + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== + +Copyright and license statement for the original Mersenne twister C +routines translated in MersenneTwister class in package +org.apache.commons.math3.random: + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=============================================================================== + +The initial code for shuffling an array (originally in class +"org.apache.commons.math3.random.RandomDataGenerator", now replaced by +a method in class "org.apache.commons.math3.util.MathArrays") was +inspired from the algorithm description provided in +"Algorithms", by Ian Craw and John Pulham (University of Aberdeen 1999). +The textbook (containing a proof that the shuffle is uniformly random) is +available here: + http://citeseerx.ist.psu.edu/viewdoc/download;?doi=10.1.1.173.1898&rep=rep1&type=pdf + +=============================================================================== +License statement for the direction numbers in the resource files for Sobol sequences. + +----------------------------------------------------------------------------- +Licence pertaining to sobol.cc and the accompanying sets of direction numbers + +----------------------------------------------------------------------------- +Copyright (c) 2008, Frances Y. Kuo and Stephen Joe +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the copyright holders nor the names of the + University of New South Wales and the University of Waikato + and its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== + +The initial commit of package "org.apache.commons.math3.ml.neuralnet" is +an adapted version of code developed in the context of the Data Processing +and Analysis Consortium (DPAC) of the "Gaia" project of the European Space +Agency (ESA). +=============================================================================== + +The initial commit of the class "org.apache.commons.math3.special.BesselJ" is +an adapted version of code translated from the netlib Fortran program, rjbesl +http://www.netlib.org/specfun/rjbesl by R.J. Cody at Argonne National +Laboratory (USA). There is no license or copyright statement included with the +original Fortran sources. +=============================================================================== + + +The BracketFinder (package org.apache.commons.math3.optimization.univariate) +and PowellOptimizer (package org.apache.commons.math3.optimization.general) +classes are based on the Python code in module "optimize.py" (version 0.5) +developed by Travis E. Oliphant for the SciPy library (http://www.scipy.org/) +Copyright © 2003-2009 SciPy Developers. + +SciPy license +Copyright © 2001, 2002 Enthought, Inc. +All rights reserved. + +Copyright © 2003-2013 SciPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Enthought nor the names of the SciPy Developers may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=============================================================================== diff --git a/lucene/licenses/commons-math3-NOTICE.txt b/lucene/licenses/commons-math3-NOTICE.txt new file mode 100644 index 000000000000..5e2a2f91d48a --- /dev/null +++ b/lucene/licenses/commons-math3-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the Apache Commons Math project. +https://commons.apache.org/proper/commons-math/ + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 b/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 new file mode 100644 index 000000000000..ae9459b0c93d --- /dev/null +++ b/lucene/licenses/jvector-4.0.0-rc.5.jar.sha1 @@ -0,0 +1 @@ +799740d5484d589c579ba0b9a65ec887ec542123 diff --git a/lucene/licenses/jvector-LICENSE-ASL.txt b/lucene/licenses/jvector-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/jvector-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/jvector-NOTICE.txt b/lucene/licenses/jvector-NOTICE.txt new file mode 100644 index 000000000000..0542e27d7ef7 --- /dev/null +++ b/lucene/licenses/jvector-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the JVector project. +https://github.com/jbellis/jvector + +Copyright © 2023 Jonathan Ellis + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/snakeyaml-2.4.jar.sha1 b/lucene/licenses/snakeyaml-2.4.jar.sha1 new file mode 100644 index 000000000000..8739f8c17629 --- /dev/null +++ b/lucene/licenses/snakeyaml-2.4.jar.sha1 @@ -0,0 +1 @@ +e0666b825b796f85521f02360e77f4c92c5a7a07 diff --git a/lucene/licenses/snakeyaml-LICENSE-ASL.txt b/lucene/licenses/snakeyaml-LICENSE-ASL.txt new file mode 100644 index 000000000000..d9a10c0d8e86 --- /dev/null +++ b/lucene/licenses/snakeyaml-LICENSE-ASL.txt @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/lucene/licenses/snakeyaml-NOTICE.txt b/lucene/licenses/snakeyaml-NOTICE.txt new file mode 100644 index 000000000000..c1e6931cc149 --- /dev/null +++ b/lucene/licenses/snakeyaml-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the SnakeYAML project. +https://bitbucket.org/snakeyaml/snakeyaml + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index daf952f84a8d..6040c651f887 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -16,12 +16,25 @@ */ +plugins { + id 'java-library' +} description = 'Various third party contributions and new ideas' +java { + modularity.inferModulePath = true +} + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + + moduleImplementation('io.github.jbellis:jvector:4.0.0-rc.5') { + exclude group: 'org.slf4j', module: 'slf4j-api' + } + + moduleImplementation 'org.slf4j:slf4j-api:2.0.17' } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index ee9be3227de2..ea49d9e2b26a 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -16,13 +16,16 @@ */ /** Various third party contributions and new ideas */ +@SuppressWarnings("requires-automatic") module org.apache.lucene.sandbox { requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires jvector; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.faiss; + exports org.apache.lucene.sandbox.codecs.jvector; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; exports org.apache.lucene.sandbox.document; @@ -41,5 +44,6 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with - org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat; + org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat, + org.apache.lucene.sandbox.codecs.jvector.JVectorFormat; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java new file mode 100644 index 000000000000..2e74da91c8d0 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.index.SegmentCommitInfo; +import org.apache.lucene.index.SegmentInfos; + +/** + * A merge policy that only merges segments if they are forced. This is useful for testing and + * benchmarking purposes. Since it can be used for benchmarks, it is placed in the common codec + * module. + */ +public class ForceMergesOnlyMergePolicy extends MergePolicy { + private final boolean useCompoundFile; + + public ForceMergesOnlyMergePolicy() { + this(false); + } + + public ForceMergesOnlyMergePolicy(boolean useCompoundFile) { + super(); + this.useCompoundFile = useCompoundFile; + } + + @Override + public MergeSpecification findMerges( + MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) + throws IOException { + return null; + } + + @Override + public MergeSpecification findForcedMerges( + SegmentInfos segmentInfos, + int maxSegmentCount, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + // If the segments are already merged (e.g. there's only 1 segment), or + // there are segments = segmentInfos.asList(); + MergeSpecification spec = new MergeSpecification(); + + final OneMerge merge = new OneMerge(segments); + spec.add(merge); + return spec; + } + + @Override + public boolean useCompoundFile( + SegmentInfos segmentInfos, SegmentCommitInfo newSegment, MergeContext mergeContext) + throws IOException { + return useCompoundFile; + } + + @Override + public MergeSpecification findForcedDeletesMerges( + SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException { + return null; + } + + /** + * Returns true if the number of segments eligible for merging is less than or equal to the + * specified {@code maxNumSegments}. + */ + protected boolean isMerged( + SegmentInfos infos, + int maxNumSegments, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + final int numSegments = infos.size(); + int numToMerge = 0; + SegmentCommitInfo mergeInfo = null; + boolean segmentIsOriginal = false; + for (int i = 0; i < numSegments && numToMerge <= maxNumSegments; i++) { + final SegmentCommitInfo info = infos.info(i); + final Boolean isOriginal = segmentsToMerge.get(info); + if (isOriginal != null) { + segmentIsOriginal = isOriginal; + numToMerge++; + mergeInfo = info; + } + } + + return numToMerge <= maxNumSegments + && (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext)); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java new file mode 100644 index 000000000000..0c733c73b34b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/GraphNodeIdToDocMap.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.KnnVectorValues.DocIndexIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +/** + * This class represents the mapping from the Lucene document IDs to the jVector ordinals. This + * mapping is necessary because the jVector ordinals can be different from the Lucene document IDs + * and when lucene documentIDs change after a merge, we need to update this mapping to reflect the + * new document IDs. This requires us to know the previous mapping from the previous merge and the + * new mapping from the current merge. + * + *

Which means that we also need to persist this mapping to disk to be available across merges. + */ +public class GraphNodeIdToDocMap { + private static final int VERSION = 1; + private final int[] graphNodeIdsToDocIds; + private final int[] docIdsToGraphNodeIds; + + /** + * Constructor that reads the mapping from the index input + * + * @param in The index input + * @throws IOException if an I/O error occurs + */ + public GraphNodeIdToDocMap(IndexInput in) throws IOException { + final int version = in.readInt(); // Read the version + if (version != VERSION) { + throw new IOException("Unsupported version: " + version); + } + int size = in.readVInt(); + int maxDocId = in.readVInt(); + + graphNodeIdsToDocIds = new int[size]; + docIdsToGraphNodeIds = new int[maxDocId]; + Arrays.fill(docIdsToGraphNodeIds, -1); + for (int ord = 0; ord < size; ord++) { + final int docId = in.readVInt(); + graphNodeIdsToDocIds[ord] = docId; + docIdsToGraphNodeIds[docId] = ord; + } + } + + public GraphNodeIdToDocMap(DocsWithFieldSet docs) { + this.graphNodeIdsToDocIds = new int[docs.cardinality()]; + + int ord = 0; + int maxDocId = -1; + final var docsIterator = docs.iterator(); + try { + for (int docId = docsIterator.nextDoc(); + docId != NO_MORE_DOCS; + docId = docsIterator.nextDoc()) { + graphNodeIdsToDocIds[ord++] = docId; + if (docId > maxDocId) { + maxDocId = docId; + } + } + } catch (IOException e) { + // This should never happen; docsIterator should be FixedBitSet or DocSetIterator.all() + throw new UncheckedIOException(e); + } + + this.docIdsToGraphNodeIds = new int[maxDocId + 1]; + Arrays.fill(docIdsToGraphNodeIds, -1); + for (ord = 0; ord < graphNodeIdsToDocIds.length; ++ord) { + docIdsToGraphNodeIds[graphNodeIdsToDocIds[ord]] = ord; + } + } + + /** + * Returns the jVector node id for the given Lucene document ID + * + * @param luceneDocId The Lucene document ID + * @return The jVector ordinal + */ + public int getJVectorNodeId(int luceneDocId) { + return docIdsToGraphNodeIds[luceneDocId]; + } + + /** + * Returns the Lucene document ID for the given jVector node id + * + * @param graphNodeId The jVector ordinal + * @return The Lucene document ID + *

NOTE: This method is useful when, for example, we want to remap acceptedDocs bitmap from + * Lucene to jVector ordinal bitmap filter + */ + public int getLuceneDocId(int graphNodeId) { + return graphNodeIdsToDocIds[graphNodeId]; + } + + /** + * Writes the mapping to the index output + * + * @param out The index output + * @throws IOException if an I/O error occurs + */ + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(VERSION); + out.writeVInt(graphNodeIdsToDocIds.length); + out.writeVInt(docIdsToGraphNodeIds.length); + for (int ord = 0; ord < graphNodeIdsToDocIds.length; ord++) { + out.writeVInt(graphNodeIdsToDocIds[ord]); + } + } + + public DocIndexIterator iterator() { + return new DocIndexIterator() { + int docId = -1; + + @Override + public int index() { + return docIdsToGraphNodeIds[docId]; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + while (docId < docIdsToGraphNodeIds.length - 1) { + ++docId; + final int ord = docIdsToGraphNodeIds[docId]; + if (ord >= 0) { + return docId; + } + } + return docId = NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + if (target <= docId) { + throw new IllegalArgumentException(); + } else if (target >= docIdsToGraphNodeIds.length) { + return docId = NO_MORE_DOCS; + } + + docId = target - 1; + return nextDoc(); + } + + @Override + public long cost() { + return graphNodeIdsToDocIds.length; + } + }; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java new file mode 100644 index 000000000000..ccbe286c776c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.similarity.ScoreFunction; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.util.Bits.MatchAllBits; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.IOException; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.VectorScorer; + +/// Implements Lucene vector access over a JVector on-disk index +public class JVectorFloatVectorValues extends FloatVectorValues { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final OnDiskGraphIndex index; + private final OnDiskGraphIndex.View view; + private final PQVectors pq; + private final VectorSimilarityFunction similarityFunction; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + + public JVectorFloatVectorValues( + OnDiskGraphIndex index, + PQVectors pq, + VectorSimilarityFunction similarityFunction, + GraphNodeIdToDocMap graphNodeIdToDocMap) + throws IOException { + this.index = index; + this.view = index.getView(); + this.pq = pq; + this.similarityFunction = similarityFunction; + this.graphNodeIdToDocMap = graphNodeIdToDocMap; + } + + @Override + public int dimension() { + return view.dimension(); + } + + @Override + public int size() { + return view.size(); + } + + @Override + public int ordToDoc(int ord) { + return graphNodeIdToDocMap.getLuceneDocId(ord); + } + + // This allows us to access the vector without copying it to float[] + public VectorFloat vectorFloatValue(int ord) { + return view.getVector(ord); + } + + public void getVectorInto(int node, VectorFloat vector, int offset) { + view.getVectorInto(node, vector, offset); + } + + @Override + public DocIndexIterator iterator() { + assert view.liveNodes() instanceof MatchAllBits : "All OnDiskGraphIndex nodes must be live"; + return graphNodeIdToDocMap.iterator(); + } + + @Override + public float[] vectorValue(int i) throws IOException { + try { + final VectorFloat vector = vectorFloatValue(i); + return (float[]) vector.get(); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public FloatVectorValues copy() throws IOException { + return new JVectorFloatVectorValues(index, pq, similarityFunction, graphNodeIdToDocMap); + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + if (pq != null) { + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(query); + final var quantizedScoreFunction = pq.precomputedScoreFunctionFor(vector, similarityFunction); + return new JVectorScorer(quantizedScoreFunction, iterator()); + } else { + return rescorer(query); + } + } + + @Override + public VectorScorer rescorer(float[] target) throws IOException { + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final var scoreFunction = view.rerankerFor(vector, similarityFunction); + return new JVectorScorer(scoreFunction, iterator()); + } + + private static class JVectorScorer implements VectorScorer { + private final ScoreFunction scoreFunction; + private final DocIndexIterator iterator; + + JVectorScorer(ScoreFunction scoreFunction, DocIndexIterator iterator) { + this.scoreFunction = scoreFunction; + this.iterator = iterator; + } + + @Override + public float score() throws IOException { + return scoreFunction.similarityTo(iterator.index()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java new file mode 100644 index 000000000000..07a4f31f6bad --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.io.IOException; +import java.util.List; +import java.util.function.IntUnaryOperator; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/// Implements K-NN search using JVector library for indexing +public class JVectorFormat extends KnnVectorsFormat { + public static final String NAME = "JVectorFormat"; + public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; + public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; + public static final String JVECTOR_FILES_SUFFIX = "jvector"; + public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; + public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; + + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + public static final int DEFAULT_MAX_CONN = 32; + public static final int DEFAULT_BEAM_WIDTH = 100; + public static final int DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION = 1024; + public static final float DEFAULT_NEIGHBOR_OVERFLOW = 2f; + public static final float DEFAULT_ALPHA = 2f; + public static final boolean DEFAULT_HIERARCHY_ENABLED = true; + + private final List maxDegrees; + private final int beamWidth; + // As a function of the original dimension + private final IntUnaryOperator numberOfSubspacesPerVectorSupplier; + private final int minBatchSizeForQuantization; + private final float alpha; + private final float neighborOverflow; + private final boolean hierarchyEnabled; + + public JVectorFormat() { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + DEFAULT_HIERARCHY_ENABLED); + } + + public JVectorFormat(int minBatchSizeForQuantization) { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + minBatchSizeForQuantization, + DEFAULT_HIERARCHY_ENABLED); + } + + public JVectorFormat( + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { + this( + NAME, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + public JVectorFormat( + String name, + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { + this( + name, + hierarchyEnabled ? List.of(maxConn * 2, maxConn) : List.of(maxConn), + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + public JVectorFormat( + String name, + List maxDegrees, + int beamWidth, + float neighborOverflow, + float alpha, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean hierarchyEnabled) { + super(name); + this.maxDegrees = maxDegrees; + this.beamWidth = beamWidth; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minBatchSizeForQuantization = minBatchSizeForQuantization; + this.alpha = alpha; + this.neighborOverflow = neighborOverflow; + this.hierarchyEnabled = hierarchyEnabled; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new JVectorWriter( + state, + maxDegrees, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + hierarchyEnabled); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new JVectorReader(state); + } + + @Override + public int getMaxDimensions(String s) { + // Not a hard limit, but a reasonable default + return 8192; + } + + /** + * This method returns the default number of subspaces per vector for a given original dimension. + * Should be used as a default value for the number of subspaces per vector in case no value is + * provided. + * + * @param originalDimension original vector dimension + * @return default number of subspaces per vector + */ + public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { + // the idea here is that higher dimensions compress well, but not so well that we should use + // fewer bits + // than a lower-dimension vector, which is what you could get with cutoff points to switch + // between (e.g.) + // D*0.5 and D*0.25. Thus, the following ensures that bytes per vector is strictly increasing + // with D. + int compressedBytes; + if (originalDimension <= 32) { + // We are compressing from 4-byte floats to single-byte codebook indexes, + // so this represents compression of 4x + // * GloVe-25 needs 25 BPV to achieve good recall + compressedBytes = originalDimension; + } else if (originalDimension <= 64) { + // * GloVe-50 performs fine at 25 + compressedBytes = 32; + } else if (originalDimension <= 200) { + // * GloVe-100 and -200 perform well at 50 and 100 BPV, respectively + compressedBytes = (int) (originalDimension * 0.5); + } else if (originalDimension <= 400) { + // * NYTimes-256 actually performs fine at 64 BPV but we'll be conservative + // since we don't want BPV to decrease + compressedBytes = 100; + } else if (originalDimension <= 768) { + // allow BPV to increase linearly up to 192 + compressedBytes = (int) (originalDimension * 0.25); + } else if (originalDimension <= 1536) { + // * ada002 vectors have good recall even at 192 BPV = compression of 32x + compressedBytes = 192; + } else { + // We have not tested recall with larger vectors than this, let's let it increase linearly + compressedBytes = (int) (originalDimension * 0.125); + } + return compressedBytes; + } + + static io.github.jbellis.jvector.vector.VectorSimilarityFunction toJVectorSimilarity( + final org.apache.lucene.index.VectorSimilarityFunction luceneFunction) { + return switch (luceneFunction) { + case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; + case MAXIMUM_INNER_PRODUCT -> + throw new UnsupportedOperationException("JVector does not support MAXIMUM_INNER_PRODUCT"); + }; + } + + static org.apache.lucene.index.VectorSimilarityFunction toLuceneSimilarity( + final io.github.jbellis.jvector.vector.VectorSimilarityFunction jVectorFunction) { + return switch (jVectorFunction) { + case COSINE -> org.apache.lucene.index.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; + case EUCLIDEAN -> org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + }; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java new file mode 100644 index 000000000000..5cbfece4c0e1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.disk.IndexWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.apache.lucene.store.IndexOutput; + +/** + * JVectorRandomAccessWriter is a wrapper around IndexOutput that implements RandomAccessWriter. + * Note: This is not thread safe! + */ +public class JVectorIndexWriter implements IndexWriter { + private final IndexOutput indexOutputDelegate; + /// Initial offset of the writer, which will be subtracted from [position()][#position()] to trick + /// JVector into using offsets that work for slices used by the readers. + private final long offset; + + public JVectorIndexWriter(IndexOutput indexOutputDelegate) { + this.indexOutputDelegate = indexOutputDelegate; + this.offset = indexOutputDelegate.getFilePointer(); + } + + @Override + public long position() throws IOException { + return indexOutputDelegate.getFilePointer() - offset; + } + + @Override + public void close() throws IOException { + // Let the user close the delegate + } + + @Override + public void write(int b) throws IOException { + indexOutputDelegate.writeByte((byte) b); + } + + @Override + public void write(byte[] b) throws IOException { + indexOutputDelegate.writeBytes(b, 0, b.length); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + indexOutputDelegate.writeBytes(b, off, len); + } + + @Override + public void writeBoolean(boolean v) throws IOException { + indexOutputDelegate.writeByte((byte) (v ? 1 : 0)); + } + + @Override + public void writeByte(int v) throws IOException { + indexOutputDelegate.writeByte((byte) v); + } + + @Override + public void writeShort(int v) throws IOException { + indexOutputDelegate.writeShort((short) v); + } + + @Override + public void writeChar(int v) throws IOException { + throw new UnsupportedOperationException( + "JVectorRandomAccessWriter does not support writing chars"); + } + + @Override + public void writeInt(int v) throws IOException { + indexOutputDelegate.writeInt(v); + } + + @Override + public void writeLong(long v) throws IOException { + indexOutputDelegate.writeLong(v); + } + + @Override + public void writeFloat(float v) throws IOException { + indexOutputDelegate.writeInt(Float.floatToIntBits(v)); + } + + @Override + public void writeFloats(float[] floats, int offset, int count) throws IOException { + final ByteBuffer buf = ByteBuffer.allocate(count * Float.BYTES); + buf.order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floats, offset, count); + write(buf.array()); + } + + @Override + public void writeDouble(double v) throws IOException { + writeLong(Double.doubleToLongBits(v)); + } + + @Override + public void writeBytes(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing String as bytes"); + } + + @Override + public void writeChars(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing chars"); + } + + @Override + public void writeUTF(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing UTF strings"); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java new file mode 100644 index 000000000000..5374b822795e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.disk.RandomAccessReader; +import io.github.jbellis.jvector.disk.ReaderSupplier; +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.lucene.store.IndexInput; + +/// Implements JVector reader capabilities over a Lucene IndexInput +public class JVectorRandomAccessReader implements RandomAccessReader { + private final byte[] internalBuffer = new byte[Long.BYTES]; + private final IndexInput indexInputDelegate; + + public JVectorRandomAccessReader(IndexInput indexInputDelegate) { + this.indexInputDelegate = indexInputDelegate; + } + + @Override + public void seek(long offset) throws IOException { + indexInputDelegate.seek(offset); + } + + @Override + public long getPosition() throws IOException { + return indexInputDelegate.getFilePointer(); + } + + @Override + public int readInt() throws IOException { + return indexInputDelegate.readInt(); + } + + @Override + public float readFloat() throws IOException { + return Float.intBitsToFloat(indexInputDelegate.readInt()); + } + + @Override + public long readLong() throws IOException { + return indexInputDelegate.readLong(); + } + + @Override + public void readFully(byte[] bytes) throws IOException { + indexInputDelegate.readBytes(bytes, 0, bytes.length); + } + + @Override + public void readFully(ByteBuffer buffer) throws IOException { + // validate that the requested bytes actually exist ---- + long remainingInFile = indexInputDelegate.length() - indexInputDelegate.getFilePointer(); + if (buffer.remaining() > remainingInFile) { + throw new EOFException( + "Requested " + buffer.remaining() + " bytes but only " + remainingInFile + " available"); + } + + // Heap buffers with a backing array can be filled in one call ---- + if (buffer.hasArray()) { + int off = buffer.arrayOffset() + buffer.position(); + int len = buffer.remaining(); + indexInputDelegate.readBytes(buffer.array(), off, len); + buffer.position(buffer.limit()); // advance fully + return; + } + + // Direct / non-array buffers: copy in reasonable chunks ---- + while (buffer.hasRemaining()) { + final int bytesToRead = Math.min(buffer.remaining(), Long.BYTES); + indexInputDelegate.readBytes(this.internalBuffer, 0, bytesToRead); + buffer.put(this.internalBuffer, 0, bytesToRead); + } + } + + @Override + public void readFully(long[] vector) throws IOException { + indexInputDelegate.readLongs(vector, 0, vector.length); + } + + @Override + public void read(int[] ints, int offset, int count) throws IOException { + indexInputDelegate.readInts(ints, offset, count); + } + + @Override + public void read(float[] floats, int offset, int count) throws IOException { + indexInputDelegate.readFloats(floats, offset, count); + } + + @Override + public void close() throws IOException { + // no need to really close the index input delegate since it is a clone + } + + @Override + public long length() throws IOException { + return indexInputDelegate.length(); + } + + /** + * Supplies readers which are actually slices of the original IndexInput. We will vend out slices + * in order for us to easily find the footer of the jVector graph index. This is useful because + * our logic that reads the graph that the footer is always at {@link IndexInput#length()} of the + * slice. Which is how {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} is working + * behind the scenes. The header offset, on the other hand, is flexible because we can provide it + * as a parameter to {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} + */ + public static class Supplier implements ReaderSupplier { + private final IndexInput input; + + public Supplier(IndexInput input) { + this.input = input; + } + + @Override + public synchronized RandomAccessReader get() throws IOException { + return new JVectorRandomAccessReader(input.clone()); + } + + @Override + public void close() throws IOException { + // Cloned inputs do not need to be closed + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java new file mode 100644 index 000000000000..670c45c54447 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.graph.GraphSearcher; +import io.github.jbellis.jvector.graph.SearchResult; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider; +import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.util.Bits; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.*; +import org.apache.lucene.search.AcceptDocs; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.search.knn.KnnSearchStrategy; +import org.apache.lucene.store.*; +import org.apache.lucene.util.IOUtils; + +/// Implements KnnVectorsReader over an on-disk JVector index serialized using {@link JVectorWriter} +public class JVectorReader extends KnnVectorsReader { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final IndexInput data; + // Maps field name to field entries + private final Map fieldEntryMap; + + public JVectorReader(SegmentReadState state) throws IOException { + final List fieldMetaList = new ArrayList<>(); + final String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, JVectorFormat.META_EXTENSION); + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorE = null; + try { + CodecUtil.checkIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + + JVectorWriter.VectorIndexFieldMetadata fieldMeta; + while ((fieldMeta = parseNextField(meta, state.fieldInfos)) != null) { + fieldMetaList.add(fieldMeta); + } + } catch (Throwable t) { + priorE = t; + } finally { + CodecUtil.checkFooter(meta, priorE); + } + + final String dataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, JVectorFormat.VECTOR_INDEX_EXTENSION); + this.data = + state.directory.openInput( + dataFileName, state.context.withHints(FileTypeHint.DATA, DataAccessHint.RANDOM)); + + + CodecUtil.checkHeader( + data, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT); + CodecUtil.retrieveChecksum(data); + + this.fieldEntryMap = new HashMap<>(fieldMetaList.size()); + for (var fieldMeta : fieldMetaList) { + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldMeta.fieldNumber); + if (fieldEntryMap.containsKey(fieldInfo.name)) { + throw new CorruptIndexException("Duplicate field: " + fieldInfo.name, meta); + } + fieldEntryMap.put(fieldInfo.name, new FieldEntry(data, fieldMeta)); + } + } + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(data); + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + if (fieldEntry == null) { + return new FloatVectorValues() { + @Override + public float[] vectorValue(int ord) throws IOException { + throw new IndexOutOfBoundsException(); + } + + @Override + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public int dimension() { + return 0; + } + + @Override + public int size() { + return 0; + } + + @Override + public VectorScorer scorer(float[] target) throws IOException { + return null; + } + }; + } + + return new JVectorFloatVectorValues( + fieldEntry.index, + fieldEntry.pqVectors, + fieldEntry.similarityFunction, + fieldEntry.graphNodeIdToDocMap); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + /** Byte vector values are not supported in jVector library. Instead use PQ. */ + return null; + } + + public Optional getProductQuantizationForField(String field) + throws IOException { + final FieldEntry fieldEntry = fieldEntryMap.get(field); + if (fieldEntry.pqVectors == null) { + return Optional.empty(); + } + + return Optional.of(fieldEntry.pqVectors.getCompressor()); + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) + throws IOException { + final var fieldEntry = fieldEntryMap.get(field); + final OnDiskGraphIndex index = fieldEntry.index; + if (index == null) { + // Skip search when the graph is empty + return; + } + + final JVectorSearchStrategy searchStrategy; + if (knnCollector.getSearchStrategy() instanceof JVectorSearchStrategy strategy) { + searchStrategy = strategy; + } else if (knnCollector.getSearchStrategy() instanceof KnnSearchStrategy.Seeded seeded + && seeded.originalStrategy() instanceof JVectorSearchStrategy strategy) { + searchStrategy = strategy; + } else searchStrategy = JVectorSearchStrategy.DEFAULT; + + // search for a random vector using a GraphSearcher and SearchScoreProvider + VectorFloat q = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final SearchScoreProvider ssp; + + try (var view = index.getView()) { + if (fieldEntry.pqVectors != null) { // Quantized, use the precomputed score function + final PQVectors pqVectors = fieldEntry.pqVectors; + // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, + // then reranks with the exact vectors that are stored on disk in the index + final var asf = pqVectors.precomputedScoreFunctionFor(q, fieldEntry.similarityFunction); + final var reranker = view.rerankerFor(q, fieldEntry.similarityFunction); + ssp = new DefaultSearchScoreProvider(asf, reranker); + } else { // Not quantized, used typical searcher + ssp = DefaultSearchScoreProvider.exact(q, fieldEntry.similarityFunction, view); + } + final GraphNodeIdToDocMap jvectorLuceneDocMap = fieldEntry.graphNodeIdToDocMap; + // Convert the acceptDocs bitmap from Lucene to jVector ordinal bitmap filter + // Logic works as follows: if acceptDocs is null, we accept all ordinals. Otherwise, we check + // if the jVector ordinal has a + // corresponding Lucene doc ID accepted by acceptDocs filter. + + Bits compatibleBits = Bits.ALL; + if (acceptDocs != null) { + final var luceneBits = acceptDocs.bits(); + if (luceneBits != null) { + compatibleBits = ord -> luceneBits.get(jvectorLuceneDocMap.getLuceneDocId(ord)); + } + } + + try (var graphSearcher = new GraphSearcher(index)) { + final var searchResults = + graphSearcher.search( + ssp, + knnCollector.k(), + knnCollector.k() * searchStrategy.overQueryFactor, + searchStrategy.threshold, + searchStrategy.rerankFloor, + compatibleBits); + for (SearchResult.NodeScore ns : searchResults.getNodes()) { + knnCollector.collect(jvectorLuceneDocMap.getLuceneDocId(ns.node), ns.score); + } + // JVector does not seem to count the entry-point as visited + knnCollector.incVisitedCount(1 + searchResults.getVisitedCount()); + } + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) + throws IOException { + // TODO: implement this + throw new UnsupportedOperationException("Byte vector search is not supported yet with jVector"); + } + + @Override + public void close() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + IOUtils.close(fieldEntry); + } + fieldEntryMap.clear(); + IOUtils.close(data); + } + + private static JVectorWriter.VectorIndexFieldMetadata parseNextField( + IndexInput meta, FieldInfos fieldInfos) throws IOException { + final int fieldNumber = meta.readInt(); + if (fieldNumber == -1) { + return null; + } + + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + if (fieldInfo == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + + return new JVectorWriter.VectorIndexFieldMetadata(meta); + } + + class FieldEntry implements Closeable { + private final VectorSimilarityFunction similarityFunction; + private final GraphNodeIdToDocMap graphNodeIdToDocMap; + private final OnDiskGraphIndex index; + private final PQVectors pqVectors; // The product quantized vectors with their codebooks + + public FieldEntry( + IndexInput data, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) + throws IOException { + this.similarityFunction = vectorIndexFieldMetadata.vectorSimilarityFunction; + this.graphNodeIdToDocMap = vectorIndexFieldMetadata.graphNodeIdToDocMap; + + final long graphOffset = vectorIndexFieldMetadata.vectorIndexOffset; + final long graphLength = vectorIndexFieldMetadata.vectorIndexLength; + assert graphLength > 0 : "Read empty JVector graph"; + // Load the graph index from cloned slices of data (no need to close) + final var indexReaderSupplier = + new JVectorRandomAccessReader.Supplier(data.slice("graph", graphOffset, graphLength)); + this.index = OnDiskGraphIndex.load(indexReaderSupplier); + + // If quantized load the compressed product quantized vectors with their codebooks + final long pqOffset = vectorIndexFieldMetadata.pqCodebooksAndVectorsOffset; + final long pqLength = vectorIndexFieldMetadata.pqCodebooksAndVectorsLength; + if (pqLength > 0) { + assert pqOffset > 0; + if (pqOffset < graphOffset) { + throw new IllegalArgumentException( + "pqOffset must be greater than vectorIndexOffset"); + } + final var pqSlice = data.slice("pq", pqOffset, pqLength); + try (final var randomAccessReader = new JVectorRandomAccessReader(pqSlice)) { + this.pqVectors = PQVectors.load(randomAccessReader); + } + } else { + this.pqVectors = null; + } + } + + @Override + public void close() throws IOException { + index.close(); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java new file mode 100644 index 000000000000..1f713a8b214b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorSearchStrategy.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.util.Locale; +import java.util.Objects; +import org.apache.lucene.search.knn.KnnSearchStrategy; + +/// Defines query-time parameters for searching a JVector index to be passed into +/// [`search()`][JVectorReader#search] via [`KnnCollector`][org.apache.lucene.search.KnnCollector]. +public class JVectorSearchStrategy extends KnnSearchStrategy { + static final float DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0f; + static final float DEFAULT_QUERY_RERANK_FLOOR = 0f; + static final int DEFAULT_OVER_QUERY_FACTOR = 3; + static final boolean DEFAULT_QUERY_USE_PRUNING = false; + + public static final JVectorSearchStrategy DEFAULT = + new JVectorSearchStrategy( + DEFAULT_QUERY_SIMILARITY_THRESHOLD, + DEFAULT_QUERY_RERANK_FLOOR, + DEFAULT_OVER_QUERY_FACTOR, + DEFAULT_QUERY_USE_PRUNING); + + final float threshold; + final float rerankFloor; + final int overQueryFactor; + final boolean usePruning; + + private JVectorSearchStrategy( + float threshold, float rerankFloor, int overQueryFactor, boolean usePruning) { + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.overQueryFactor = overQueryFactor; + this.usePruning = usePruning; + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "%s[threshold=%f, rerankFloor=%f, overQueryFactor=%d, usePruning=%s]", + getClass().getSimpleName(), + threshold, + rerankFloor, + overQueryFactor, + usePruning); + } + + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } else if (obj instanceof JVectorSearchStrategy other) { + return this.threshold == other.threshold + && this.rerankFloor == other.rerankFloor + && this.overQueryFactor == other.overQueryFactor + && this.usePruning == other.usePruning; + } else return false; + } + + @Override + public int hashCode() { + return Objects.hash(getClass(), threshold, rerankFloor, overQueryFactor, usePruning); + } + + @Override + public void nextVectorsBlock() {} + + public static Builder builder() { + return new Builder(); + } + + /// Builder for defining a [JVectorSearchStrategy]. + public static class Builder { + private float threshold = DEFAULT_QUERY_SIMILARITY_THRESHOLD; + private float rerankFloor = DEFAULT_QUERY_RERANK_FLOOR; + private int overQueryFactor = DEFAULT_OVER_QUERY_FACTOR; + private boolean usePruning = DEFAULT_QUERY_USE_PRUNING; + + private Builder() {} + + public Builder withThreshold(float threshold) { + this.threshold = threshold; + return this; + } + + public Builder withRerankFloor(float rerankFloor) { + this.rerankFloor = rerankFloor; + return this; + } + + public Builder withOverQueryFactor(int overQueryFactor) { + this.overQueryFactor = overQueryFactor; + return this; + } + + public Builder withUsePruning(boolean usePruning) { + this.usePruning = usePruning; + return this; + } + + public JVectorSearchStrategy build() { + return new JVectorSearchStrategy(threshold, rerankFloor, overQueryFactor, usePruning); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java new file mode 100644 index 000000000000..fb9b43f347a1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -0,0 +1,893 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; + +import io.github.jbellis.jvector.graph.GraphIndexBuilder; +import io.github.jbellis.jvector.graph.ImmutableGraphIndex; +import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; +import io.github.jbellis.jvector.graph.OnHeapGraphIndex; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.OrdinalMapper; +import io.github.jbellis.jvector.graph.disk.feature.Feature; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; +import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; +import io.github.jbellis.jvector.quantization.MutablePQVectors; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; +import java.util.function.IntUnaryOperator; +import java.util.stream.IntStream; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * JVectorWriter is responsible for writing vector data into index segments using the JVector + * library. + * + *

Persisting the JVector Graph Index

+ * + *

Flushing data into disk segments occurs in two scenarios: + * + *

    + *
  1. When the segment is being flushed to disk (e.g., when a new segment is created) via {@link + * #flush(int, Sorter.DocMap)} + *
  2. When the segment is a result of a merge (e.g., when multiple segments are merged into one) + * via {@link #mergeOneField(FieldInfo, MergeState)} + *
+ * + *

jVector Graph Ordinal to Lucene Document ID Mapping

+ * + *

JVector keeps its own ordinals to identify its nodes. Those ordinals can be different from the + * Lucene document IDs. Document IDs in Lucene can change after a merge operation. Therefore, we + * need to maintain a mapping between JVector ordinals and Lucene document IDs that can hold across + * merges. + * + *

Document IDs in Lucene are mapped across merges and sorts using the {@link + * org.apache.lucene.index.MergeState.DocMap} for merges and {@link + * org.apache.lucene.index.Sorter.DocMap} for flush/sorts. For jVector however, we don't want to + * modify the ordinals in the jVector graph, and therefore we need to maintain a mapping between the + * jVector ordinals and the new Lucene document IDs. This is achieved by keeping checkpoints of the + * {@link GraphNodeIdToDocMap} class in the index metadata and allowing us to update the mapping as + * needed across merges by constructing a new mapping from the previous mapping and the {@link + * org.apache.lucene.index.MergeState.DocMap} provided in the {@link MergeState}. + */ +public class JVectorWriter extends KnnVectorsWriter { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + private static final long SHALLOW_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); + + private final List fields = new ArrayList<>(); + + private final IndexOutput meta; + private final IndexOutput data; + private final List maxDegrees; + private final int beamWidth; + private final float degreeOverflow; + private final float alpha; + /// Number of subspaces used per vector in PQ quantization as a function of the original dimension + private final IntUnaryOperator numberOfSubspacesPerVectorSupplier; + private final int + minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger + // PQ quantization + private final boolean hierarchyEnabled; + + private boolean finished = false; + + public JVectorWriter( + SegmentWriteState segmentWriteState, + List maxDegrees, + int beamWidth, + float degreeOverflow, + float alpha, + IntUnaryOperator numberOfSubspacesPerVectorSupplier, + int minimumBatchSizeForQuantization, + boolean hierarchyEnabled) + throws IOException { + this.maxDegrees = maxDegrees; + this.beamWidth = beamWidth; + this.degreeOverflow = degreeOverflow; + this.alpha = alpha; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; + this.hierarchyEnabled = hierarchyEnabled; + + try { + final String metaFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.META_EXTENSION); + meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); + CodecUtil.writeIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + + final String dataFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.VECTOR_INDEX_EXTENSION); + data = segmentWriteState.directory.createOutput(dataFileName, segmentWriteState.context); + CodecUtil.writeIndexHeader( + data, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + } catch (Throwable t) { + IOUtils.closeWhileSuppressingExceptions(t, this); + throw t; + } + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { + final String errorMessage = + "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + throw new UnsupportedOperationException(errorMessage); + } + final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(fieldInfo.getVectorDimension()); + final FieldWriter newField = + new FieldWriter( + fieldInfo, + maxDegrees, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled, + minimumBatchSizeForQuantization, + M); + + fields.add(newField); + return newField; + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + try { + switch (fieldInfo.getVectorEncoding()) { + case BYTE: + throw new UnsupportedEncodingException("Byte vectors are not supported in JVector."); + case FLOAT32: + mergeAndWriteField(fieldInfo, mergeState); + break; + } + } catch (Exception e) { + throw e; + } + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + for (FieldWriter field : fields) { + final DocsWithFieldSet newDocIds; + final OrdinalMapper ordinalMapper; + if (sortMap != null) { + assert field.docIds.cardinality() <= sortMap.size(); + final int size = field.docIds.cardinality(); + final int[] oldToNew = new int[size]; + final int[] newToOld = new int[size]; + newDocIds = new DocsWithFieldSet(); + KnnVectorsWriter.mapOldOrdToNewOrd(field.docIds, sortMap, oldToNew, newToOld, newDocIds); + ordinalMapper = new ArrayOrdinalMapper(size - 1, oldToNew, newToOld); + } else { + newDocIds = field.docIds; + ordinalMapper = null; + } + final RandomAccessVectorValues randomAccessVectorValues = field.toRandomAccessVectorValues(); + final PQVectors pqVectors = field.getCompressedVectors(); + final ImmutableGraphIndex graph = field.getGraphIndex(); + final GraphNodeIdToDocMap graphNodeIdToDocMap = new GraphNodeIdToDocMap(newDocIds); + writeField( + field.fieldInfo, + randomAccessVectorValues, + pqVectors, + ordinalMapper, + graphNodeIdToDocMap, + graph); + } + } + + private record ArrayOrdinalMapper(int maxOrdinal, int[] oldToNew, int[] newToOld) + implements OrdinalMapper { + @Override + public int maxOrdinal() { + return maxOrdinal; + } + + @Override + public int oldToNew(int oldOrdinal) { + return oldToNew[oldOrdinal]; + } + + @Override + public int newToOld(int newOrdinal) { + return newToOld[newOrdinal]; + } + } + + private void writeField( + FieldInfo fieldInfo, + RandomAccessVectorValues randomAccessVectorValues, + PQVectors pqVectors, + OrdinalMapper ordinalMapper, + GraphNodeIdToDocMap graphNodeIdToDocMap, + ImmutableGraphIndex graph) + throws IOException { + final var vectorIndexFieldMetadata = + writeGraph( + graph, + randomAccessVectorValues, + fieldInfo, + pqVectors, + ordinalMapper, + graphNodeIdToDocMap); + meta.writeInt(fieldInfo.number); + vectorIndexFieldMetadata.toOutput(meta); + } + + /** + * Writes the graph and PQ codebooks and compressed vectors to the vector index file + * + * @param graph graph + * @param randomAccessVectorValues random access vector values + * @param fieldInfo field info + * @return Tuple of start offset and length of the graph + * @throws IOException IOException + */ + private VectorIndexFieldMetadata writeGraph( + ImmutableGraphIndex graph, + RandomAccessVectorValues randomAccessVectorValues, + FieldInfo fieldInfo, + PQVectors pqVectors, + OrdinalMapper ordinalMapper, + GraphNodeIdToDocMap graphNodeIdToDocMap) + throws IOException { + try (final var jVectorIndexWriter = new JVectorIndexWriter(data)) { + final long startOffset = data.getFilePointer(); + final var writerBuilder = + new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) + .with(new InlineVectors(randomAccessVectorValues.dimension())); + if (ordinalMapper != null) { + writerBuilder.withMapper(ordinalMapper); + } + try (var writer = writerBuilder.build()) { + var suppliers = + Feature.singleStateFactory( + FeatureId.INLINE_VECTORS, + nodeId -> new InlineVectors.State(randomAccessVectorValues.getVector(nodeId))); + writer.write(suppliers); + final long endGraphOffset = data.getFilePointer(); + + // If PQ is enabled and we have enough vectors, write the PQ codebooks and compressed + // vectors + final long pqOffset; + final long pqLength; + if (pqVectors != null) { + pqOffset = endGraphOffset; + // write the compressed vectors and codebooks to disk + pqVectors.write(jVectorIndexWriter); + pqLength = data.getFilePointer() - endGraphOffset; + } else { + pqOffset = 0; + pqLength = 0; + } + + return new VectorIndexFieldMetadata( + fieldInfo.number, + fieldInfo.getVectorEncoding(), + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()), + randomAccessVectorValues.dimension(), + startOffset, + endGraphOffset - startOffset, + pqOffset, + pqLength, + degreeOverflow, + graphNodeIdToDocMap); + } + } + } + + /// Metadata about the index to be persisted on disk + public static class VectorIndexFieldMetadata { + final int fieldNumber; + final VectorEncoding vectorEncoding; + final VectorSimilarityFunction vectorSimilarityFunction; + final int vectorDimension; + final long vectorIndexOffset; + final long vectorIndexLength; + final long pqCodebooksAndVectorsOffset; + final long pqCodebooksAndVectorsLength; + final float degreeOverflow; // important when leveraging cache + final GraphNodeIdToDocMap graphNodeIdToDocMap; + + public VectorIndexFieldMetadata( + int fieldNumber, + VectorEncoding vectorEncoding, + VectorSimilarityFunction vectorSimilarityFunction, + int vectorDimension, + long vectorIndexOffset, + long vectorIndexLength, + long pqCodebooksAndVectorsOffset, + long pqCodebooksAndVectorsLength, + float degreeOverflow, + GraphNodeIdToDocMap graphNodeIdToDocMap) { + this.fieldNumber = fieldNumber; + this.vectorEncoding = vectorEncoding; + this.vectorSimilarityFunction = vectorSimilarityFunction; + this.vectorDimension = vectorDimension; + this.vectorIndexOffset = vectorIndexOffset; + this.vectorIndexLength = vectorIndexLength; + this.pqCodebooksAndVectorsOffset = pqCodebooksAndVectorsOffset; + this.pqCodebooksAndVectorsLength = pqCodebooksAndVectorsLength; + this.degreeOverflow = degreeOverflow; + this.graphNodeIdToDocMap = graphNodeIdToDocMap; + } + + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(fieldNumber); + out.writeInt(vectorEncoding.ordinal()); + out.writeInt(vectorSimilarityFunction.ordinal()); + out.writeVInt(vectorDimension); + out.writeVLong(vectorIndexOffset); + out.writeVLong(vectorIndexLength); + out.writeVLong(pqCodebooksAndVectorsOffset); + out.writeVLong(pqCodebooksAndVectorsLength); + out.writeInt(Float.floatToIntBits(degreeOverflow)); + graphNodeIdToDocMap.toOutput(out); + } + + public VectorIndexFieldMetadata(IndexInput in) throws IOException { + this.fieldNumber = in.readInt(); + this.vectorEncoding = readVectorEncoding(in); + this.vectorSimilarityFunction = VectorSimilarityFunction.values()[in.readInt()]; + this.vectorDimension = in.readVInt(); + this.vectorIndexOffset = in.readVLong(); + this.vectorIndexLength = in.readVLong(); + this.pqCodebooksAndVectorsOffset = in.readVLong(); + this.pqCodebooksAndVectorsLength = in.readVLong(); + this.degreeOverflow = Float.intBitsToFloat(in.readInt()); + this.graphNodeIdToDocMap = new GraphNodeIdToDocMap(in); + } + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + CodecUtil.writeFooter(data); + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, data); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (FieldWriter field : fields) { + // the field tracks the delegate field usage + total += field.ramBytesUsed(); + } + return total; + } + + /** + * The FieldWriter class is responsible for writing vector field data into index segments. It + * provides functionality to process vector values as those being added, manage memory usage, and + * build HNSW graph indexing structures for efficient retrieval during search queries. + */ + static class FieldWriter extends KnnFieldVectorsWriter { + private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); + private final FieldInfo fieldInfo; + // The ordering of docIds matches the ordering of vectors, the index in this list corresponds to + // the jVector ordinal + private final List> vectors; + private final DocsWithFieldSet docIds; + + private GraphIndexBuilder indexBuilder; + private final DelegatingBuildScoreProvider buildScoreProvider; + + // PQ fields + private final int pqThreshold; + private final int pqSubspaceCount; + private MutablePQVectors pqVectors; + + FieldWriter( + FieldInfo fieldInfo, + List maxDegrees, + int beamWidth, + float degreeOverflow, + float alpha, + boolean hierarchyEnabled, + int pqThreshold, + int pqSubspaceCount) { + /** For creating a new field from a flat field vectors writer. */ + this.fieldInfo = fieldInfo; + this.vectors = new ArrayList<>(); + this.docIds = new DocsWithFieldSet(); + + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + this.buildScoreProvider = + new DelegatingBuildScoreProvider( + BuildScoreProvider.randomAccessScoreProvider( + toRandomAccessVectorValues(), + similarityFunction)); + this.indexBuilder = + new GraphIndexBuilder( + buildScoreProvider, + fieldInfo.getVectorDimension(), + maxDegrees, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled, + true); + + this.pqThreshold = pqThreshold; + this.pqSubspaceCount = pqSubspaceCount; + this.pqVectors = null; + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + if (docID < docIds.cardinality()) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + final int ord = vectors.size(); + docIds.add(docID); + final var vector = VECTOR_TYPE_SUPPORT.createFloatVector(copyValue(vectorValue)); + vectors.add(vector); + + if (pqVectors != null) { + pqVectors.encodeAndSet(ord, vector); + } else if (vectors.size() > pqThreshold) { + final ProductQuantization pq = + trainPQ( + toRandomAccessVectorValues(), + pqSubspaceCount, + fieldInfo.getVectorSimilarityFunction()); + pqVectors = new MutablePQVectors(pq); + for (int i = 0; i < vectors.size(); ++i) { + pqVectors.encodeAndSet(i, vectors.get(i)); + } + + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + buildScoreProvider.setDelegate( + BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors)); + indexBuilder = GraphIndexBuilder.rescore(indexBuilder, buildScoreProvider); + } + + indexBuilder.addGraphNode(ord, vector); + } + + @Override + public float[] copyValue(float[] vectorValue) { + return vectorValue.clone(); + } + + public RandomAccessVectorValues toRandomAccessVectorValues() { + return new ListRandomAccessVectorValues(vectors, fieldInfo.getVectorDimension()); + } + + public PQVectors getCompressedVectors() { + return pqVectors; + } + + public ImmutableGraphIndex getGraphIndex() { + indexBuilder.cleanup(); + return indexBuilder.getGraph(); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + + (long) vectors.size() * fieldInfo.getVectorDimension() * Float.BYTES + + docIds.ramBytesUsed(); + } + } + + static final class DelegatingBuildScoreProvider implements BuildScoreProvider { + BuildScoreProvider delegate; + + DelegatingBuildScoreProvider(BuildScoreProvider delegate) { + this.delegate = Objects.requireNonNull(delegate); + } + + public void setDelegate(BuildScoreProvider delegate) { + this.delegate = Objects.requireNonNull(delegate); + } + + @Override + public boolean isExact() { + return delegate.isExact(); + } + + @Override + public VectorFloat approximateCentroid() { + return delegate.approximateCentroid(); + } + + @Override + public SearchScoreProvider searchProviderFor(VectorFloat vector) { + return delegate.searchProviderFor(vector); + } + + @Override + public SearchScoreProvider searchProviderFor(int node1) { + return delegate.searchProviderFor(node1); + } + + @Override + public SearchScoreProvider diversityProviderFor(int node1) { + return delegate.diversityProviderFor(node1); + } + } + + private void mergeAndWriteField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + assert fieldInfo.hasVectorValues(); + final int dimension = fieldInfo.getVectorDimension(); + final int mergeCount = mergeState.knnVectorsReaders.length; + + // Collect the sub-readers into a list to make a DocIdMerger + final List subs = new ArrayList<>(mergeCount); + final FloatVectorValues[] vectors = new FloatVectorValues[mergeCount]; + for (int i = 0; i < mergeCount; ++i) { + if (false == MergedVectorValues.hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) { + continue; + } + final var reader = mergeState.knnVectorsReaders[i]; + if (reader == null) { + continue; + } + final var values = reader.getFloatVectorValues(fieldInfo.name); + if (values == null || values.size() == 0) { + continue; + } + + assert values.dimension() == dimension; + subs.add(new SubFloatVectors(mergeState.docMaps[i], i, values)); + vectors[i] = values; + } + + // These arrays may be larger than strictly necessary if there are deleted docs/missing fields + final int totalMaxDocs = Arrays.stream(mergeState.maxDocs).reduce(0, Math::addExact); + final DocsWithFieldSet docIds = new DocsWithFieldSet(); + final int[] ordToReaderIndex = new int[totalMaxDocs]; + final int[] ordToReaderOrd = new int[totalMaxDocs]; + + // Construct ordinal mappings for the new graph + int ord = 0; + final var docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + for (var sub = docIdMerger.next(); sub != null; sub = docIdMerger.next()) { + docIds.add(sub.mappedDocID); + ordToReaderIndex[ord] = sub.readerIndex; + ordToReaderOrd[ord] = sub.index(); + ord += 1; + } + + final int totalLiveDocsCount = ord; + if (totalLiveDocsCount == 0) { + // Avoid writing an empty graph + return; + } + + // Make a RandomAccessVectorValues instance using the new graph ordinals + final var ravv = + new RandomAccessMergedFloatVectorValues( + totalLiveDocsCount, + dimension, + vectors, + i -> ordToReaderIndex[i], + i -> ordToReaderOrd[i]); + + final BuildScoreProvider buildScoreProvider; + final var similarityFunction = + JVectorFormat.toJVectorSimilarity(fieldInfo.getVectorSimilarityFunction()); + + // Perform PQ if applicable + final PQVectors pqVectors; + if (ravv.size() >= minimumBatchSizeForQuantization) { + final int M = numberOfSubspacesPerVectorSupplier.applyAsInt(ravv.dimension()); + final ProductQuantization newPQ = trainPQ(ravv, M, fieldInfo.getVectorSimilarityFunction()); + pqVectors = (PQVectors) newPQ.encodeAll(ravv); + buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(similarityFunction, pqVectors); + // Pre-init the diversity provider here to avoid doing it lazily (as it could block the SIMD + // threads) + buildScoreProvider.diversityProviderFor(0); + } else { + pqVectors = null; + buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(ravv, similarityFunction); + } + + final var graphNodeIdToDocMap = new GraphNodeIdToDocMap(docIds); + final var graph = + getGraph(buildScoreProvider, ravv, fieldInfo, mergeState.intraMergeTaskExecutor); + writeField(fieldInfo, ravv, pqVectors, null, graphNodeIdToDocMap, graph); + } + + private static final class SubFloatVectors extends DocIDMerger.Sub { + final int readerIndex; + final KnnVectorValues.DocIndexIterator iterator; + int docId = -1; + + SubFloatVectors(MergeState.DocMap docMap, int readerIndex, FloatVectorValues values) { + super(docMap); + this.readerIndex = readerIndex; + this.iterator = values.iterator(); + } + + @Override + public int nextDoc() throws IOException { + docId = iterator.nextDoc(); + return docId; + } + + public int index() { + return iterator.index(); + } + } + + private static final class RandomAccessMergedFloatVectorValues + implements RandomAccessVectorValues { + private final int size; + private final int dimension; + private final FloatVectorValues[] vectors; + private final IntUnaryOperator ordToReader; + private final IntUnaryOperator ordToReaderOrd; + + public RandomAccessMergedFloatVectorValues( + int size, + int dimension, + FloatVectorValues[] values, + IntUnaryOperator ordToReader, + IntUnaryOperator ordToReaderOrd) { + this.size = size; + this.dimension = dimension; + this.vectors = values; + this.ordToReader = ordToReader; + this.ordToReaderOrd = ordToReaderOrd; + } + + @Override + public RandomAccessMergedFloatVectorValues copy() { + final FloatVectorValues[] newVectors = new FloatVectorValues[vectors.length]; + for (int i = 0; i < newVectors.length; ++i) { + if (vectors[i] != null) { + try { + newVectors[i] = vectors[i].copy(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + return new RandomAccessMergedFloatVectorValues( + size, dimension, newVectors, ordToReader, ordToReaderOrd); + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public VectorFloat getVector(int node) { + final FloatVectorValues values = vectors[ordToReader.applyAsInt(node)]; + final int ord = ordToReaderOrd.applyAsInt(node); + + if (values instanceof JVectorFloatVectorValues jVectorValues) { + return jVectorValues.vectorFloatValue(ord); + } + + try { + return VECTOR_TYPE_SUPPORT.createFloatVector(values.vectorValue(ord)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public void getVectorInto(int node, VectorFloat destinationVector, int offset) { + final FloatVectorValues values = vectors[ordToReader.applyAsInt(node)]; + final int ord = ordToReaderOrd.applyAsInt(node); + + if (values instanceof JVectorFloatVectorValues jVectorValues) { + jVectorValues.getVectorInto(ord, destinationVector, offset); + } + + final VectorFloat srcVector; + try { + srcVector = VECTOR_TYPE_SUPPORT.createFloatVector(values.vectorValue(ord)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + destinationVector.copyFrom(srcVector, 0, offset, srcVector.length()); + } + + @Override + public boolean isValueShared() { + // force thread-local copies + return true; + } + + @Override + public int size() { + return size; + } + } + + /** + * This method will return the graph index for the field + * + * @return OnHeapGraphIndex + */ + public OnHeapGraphIndex getGraph( + BuildScoreProvider buildScoreProvider, + RandomAccessVectorValues randomAccessVectorValues, + FieldInfo fieldInfo, + Executor executor) { + assert randomAccessVectorValues.size() > 0 : "Cannot build empty graph"; + final GraphIndexBuilder graphIndexBuilder = + new GraphIndexBuilder( + buildScoreProvider, + fieldInfo.getVectorDimension(), + maxDegrees, + beamWidth, + degreeOverflow, + alpha, + hierarchyEnabled, + true); + + /* + * We cannot always use randomAccessVectorValues for the graph building + * because it's size will not always correspond to the document count. + * To have the right mapping from docId to vector ordinal we need to use the mergedFloatVector. + * This is the case when we are merging segments and we might have more documents than vectors. + */ + final OnHeapGraphIndex graphIndex; + final var vv = randomAccessVectorValues.threadLocalSupplier(); + + // parallel graph construction from the merge documents Ids + final int size = randomAccessVectorValues.size(); + IntStream.range(0, size) + .mapToObj( + ord -> + CompletableFuture.runAsync( + () -> graphIndexBuilder.addGraphNode(ord, vv.get().getVector(ord)), executor)) + .reduce((a, b) -> a.runAfterBoth(b, () -> {})) + .ifPresent(CompletableFuture::join); + graphIndexBuilder.cleanup(); + graphIndex = (OnHeapGraphIndex) graphIndexBuilder.getGraph(); + + return graphIndex; + } + + private static ProductQuantization trainPQ( + RandomAccessVectorValues vectors, + int M, + org.apache.lucene.index.VectorSimilarityFunction similarityFunction) { + final boolean globallyCenter = + switch (similarityFunction) { + case EUCLIDEAN -> true; + case COSINE, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT -> false; + }; + final int numberOfClustersPerSubspace = Math.min(256, vectors.size()); + // This extracts a random minimal subset of the vectors for training the PQ codebooks + return + ProductQuantization.compute( + vectors, + M, + numberOfClustersPerSubspace, + globallyCenter); + } + + static class RandomAccessVectorValuesOverVectorValues implements RandomAccessVectorValues { + private final FloatVectorValues values; + + public RandomAccessVectorValuesOverVectorValues(FloatVectorValues values) { + this.values = values; + } + + @Override + public int size() { + return values.size(); + } + + @Override + public int dimension() { + return values.dimension(); + } + + @Override + public VectorFloat getVector(int nodeId) { + try { + final float[] vector = values.vectorValue(nodeId); + return VECTOR_TYPE_SUPPORT.createFloatVector(Arrays.copyOf(vector, vector.length)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public boolean isValueShared() { + // Access to float values is not thread safe + return true; + } + + @Override + public RandomAccessVectorValues copy() { + try { + return new RandomAccessVectorValuesOverVectorValues(values.copy()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java new file mode 100644 index 000000000000..5f05b040c88a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the implementation of the JVector codec, a Lucene codec for approximate + * nearest neighbor search using vector quantization and HNSW graph indexing. It is based on the + * OpenSearch JVector codec and optimized for Lucene. + */ +package org.apache.lucene.sandbox.codecs.jvector; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 29a44d2ecfa8..84f11e50fd0a 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat +org.apache.lucene.sandbox.codecs.jvector.JVectorFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java new file mode 100644 index 000000000000..0f1413bbcce9 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/jvector/TestJVectorFormat.java @@ -0,0 +1,1622 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import static org.apache.lucene.sandbox.codecs.jvector.JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + +import com.carrotsearch.randomizedtesting.ThreadFilter; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.NamedThreadFactory; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** Test used specifically for JVector */ +// Currently {@link IndexGraphBuilder} is using the default ForkJoinPool.commonPool() which is not +// being shutdown. +// Ignore thread leaks until we remove the ForkJoinPool.commonPool() usage from IndexGraphBuilder +// TODO: Wire the execution thread pool to {@link IndexGraphBuilder} to avoid the failure of the UT +// due to leaked thread pool warning. +@ThreadLeakFilters( + defaultFilters = true, + filters = {TestJVectorFormat.ThreadLeakFilter.class}) +public class TestJVectorFormat extends BaseKnnVectorsFormatTestCase { + private static final VectorEncoding[] SUPPORTED_ENCODINGS = {FLOAT32}; + private static final VectorSimilarityFunction[] SUPPORTED_FUNCTIONS = { + DOT_PRODUCT, EUCLIDEAN, COSINE + }; + private static final String TEST_FIELD = "test_field"; + private static final String TEST_ID_FIELD = "id"; + + @Override + @Ignore("Does not honor visitedLimit") + public void testSearchWithVisitedLimit() {} + + @Override + @Ignore("Does not support byte vectors") + public void testByteVectorScorerIteration() {} + + @Override + @Ignore("Does not support byte vectors") + public void testMismatchedFields() {} + + @Override + @Ignore("Does not support byte vectors") + public void testSortedIndexBytes() {} + + @Override + @Ignore("Does not support byte vectors") + public void testRandomBytes() {} + + @Override + @Ignore("Does not support byte vectors") + public void testEmptyByteVectorData() {} + + @Override + @Ignore("Does not support byte vectors") + public void testMergingWithDifferentByteKnnFields() {} + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. All the documents are stored in a + * single segment. Single commit without refreshing the index. No merge. + */ + @Test + public void testJVectorKnnIndex_simpleCase() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + // Flush docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with 10 documents; + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** Test the scenario when not all documents are populated with the vector field */ + @Test + public void testMissing_fields() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + if (i % 2 == 0) { + final float[] source = new float[] {0.0f, i}; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + } + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + // Flush docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 0.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(2, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(4, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 4.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test the scenario when the index is sorted by a doc value We want to make sure the docIDs are + * correctly mapped to the jVector ordinals + * + * @throws IOException if an I/O error occurs + */ + @Test + public void test_sorted_index() throws IOException { + final int k = 3; // The number of nearest neighbors to gather + final int totalNumberOfDocs = 10; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final String sortFieldName = "sorted_field"; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // Add index sorting configuration + indexWriterConfig.setIndexSort( + new Sort(new SortField(sortFieldName, SortField.Type.INT, true))); // true = reverse order + + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 0; i < totalNumberOfDocs; i++) { + final Document doc = new Document(); + final float[] source = new float[] {0.0f, i}; + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + // Add the sortable field + doc.add(new NumericDocValuesField(sortFieldName, i)); + w.addDocument(doc); + } + // Flushing docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + assertEquals( + 0, + reader + .storedFields() + .document(topDocs.scoreDocs[0].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 0.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + assertEquals( + 1, + reader + .storedFields() + .document(topDocs.scoreDocs[1].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + assertEquals( + 2, + reader + .storedFields() + .document(topDocs.scoreDocs[2].doc) + .getField(TEST_ID_FIELD) + .numericValue() + .intValue()); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in a multiple + * segments. Multiple commits without refreshing the index. No merge. + */ + @Test + public void testJVectorKnnIndex_multipleSegments() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.commit(); // this creates a new segment + } + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 10 segments, each with a single document + Assert.assertEquals(10, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in a multiple + * segments. Multiple commits without refreshing the index. Merge is enabled. + */ + @Test + public void testJVectorKnnIndex_mergeEnabled() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 1 segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbors + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. + */ + @Test + public void multipleMerges() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + w.commit(); // this creates a new segment without triggering a merge + w.forceMerge(1); // this merges all segments into a single segment + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 1 segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + Document doc = reader.storedFields().document(topDocs.scoreDocs[0].doc); + assertEquals("1", doc.get("my_doc_id")); + Assert.assertEquals( + vectorSimilarityFunction.compare(target, new float[] {0.0f, 1.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[1].doc); + assertEquals("2", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 2.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + doc = reader.storedFields().document(topDocs.scoreDocs[2].doc); + assertEquals("3", doc.get("my_doc_id")); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 3.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test to verify that the jVector codec is able to successfully search for the nearest neighbours + * in the index. A Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Large batches Use a compound file + */ + @Test + public void testJVectorKnnIndex_multiple_merges_large_batches_no_quantization() + throws IOException { + int segmentSize = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("my_doc_id", Integer.toString(i, 10), Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge + } + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 1 segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + float expectedMinScoreInTopK = + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, k}); + final float recall = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertEquals(1.0f, recall, 0.01f); + } + } + } + + /** + * Similar to testJVectorKnnIndex_multiple_merges_large_batches_no_quantization but with random + * vectors It's important to add more randomness to the vectors to make sure the graph is not + * linear + * + * @throws IOException if an I/O error occurs + */ + @Test + public void + testJVectorKnnIndex_multiple_merges_large_batches_no_quantization_with_random_vectors() + throws IOException { + int segmentSize = 200; + int totalNumberOfDocs = segmentSize * 4; + int k = 3; // The number of nearest neighbors to gather + final int dimension = 2; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final float[] target = generateRandomVectors(1, dimension)[0]; + final float[][] source = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, source, k, vectorSimilarityFunction); + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec(Integer.MAX_VALUE)); // effectively without quantization + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + for (int i = 0; i < source.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % segmentSize == 0) { + w.commit(); // this creates a new segment without triggering a merge + } + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + } + } + } + + /** + * Tests the functionality and integrity of a Lucene k-NN index under multiple merge cycles and + * verifies the proper ordering of vectors and document identifiers. + * + *

The method performs the following validation steps: 1. Indexes a predefined number of + * documents into a Lucene index, creating many small segments. Each document includes a k-NN + * float vector field encoding a specific order. 2. Executes several merge operations on the index + * (partial and full merges) to validate that the merging process maintains correctness and + * consistency. 3. Validates the following invariants post-merge: (a) Verifies that the index is + * merged into a single segment. (b) Confirms the integrity of vector values by iterating through + * the merged segment and checking the relationship between vector components and document + * identifiers. (c) Performs k-NN searches with various cases: - Single-threaded searches using + * vectors to ensure correct results. - Multi-threaded concurrent searches to confirm robustness + * and verify the index operates correctly under concurrent access without exhausting file handles + * or encountering other issues. + * + *

Assertions are used throughout to ensure the state of the index matches the expected + * behavior, validate merge results, and confirm the accuracy of search operations. The test also + * logs the number of successful k-NN queries during the concurrent search phase. + * + * @throws IOException if an I/O error occurs during index operations. + * @throws InterruptedException if the concurrent search phase is interrupted. + */ + @Test + public void testLuceneKnnIndex_multipleMerges_with_ordering_check() + throws IOException, InterruptedException { + final int numDocs = 10000; + final String floatVectorField = "vec"; + final String expectedDocIdField = "expectedDocId"; + final Path indexPath = createTempDir(); + final float[][] sourceVectors = generateRandomVectors(numDocs, 2); + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + try (Directory dir = newFSDirectory(indexPath)) { + IndexWriterConfig cfg = newIndexWriterConfig(); + cfg.setCodec(getCodec()); + cfg.setUseCompoundFile(false); + cfg.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + cfg.setMergeScheduler(new SerialMergeScheduler()); + + try (IndexWriter w = new IndexWriter(dir, cfg)) { + /* ---------- 1. index documents, create many tiny segments ---------- */ + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + // vector whose first component encodes the future (segment-local) docID + doc.add( + new KnnFloatVectorField( + floatVectorField, sourceVectors[i], vectorSimilarityFunction)); + doc.add(new StoredField(expectedDocIdField, i)); + w.addDocument(doc); + } + w.commit(); + + /* ---------- 2. run several merge cycles ---------- */ + w.forceMerge(5); // partial merge + w.forceMerge(3); // another partial merge + w.forceMerge(1); // final full merge + } + + /* ---------- 3. open reader and assert the invariant ---------- */ + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals("we merged down to exactly one segment", 1, reader.leaves().size()); + + // (a) iterate through vectors directly + for (LeafReaderContext context : reader.leaves()) { + FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final var docIdSetIterator = + vectorValues.iterator(); // iterator for all the vectors with values + int docId = -1; + while ((docId = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + final int luceneDocId = context.docBase + docId; + final int globalDocId = + reader + .storedFields() + .document(luceneDocId) + .getField(expectedDocIdField) + .storedValue() + .getIntValue(); + float[] vectorValue = vectorValues.vectorValue(docIdSetIterator.index()); + float[] expectedVectorValue = sourceVectors[globalDocId]; + Assert.assertArrayEquals( + "vector with global id " + + globalDocId + + " in source doesn't match vector value in lucene docID " + + luceneDocId + + " on the index", + expectedVectorValue, + vectorValue, + 0.0f); + } + } + + // (b) search with the same vector and confirm we are not exhausting the file handles with + // each search + IndexSearcher searcher = newSearcher(reader); + LeafReaderContext context = + reader + .leaves() + .get(0); // we only have one leaf at this point so we can use it to obtain the + // vector values + final int baseDocId = context.docBase; + final FloatVectorValues vectorValues = context.reader().getFloatVectorValues("vec"); + final int k = 1; + for (int i = 0; i < reader.maxDoc(); i++) { + float[] query = generateRandomVectors(1, 2)[0]; + TopDocs td = + searcher.search(new KnnFloatVectorQuery("vec", query, k, new MatchAllDocsQuery()), k); + assertEquals(k, td.scoreDocs.length); + + compareSearchResults( + td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); + } + + // (c) search with the same vector and this time add concurrency to make sure we are still + // not exhausting the file handles + int numThreads = 10; // Number of concurrent search threads + int queriesPerThread = 100; // Number of searches per thread + ExecutorService executor = + Executors.newFixedThreadPool(numThreads, new NamedThreadFactory("KNNJVectorTests")); + CountDownLatch latch = new CountDownLatch(numThreads); + AtomicBoolean failureDetected = new AtomicBoolean(false); + AtomicInteger totalQueries = new AtomicInteger(0); + + try { + for (int t = 0; t < numThreads; t++) { + executor.submit( + () -> { + int i = 0; + + try { + for (i = 0; i < queriesPerThread && !failureDetected.get(); i++) { + float[] query = generateRandomVectors(1, 2)[0]; + try { + TopDocs td = searcher.search(new KnnFloatVectorQuery("vec", query, k), k); + assertEquals( + "Search should return correct number of results", + k, + td.scoreDocs.length); + compareSearchResults( + td, sourceVectors, reader, expectedDocIdField, baseDocId, vectorValues); + totalQueries.incrementAndGet(); + } catch (Throwable e) { + failureDetected.compareAndSet(false, true); + fail("Exception during concurrent search: " + e.getMessage()); + } + } + } finally { + latch.countDown(); + } + }); + } + + // Wait for all threads to complete or for a failure + boolean completed = latch.await(30, TimeUnit.SECONDS); + assertTrue("Test timed out while waiting for concurrent searches", completed); + assertFalse( + "Test encountered failures during concurrent searches", failureDetected.get()); + assertEquals( + "Incorrect number of queries executed", + numThreads * queriesPerThread, + totalQueries.get()); + + } finally { + executor.shutdownNow(); + } + } + } + } + + private void compareSearchResults( + TopDocs topDocs, + float[][] sourceVectors, + DirectoryReader reader, + String expectedDocIdField, + int baseDocId, + FloatVectorValues vectorValues) + throws IOException { + // Get the ords matching the lucene doc ids so that we can later find their values in the {@link + // vectorValues} + final Map docToOrdMap = new HashMap<>(); // docToOrd map + final var docIdSetIterator = vectorValues.iterator(); + while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + docToOrdMap.put(docIdSetIterator.docID() + baseDocId, docIdSetIterator.index()); + } + + for (int resultIdx = 0; resultIdx < topDocs.scoreDocs.length; resultIdx++) { + final int localDocId = topDocs.scoreDocs[resultIdx].doc; + final int globalDocId = + reader + .storedFields() + .document(localDocId) + .getField(expectedDocIdField) + .storedValue() + .getIntValue(); + + // Access to float values is not thread safe + final float[] vectorValue; + synchronized (vectorValues) { + vectorValue = vectorValues.vectorValue(docToOrdMap.get(localDocId)); + } + float[] expectedVectorValue = sourceVectors[globalDocId]; + Assert.assertArrayEquals( + "vectors in source and index should match", expectedVectorValue, vectorValue, 0.0f); + } + } + + /** + * Test to verify that a document which has been deleted is no longer returned in a k-NN search. + * The index uses the JVector codec and is kept in multiple segments to ensure we also cover the + * case where the deleted document still physically resides in the segment as a dead (non-live) + * record. + */ + @Test + public void deletedDocs() throws IOException { + final int totalNumberOfDocs = 100; + final int batchSize = 10; + final int k = batchSize - 1; + final int docToDeleteInEachBatch = 5; + final Path indexPath = createTempDir(); + final IndexWriterConfig iwc = newIndexWriterConfig(); + // JVector codec requires compound files to be disabled at the moment + iwc.setUseCompoundFile(false); + iwc.setCodec(getCodec()); + iwc.setMergePolicy(new ForceMergesOnlyMergePolicy(false)); + + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter writer = new IndexWriter(dir, iwc)) { + + /* + * 1. Index 100 docs, in batches of 10. Delete the 5th doc in each batch. + * will leave us with 10 segments, each with 9 live docs. + */ + int batchNumber = 0; + for (int i = 1; i <= totalNumberOfDocs; i++) { + Document doc = new Document(); + final float[] vector = {0.0f, 1.0f * (i + batchNumber)}; + doc.add(new StringField("docId", Integer.toString(i + 1), Field.Store.YES)); + doc.add(new KnnFloatVectorField("test_field", vector, VectorSimilarityFunction.EUCLIDEAN)); + writer.addDocument(doc); + if (i % batchSize == 0) { + writer.flush(); + writer.deleteDocuments( + new TermQuery(new Term("docId", Integer.toString(i - docToDeleteInEachBatch)))); + batchNumber++; + } + } + writer.commit(); + + /* ---------------------------------------- + * 2. Merge all segments into one + * ---------------------------------------- */ + writer.forceMerge(1); + + /* ---------------------------------------- + * 3. Search – the deleted doc must be gone + * ---------------------------------------- */ + try (IndexReader reader = DirectoryReader.open(writer)) { + assertEquals( + "All documents except the deleted ones should be live", + totalNumberOfDocs - (totalNumberOfDocs / batchSize), + reader.numDocs()); + // For each batch we will verify that the deleted document doesn't come up in search and + // only it's neighbours are returned + + for (int i = 0; i < totalNumberOfDocs; i += batchSize) { + final float[] target = {0.0f, 1.0f * (i + docToDeleteInEachBatch)}; + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, new MatchAllDocsQuery()); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + for (int j = 0; j < k; j++) { + Document doc = reader.storedFields().document(topDocs.scoreDocs[j].doc); + int docId = Integer.parseInt(doc.get("docId")); + assertNotEquals( + "Deleted doc should not be returned in search results", + i + docToDeleteInEachBatch, + docId); + } + } + } + } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Merge is enabled. compound file is + * enabled. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile() throws IOException { + int k = 3; // The number of nearest neighbors to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 1 segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.01f); + assertEquals(8, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 9.0f}), + topDocs.scoreDocs[1].score, + 0.01f); + assertEquals(7, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[2].score, + 0.01f); + } + } + } + + /** + * Test to verify that the Lucene codec is able to successfully search for the nearest neighbours + * in the index. Single field is used to store the vectors. Documents are stored in potentially + * multiple segments. Multiple commits. Multiple merges. Merge is enabled. compound file is + * enabled. cosine similarity is used. + */ + @Test + public void testLuceneKnnIndex_mergeEnabled_withCompoundFile_cosine() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(true); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(true)); + indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {1.0f, 1.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {1.0f + i, 2.0f * i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.COSINE)); + w.addDocument(doc); + w.flush(); // this creates a new segment without triggering a merge + } + + w.forceMerge(1); // this merges all segments into a single segment + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have 1 segment with 10 documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + assertEquals(0, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {2.0f, 2.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(1, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {3.0f, 4.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(2, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.COSINE.compare(target, new float[] {4.0f, 6.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test to verify that the JVector codec is providing proper error if used with byte vector TODO: + * Create Binary Quantization support for JVector codec + */ + @Test + public void testJVectorKnnIndex_simpleCase_withBinaryVector() throws IOException { + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + // TODO: re-enable this after fixing the compound file augmentation for JVector + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + try (Directory dir = newFSDirectory(indexPath); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, indexWriterConfig)) { + final byte[] source = new byte[] {(byte) 0, (byte) 0}; + final Document doc = new Document(); + doc.add(new KnnByteVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + Assert.assertThrows(UnsupportedOperationException.class, () -> w.addDocument(doc)); + } + } + + /** + * Test to verify that the JVector codec is able to successfully search for the nearest neighbours + * in the index with a filter applied. + */ + @Test + public void testJVectorKnnIndex_withFilter() throws IOException { + int k = 3; // The number of nearest neighbours to gather + int totalNumberOfDocs = 10; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec()); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + final Path indexPath = createTempDir(); + try (Directory dir = newFSDirectory(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = new float[] {0.0f, 0.0f}; + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = new float[] {0.0f, 1.0f / i}; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new StringField("filter_field", i % 2 == 0 ? "even" : "odd", Field.Store.YES)); + w.addDocument(doc); + } + // Flushing docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + final Query filterQuery = new TermQuery(new Term("filter_field", "even")); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + + assertEquals(k, topDocs.totalHits.value()); + assertEquals(9, topDocs.scoreDocs[0].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 10.0f}), + topDocs.scoreDocs[0].score, + 0.001f); + assertEquals(7, topDocs.scoreDocs[1].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 8.0f}), + topDocs.scoreDocs[1].score, + 0.001f); + assertEquals(5, topDocs.scoreDocs[2].doc); + Assert.assertEquals( + VectorSimilarityFunction.EUCLIDEAN.compare(target, new float[] {0.0f, 1.0f / 6.0f}), + topDocs.scoreDocs[2].score, + 0.001f); + } + } + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size with no + * merges or too small batch sizes + */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization() throws IOException { + int k = 50; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + } + // Flushing docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery(TEST_FIELD, target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + } + } + } + + /** Test recall with different types of rerank parameters */ + @Test + public void testJVectorKnnIndex_simpleCase_withQuantization_rerank() throws IOException { + int k = 1; // The number of nearest neighbours to gather + int dimension = 16; + int totalNumberOfDocs = DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + for (int i = 1; i < totalNumberOfDocs + 1; i++) { + final float[] source = generateZerosVectorWithLastValue(dimension, i); + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, VectorSimilarityFunction.EUCLIDEAN)); + w.addDocument(doc); + } + // Flushing docs to make them discoverable on the file system + w.commit(); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + float expectedMinScoreInTopK = + VectorSimilarityFunction.EUCLIDEAN.compare( + target, + new float[] { + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, k + }); + + // Query with essentially no reranking and expect recall to be very low + JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1).build(); + KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + + final float recallWithLowOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + + // Query with reranking and expect recall to be high + searchStrategy = JVectorSearchStrategy.builder().withOverQueryFactor(5).build(); + knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); + topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + float recallWithHighOverqueryFactor = calculateRecall(topDocs, expectedMinScoreInTopK); + Assert.assertTrue(recallWithLowOverqueryFactor <= recallWithHighOverqueryFactor); + } + } + } + + /** + * Test the simple case of quantization where we have the perfect batch single batch size each + * time with a merge of multiple segments + */ + @Test + public void testJVectorKnnIndex_happyCase_withQuantization_multipleSegments() throws IOException { + final int dimension = 16; + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int perfectBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION is the + // minimal + // batch size that will trigger a quantization without + // breaking it, generally speaking the batch size can't be + // lower than the number of clusters + final int totalNumberOfDocs = perfectBatchSize * 2; + + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + + for (int i = 0; i < vectors.length; i++) { + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, vectors[i], vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % perfectBatchSize == 0) { + w.commit(); + } + } + // Flushing docs to make them discoverable on the file system + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + } + } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of + * recommended clusters in the index The expected behavior is for the quantization to only kick in + * when we have a merge or batch size that is bigger than the minimal required batch size + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION + / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + notIdealBatchSize + * 3; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link + // MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} as a result of merging all the smaller + // batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(false); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy()); + // We set the below parameters to make sure no permature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField("test_field", source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); + } + } + // Flushing docs to make them discoverable on the file system + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals(1.0f, recall, 0.05f); + } + } + } + + /** + * Test the non-ideal case where batch sizes are not perfect and are lower than the number of + * recommended clusters in the index The expected behavior is for the quantization to only kick in + * when we have a merge or batch size that is bigger than the minimal required batch size Also + * this is adding the compound file to the mix + */ + @Test + public void testJVectorKnnIndex_mixedBatchSizes_withQuantization_multipleMerges_withCompoundFile() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int notIdealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION + / 3; // Batch size that is not ideal for quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + notIdealBatchSize + * 10; // 3 batches of documents each will result in quantization only when the merge + // is triggered, and we have a batch size of {@link MINIMUM_BATCH_SIZE_FOR_QUANTIZATION} + // as a result of merging all the smaller batches + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will + // result in a broken linear graph + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + w.addDocument(doc); + if (i % notIdealBatchSize == 0) { + w.commit(); + } + } + w.commit(); + // Flushing docs to make them discoverable on the file system + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1000).build(); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals( + "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + } + } + // TODO: assert no quantization + // TODO: assert no graph merge + } + + /** + * We will use multiple batches, each can trigger a quantization and later merge them in an + * appending order to keep track of refinement + */ + @Test + public void testJVectorKnnIndex_withQuantization_withCompoundFile_with_refinement() + throws IOException { + final int k = + 50; // The number of nearest neighbours to gather, we set a high number here to avoid an + // inaccurate result and + // jittery tests + final int dimension = 16; + final int idealBatchSize = + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION; // Batch size that is not ideal for + // quantization and + // shouldn't trigger it + final int totalNumberOfDocs = + idealBatchSize * 10; // 10 batches, each batch on it's own will trigger quantization + final VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + + boolean useCompoundFile = true; + IndexWriterConfig indexWriterConfig = LuceneTestCase.newIndexWriterConfig(); + indexWriterConfig.setUseCompoundFile(useCompoundFile); + indexWriterConfig.setCodec(getCodec(DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION)); + indexWriterConfig.setMergePolicy(new ForceMergesOnlyMergePolicy(useCompoundFile)); + // We set the below parameters to make sure no premature flush will occur, this way we can have + // a single segment, and we can force + // test the quantization case + indexWriterConfig.setMaxBufferedDocs( + 10000); // force flush every 10000 docs, this way we make sure that we only have a single + // segment for a totalNumberOfDocs < 1000 + indexWriterConfig.setRAMPerThreadHardLimitMB( + 1000); // 1000MB per thread, this way we make sure that no premature flush will occur + final Path indexPath = createTempDir(); + try (FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + final float[] target = generateZerosVectorWithLastValue(dimension, 0); + // We will use random vectors because otherwise PQ will have a correlated subspaces which will + // result in a broken linear graph + final float[][] vectors = generateRandomVectors(totalNumberOfDocs, dimension); + final Set groundTruthVectorsIds = + calculateGroundTruthVectorsIds(target, vectors, k, vectorSimilarityFunction); + for (int i = 0; i < totalNumberOfDocs; i++) { + final float[] source = vectors[i]; + final Document doc = new Document(); + doc.add(new IntField(TEST_ID_FIELD, i, Field.Store.YES)); + doc.add(new KnnFloatVectorField(TEST_FIELD, source, vectorSimilarityFunction)); + w.addDocument(doc); + if (i % idealBatchSize == 0) { + w.commit(); + w.forceMerge(1); // force merge will trigger PQ refinement if other segments are present + } + } + w.commit(); + // Flushing docs to make them discoverable on the file system + w.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(w)) { + // We should now have a single segment with totalNumberOfDocs documents + Assert.assertEquals(1, reader.getContext().leaves().size()); + Assert.assertEquals(totalNumberOfDocs, reader.numDocs()); + + final Query filterQuery = new MatchAllDocsQuery(); + final IndexSearcher searcher = newSearcher(reader); + final JVectorSearchStrategy searchStrategy = + JVectorSearchStrategy.builder().withOverQueryFactor(1000).build(); + final KnnFloatVectorQuery knnFloatVectorQuery = + new KnnFloatVectorQuery("test_field", target, k, filterQuery, searchStrategy); + TopDocs topDocs = searcher.search(knnFloatVectorQuery, k); + assertEquals(k, topDocs.totalHits.value()); + final float recall = calculateRecall(reader, groundTruthVectorsIds, topDocs, k); + Assert.assertEquals( + "Expected to have recall of 1.0+/-0.05 but got " + recall, 1.0f, recall, 0.05f); + } + } + // TODO: Assert no graph merge + } + + /** + * Calculate the recall for the top k documents For simplicity we assume that all documents have + * unique scores and therefore the minimum score in the top k documents is the kth document + * + * @param topDocs the top documents returned by the search + * @param minScoreInTopK the minimum score in the top k documents + * @return the recall of the top k documents + */ + private float calculateRecall(TopDocs topDocs, float minScoreInTopK) { + int totalRelevantDocs = 0; + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + if (topDocs.scoreDocs[i].score >= minScoreInTopK) { + totalRelevantDocs++; + } + } + float recall = ((float) totalRelevantDocs) / ((float) topDocs.scoreDocs.length); + return recall; + } + + private static float[] generateZerosVectorWithLastValue(int vectorDimension, int lastValue) { + float[] vector = new float[vectorDimension]; + for (int i = 0; i < vectorDimension - 1; i++) { + vector[i] = 0; + } + vector[vectorDimension - 1] = lastValue; + return vector; + } + + private static float calculateRecall( + IndexReader reader, Set groundTruthVectorsIds, TopDocs topDocs, int k) + throws IOException { + final ScoreDoc[] scoreDocs = topDocs.scoreDocs; + Assert.assertEquals(groundTruthVectorsIds.size(), scoreDocs.length); + int totalRelevantDocs = 0; + for (ScoreDoc scoreDoc : scoreDocs) { + final int id = + reader + .storedFields() + .document(scoreDoc.doc) + .getField(TEST_ID_FIELD) + .storedValue() + .getIntValue(); + if (groundTruthVectorsIds.contains(id)) { + totalRelevantDocs++; + } + } + return ((float) totalRelevantDocs) / ((float) k); + } + + /** + * Find the IDs of the ground truth vectors in the dataset + * + * @param query query vector + * @param dataset dataset of all the vectors with their ordinal position in the array as their ID + * @param k the number of expected results + * @return the IDs of the ground truth vectors in the dataset + */ + private static Set calculateGroundTruthVectorsIds( + float[] query, + final float[][] dataset, + int k, + VectorSimilarityFunction vectorSimilarityFunction) { + final Set groundTruthVectorsIds = new HashSet<>(); + final PriorityQueue priorityQueue = + new PriorityQueue<>(k, (o1, o2) -> Float.compare(o1.score, o2.score)); + for (int i = 0; i < dataset.length; i++) { + ScoreDoc scoreDoc = new ScoreDoc(i, vectorSimilarityFunction.compare(query, dataset[i])); + if (priorityQueue.size() >= k) { + final ScoreDoc top = priorityQueue.poll(); + if (top.score < scoreDoc.score) { + priorityQueue.add(scoreDoc); + } else { + priorityQueue.add(top); + } + } else { + priorityQueue.add(scoreDoc); + } + } + while (!priorityQueue.isEmpty()) { + groundTruthVectorsIds.add(priorityQueue.poll().doc); + } + + return groundTruthVectorsIds; + } + + static float[][] generateRandomVectors(int count, int dimension) { + final var rng = nonAssertingRandom(random()); + final float[][] vectors = new float[count][dimension]; + for (int i = 0; i < vectors.length; ++i) { + for (int j = 0; j < vectors[i].length; ++j) { + vectors[i][j] = rng.nextFloat(); + } + } + return vectors; + } + + @Override + protected VectorEncoding randomVectorEncoding() { + return SUPPORTED_ENCODINGS[random().nextInt(SUPPORTED_ENCODINGS.length)]; + } + + @Override + protected VectorSimilarityFunction randomSimilarity() { + return SUPPORTED_FUNCTIONS[random().nextInt(SUPPORTED_FUNCTIONS.length)]; + } + + @Override + protected void assertOffHeapByteSize(LeafReader r, String fieldName) throws IOException {} + + @Override + protected Codec getCodec() { + return getCodec(JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION); + } + + private Codec getCodec(final int minimumBatchSizeForQuantization) { + return TestUtil.alwaysKnnVectorsFormat(new JVectorFormat(minimumBatchSizeForQuantization)); + } + + public static class ThreadLeakFilter implements ThreadFilter { + @Override + public boolean reject(Thread thread) { + return thread.getName().contains("ForkJoinPool"); + } + } +} diff --git a/versions.lock b/versions.lock index 1e934ceec3ce..b47e0f049ab1 100644 --- a/versions.lock +++ b/versions.lock @@ -6,14 +6,16 @@ "com.ibm.icu:icu4j:78.1" : "47ea4550,refs=6", "commons-codec:commons-codec:1.20.0" : "e6288df0,refs=6", "commons-io:commons-io:2.20.0" : "5ce8cdc6,refs=2", + "io.github.jbellis:jvector:4.0.0-rc.5" : "9f877bb0,refs=7", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.2" : "fa9ef26b,refs=4", "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "5ce8cdc6,refs=2", + "org.agrona:agrona:1.20.0" : "9f877bb0,refs=7", "org.antlr:antlr4-runtime:4.13.2" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.28.0" : "5ce8cdc6,refs=2", "org.apache.commons:commons-lang3:3.18.0" : "5ce8cdc6,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", + "org.apache.commons:commons-math3:3.6.1" : "dd26014b,refs=8", "org.apache.opennlp:opennlp-tools:2.5.6.1" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", @@ -21,7 +23,8 @@ "org.hamcrest:hamcrest:3.0" : "fa9ef26b,refs=4", "org.locationtech.spatial4j:spatial4j:0.8" : "cbc357ab,refs=4", "org.openjdk.jmh:jmh-core:1.37" : "85a1e4c6,refs=2", - "org.slf4j:slf4j-api:2.0.17" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.17" : "07f0efc6,refs=10", + "org.yaml:snakeyaml:2.4" : "9f877bb0,refs=7", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.2" : "5ce8cdc6,refs=2" }, @@ -48,16 +51,18 @@ "commons-io:commons-io:2.20.0" : "6f16ff86,refs=2", "io.github.eisop:dataflow-errorprone:3.41.0-eisop1" : "90685606,refs=39", "io.github.java-diff-utils:java-diff-utils:4.12" : "90685606,refs=39", + "io.github.jbellis:jvector:4.0.0-rc.5" : "43dd284b,refs=10", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", "javax.inject:javax.inject:1" : "90685606,refs=39", "junit:junit:4.13.2" : "129da9bf,refs=76", "net.bytebuddy:byte-buddy:1.17.7" : "b7ba1646,refs=2", "net.sf.jopt-simple:jopt-simple:5.0.4" : "152d9f78,refs=3", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "6f16ff86,refs=2", + "org.agrona:agrona:1.20.0" : "43dd284b,refs=10", "org.antlr:antlr4-runtime:4.13.2" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.28.0" : "6f16ff86,refs=2", "org.apache.commons:commons-lang3:3.18.0" : "6f16ff86,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", + "org.apache.commons:commons-math3:3.6.1" : "f0656784,refs=12", "org.apache.opennlp:opennlp-tools:2.5.6.1" : "b91715f0,refs=6", "org.assertj:assertj-core:3.27.6" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", @@ -71,12 +76,55 @@ "org.openjdk.jmh:jmh-core:1.37" : "152d9f78,refs=3", "org.openjdk.jmh:jmh-generator-annprocess:1.37" : "ecaf1d73,refs=1", "org.pcollections:pcollections:4.0.1" : "90685606,refs=39", - "org.slf4j:slf4j-api:2.0.17" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.17" : "736bb8da,refs=15", + "org.yaml:snakeyaml:2.4" : "43dd284b,refs=10", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.2" : "6f16ff86,refs=2" } }, "because" : { + "07f0efc6" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "129da9bf" : [ { "configuration" : "testCompileClasspath", @@ -443,6 +491,48 @@ "projectPath" : ":lucene:analysis:opennlp" } ], + "43dd284b" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "47ea4550" : [ { "configuration" : "compileClasspath", @@ -511,6 +601,68 @@ "projectPath" : ":lucene:queries" } ], + "736bb8da" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "79af844b" : [ { "configuration" : "compileClasspath", @@ -731,6 +883,36 @@ "projectPath" : ":lucene:analysis:phonetic" } ], + "9f877bb0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "b7ba1646" : [ { "configuration" : "testCompileClasspath", @@ -825,6 +1007,40 @@ "projectPath" : ":lucene:expressions" } ], + "dd26014b" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "e077a675" : [ { "configuration" : "testCompileClasspath", @@ -891,6 +1107,56 @@ "projectPath" : ":lucene:benchmark-jmh" } ], + "f0656784" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "fa9ef26b" : [ { "configuration" : "compileClasspath",