test: Add sparse invert index algo check tests (#39691)

related issue: #39332 --------- Signed-off-by: yanliang567 <[email protected]>
milvus-io · Feb 12, 2025 · 5fdc757 · 5fdc757
1 parent c12c4b4
commit 5fdc757
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 40 deletions.
diff --git a/tests/python_client/common/common_type.py b/tests/python_client/common/common_type.py
@@ -220,10 +220,12 @@
                    "SPARSE_INVERTED_INDEX", "SPARSE_WAND",
                    "GPU_IVF_FLAT", "GPU_IVF_PQ"]
 
+inverted_index_algo = ['TAAT_NAIVE', 'DAAT_WAND', 'DAAT_MAXSCORE']
+
 default_all_indexes_params = [{}, {"nlist": 128}, {"nlist": 128}, {"nlist": 128, "m": 16, "nbits": 8},
                               {"M": 32, "efConstruction": 360}, {"nlist": 128}, {},
                               {}, {"nlist": 64},
-                              {"drop_ratio_build": 0.2}, {"drop_ratio_build": 0.2},
+                              {}, {"drop_ratio_build": 0.2},
                               {"nlist": 64}, {"nlist": 64, "m": 16, "nbits": 8}]
 
 default_all_search_params_params = [{}, {"nprobe": 32}, {"nprobe": 32}, {"nprobe": 32},

diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
@@ -2315,9 +2315,10 @@ def test_full_text_search_default(
     @pytest.mark.parametrize("expr", ["text_match"])
     @pytest.mark.parametrize("offset", [10])
     @pytest.mark.parametrize("tokenizer", ["jieba"])
+    @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
     def test_full_text_search_with_jieba_tokenizer(
-            self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
-    ):
+            self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key,
+            empty_percent, index_type, nq, inverted_index_algo):
         """
         target: test full text search
         method: 1. enable full text search with jieba tokenizer and insert data with varchar
@@ -2430,6 +2431,7 @@ def test_full_text_search_with_jieba_tokenizer(
                 "params": {
                     "bm25_k1": 1.5,
                     "bm25_b": 0.75,
+                    "inverted_index_algo": inverted_index_algo
                 }
             }
         )
@@ -3302,8 +3304,9 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
     @pytest.mark.parametrize("enable_inverted_index", [True])
     @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
     @pytest.mark.parametrize("tokenizer", ["standard"])
+    @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
     def test_hybrid_search_with_full_text_search(
-            self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type
+            self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, inverted_index_algo
     ):
         """
         target: test full text search
@@ -3403,6 +3406,7 @@ def test_hybrid_search_with_full_text_search(
                 "params": {
                     "bm25_k1": 1.5,
                     "bm25_b": 0.75,
+                    "inverted_index_algo": inverted_index_algo
                 }
             }
         )

diff --git a/tests/python_client/testcases/test_index.py b/tests/python_client/testcases/test_index.py
@@ -1485,6 +1485,28 @@ def test_invalid_sparse_ratio(self, ratio, index):
                                               check_task=CheckTasks.err_res,
                                               check_items=error)
 
+    @pytest.mark.tags(CaseLabel.L2)
+    @pytest.mark.parametrize("inverted_index_algo", ["INVALID_ALGO"])
+    @pytest.mark.parametrize("index ", ct.all_index_types[9:11])
+    def test_invalid_sparse_inverted_index_algo(self, inverted_index_algo, index):
+        """
+        target: index creation for unsupported ratio parameter
+        method: indexing of unsupported ratio parameters
+        expected: raise exception
+        """
+        c_name = cf.gen_unique_str(prefix)
+        schema = cf.gen_default_sparse_schema()
+        collection_w = self.init_collection_wrap(name=c_name, schema=schema)
+        data = cf.gen_default_list_sparse_data()
+        collection_w.insert(data=data)
+        params = {"index_type": index, "metric_type": "IP", "params": {"inverted_index_algo": inverted_index_algo}}
+        error = {ct.err_code: 999,
+                 ct.err_msg: f"sparse inverted index algo {inverted_index_algo} not found or not supported, "
+                             f"supported: [TAAT_NAIVE DAAT_WAND DAAT_MAXSCORE]"}
+        index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
+                                              check_task=CheckTasks.err_res,
+                                              check_items=error)
+
 
 @pytest.mark.tags(CaseLabel.GPU)
 class TestNewIndexAsync(TestcaseBase):

diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
@@ -3104,24 +3104,23 @@ def test_search_with_expression(self, null_data_percent):
                     assert set(ids).issubset(filter_ids_set)
 
                 # 5. search again with expression template and search hints
-                if expr != "":          # TODO: remove this when issue #39013 is fixed
-                    search_param = default_search_params.copy()
-                    search_param.update({"hints": "iterative_filter"})
-                    search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
-                                                        search_param, nb,
-                                                        expr=expr, expr_params=expr_params, _async=_async,
-                                                        check_task=CheckTasks.check_search_results,
-                                                        check_items={"nq": default_nq,
-                                                                     "ids": insert_ids,
-                                                                     "limit": min(nb, len(filter_ids)),
-                                                                     "_async": _async})
-                    if _async:
-                        search_res.done()
-                        search_res = search_res.result()
-                    filter_ids_set = set(filter_ids)
-                    for hits in search_res:
-                        ids = hits.ids
-                        assert set(ids).issubset(filter_ids_set)
+                search_param = default_search_params.copy()
+                search_param.update({"hints": "iterative_filter"})
+                search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
+                                                    search_param, nb,
+                                                    expr=expr, expr_params=expr_params, _async=_async,
+                                                    check_task=CheckTasks.check_search_results,
+                                                    check_items={"nq": default_nq,
+                                                                 "ids": insert_ids,
+                                                                 "limit": min(nb, len(filter_ids)),
+                                                                 "_async": _async})
+                if _async:
+                    search_res.done()
+                    search_res = search_res.result()
+                filter_ids_set = set(filter_ids)
+                for hits in search_res:
+                    ids = hits.ids
+                    assert set(ids).issubset(filter_ids_set)
 
     @pytest.mark.tags(CaseLabel.L2)
     @pytest.mark.parametrize("bool_type", [True, False, "true", "false"])
@@ -12860,7 +12859,8 @@ class TestSparseSearch(TestcaseBase):
 
     @pytest.mark.tags(CaseLabel.L1)
     @pytest.mark.parametrize("index", ct.all_index_types[9:11])
-    def test_sparse_index_search(self, index):
+    @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
+    def test_sparse_index_search(self, index, inverted_index_algo):
         """
         target: verify that sparse index for sparse vectors can be searched properly
         method: create connection, collection, insert and search
@@ -12873,12 +12873,16 @@ def test_sparse_index_search(self, index):
         data = cf.gen_default_list_sparse_data(nb=3000)
         collection_w.insert(data)
         params = cf.get_index_params_params(index)
+        params.update({"inverted_index_algo": inverted_index_algo})
         index_params = {"index_type": index, "metric_type": "IP", "params": params}
         collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
         collection_w.load()
 
+        _params = cf.get_search_params_params(index)
+        _params.update({"dim_max_score_ratio": 1.05})
+        search_params = {"params": _params}
         collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
-                            ct.default_sparse_search_params, default_limit,
+                            search_params, default_limit,
                             output_fields=[ct.default_sparse_vec_field_name],
                             check_task=CheckTasks.check_search_results,
                             check_items={"nq": default_nq,
@@ -12887,7 +12891,7 @@ def test_sparse_index_search(self, index):
                                          "output_fields": [ct.default_sparse_vec_field_name]})
         expr = "int64 < 100 "
         collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
-                            ct.default_sparse_search_params, default_limit,
+                            search_params, default_limit,
                             expr=expr, output_fields=[ct.default_sparse_vec_field_name],
                             check_task=CheckTasks.check_search_results,
                             check_items={"nq": default_nq,
@@ -12923,7 +12927,8 @@ def test_sparse_index_dim(self, index, dim):
 
     @pytest.mark.tags(CaseLabel.L2)
     @pytest.mark.parametrize("index", ct.all_index_types[9:11])
-    def test_sparse_index_enable_mmap_search(self, index):
+    @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
+    def test_sparse_index_enable_mmap_search(self, index, inverted_index_algo):
         """
         target: verify that the sparse indexes of sparse vectors can be searched properly after turning on mmap
         method: create connection, collection, enable mmap,  insert and search
@@ -12939,6 +12944,7 @@ def test_sparse_index_enable_mmap_search(self, index):
         collection_w.insert(data)
 
         params = cf.get_index_params_params(index)
+        params.update({"inverted_index_algo": inverted_index_algo})
         index_params = {"index_type": index, "metric_type": "IP", "params": params}
         collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
 
@@ -12968,9 +12974,9 @@ def test_sparse_index_enable_mmap_search(self, index):
         assert len(res) == 4
 
     @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9])
+    @pytest.mark.parametrize("drop_ratio_build", [0.01])
     @pytest.mark.parametrize("index", ct.all_index_types[9:11])
-    def test_search_sparse_ratio(self, ratio, index):
+    def test_search_sparse_ratio(self, drop_ratio_build, index):
         """
         target: create a sparse index by adjusting the ratio parameter.
         method: create a sparse index by adjusting the ratio parameter.
@@ -12982,16 +12988,28 @@ def test_search_sparse_ratio(self, ratio, index):
         collection_w = self.init_collection_wrap(c_name, schema=schema)
         data = cf.gen_default_list_sparse_data(nb=4000)
         collection_w.insert(data)
-        params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
+        params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": drop_ratio_build}}
         collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
         collection_w.load()
         assert collection_w.has_index(index_name=index)[0] is True
-        search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}}
-        collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
-                            search_params, default_limit,
-                            check_task=CheckTasks.check_search_results,
-                            check_items={"nq": default_nq,
-                                         "limit": default_limit})
+        _params = {"drop_ratio_search": 0.2}
+        for dim_max_score_ratio in [0.5, 0.99, 1, 1.3]:
+            _params.update({"dim_max_score_ratio": dim_max_score_ratio})
+            search_params = {"metric_type": "IP", "params": _params}
+            collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
+                                search_params, default_limit,
+                                check_task=CheckTasks.check_search_results,
+                                check_items={"nq": default_nq,
+                                             "limit": default_limit})
+        error = {ct.err_code: 999,
+                 ct.err_msg: "should be in range [0.500000, 1.300000]"}
+        for invalid_ratio in [0.49, 1.4]:
+            _params.update({"dim_max_score_ratio": invalid_ratio})
+            search_params = {"metric_type": "IP", "params": _params}
+            collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
+                                search_params, default_limit,
+                                check_task=CheckTasks.err_res,
+                                check_items=error)
 
     @pytest.mark.tags(CaseLabel.L2)
     @pytest.mark.parametrize("index", ct.all_index_types[9:11])
@@ -13024,8 +13042,8 @@ def test_sparse_vector_search_output_field(self, index):
 
     @pytest.mark.tags(CaseLabel.L2)
     @pytest.mark.parametrize("index", ct.all_index_types[9:11])
-    @pytest.mark.xfail(reason="issue #36174")
-    def test_sparse_vector_search_iterator(self, index):
+    @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
+    def test_sparse_vector_search_iterator(self, index, inverted_index_algo):
         """
         target: create sparse vectors and search iterator
         method: create sparse vectors and search iterator
@@ -13038,6 +13056,7 @@ def test_sparse_vector_search_iterator(self, index):
         data = cf.gen_default_list_sparse_data(nb=4000)
         collection_w.insert(data)
         params = cf.get_index_params_params(index)
+        params.update({"inverted_index_algo": inverted_index_algo})
         index_params = {"index_type": index, "metric_type": "IP", "params": params}
         collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
 

diff --git a/tests/python_client/utils/util_pymilvus.py b/tests/python_client/utils/util_pymilvus.py
@@ -8,7 +8,7 @@
 import numpy as np
 import requests
 from sklearn import preprocessing
-from pymilvus import DataType
+from pymilvus import MilvusClient, DataType
 from utils.util_log import test_log as log
 from utils.util_k8s import init_k8s_client_config
 
@@ -115,9 +115,9 @@ def get_milvus(host, port, uri=None, handler=None, **kwargs):
         handler = "GRPC"
     try_connect = kwargs.get("try_connect", True)
     if uri is not None:
-        milvus = Milvus(uri=uri, handler=handler, try_connect=try_connect)
+        milvus = MilvusClient(uri=uri, handler=handler, try_connect=try_connect)
     else:
-        milvus = Milvus(host=host, port=port, handler=handler, try_connect=try_connect)
+        milvus = MilvusClient(uri=f"http://{host}:{port}", handler=handler, try_connect=try_connect)
     return milvus