mirror of https://github.com/milvus-io/milvus.git
test: Split test_search and refactor on test class to share collections (#40677)
issue: #40698 --------- Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>pull/40766/head
parent
b119ac5d30
commit
cf223bae7b
|
@ -53,8 +53,6 @@ image:
|
|||
tag: PR-35426-20240812-46dadb120
|
||||
indexCoordinator:
|
||||
enabled: false
|
||||
gc:
|
||||
interval: 1
|
||||
resources:
|
||||
limits:
|
||||
cpu: "1"
|
||||
|
@ -107,11 +105,20 @@ log:
|
|||
extraConfigFiles:
|
||||
user.yaml: |+
|
||||
indexCoord:
|
||||
gc:
|
||||
interval: 1
|
||||
scheduler:
|
||||
interval: 100
|
||||
indexNode:
|
||||
scheduler:
|
||||
buildParallel: 4
|
||||
queryNode:
|
||||
mmap:
|
||||
vectorField: true
|
||||
vectorIndex: true
|
||||
scalarField: true
|
||||
scalarIndex: true
|
||||
growingMmapEnabled: true
|
||||
metrics:
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
|
@ -260,12 +267,6 @@ queryNode:
|
|||
requests:
|
||||
cpu: "0.5"
|
||||
memory: 500Mi
|
||||
mmap:
|
||||
vectorField: true
|
||||
vectorIndex: true
|
||||
scalarField: true
|
||||
scalarIndex: true
|
||||
growingMmapEnabled: true
|
||||
rootCoordinator:
|
||||
resources:
|
||||
limits:
|
||||
|
|
|
@ -90,7 +90,7 @@ class TestMilvusClientV2Base(Base):
|
|||
collection_name=collection_name, dimension=dimension,
|
||||
**kwargs).run()
|
||||
|
||||
self.tear_down_collection_names.append(collection_name)
|
||||
# self.tear_down_collection_names.append(collection_name)
|
||||
return res, check_result
|
||||
|
||||
def has_collection(self, client, collection_name, timeout=None, check_task=None,
|
||||
|
|
|
@ -305,6 +305,21 @@ class TestMilvusClientSearchInvalid(TestMilvusClientV2Base):
|
|||
self.create_collection(client, collection_name, default_dim, id_type="invalid",
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_milvus_client_collection_string_auto_id(self):
|
||||
"""
|
||||
target: test high level api: client.create_collection
|
||||
method: create collection with auto id on string primary key without mx length
|
||||
expected: Raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
# 1. create collection
|
||||
error = {ct.err_code: 65535, ct.err_msg: f"type param(max_length) should be specified for the "
|
||||
f"field({default_primary_key_field_name}) of collection {collection_name}"}
|
||||
self.create_collection(client, collection_name, default_dim, id_type="string", auto_id=True,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_milvus_client_create_same_collection_different_params(self):
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,454 @@
|
|||
import pytest
|
||||
import random
|
||||
|
||||
from base.client_v2_base import TestMilvusClientV2Base
|
||||
from utils.util_log import test_log as log
|
||||
from common import common_func as cf
|
||||
from common import common_type as ct
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from utils.util_pymilvus import *
|
||||
from common.constants import *
|
||||
from pymilvus import DataType
|
||||
|
||||
prefix = "alias"
|
||||
exp_name = "name"
|
||||
exp_schema = "schema"
|
||||
default_schema = cf.gen_default_collection_schema()
|
||||
default_binary_schema = cf.gen_default_binary_collection_schema()
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_primary_key_field_name = "id"
|
||||
default_vector_field_name = "vector"
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
|
||||
|
||||
class TestMilvusClientV2AliasInvalid(TestMilvusClientV2Base):
|
||||
""" Negative test cases of alias interface parameters"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("alias_name", ct.invalid_resource_names)
|
||||
def test_milvus_client_v2_create_alias_with_invalid_name(self, alias_name):
|
||||
"""
|
||||
target: test alias inserting data
|
||||
method: create a collection with invalid alias name
|
||||
expected: create alias failed
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias with invalid name
|
||||
error = {ct.err_code: 1100, ct.err_msg: "Invalid collection alias"}
|
||||
if alias_name is None or alias_name.strip() == "":
|
||||
error = {ct.err_code: 1100, ct.err_msg: "collection alias should not be empty"}
|
||||
self.create_alias(client, collection_name, alias_name,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
# cleanup
|
||||
self.drop_collection(client, collection_name)
|
||||
|
||||
|
||||
class TestMilvusClientV2AliasOperation(TestMilvusClientV2Base):
|
||||
""" Test cases of alias interface operations"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_v2_alter_alias_operation_default(self):
|
||||
"""
|
||||
target: test collection altering alias
|
||||
method:
|
||||
1. create collection_1 with index and load, bind alias to collection_1 and insert 2000 entities
|
||||
2. verify operations using alias work on collection_1
|
||||
3. create collection_2 with index and load with 1500 entities
|
||||
4. alter alias to collection_2
|
||||
5. verify operations using alias work on collection_2
|
||||
expected:
|
||||
1. operations using alias work on collection_1 before alter
|
||||
2. operations using alias work on collection_2 after alter
|
||||
"""
|
||||
client = self._client()
|
||||
|
||||
# 1. create collection1 with index and load
|
||||
collection_name1 = cf.gen_unique_str("collection1")
|
||||
index_params = self.prepare_index_params(client)[0]
|
||||
index_params.add_index(field_name=default_vector_field_name, metric_type="L2")
|
||||
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded", index_params=index_params)
|
||||
|
||||
# 2. create alias and insert data
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name1, alias_name)
|
||||
|
||||
# 3. insert data into collection1 using alias
|
||||
nb1 = 2000
|
||||
vectors = cf.gen_vectors(nb1, default_dim)
|
||||
rows = [{default_primary_key_field_name: i,
|
||||
default_vector_field_name: vectors[i],
|
||||
default_float_field_name: i * 1.0,
|
||||
default_string_field_name: str(i)} for i in range(nb1)]
|
||||
self.insert(client, alias_name, rows)
|
||||
self.flush(client, alias_name)
|
||||
|
||||
# 4. verify collection1 data using alias
|
||||
res1 = self.query(client, alias_name, filter="", output_fields=["count(*)"])
|
||||
assert res1[0][0].get("count(*)") == nb1
|
||||
|
||||
# 5. verify search using alias works on collection1
|
||||
search_vectors = cf.gen_vectors(1, default_dim)
|
||||
self.search(client, alias_name, search_vectors, limit=default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": len(search_vectors),
|
||||
"limit": default_limit})
|
||||
|
||||
# 6. create collection2 with index and load
|
||||
collection_name2 = cf.gen_unique_str("collection2")
|
||||
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded", index_params=index_params)
|
||||
|
||||
# 7. insert data into collection2
|
||||
nb2 = 1500
|
||||
vectors = cf.gen_vectors(nb2, default_dim)
|
||||
rows = [{default_primary_key_field_name: i,
|
||||
default_vector_field_name: vectors[i],
|
||||
default_float_field_name: i * 1.0,
|
||||
default_string_field_name: str(i)} for i in range(nb2)]
|
||||
self.insert(client, collection_name2, rows)
|
||||
self.flush(client, collection_name2)
|
||||
|
||||
# 8. alter alias to collection2
|
||||
self.alter_alias(client, collection_name2, alias_name)
|
||||
|
||||
# 9. verify collection2 data using alias
|
||||
res2 = self.query(client, alias_name, filter="", output_fields=["count(*)"])
|
||||
assert res2[0][0].get("count(*)") == nb2
|
||||
|
||||
# 10. verify search using alias works on collection2
|
||||
search_vectors = cf.gen_vectors(1, default_dim)
|
||||
self.search(client, alias_name, search_vectors, limit=default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": len(search_vectors),
|
||||
"limit": default_limit})
|
||||
|
||||
# 11. verify operations on collection1 still work
|
||||
res1 = self.query(client, collection_name1, filter="", output_fields=["count(*)"])
|
||||
assert res1[0][0].get("count(*)") == nb1
|
||||
|
||||
# cleanup
|
||||
self.release_collection(client, collection_name1)
|
||||
self.release_collection(client, collection_name2)
|
||||
self.drop_collection(client, collection_name1)
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name2)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_milvus_client_v2_create_drop_alias_operation_default(self):
|
||||
"""
|
||||
target: test collection creating and dropping alias
|
||||
method:
|
||||
1. create a collection with 10 partitions
|
||||
2. create an alias for the collection
|
||||
3. verify alias has same partitions as collection
|
||||
4. drop the alias
|
||||
5. verify alias is dropped and collection still exists
|
||||
expected:
|
||||
1. alias has same partitions as collection
|
||||
2. alias can be dropped successfully
|
||||
3. collection remains unchanged after alias operations
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create partitions
|
||||
partition_names = []
|
||||
for _ in range(10):
|
||||
partition_name = cf.gen_unique_str("partition")
|
||||
partition_names.append(partition_name)
|
||||
self.create_partition(client, collection_name, partition_name)
|
||||
|
||||
# 3. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name, alias_name)
|
||||
|
||||
# 4. verify partitions in collection and alias
|
||||
partitions = self.list_partitions(client, collection_name)
|
||||
alias_partitions = self.list_partitions(client, alias_name)
|
||||
assert partitions == alias_partitions
|
||||
|
||||
# 5. verify collection exists
|
||||
assert self.has_collection(client, collection_name)[0]
|
||||
assert self.has_collection(client, alias_name)[0]
|
||||
|
||||
# 6. drop alias
|
||||
self.drop_alias(client, alias_name)
|
||||
|
||||
# 7. verify alias is dropped
|
||||
error = {ct.err_code: 0,
|
||||
ct.err_msg: f"can't find collection[database=default][collection={alias_name}]"}
|
||||
self.describe_collection(client, alias_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# 8. verify collection still exists and unchanged
|
||||
assert self.has_collection(client, collection_name)[0]
|
||||
collection_partitions = self.list_partitions(client, collection_name)
|
||||
assert collection_partitions == partitions
|
||||
|
||||
# cleanup
|
||||
self.drop_collection(client, collection_name)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_milvus_client_v2_collection_operations_by_alias(self):
|
||||
"""
|
||||
target: test collection operations using alias
|
||||
method:
|
||||
1. create collection with alias
|
||||
2. verify has_collection works with alias
|
||||
3. verify drop_collection fails with alias
|
||||
expected:
|
||||
1. has_collection returns True for alias
|
||||
2. drop_collection fails with error message
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name, alias_name)
|
||||
|
||||
# 3. verify has_collection works with alias
|
||||
assert self.has_collection(client, alias_name)[0]
|
||||
assert self.has_collection(client, collection_name)[0]
|
||||
|
||||
# 4. verify drop_collection fails with alias
|
||||
error = {ct.err_code: 1,
|
||||
ct.err_msg: f"cannot drop the collection via alias = {alias_name}"}
|
||||
self.drop_collection(client, alias_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name)
|
||||
assert not self.has_collection(client, collection_name)[0]
|
||||
|
||||
|
||||
class TestMilvusClientV2AliasOperationInvalid(TestMilvusClientV2Base):
|
||||
""" Test cases of alias interface invalid operations"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_milvus_client_v2_create_duplication_alias(self):
|
||||
"""
|
||||
target: test create duplicate alias
|
||||
method: create alias twice with same name to different collections
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name1 = cf.gen_unique_str("collection1")
|
||||
collection_name2 = cf.gen_unique_str("collection2")
|
||||
|
||||
# 1. create collection1
|
||||
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create collection2
|
||||
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 3. create alias for collection1
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name1, alias_name)
|
||||
|
||||
# 4. try to create same alias for collection2
|
||||
error = {ct.err_code: 1,
|
||||
ct.err_msg: f"{alias_name} is alias to another collection: {collection_name1}"}
|
||||
self.create_alias(client, collection_name2, alias_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name1)
|
||||
self.drop_collection(client, collection_name2)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_milvus_client_v2_alter_not_exist_alias(self):
|
||||
"""
|
||||
target: test alter not exist alias
|
||||
method: alter alias that not exists
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias and link to the collection
|
||||
self.create_alias(client, collection_name, alias_name)
|
||||
|
||||
# 3. alter alias, trying to link the collection to a non existing alias
|
||||
non_exist_alias = cf.gen_unique_str(prefix)
|
||||
error = {ct.err_code: 1600,
|
||||
ct.err_msg: f"alias not found[database=default][alias={non_exist_alias}]"}
|
||||
self.alter_alias(client, collection_name, non_exist_alias,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# 4. cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_milvus_client_v2_drop_not_exist_alias(self):
|
||||
"""
|
||||
target: test drop not exist alias
|
||||
method: drop alias that not exists
|
||||
expected: no exception
|
||||
"""
|
||||
client = self._client()
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
|
||||
# trying to drop a non existing alias
|
||||
self.drop_alias(client, alias_name)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_milvus_client_v2_drop_same_alias_twice(self):
|
||||
"""
|
||||
target: test drop same alias twice
|
||||
method: drop alias twice
|
||||
expected: no exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name, alias_name)
|
||||
|
||||
# 3. drop alias first time
|
||||
self.drop_alias(client, alias_name)
|
||||
|
||||
# 4. try to drop alias second time
|
||||
self.drop_alias(client, alias_name)
|
||||
|
||||
# cleanup
|
||||
self.drop_collection(client, collection_name)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_milvus_client_v2_create_dup_name_collection(self):
|
||||
"""
|
||||
target: test create collection with duplicate name
|
||||
method: create collection with alias name
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str("collection")
|
||||
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name, alias_name)
|
||||
|
||||
# 3. try to create collection with alias name
|
||||
error = {ct.err_code: 0,
|
||||
ct.err_msg: f"collection name [{alias_name}] conflicts with an existing alias,"
|
||||
" please choose a unique name"}
|
||||
self.create_collection(client, alias_name, default_dim, consistency_level="Bounded",
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_v2_reuse_alias_name(self):
|
||||
"""
|
||||
target: test reuse alias name from dropped collection
|
||||
method:
|
||||
1.create collection1 with alias
|
||||
2.drop collection1
|
||||
3.create collection2 with same alias name
|
||||
expected: create collection2 successfully
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name1 = cf.gen_unique_str("collection1")
|
||||
|
||||
# 1. create collection1
|
||||
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name1, alias_name)
|
||||
|
||||
# 3. drop the alias and collection1
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name1)
|
||||
|
||||
# 4. create collection2
|
||||
collection_name2 = cf.gen_unique_str("collection2")
|
||||
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 5. create alias with the previous alias name and assign it to collection2
|
||||
self.create_alias(client, collection_name2, alias_name)
|
||||
|
||||
# 6. verify collection2
|
||||
assert self.has_collection(client, collection_name2)[0]
|
||||
assert self.has_collection(client, alias_name)[0]
|
||||
|
||||
# cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name2)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_v2_rename_collection_to_alias_name(self):
|
||||
"""
|
||||
target: test rename collection to alias name
|
||||
method:
|
||||
1.create collection1 with alias
|
||||
2.rename collection2 to alias name
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
collection_name1 = cf.gen_unique_str("collection1")
|
||||
collection_name2 = cf.gen_unique_str("collection2")
|
||||
|
||||
# 1. create collection1
|
||||
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 2. create alias
|
||||
alias_name = cf.gen_unique_str(prefix)
|
||||
self.create_alias(client, collection_name1, alias_name)
|
||||
|
||||
# 3. create collection2
|
||||
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
|
||||
|
||||
# 4. try to rename collection2 to alias name
|
||||
error = {ct.err_code: 999,
|
||||
ct.err_msg: f"cannot rename collection to an existing alias: {alias_name}"}
|
||||
self.rename_collection(client, collection_name2, alias_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
# cleanup
|
||||
self.drop_alias(client, alias_name)
|
||||
self.drop_collection(client, collection_name1)
|
||||
self.drop_collection(client, collection_name2)
|
|
@ -0,0 +1,596 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
import time
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_func as cf
|
||||
from common import common_type as ct
|
||||
from utils.util_log import test_log as log
|
||||
from utils.util_pymilvus import *
|
||||
from base.client_v2_base import TestMilvusClientV2Base
|
||||
from pymilvus import DataType, FieldSchema, CollectionSchema
|
||||
|
||||
# Test parameters
|
||||
default_dim = ct.default_dim
|
||||
default_nb = ct.default_nb
|
||||
default_nq = ct.default_nq
|
||||
default_limit = ct.default_limit
|
||||
default_search_exp = "id >= 0"
|
||||
exp_res = "exp_res"
|
||||
default_primary_key_field_name = "id"
|
||||
default_vector_field_name = "vector"
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
|
||||
|
||||
class TestMilvusClientE2E(TestMilvusClientV2Base):
|
||||
""" Test case of end-to-end interface """
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.skip(reason="issue #40686")
|
||||
@pytest.mark.parametrize("flush_enable", [True, False])
|
||||
@pytest.mark.parametrize("scalar_index_enable", [True, False])
|
||||
def test_milvus_client_e2e_default(self, flush_enable, scalar_index_enable):
|
||||
"""
|
||||
target: test high level api: client.create_collection, insert, search, query
|
||||
method: create connection, collection, insert and search with:
|
||||
1. flush enabled/disabled
|
||||
2. scalar index enabled/disabled
|
||||
expected: search/query successfully
|
||||
"""
|
||||
client = self._client()
|
||||
|
||||
# 1. Create collection with custom schema
|
||||
collection_name = cf.gen_unique_str("test_e2e")
|
||||
schema = self.create_schema(client, enable_dynamic_field=False)[0]
|
||||
# Primary key and vector field
|
||||
schema.add_field("id", DataType.INT64, is_primary=True, auto_id=False)
|
||||
schema.add_field("embeddings", DataType.FLOAT_VECTOR, dim=default_dim)
|
||||
# Boolean type
|
||||
schema.add_field("bool_field", DataType.BOOL, nullable=True)
|
||||
# Integer types
|
||||
schema.add_field("int8_field", DataType.INT8, nullable=True)
|
||||
schema.add_field("int16_field", DataType.INT16, nullable=True)
|
||||
schema.add_field("int32_field", DataType.INT32, nullable=True)
|
||||
schema.add_field("int64_field", DataType.INT64, nullable=True)
|
||||
# Float types
|
||||
schema.add_field("float_field", DataType.FLOAT, nullable=True)
|
||||
schema.add_field("double_field", DataType.DOUBLE, nullable=True)
|
||||
# String type
|
||||
schema.add_field("varchar_field", DataType.VARCHAR, max_length=65535, nullable=True)
|
||||
# JSON type
|
||||
schema.add_field("json_field", DataType.JSON, nullable=True)
|
||||
# Array type
|
||||
schema.add_field("array_field", DataType.ARRAY, element_type=DataType.INT64, max_capacity=12, nullable=True)
|
||||
|
||||
# Create collection
|
||||
self.create_collection(client, collection_name, schema=schema)
|
||||
|
||||
# 2. Insert data with null values for nullable fields
|
||||
num_inserts = 3 # insert data for 3 times
|
||||
total_rows = []
|
||||
for batch in range(num_inserts):
|
||||
vectors = cf.gen_vectors(default_nb, default_dim)
|
||||
rows = []
|
||||
start_id = batch * default_nb # ensure id is not duplicated
|
||||
|
||||
for i in range(default_nb):
|
||||
row = {
|
||||
"id": start_id + i, # ensure id is not duplicated
|
||||
"embeddings": list(vectors[i])
|
||||
}
|
||||
|
||||
# Add nullable fields with null values for every 5th record
|
||||
if i % 5 == 0:
|
||||
row.update({
|
||||
"bool_field": None,
|
||||
"int8_field": None,
|
||||
"int16_field": None,
|
||||
"int32_field": None,
|
||||
"int64_field": None,
|
||||
"float_field": None,
|
||||
"double_field": None,
|
||||
"varchar_field": None,
|
||||
"json_field": None,
|
||||
"array_field": None
|
||||
})
|
||||
else:
|
||||
row.update({
|
||||
"bool_field": i % 2 == 0,
|
||||
"int8_field": i % 128,
|
||||
"int16_field": i % 32768,
|
||||
"int32_field": i,
|
||||
"int64_field": i,
|
||||
"float_field": float(i),
|
||||
"double_field": float(i) * 1.0,
|
||||
"varchar_field": f"varchar_{start_id + i}",
|
||||
"json_field": {"id": start_id + i, "value": f"json_{start_id + i}"},
|
||||
"array_field": [i, i + 1, i + 2]
|
||||
})
|
||||
rows.append(row)
|
||||
total_rows.append(row)
|
||||
|
||||
t0 = time.time()
|
||||
self.insert(client, collection_name, rows)
|
||||
t1 = time.time()
|
||||
log.info(f"Insert batch {batch + 1}: {default_nb} entities cost {t1 - t0:.4f} seconds")
|
||||
|
||||
log.info(f"Total inserted {num_inserts * default_nb} entities")
|
||||
|
||||
if flush_enable:
|
||||
self.flush(client, collection_name)
|
||||
log.info("Flush enabled: executing flush operation")
|
||||
else:
|
||||
log.info("Flush disabled: skipping flush operation")
|
||||
|
||||
# Create index parameters
|
||||
index_params = self.prepare_index_params(client)[0]
|
||||
index_params.add_index("embeddings", metric_type="COSINE")
|
||||
|
||||
# Add autoindex for scalar fields if enabled
|
||||
if scalar_index_enable:
|
||||
index_params.add_index(field_name="int8_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="int16_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="int32_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="int64_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="float_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="double_field", index_type="AUTOINDEX")
|
||||
index_params.add_index(field_name="varchar_field", index_type="AUTOINDEX")
|
||||
|
||||
# 3. create index
|
||||
self.create_index(client, collection_name, index_params)
|
||||
|
||||
# Verify scalar indexes are created if enabled
|
||||
indexes = self.list_indexes(client, collection_name)[0]
|
||||
log.info(f"Created indexes: {indexes}")
|
||||
expected_scalar_indexes = ["int8_field", "int16_field", "int32_field", "int64_field",
|
||||
"float_field", "double_field", "varchar_field"]
|
||||
if scalar_index_enable:
|
||||
for field in expected_scalar_indexes:
|
||||
assert field in indexes, f"Scalar index not created for field: {field}"
|
||||
else:
|
||||
for field in expected_scalar_indexes:
|
||||
assert field not in indexes, f"Scalar index should not be created for field: {field}"
|
||||
|
||||
# 4. Load collection
|
||||
t0 = time.time()
|
||||
self.load_collection(client, collection_name)
|
||||
t1 = time.time()
|
||||
log.info(f"Load collection cost {t1 - t0:.4f} seconds")
|
||||
|
||||
# 4. Search
|
||||
t0 = time.time()
|
||||
vectors_to_search = cf.gen_vectors(1, default_dim)
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}}
|
||||
search_res, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search,
|
||||
anns_field="embeddings",
|
||||
search_params=search_params,
|
||||
limit=default_limit,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": len(vectors_to_search),
|
||||
"limit": default_limit
|
||||
}
|
||||
)
|
||||
t1 = time.time()
|
||||
log.info(f"Search cost {t1 - t0:.4f} seconds")
|
||||
|
||||
# 5. Query with filters on each scalar field
|
||||
t0 = time.time()
|
||||
|
||||
# Query on boolean field
|
||||
bool_filter = "bool_field == true"
|
||||
bool_expected = [r for r in total_rows if r["bool_field"] is not None and r["bool_field"]]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=bool_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": bool_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on int8 field
|
||||
int8_filter = "int8_field < 50"
|
||||
int8_expected = [r for r in total_rows if r["int8_field"] is not None and r["int8_field"] < 50]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int8_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int8_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on int16 field
|
||||
int16_filter = "int16_field < 1000"
|
||||
int16_expected = [r for r in total_rows if r["int16_field"] is not None and r["int16_field"] < 1000]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int16_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int16_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on int32 field
|
||||
int32_filter = "int32_field in [1,2,3,4,5]"
|
||||
int32_expected = [r for r in total_rows if r["int32_field"] is not None and r["int32_field"] in [1,2,3,4,5]]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int32_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int32_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on int64 field
|
||||
int64_filter = "int64_field >= 10"
|
||||
int64_expected = [r for r in total_rows if r["int64_field"] is not None and r["int64_field"] >= 10]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int64_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int64_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on float field
|
||||
float_filter = "float_field > 5.0"
|
||||
float_expected = [r for r in total_rows if r["float_field"] is not None and r["float_field"] > 5.0]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=float_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": float_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on double field
|
||||
double_filter = "3.0 <=double_field <= 7.0"
|
||||
double_expected = [r for r in total_rows if r["double_field"] is not None and 3.0 <= r["double_field"] <= 7.0]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=double_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": double_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on varchar field
|
||||
varchar_filter = "varchar_field like \"varchar_1%\""
|
||||
varchar_expected = [r for r in total_rows if r["varchar_field"] is not None and r["varchar_field"].startswith("varchar_1")]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=varchar_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": varchar_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on varchar null values
|
||||
varchar_null_filter = "varchar_field is null"
|
||||
varchar_null_expected = [r for r in total_rows if r["varchar_field"] is None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=varchar_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": varchar_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on json field null values
|
||||
json_null_filter = "json_field is null"
|
||||
json_null_expected = [r for r in total_rows if r["json_field"] is None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=json_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": json_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on array field null values
|
||||
array_null_filter = "array_field is null"
|
||||
array_null_expected = [r for r in total_rows if r["array_field"] is None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=array_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": array_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on multiple nullable fields
|
||||
multi_null_filter = "varchar_field is null and json_field is null and array_field is null"
|
||||
multi_null_expected = [r for r in total_rows if r["varchar_field"] is None and r["json_field"] is None and r["array_field"] is None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=multi_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": multi_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on mix of null and non-null conditions
|
||||
mix_filter = "varchar_field is null and json_field is not null"
|
||||
mix_expected = [r for r in total_rows if r["varchar_field"] is None and r["json_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=mix_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": mix_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Query on is not null conditions for each scalar field
|
||||
# Int8 field is not null
|
||||
int8_not_null_filter = "int8_field is not null"
|
||||
int8_not_null_expected = [r for r in total_rows if r["int8_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int8_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int8_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Int16 field is not null
|
||||
int16_not_null_filter = "int16_field is not null"
|
||||
int16_not_null_expected = [r for r in total_rows if r["int16_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=int16_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": int16_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Float field is not null
|
||||
float_not_null_filter = "float_field is not null"
|
||||
float_not_null_expected = [r for r in total_rows if r["float_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=float_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": float_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Double field is not null
|
||||
double_not_null_filter = "double_field is not null"
|
||||
double_not_null_expected = [r for r in total_rows if r["double_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=double_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": double_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Varchar field is not null
|
||||
varchar_not_null_filter = "varchar_field is not null"
|
||||
varchar_not_null_expected = [r for r in total_rows if r["varchar_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=varchar_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": varchar_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# JSON field is not null
|
||||
json_not_null_filter = "json_field is not null"
|
||||
json_not_null_expected = [r for r in total_rows if r["json_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=json_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": json_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Array field is not null
|
||||
array_not_null_filter = "array_field is not null"
|
||||
array_not_null_expected = [r for r in total_rows if r["array_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=array_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": array_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Multiple fields is not null
|
||||
multi_not_null_filter = "varchar_field is not null and json_field is not null and array_field is not null"
|
||||
multi_not_null_expected = [r for r in total_rows if r["varchar_field"] is not None and r["json_field"] is not None and r["array_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=multi_not_null_filter,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": multi_not_null_expected,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Complex mixed conditions with is null, is not null, and comparison operators
|
||||
# Test case 1: int field is null AND float field > value AND varchar field is not null
|
||||
complex_mix_filter1 = "int32_field is null and float_field > 10.0 and varchar_field is not null"
|
||||
complex_mix_expected1 = [r for r in total_rows if r["int32_field"] is None and
|
||||
r["float_field"] is not None and r["float_field"] > 10.0 and
|
||||
r["varchar_field"] is not None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=complex_mix_filter1,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": complex_mix_expected1,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Test case 2: varchar field is not null AND int field between values AND float field is null
|
||||
complex_mix_filter2 = "varchar_field is not null and 5 <= int64_field <= 15 and float_field is null"
|
||||
complex_mix_expected2 = [r for r in total_rows if r["varchar_field"] is not None and
|
||||
r["int64_field"] is not None and 5 <= r["int64_field"] <= 15 and
|
||||
r["float_field"] is None]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=complex_mix_filter2,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": complex_mix_expected2,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
# Test case 3: Multiple fields with mixed null/not null conditions and range comparisons
|
||||
complex_mix_filter3 = "int8_field is not null and int8_field < 50 and double_field is null and varchar_field is not null and varchar_field like \"varchar_2%\""
|
||||
complex_mix_expected3 = [r for r in total_rows if r["int8_field"] is not None and r["int8_field"] < 50 and
|
||||
r["double_field"] is None and
|
||||
r["varchar_field"] is not None and r["varchar_field"].startswith("varchar_2")]
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=complex_mix_filter3,
|
||||
output_fields=['*'],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={
|
||||
"exp_res": complex_mix_expected3,
|
||||
"with_vec": True,
|
||||
"primary_field": "id"
|
||||
}
|
||||
)
|
||||
|
||||
t1 = time.time()
|
||||
log.info(f"Query on all scalar fields cost {t1 - t0:.4f} seconds")
|
||||
|
||||
# 6. Delete data
|
||||
t0 = time.time()
|
||||
self.delete(client, collection_name, filter=default_search_exp)
|
||||
t1 = time.time()
|
||||
log.info(f"Delete cost {t1 - t0:.4f} seconds")
|
||||
|
||||
# 7. Verify deletion
|
||||
query_res, _ = self.query(
|
||||
client,
|
||||
collection_name,
|
||||
filter=default_search_exp,
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={"exp_res": []}
|
||||
)
|
||||
|
||||
# 8. Cleanup
|
||||
self.release_collection(client, collection_name)
|
||||
self.drop_collection(client, collection_name)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,133 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchArray(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("array_element_data_type", [DataType.INT64])
|
||||
def test_search_array_with_inverted_index(self, array_element_data_type):
|
||||
# create collection
|
||||
additional_params = {"max_length": 1000} if array_element_data_type == DataType.VARCHAR else {}
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(name="contains", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000,
|
||||
**additional_params),
|
||||
FieldSchema(name="contains_any", dtype=DataType.ARRAY, element_type=array_element_data_type,
|
||||
max_capacity=2000, **additional_params),
|
||||
FieldSchema(name="contains_all", dtype=DataType.ARRAY, element_type=array_element_data_type,
|
||||
max_capacity=2000, **additional_params),
|
||||
FieldSchema(name="equals", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000,
|
||||
**additional_params),
|
||||
FieldSchema(name="array_length_field", dtype=DataType.ARRAY, element_type=array_element_data_type,
|
||||
max_capacity=2000, **additional_params),
|
||||
FieldSchema(name="array_access", dtype=DataType.ARRAY, element_type=array_element_data_type,
|
||||
max_capacity=2000, **additional_params),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=128)
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True)
|
||||
collection_w = self.init_collection_wrap(name=cf.gen_unique_str(prefix), schema=schema)
|
||||
# insert data
|
||||
train_data, query_expr = cf.prepare_array_test_data(3000, hit_rate=0.05)
|
||||
collection_w.insert(train_data)
|
||||
index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 48, "efConstruction": 500}}
|
||||
collection_w.create_index("emb", index_params=index_params)
|
||||
for f in ["contains", "contains_any", "contains_all", "equals", "array_length_field", "array_access"]:
|
||||
collection_w.create_index(f, {"index_type": "INVERTED"})
|
||||
collection_w.load()
|
||||
|
||||
for item in query_expr:
|
||||
expr = item["expr"]
|
||||
ground_truth_candidate = item["ground_truth"]
|
||||
res, _ = collection_w.search(
|
||||
data=[np.array([random.random() for j in range(128)], dtype=np.dtype("float32"))],
|
||||
anns_field="emb",
|
||||
param={"metric_type": "L2", "params": {"M": 32, "efConstruction": 360}},
|
||||
limit=10,
|
||||
expr=expr,
|
||||
output_fields=["*"],
|
||||
)
|
||||
assert len(res) == 1
|
||||
for i in range(len(res)):
|
||||
assert len(res[i]) == 10
|
||||
for hit in res[i]:
|
||||
assert hit.id in ground_truth_candidate
|
|
@ -0,0 +1,475 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchDiskann(TestcaseBase):
|
||||
"""
|
||||
******************************************************************
|
||||
The following cases are used to test search about diskann index
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope="function", params=[32, 128])
|
||||
def dim(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def auto_id(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def _async(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def enable_dynamic_field(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_with_diskann_index(self, _async):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data, primary_field is int field
|
||||
2.create diskann index , then load
|
||||
3.search
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 100
|
||||
auto_id = False
|
||||
enable_dynamic_field = True
|
||||
nb = 2000
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
|
||||
nb=nb, dim=dim, is_index=False,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
|
||||
default_search_params = {
|
||||
"metric_type": "L2", "params": {"search_list": 30}}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("search_list", [20, 200])
|
||||
def test_search_with_limit_20(self, _async, search_list):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data, primary_field is int field
|
||||
2.create diskann index , then load
|
||||
3.search
|
||||
expected: search successfully
|
||||
"""
|
||||
limit = 20
|
||||
# 1. initialize with data
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, is_index=False,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": "L2", "params": {"search_list": search_list}}
|
||||
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_params, limit, default_search_exp,
|
||||
output_fields=output_fields, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_invalid_params_with_diskann_B(self):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data, primary_field is int field
|
||||
2.create diskann index
|
||||
3.search with invalid params, [k, 200] when k <= 20
|
||||
expected: search report an error
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 100
|
||||
limit = 20
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
default_search_params = {"metric_type": "L2", "params": {"search_list": limit-1}}
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 999,
|
||||
"err_msg": f"should be larger than k({limit})"})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_with_diskann_with_string_pk(self):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data, primary_field is string field
|
||||
2.create diskann index
|
||||
3.search with invalid metric type
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=False, dim=dim, is_index=False,
|
||||
primary_field=ct.default_string_field_name,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
search_list = 20
|
||||
default_search_params = {"metric_type": "L2",
|
||||
"params": {"search_list": search_list}}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_with_delete_data(self, _async):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data,
|
||||
2.create diskann index
|
||||
3.delete data, the search
|
||||
expected: assert index and deleted id not in search result
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 100
|
||||
auto_id = True
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
tmp_expr = f'{ct.default_int64_field_name} in {[0]}'
|
||||
|
||||
expr = f'{ct.default_int64_field_name} in {ids[:half_nb]}'
|
||||
|
||||
# delete half of data
|
||||
del_res = collection_w.delete(expr)[0]
|
||||
assert del_res.delete_count == half_nb
|
||||
|
||||
collection_w.delete(tmp_expr)
|
||||
default_search_params = {
|
||||
"metric_type": "L2", "params": {"search_list": 30}}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_with_diskann_and_more_index(self, _async):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data
|
||||
2.create more index ,then load
|
||||
3.delete half data, search
|
||||
expected: assert index and deleted id not in search result
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
auto_id = False
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False,
|
||||
enable_dynamic_field=enable_dynamic_field, language="French")[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "COSINE", "params": {}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index, index_name=index_name1)
|
||||
if not enable_dynamic_field:
|
||||
index_params_one = {}
|
||||
collection_w.create_index("float", index_params_one, index_name="a")
|
||||
index_param_two = {}
|
||||
collection_w.create_index("varchar", index_param_two, index_name="b")
|
||||
|
||||
collection_w.load()
|
||||
tmp_expr = f'{ct.default_int64_field_name} in {[0]}'
|
||||
|
||||
expr = f'{ct.default_int64_field_name} in {ids[:half_nb]}'
|
||||
|
||||
# delete half of data
|
||||
del_res = collection_w.delete(expr)[0]
|
||||
assert del_res.delete_count == half_nb
|
||||
|
||||
collection_w.delete(tmp_expr)
|
||||
default_search_params = {"metric_type": "COSINE", "params": {"search_list": 30}}
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_with_scalar_field(self, _async):
|
||||
"""
|
||||
target: test search with scalar field
|
||||
method: 1.create collection , insert data
|
||||
2.create more index ,then load
|
||||
3.search with expr
|
||||
expected: assert index and search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 66
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, ids = \
|
||||
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
|
||||
is_index=False, enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "IVF_SQ8",
|
||||
"metric_type": "COSINE", "params": {"nlist": 64}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
index_params = {}
|
||||
if not enable_dynamic_field:
|
||||
collection_w.create_index(
|
||||
ct.default_float_field_name, index_params=index_params)
|
||||
collection_w.create_index(
|
||||
ct.default_int64_field_name, index_params=index_params)
|
||||
else:
|
||||
collection_w.create_index(
|
||||
ct.default_string_field_name, index_params=index_params)
|
||||
collection_w.load()
|
||||
default_expr = "int64 in [1, 2, 3, 4]"
|
||||
limit = 4
|
||||
default_search_params = {"metric_type": "COSINE", "params": {"nprobe": 64}}
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, limit, default_expr,
|
||||
output_fields=output_fields, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": ids,
|
||||
"limit": limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("limit", [10, 100, 1000])
|
||||
def test_search_diskann_search_list_equal_to_limit(self, limit, _async):
|
||||
"""
|
||||
target: test search diskann index when search_list equal to limit
|
||||
method: 1.create collection , insert data, primary_field is int field
|
||||
2.create diskann index , then load
|
||||
3.search
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 77
|
||||
auto_id = False
|
||||
enable_dynamic_field = False
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
|
||||
dim=dim, is_index=False,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": "L2", "params": {"search_list": limit}}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_params, limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.xfail(reason="issue #23672")
|
||||
def test_search_diskann_search_list_up_to_min(self, _async):
|
||||
"""
|
||||
target: test search diskann index when search_list up to min
|
||||
method: 1.create collection , insert data, primary_field is int field
|
||||
2.create diskann index , then load
|
||||
3.search
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 100
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
|
||||
dim=dim, is_index=False)[0:4]
|
||||
|
||||
# 2. create index
|
||||
default_index = {"index_type": "DISKANN",
|
||||
"metric_type": "L2", "params": {}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, default_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": "L2",
|
||||
"params": {"k": 200, "search_list": 201}}
|
||||
search_vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(search_vectors[:default_nq], default_search_field,
|
||||
search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
|
@ -0,0 +1,102 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchDSL(TestcaseBase):
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_search_vector_only(self):
|
||||
"""
|
||||
target: test search normal scenario
|
||||
method: search vector only
|
||||
expected: search status ok, the length of result
|
||||
"""
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, ct.default_nb)[0:5]
|
||||
vectors = [[random.random() for _ in range(ct.default_dim)]
|
||||
for _ in range(nq)]
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, ct.default_top_k,
|
||||
default_search_exp,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": ct.default_top_k})
|
|
@ -0,0 +1,357 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchGroupBy(TestcaseBase):
|
||||
""" Test case of search group by """
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_max_group_size_and_max_limit(self):
|
||||
"""
|
||||
target: test search group by with max group size and max limit
|
||||
method: 1. create a collection with data
|
||||
2. search with group by int32 with max group size and max limit
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("group_size", [0, -1])
|
||||
@pytest.mark.xfail(reason="issue #36146")
|
||||
def test_search_negative_group_size(self, group_size):
|
||||
"""
|
||||
target: test search group by with negative group size
|
||||
"""
|
||||
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=True, is_index=True)[0]
|
||||
search_params = ct.default_search_params
|
||||
search_vectors = cf.gen_vectors(1, dim=ct.default_dim)
|
||||
# verify
|
||||
error = {ct.err_code: 999, ct.err_msg: "group_size must be greater than 1"}
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
|
||||
param=search_params, limit=10,
|
||||
group_by_field=ct.default_int64_field_name,
|
||||
group_size=group_size,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("metric", ["JACCARD", "HAMMING"])
|
||||
def test_search_binary_vec_group_by(self, metric):
|
||||
"""
|
||||
target: test search on birany vector does not support group by
|
||||
method: 1. create a collection with binary vectors
|
||||
2. create index with different metric types
|
||||
3. search with group by
|
||||
verified error code and msg
|
||||
"""
|
||||
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
|
||||
is_binary=True)[0]
|
||||
_index = {"index_type": "BIN_FLAT", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index)
|
||||
# insert with the same values for scalar fields
|
||||
for _ in range(10):
|
||||
data = cf.gen_default_binary_dataframe_data(nb=100, auto_id=True)[0]
|
||||
collection_w.insert(data)
|
||||
|
||||
collection_w.flush()
|
||||
collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": metric, "params": {"ef": 128}}
|
||||
nq = 2
|
||||
limit = 10
|
||||
search_vectors = cf.gen_binary_vectors(nq, dim=ct.default_dim)[1]
|
||||
|
||||
# verify the results are same if group by pk
|
||||
err_code = 999
|
||||
err_msg = "not support search_group_by operation based on binary"
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_binary_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field=ct.default_int64_field_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("grpby_unsupported_field", [ct.default_float_field_name, ct.default_json_field_name,
|
||||
ct.default_double_field_name, ct.default_float_vec_field_name])
|
||||
def test_search_group_by_unsupported_field(self, grpby_unsupported_field):
|
||||
"""
|
||||
target: test search group by with the unsupported field
|
||||
method: 1. create a collection with data
|
||||
2. create index
|
||||
3. search with group by the unsupported fields
|
||||
verify: the error code and msg
|
||||
"""
|
||||
metric = "IP"
|
||||
collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False,
|
||||
is_all_data_type=True, with_json=True, )[0]
|
||||
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": metric, "params": {"ef": 64}}
|
||||
nq = 1
|
||||
limit = 1
|
||||
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
|
||||
|
||||
# search with groupby
|
||||
err_code = 999
|
||||
err_msg = f"unsupported data type"
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field=grpby_unsupported_field,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[:7])
|
||||
def test_search_group_by_unsupported_index(self, index):
|
||||
"""
|
||||
target: test search group by with the unsupported vector index
|
||||
method: 1. create a collection with data
|
||||
2. create a groupby unsupported index
|
||||
3. search with group by
|
||||
verify: the error code and msg
|
||||
"""
|
||||
if index in ["HNSW", "IVF_FLAT", "FLAT", "IVF_SQ8", "DISKANN", "SCANN"]:
|
||||
pass # Only HNSW and IVF_FLAT are supported
|
||||
else:
|
||||
metric = "L2"
|
||||
collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False,
|
||||
is_all_data_type=True, with_json=False)[0]
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "params": params, "metric_type": metric}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"params": {}}
|
||||
nq = 1
|
||||
limit = 1
|
||||
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
|
||||
|
||||
# search with groupby
|
||||
err_code = 999
|
||||
err_msg = f"current index:{index} doesn't support"
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field=ct.default_int8_field_name,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_group_by_multi_fields(self):
|
||||
"""
|
||||
target: test search group by with the multi fields
|
||||
method: 1. create a collection with data
|
||||
2. create index
|
||||
3. search with group by the multi fields
|
||||
verify: the error code and msg
|
||||
"""
|
||||
metric = "IP"
|
||||
collection_w = self.init_collection_general(prefix, insert_data=False, is_index=False,
|
||||
is_all_data_type=True, with_json=True, )[0]
|
||||
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": metric, "params": {"ef": 128}}
|
||||
nq = 1
|
||||
limit = 1
|
||||
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
|
||||
|
||||
# search with groupby
|
||||
err_code = 1700
|
||||
err_msg = f"groupBy field not found in schema"
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field=[ct.default_string_field_name, ct.default_int32_field_name],
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("grpby_nonexist_field", ["nonexit_field", 100])
|
||||
def test_search_group_by_nonexit_fields(self, grpby_nonexist_field):
|
||||
"""
|
||||
target: test search group by with the nonexisting field
|
||||
method: 1. create a collection with data
|
||||
2. create index
|
||||
3. search with group by the unsupported fields
|
||||
verify: the error code and msg
|
||||
"""
|
||||
metric = "IP"
|
||||
collection_w = self.init_collection_general(prefix, insert_data=False, is_index=False,
|
||||
is_all_data_type=True, with_json=True, )[0]
|
||||
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
|
||||
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||
index_param = {"index_type": "FLAT", "metric_type": "COSINE", "params": {"nlist": 100}}
|
||||
for vector_name in vector_name_list:
|
||||
collection_w.create_index(vector_name, index_param)
|
||||
collection_w.load()
|
||||
|
||||
search_params = {"metric_type": metric, "params": {"ef": 128}}
|
||||
nq = 1
|
||||
limit = 1
|
||||
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
|
||||
|
||||
# search with groupby
|
||||
err_code = 1700
|
||||
err_msg = f"groupBy field not found in schema: field not found[field={grpby_nonexist_field}]"
|
||||
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field=grpby_nonexist_field,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_iterator_not_support_group_by(self):
|
||||
"""
|
||||
target: test search iterator does not support group by
|
||||
method: 1. create a collection with data
|
||||
2. create index HNSW
|
||||
3. search iterator with group by
|
||||
4. search with filtering every value of group_by_field
|
||||
verify: error code and msg
|
||||
"""
|
||||
metric = "COSINE"
|
||||
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
|
||||
is_all_data_type=True, with_json=False)[0]
|
||||
# insert with the same values for scalar fields
|
||||
for _ in range(10):
|
||||
data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False)
|
||||
collection_w.insert(data)
|
||||
|
||||
collection_w.flush()
|
||||
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
collection_w.load()
|
||||
|
||||
grpby_field = ct.default_int32_field_name
|
||||
search_vectors = cf.gen_vectors(1, dim=ct.default_dim)
|
||||
search_params = {"metric_type": metric}
|
||||
batch_size = 10
|
||||
|
||||
err_code = 1100
|
||||
err_msg = "Not allowed to do groupBy when doing iteration"
|
||||
collection_w.search_iterator(search_vectors, ct.default_float_vec_field_name,
|
||||
search_params, batch_size, group_by_field=grpby_field,
|
||||
output_fields=[grpby_field],
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_range_search_not_support_group_by(self):
|
||||
"""
|
||||
target: test range search does not support group by
|
||||
method: 1. create a collection with data
|
||||
2. create index hnsw
|
||||
3. range search with group by
|
||||
verify: the error code and msg
|
||||
"""
|
||||
metric = "COSINE"
|
||||
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
|
||||
is_all_data_type=True, with_json=False)[0]
|
||||
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
# insert with the same values for scalar fields
|
||||
for _ in range(10):
|
||||
data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False)
|
||||
collection_w.insert(data)
|
||||
|
||||
collection_w.flush()
|
||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
|
||||
collection_w.load()
|
||||
|
||||
nq = 1
|
||||
limit = 5
|
||||
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
|
||||
grpby_field = ct.default_int32_field_name
|
||||
range_search_params = {"metric_type": "COSINE", "params": {"radius": 0.1,
|
||||
"range_filter": 0.5}}
|
||||
err_code = 1100
|
||||
err_msg = f"Not allowed to do range-search"
|
||||
collection_w.search(search_vectors, ct.default_float_vec_field_name,
|
||||
range_search_params, limit,
|
||||
default_search_exp, group_by_field=grpby_field,
|
||||
output_fields=[grpby_field],
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": err_code, "err_msg": err_msg})
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,213 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchIterator(TestcaseBase):
|
||||
""" Test case of search iterator """
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("metric_type", ct.float_metrics)
|
||||
@pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||
def test_range_search_iterator_default(self, metric_type, vector_data_type):
|
||||
"""
|
||||
target: test iterator range search
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk not repeat and meet the range requirements
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
batch_size = 100
|
||||
collection_w = self.init_collection_general(prefix, True, dim=default_dim, is_index=False,
|
||||
vector_data_type=vector_data_type)[0]
|
||||
collection_w.create_index(field_name, {"metric_type": metric_type})
|
||||
collection_w.load()
|
||||
search_vector = cf.gen_vectors(1, default_dim, vector_data_type)
|
||||
search_params = {"metric_type": metric_type}
|
||||
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"metric_type": metric_type,
|
||||
"batch_size": batch_size})
|
||||
|
||||
limit = 200
|
||||
res = collection_w.search(search_vector, field_name, param=search_params, limit=200,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": 1, "limit": limit})[0]
|
||||
# 2. search iterator
|
||||
if metric_type != "L2":
|
||||
radius = res[0][limit // 2].distance - 0.1 # pick a radius to make sure there exists results
|
||||
range_filter = res[0][0].distance + 0.1
|
||||
search_params = {"metric_type": metric_type, "params": {"radius": radius, "range_filter": range_filter}}
|
||||
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"metric_type": metric_type, "batch_size": batch_size,
|
||||
"radius": radius,
|
||||
"range_filter": range_filter})
|
||||
else:
|
||||
radius = res[0][limit // 2].distance + 0.1
|
||||
range_filter = res[0][0].distance - 0.1
|
||||
search_params = {"metric_type": metric_type, "params": {"radius": radius, "range_filter": range_filter}}
|
||||
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"metric_type": metric_type, "batch_size": batch_size,
|
||||
"radius": radius,
|
||||
"range_filter": range_filter})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_iterator_binary(self):
|
||||
"""
|
||||
target: test search iterator binary
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
batch_size = 200
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, is_binary=True)[0]
|
||||
# 2. search iterator
|
||||
_, binary_vectors = cf.gen_binary_vectors(2, ct.default_dim)
|
||||
collection_w.search_iterator(binary_vectors[:1], binary_field_name,
|
||||
ct.default_search_binary_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"batch_size": batch_size})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("metrics", ct.float_metrics)
|
||||
def test_search_iterator_with_expression(self, metrics):
|
||||
"""
|
||||
target: test search iterator normal
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk not repeat and meet the expr requirements
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
batch_size = 100
|
||||
dim = 128
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, dim=dim, is_index=False)[0]
|
||||
collection_w.create_index(field_name, {"metric_type": metrics})
|
||||
collection_w.load()
|
||||
# 2. search iterator
|
||||
search_params = {"metric_type": metrics}
|
||||
expression = "1000.0 <= float < 2000.0"
|
||||
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
|
||||
expr=expression, check_task=CheckTasks.check_search_iterator,
|
||||
check_items={})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("batch_size", [10, 100, 777, 1000])
|
||||
def test_search_iterator_with_different_limit(self, batch_size):
|
||||
"""
|
||||
target: test search iterator normal
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk not repeat and meet the expr requirements
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(prefix, True)[0]
|
||||
# 2. search iterator
|
||||
search_params = {"metric_type": "COSINE"}
|
||||
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"batch_size": batch_size})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_iterator_invalid_nq(self):
|
||||
"""
|
||||
target: test search iterator normal
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
batch_size = 100
|
||||
dim = 128
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, dim=dim, is_index=False)[0]
|
||||
collection_w.create_index(field_name, {"metric_type": "L2"})
|
||||
collection_w.load()
|
||||
# 2. search iterator
|
||||
search_params = {"metric_type": "L2"}
|
||||
collection_w.search_iterator(vectors[:2], field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 1,
|
||||
"err_msg": "Not support search iteration over multiple vectors at present"})
|
|
@ -0,0 +1,491 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestCollectionSearchJSON(TestcaseBase):
|
||||
""" Test case of search interface """
|
||||
|
||||
@pytest.fixture(scope="function",
|
||||
params=[default_nb, default_nb_medium])
|
||||
def nb(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[2, 500])
|
||||
def nq(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[32, 128])
|
||||
def dim(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def auto_id(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def _async(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
|
||||
def metrics(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def is_flush(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def enable_dynamic_field(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[0, 0.5, 1])
|
||||
def null_data_percent(self, request):
|
||||
yield request.param
|
||||
|
||||
"""
|
||||
******************************************************************
|
||||
# The followings are invalid base cases
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.mark.skip("Supported json like: 1, \"abc\", [1,2,3,4]")
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_json_expression_object(self):
|
||||
"""
|
||||
target: test search with comparisons jsonField directly
|
||||
method: search with expressions using jsonField name directly
|
||||
expected: Raise error
|
||||
"""
|
||||
# 1. initialize with data
|
||||
nq = 1
|
||||
dim = 128
|
||||
collection_w, _, _, insert_ids, time_stamp = self.init_collection_general(prefix, True, dim=dim)[0:5]
|
||||
# 2. search before insert time_stamp
|
||||
log.info("test_search_json_expression_object: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
|
||||
# 3. search after insert time_stamp
|
||||
json_search_exp = "json_field > 0"
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
json_search_exp,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={ct.err_code: 1,
|
||||
ct.err_msg: "can not comparisons jsonField directly"})
|
||||
|
||||
"""
|
||||
******************************************************************
|
||||
# The followings are valid base cases
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_json_expression_default(self, nq, is_flush, enable_dynamic_field):
|
||||
"""
|
||||
target: test search case with default json expression
|
||||
method: create connection, collection, insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, auto_id=True, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field, language="Hindi")[0:5]
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
|
||||
# 2. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_json_search_exp,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_json_nullable_load_before_insert(self, nq, is_flush, enable_dynamic_field):
|
||||
"""
|
||||
target: test search case with default json expression
|
||||
method: create connection, collection, insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize collection
|
||||
dim = 64
|
||||
enable_dynamic_field = False
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, False, auto_id=True, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
nullable_fields={ct.default_json_field_name: 1})[0:5]
|
||||
# insert data
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nb)]
|
||||
data = [[np.float32(i) for i in range(default_nb)], [str(i) for i in range(default_nb)], [], vectors]
|
||||
collection_w.insert(data)
|
||||
collection_w.num_entities
|
||||
# 2. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"limit": default_limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.skip(reason="issue 37113")
|
||||
def test_search_json_nullable_insert_before_load(self, nq, is_flush, enable_dynamic_field):
|
||||
"""
|
||||
target: test search case with default json expression
|
||||
method: create connection, collection, insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize collection
|
||||
dim = 64
|
||||
enable_dynamic_field = False
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, False, auto_id=True, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
nullable_fields={ct.default_json_field_name: 1})[0:5]
|
||||
collection_w.release()
|
||||
# insert data
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nb)]
|
||||
data = [[np.float32(i) for i in range(default_nb)], [str(i) for i in range(default_nb)], [], vectors]
|
||||
collection_w.insert(data)
|
||||
collection_w.num_entities
|
||||
collection_w.load()
|
||||
# 2. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"limit": default_limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_expression_json_contains(self, enable_dynamic_field):
|
||||
"""
|
||||
target: test search with expression using json_contains
|
||||
method: search with expression (json_contains)
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, enable_dynamic_field=enable_dynamic_field)[0]
|
||||
|
||||
# 2. insert data
|
||||
array = []
|
||||
for i in range(default_nb):
|
||||
data = {
|
||||
default_int64_field_name: i,
|
||||
default_float_field_name: i * 1.0,
|
||||
default_string_field_name: str(i),
|
||||
default_json_field_name: {"number": i, "list": [i, i + 1, i + 2]},
|
||||
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
|
||||
}
|
||||
array.append(data)
|
||||
collection_w.insert(array)
|
||||
|
||||
# 2. search
|
||||
collection_w.load()
|
||||
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
|
||||
collection_w.name)
|
||||
expressions = [
|
||||
"json_contains(json_field['list'], 100)", "JSON_CONTAINS(json_field['list'], 100)"]
|
||||
for expression in expressions:
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit, expression,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": 3})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_expression_json_contains_list(self, auto_id):
|
||||
"""
|
||||
target: test search with expression using json_contains
|
||||
method: search with expression (json_contains)
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, auto_id=auto_id, enable_dynamic_field=True)[0]
|
||||
|
||||
# 2. insert data
|
||||
limit = 100
|
||||
array = []
|
||||
for i in range(default_nb):
|
||||
data = {
|
||||
default_int64_field_name: i,
|
||||
default_json_field_name: [j for j in range(i, i + limit)],
|
||||
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
|
||||
}
|
||||
if auto_id:
|
||||
data.pop(default_int64_field_name, None)
|
||||
array.append(data)
|
||||
collection_w.insert(array)
|
||||
|
||||
# 2. search
|
||||
collection_w.load()
|
||||
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
|
||||
collection_w.name)
|
||||
expressions = [
|
||||
"json_contains(json_field, 100)", "JSON_CONTAINS(json_field, 100)"]
|
||||
for expression in expressions:
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, limit, expression,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_expression_json_contains_combined_with_normal(self, enable_dynamic_field):
|
||||
"""
|
||||
target: test search with expression using json_contains
|
||||
method: search with expression (json_contains)
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, enable_dynamic_field=enable_dynamic_field)[0]
|
||||
|
||||
# 2. insert data
|
||||
limit = 100
|
||||
array = []
|
||||
for i in range(default_nb):
|
||||
data = {
|
||||
default_int64_field_name: i,
|
||||
default_float_field_name: i * 1.0,
|
||||
default_string_field_name: str(i),
|
||||
default_json_field_name: {"number": i, "list": [str(j) for j in range(i, i + limit)]},
|
||||
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
|
||||
}
|
||||
array.append(data)
|
||||
collection_w.insert(array)
|
||||
|
||||
# 2. search
|
||||
collection_w.load()
|
||||
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
|
||||
collection_w.name)
|
||||
tar = 1000
|
||||
expressions = [f"json_contains(json_field['list'], '{tar}') && int64 > {tar - limit // 2}",
|
||||
f"JSON_CONTAINS(json_field['list'], '{tar}') && int64 > {tar - limit // 2}"]
|
||||
for expression in expressions:
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, limit, expression,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": limit // 2})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("expr_prefix", ["array_contains", "ARRAY_CONTAINS"])
|
||||
def test_search_expr_array_contains(self, expr_prefix):
|
||||
"""
|
||||
target: test query with expression using json_contains
|
||||
method: query with expression using json_contains
|
||||
expected: succeed
|
||||
"""
|
||||
# 1. create a collection
|
||||
schema = cf.gen_array_collection_schema()
|
||||
collection_w = self.init_collection_wrap(schema=schema)
|
||||
|
||||
# 2. insert data
|
||||
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
|
||||
data = cf.gen_array_dataframe_data()
|
||||
data[ct.default_string_array_field_name] = string_field_value
|
||||
collection_w.insert(data)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, {})
|
||||
|
||||
# 3. search
|
||||
collection_w.load()
|
||||
expression = f"{expr_prefix}({ct.default_string_array_field_name}, '1000')"
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, {},
|
||||
limit=ct.default_nb, expr=expression)[0]
|
||||
exp_ids = cf.assert_json_contains(expression, string_field_value)
|
||||
assert set(res[0].ids) == set(exp_ids)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("expr_prefix", ["array_contains", "ARRAY_CONTAINS"])
|
||||
def test_search_expr_not_array_contains(self, expr_prefix):
|
||||
"""
|
||||
target: test query with expression using json_contains
|
||||
method: query with expression using json_contains
|
||||
expected: succeed
|
||||
"""
|
||||
# 1. create a collection
|
||||
schema = cf.gen_array_collection_schema()
|
||||
collection_w = self.init_collection_wrap(schema=schema)
|
||||
|
||||
# 2. insert data
|
||||
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
|
||||
data = cf.gen_array_dataframe_data()
|
||||
data[ct.default_string_array_field_name] = string_field_value
|
||||
collection_w.insert(data)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, {})
|
||||
|
||||
# 3. search
|
||||
collection_w.load()
|
||||
expression = f"not {expr_prefix}({ct.default_string_array_field_name}, '1000')"
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, {},
|
||||
limit=ct.default_nb, expr=expression)[0]
|
||||
exp_ids = cf.assert_json_contains(expression, string_field_value)
|
||||
assert set(res[0].ids) == set(exp_ids)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("expr_prefix", ["array_contains_all", "ARRAY_CONTAINS_ALL"])
|
||||
def test_search_expr_array_contains_all(self, expr_prefix):
|
||||
"""
|
||||
target: test query with expression using json_contains
|
||||
method: query with expression using json_contains
|
||||
expected: succeed
|
||||
"""
|
||||
# 1. create a collection
|
||||
schema = cf.gen_array_collection_schema()
|
||||
collection_w = self.init_collection_wrap(schema=schema)
|
||||
|
||||
# 2. insert data
|
||||
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
|
||||
data = cf.gen_array_dataframe_data()
|
||||
data[ct.default_string_array_field_name] = string_field_value
|
||||
collection_w.insert(data)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, {})
|
||||
|
||||
# 3. search
|
||||
collection_w.load()
|
||||
expression = f"{expr_prefix}({ct.default_string_array_field_name}, ['1000'])"
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, {},
|
||||
limit=ct.default_nb, expr=expression)[0]
|
||||
exp_ids = cf.assert_json_contains(expression, string_field_value)
|
||||
assert set(res[0].ids) == set(exp_ids)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("expr_prefix", ["array_contains_any", "ARRAY_CONTAINS_ANY",
|
||||
"not array_contains_any", "not ARRAY_CONTAINS_ANY"])
|
||||
def test_search_expr_array_contains_any(self, expr_prefix):
|
||||
"""
|
||||
target: test query with expression using json_contains
|
||||
method: query with expression using json_contains
|
||||
expected: succeed
|
||||
"""
|
||||
# 1. create a collection
|
||||
schema = cf.gen_array_collection_schema()
|
||||
collection_w = self.init_collection_wrap(schema=schema)
|
||||
|
||||
# 2. insert data
|
||||
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
|
||||
data = cf.gen_array_dataframe_data()
|
||||
data[ct.default_string_array_field_name] = string_field_value
|
||||
collection_w.insert(data)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, {})
|
||||
|
||||
# 3. search
|
||||
collection_w.load()
|
||||
expression = f"{expr_prefix}({ct.default_string_array_field_name}, ['1000'])"
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, {},
|
||||
limit=ct.default_nb, expr=expression)[0]
|
||||
exp_ids = cf.assert_json_contains(expression, string_field_value)
|
||||
assert set(res[0].ids) == set(exp_ids)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("expr_prefix", ["array_contains_all", "ARRAY_CONTAINS_ALL",
|
||||
"array_contains_any", "ARRAY_CONTAINS_ANY"])
|
||||
def test_search_expr_array_contains_invalid(self, expr_prefix):
|
||||
"""
|
||||
target: test query with expression using json_contains
|
||||
method: query with expression using json_contains(a, b) b not list
|
||||
expected: report error
|
||||
"""
|
||||
# 1. create a collection
|
||||
schema = cf.gen_array_collection_schema()
|
||||
collection_w = self.init_collection_wrap(schema=schema)
|
||||
|
||||
# 2. insert data
|
||||
data = cf.gen_array_dataframe_data()
|
||||
collection_w.insert(data)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, {})
|
||||
|
||||
# 3. search
|
||||
collection_w.load()
|
||||
expression = f"{expr_prefix}({ct.default_string_array_field_name}, '1000')"
|
||||
error = {ct.err_code: 1100,
|
||||
ct.err_msg: f"cannot parse expression: {expression}, "
|
||||
f"error: ContainsAll operation element must be an array"}
|
||||
if expr_prefix in ["array_contains_any", "ARRAY_CONTAINS_ANY"]:
|
||||
error = {ct.err_code: 1100,
|
||||
ct.err_msg: f"cannot parse expression: {expression}, "
|
||||
f"error: ContainsAny operation element must be an array"}
|
||||
collection_w.search(vectors[:default_nq], default_search_field, {},
|
||||
limit=ct.default_nb, expr=expression,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,585 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
|
||||
""" Test case of search interface """
|
||||
|
||||
@pytest.fixture(scope="function", params=[default_nb_medium])
|
||||
def nb(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[200])
|
||||
def nq(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[32, 128])
|
||||
def dim(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def auto_id(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def _async(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
|
||||
def metrics(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def is_flush(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def enable_dynamic_field(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["IP", "COSINE", "L2"])
|
||||
def metric_type(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def random_primary_key(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||
def vector_data_type(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["STL_SORT", "INVERTED"])
|
||||
def numeric_scalar_index(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["TRIE", "INVERTED", "BITMAP"])
|
||||
def varchar_scalar_index(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[200, 600])
|
||||
def batch_size(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[0, 0.5, 1])
|
||||
def null_data_percent(self, request):
|
||||
yield request.param
|
||||
|
||||
"""
|
||||
******************************************************************
|
||||
# The following are valid base cases
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_search_normal_none_data(self, nq, dim, auto_id, is_flush, enable_dynamic_field, vector_data_type,
|
||||
null_data_percent):
|
||||
"""
|
||||
target: test search normal case with none data inserted
|
||||
method: create connection, collection with nullable fields, insert data including none, and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
vector_data_type=vector_data_type,
|
||||
nullable_fields={ct.default_float_field_name: null_data_percent})[0:5]
|
||||
# 2. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
|
||||
# 3. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=[default_int64_field_name,
|
||||
default_float_field_name],
|
||||
guarantee_timestamp=0,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"output_fields": [default_int64_field_name,
|
||||
default_float_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_after_none_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index,
|
||||
null_data_percent, _async):
|
||||
"""
|
||||
target: test search after different index
|
||||
method: test search after different index and corresponding search params
|
||||
expected: search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
nullable_fields = {ct.default_int32_field_name: null_data_percent,
|
||||
ct.default_int16_field_name: null_data_percent,
|
||||
ct.default_int8_field_name: null_data_percent,
|
||||
ct.default_bool_field_name: null_data_percent,
|
||||
ct.default_float_field_name: null_data_percent,
|
||||
ct.default_double_field_name: null_data_percent,
|
||||
ct.default_string_field_name: null_data_percent}
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, 5000, partition_num=1,
|
||||
is_all_data_type=True, dim=default_dim,
|
||||
is_index=False, nullable_fields=nullable_fields)[0:4]
|
||||
# 2. create index on vector field and load
|
||||
index = "HNSW"
|
||||
params = cf.get_index_params_params(index)
|
||||
default_index = {"index_type": index, "params": params, "metric_type": "COSINE"}
|
||||
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||
vector_name_list.append(ct.default_float_vec_field_name)
|
||||
for vector_name in vector_name_list:
|
||||
collection_w.create_index(vector_name, default_index)
|
||||
# 3. create index on scalar field with None data
|
||||
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
|
||||
# 4. create index on scalar field with default data
|
||||
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_int64_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int32_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int16_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int8_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
|
||||
scalar_index_params = {"index_type": "INVERTED", "params": {}}
|
||||
collection_w.create_index(ct.default_bool_field_name, scalar_index_params)
|
||||
collection_w.load()
|
||||
# 5. search
|
||||
search_params = cf.gen_search_param(index, "COSINE")
|
||||
limit = search_params[0]["params"]["ef"]
|
||||
log.info("Searching with search params: {}".format(search_params[0]))
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, limit, default_search_exp, _async=_async,
|
||||
output_fields=[ct.default_string_field_name, ct.default_float_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async,
|
||||
"output_fields": [ct.default_string_field_name,
|
||||
ct.default_float_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_search_default_value_with_insert(self, nq, dim, auto_id, is_flush, enable_dynamic_field, vector_data_type):
|
||||
"""
|
||||
target: test search normal case with default value set
|
||||
method: create connection, collection with default value set, insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
vector_data_type=vector_data_type,
|
||||
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
|
||||
# 2. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
|
||||
# 3. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=[default_int64_field_name,
|
||||
default_float_field_name],
|
||||
guarantee_timestamp=0,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"output_fields": [default_int64_field_name,
|
||||
default_float_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_default_value_without_insert(self, enable_dynamic_field):
|
||||
"""
|
||||
target: test search normal case with default value set
|
||||
method: create connection, collection with default value set, no insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(prefix, False, dim=default_dim,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
nullable_fields={ct.default_float_field_name: 0},
|
||||
default_value_fields={
|
||||
ct.default_float_field_name: np.float32(10.0)})[0]
|
||||
# 2. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim, "FLOAT_VECTOR")
|
||||
# 3. search after insert
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
guarantee_timestamp=0,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": 0})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_after_default_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index, _async):
|
||||
"""
|
||||
target: test search after different index
|
||||
method: test search after different index and corresponding search params
|
||||
expected: search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
default_value_fields = {ct.default_int32_field_name: np.int32(1),
|
||||
ct.default_int16_field_name: np.int32(2),
|
||||
ct.default_int8_field_name: np.int32(3),
|
||||
ct.default_bool_field_name: True,
|
||||
ct.default_float_field_name: np.float32(10.0),
|
||||
ct.default_double_field_name: 10.0,
|
||||
ct.default_string_field_name: "1"}
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, 5000, partition_num=1,
|
||||
is_all_data_type=True, dim=default_dim,
|
||||
is_index=False,
|
||||
default_value_fields=default_value_fields)[0:4]
|
||||
# 2. create index on vector field and load
|
||||
index = "HNSW"
|
||||
params = cf.get_index_params_params(index)
|
||||
default_index = {"index_type": index, "params": params, "metric_type": "L2"}
|
||||
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||
vector_name_list.append(ct.default_float_vec_field_name)
|
||||
for vector_name in vector_name_list:
|
||||
collection_w.create_index(vector_name, default_index)
|
||||
# 3. create index on scalar field with None data
|
||||
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
|
||||
# 4. create index on scalar field with default data
|
||||
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_int64_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int32_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int16_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_int8_field_name, scalar_index_params)
|
||||
if numeric_scalar_index != "STL_SORT":
|
||||
collection_w.create_index(ct.default_bool_field_name, scalar_index_params)
|
||||
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
|
||||
collection_w.load()
|
||||
# 5. search
|
||||
search_params = cf.gen_search_param(index, "L2")
|
||||
limit = search_params[0]["params"]["ef"]
|
||||
log.info("Searching with search params: {}".format(search_params[0]))
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
output_fields = [ct.default_int64_field_name, ct.default_int32_field_name,
|
||||
ct.default_int16_field_name, ct.default_int8_field_name,
|
||||
ct.default_bool_field_name, ct.default_float_field_name,
|
||||
ct.default_double_field_name, ct.default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, limit, default_search_exp, _async=_async,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async,
|
||||
"output_fields": output_fields})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_both_default_value_non_data(self, nq, dim, auto_id, is_flush, enable_dynamic_field,
|
||||
vector_data_type):
|
||||
"""
|
||||
target: test search normal case with default value set
|
||||
method: create connection, collection with default value set, insert and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
vector_data_type=vector_data_type,
|
||||
nullable_fields={ct.default_float_field_name: 1},
|
||||
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
|
||||
# 2. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
|
||||
# 3. search after insert
|
||||
collection_w.search(vectors[:nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=[default_int64_field_name,
|
||||
default_float_field_name],
|
||||
guarantee_timestamp=0,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"output_fields": [default_int64_field_name,
|
||||
default_float_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_collection_with_non_default_data_after_release_load(self, nq, _async, null_data_percent):
|
||||
"""
|
||||
target: search the pre-released collection after load
|
||||
method: 1. create collection
|
||||
2. release collection
|
||||
3. load collection
|
||||
4. search the pre-released collection
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize without data
|
||||
nb = 2000
|
||||
dim = 64
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, nb, 1, auto_id=auto_id, dim=dim,
|
||||
nullable_fields={ct.default_string_field_name: null_data_percent},
|
||||
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
|
||||
# 2. release collection
|
||||
collection_w.release()
|
||||
# 3. Search the pre-released collection after load
|
||||
collection_w.load()
|
||||
log.info("test_search_collection_awith_non_default_data_after_release_load: searching after load")
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
|
||||
collection_w.search(vectors[:nq], default_search_field, default_search_params,
|
||||
default_limit, default_search_exp, _async=_async,
|
||||
output_fields=[ct.default_float_field_name, ct.default_string_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async,
|
||||
"output_fields": [ct.default_float_field_name,
|
||||
ct.default_string_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.tags(CaseLabel.GPU)
|
||||
def test_search_after_different_index_with_params_none_default_data(self, varchar_scalar_index,
|
||||
numeric_scalar_index,
|
||||
null_data_percent, _async):
|
||||
"""
|
||||
target: test search after different index
|
||||
method: test search after different index and corresponding search params
|
||||
expected: search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, 5000, partition_num=1, is_all_data_type=True,
|
||||
dim=default_dim, is_index=False,
|
||||
nullable_fields={ct.default_string_field_name: null_data_percent},
|
||||
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:4]
|
||||
# 2. create index on vector field and load
|
||||
index = "HNSW"
|
||||
params = cf.get_index_params_params(index)
|
||||
default_index = {"index_type": index, "params": params, "metric_type": "COSINE"}
|
||||
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||
vector_name_list.append(ct.default_float_vec_field_name)
|
||||
for vector_name in vector_name_list:
|
||||
collection_w.create_index(vector_name, default_index)
|
||||
# 3. create index on scalar field with None data
|
||||
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
|
||||
# 4. create index on scalar field with default data
|
||||
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
|
||||
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
|
||||
collection_w.load()
|
||||
# 5. search
|
||||
search_params = cf.gen_search_param(index, "COSINE")
|
||||
limit = search_params[0]["params"]["ef"]
|
||||
log.info("Searching with search params: {}".format(search_params[0]))
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, limit, default_search_exp, _async=_async,
|
||||
output_fields=[ct.default_string_field_name, ct.default_float_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async,
|
||||
"output_fields": [ct.default_string_field_name,
|
||||
ct.default_float_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_iterator_with_none_data(self, batch_size, null_data_percent):
|
||||
"""
|
||||
target: test search iterator normal
|
||||
method: 1. search iterator
|
||||
2. check the result, expect pk
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
collection_w = \
|
||||
self.init_collection_general(prefix, True, dim=dim, is_index=False,
|
||||
nullable_fields={ct.default_string_field_name: null_data_percent})[0]
|
||||
collection_w.create_index(field_name, {"metric_type": "L2"})
|
||||
collection_w.load()
|
||||
# 2. search iterator
|
||||
search_params = {"metric_type": "L2"}
|
||||
vectors = cf.gen_vectors_based_on_vector_type(1, dim, "FLOAT_VECTOR")
|
||||
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"batch_size": batch_size})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, null_data_percent):
|
||||
"""
|
||||
target: test search normal case with none data inserted
|
||||
method: create connection, collection with nullable fields, insert data including none, and search
|
||||
expected: 1. search successfully with limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, is_flush=is_flush,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
nullable_fields={ct.default_float_field_name: null_data_percent})[0:5]
|
||||
# 2. release and partial load again
|
||||
collection_w.release()
|
||||
loaded_fields = [default_int64_field_name, ct.default_float_vec_field_name]
|
||||
if not enable_dynamic_field:
|
||||
loaded_fields.append(default_float_field_name)
|
||||
collection_w.load(load_fields=loaded_fields)
|
||||
# 3. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
|
||||
# 4. search after partial load field with None data
|
||||
output_fields = [default_int64_field_name, default_float_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"output_fields": output_fields})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.skip(reason="issue #37547")
|
||||
def test_search_none_data_expr_cache(self, is_flush):
|
||||
"""
|
||||
target: test search case with none data to test expr cache
|
||||
method: 1. create collection with double datatype as nullable field
|
||||
2. search with expr "nullableFid == 0"
|
||||
3. drop this collection
|
||||
4. create collection with same collection name and same field name but modify the type of nullable field
|
||||
as varchar datatype
|
||||
5. search with expr "nullableFid == 0" again
|
||||
expected: 1. search successfully with limit(topK) for the first collection
|
||||
2. report error for the second collection with the same name
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, insert_ids, time_stamp = \
|
||||
self.init_collection_general(prefix, True, is_flush=is_flush,
|
||||
nullable_fields={ct.default_float_field_name: 0.5})[0:5]
|
||||
collection_name = collection_w.name
|
||||
# 2. generate search data
|
||||
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
|
||||
# 3. search with expr "nullableFid == 0"
|
||||
search_exp = f"{ct.default_float_field_name} == 0"
|
||||
output_fields = [default_int64_field_name, default_float_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
search_exp,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"output_fields": output_fields})
|
||||
# 4. drop collection
|
||||
collection_w.drop()
|
||||
# 5. create the same collection name with same field name but varchar field type
|
||||
int64_field = cf.gen_int64_field(is_primary=True)
|
||||
string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
|
||||
json_field = cf.gen_json_field()
|
||||
float_vector_field = cf.gen_float_vec_field()
|
||||
fields = [int64_field, string_field, json_field, float_vector_field]
|
||||
schema = cf.gen_collection_schema(fields)
|
||||
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
|
||||
int64_values = pd.Series(data=[i for i in range(default_nb)])
|
||||
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
|
||||
json_values = [{"number": i, "string": str(i), "bool": bool(i),
|
||||
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
|
||||
float_vec_values = cf.gen_vectors(default_nb, default_dim)
|
||||
df = pd.DataFrame({
|
||||
ct.default_int64_field_name: int64_values,
|
||||
ct.default_float_field_name: None,
|
||||
ct.default_json_field_name: json_values,
|
||||
ct.default_float_vec_field_name: float_vec_values
|
||||
})
|
||||
collection_w.insert(df)
|
||||
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
|
||||
collection_w.load()
|
||||
collection_w.flush()
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
search_exp,
|
||||
output_fields=output_fields,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 1100,
|
||||
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
|
||||
"error: comparisons between VarChar and Int64 are not supported: "
|
||||
"invalid parameter"})
|
|
@ -0,0 +1,935 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_v2_base import TestMilvusClientV2Base
|
||||
from base.client_base import TestcaseBase
|
||||
import random
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
default_nb = ct.default_nb
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
nq = 1
|
||||
field_name = default_float_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
half_nb = ct.default_nb // 2
|
||||
|
||||
default_primary_key_field_name = "id"
|
||||
default_vector_field_name = "vector"
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
|
||||
|
||||
@pytest.mark.xdist_group("TestMilvusClientSearchPagination")
|
||||
class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
"""Test search with pagination functionality"""
|
||||
|
||||
def setup_class(self):
|
||||
super().setup_class(self)
|
||||
self.collection_name = cf.gen_unique_str("test_search_pagination")
|
||||
|
||||
@pytest.fixture(scope="class", autouse=True)
|
||||
def prepare_collection(self, request):
|
||||
"""
|
||||
Initialize collection before test class runs
|
||||
"""
|
||||
# Get client connection
|
||||
client = self._client()
|
||||
|
||||
# Create collection
|
||||
self.collection_schema = self.create_schema(client, enable_dynamic_field=False)[0]
|
||||
self.collection_schema.add_field(default_primary_key_field_name, DataType.INT64, is_primary=True, auto_id=False)
|
||||
self.collection_schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=default_dim)
|
||||
self.collection_schema.add_field(default_float_field_name, DataType.FLOAT)
|
||||
self.collection_schema.add_field(default_string_field_name, DataType.VARCHAR, max_length=65535)
|
||||
self.create_collection(client, self.collection_name, schema=self.collection_schema)
|
||||
|
||||
# Insert data 5 times with non-duplicated primary keys
|
||||
for j in range(5):
|
||||
rows = [{default_primary_key_field_name: i + j * default_nb,
|
||||
default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
|
||||
default_float_field_name: (i + j * default_nb) * 1.0,
|
||||
default_string_field_name: str(i + j * default_nb)}
|
||||
for i in range(default_nb)]
|
||||
self.insert(client, self.collection_name, rows)
|
||||
self.flush(client, self.collection_name)
|
||||
|
||||
# Create index
|
||||
self.index_params = self.prepare_index_params(client)[0]
|
||||
self.index_params.add_index(field_name=default_vector_field_name,
|
||||
metric_type="COSINE",
|
||||
index_type="IVF_FLAT",
|
||||
params={"nlist": 128})
|
||||
self.create_index(client, self.collection_name, index_params=self.index_params)
|
||||
|
||||
# Load collection
|
||||
self.load_collection(client, self.collection_name)
|
||||
|
||||
def teardown():
|
||||
self.drop_collection(self._client(), self.collection_name)
|
||||
request.addfinalizer(teardown)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_search_with_pagination_default(self):
|
||||
"""
|
||||
target: test search with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
client = self._client()
|
||||
# 1. Create collection with schema
|
||||
collection_name = self.collection_name
|
||||
|
||||
# 2. Search with pagination for 10 pages
|
||||
limit = 100
|
||||
pages = 10
|
||||
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
all_pages_results = []
|
||||
for page in range(pages):
|
||||
offset = page * limit
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
|
||||
search_res_with_offset, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": default_nq,
|
||||
"limit": limit
|
||||
}
|
||||
)
|
||||
all_pages_results.append(search_res_with_offset)
|
||||
|
||||
# 3. Search without pagination
|
||||
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
|
||||
search_res_full, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params_full,
|
||||
limit=limit * pages
|
||||
)
|
||||
|
||||
# 4. Compare results - verify pagination results equal the results in full search with offsets
|
||||
for p in range(pages):
|
||||
page_res = all_pages_results[p]
|
||||
for i in range(default_nq):
|
||||
page_ids = [page_res[i][j].get('id') for j in range(limit)]
|
||||
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
|
||||
assert page_ids == ids_in_full
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_search_with_pagination_default1(self):
|
||||
"""
|
||||
target: test search with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
client = self._client()
|
||||
# 1. Create collection with schema
|
||||
collection_name = self.collection_name
|
||||
|
||||
# 2. Search with pagination for 10 pages
|
||||
limit = 100
|
||||
pages = 10
|
||||
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
all_pages_results = []
|
||||
for page in range(pages):
|
||||
offset = page * limit
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
|
||||
search_res_with_offset, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": default_nq,
|
||||
"limit": limit
|
||||
}
|
||||
)
|
||||
all_pages_results.append(search_res_with_offset)
|
||||
|
||||
# 3. Search without pagination
|
||||
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
|
||||
search_res_full, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params_full,
|
||||
limit=limit * pages
|
||||
)
|
||||
|
||||
# 4. Compare results - verify pagination results equal the results in full search with offsets
|
||||
for p in range(pages):
|
||||
page_res = all_pages_results[p]
|
||||
for i in range(default_nq):
|
||||
page_ids = [page_res[i][j].get('id') for j in range(limit)]
|
||||
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
|
||||
assert page_ids == ids_in_full
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_milvus_client_search_with_pagination_default2(self):
|
||||
"""
|
||||
target: test search with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
client = self._client()
|
||||
# 1. Create collection with schema
|
||||
collection_name = self.collection_name
|
||||
|
||||
# 2. Search with pagination for 10 pages
|
||||
limit = 100
|
||||
pages = 10
|
||||
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
all_pages_results = []
|
||||
for page in range(pages):
|
||||
offset = page * limit
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
|
||||
search_res_with_offset, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": default_nq,
|
||||
"limit": limit
|
||||
}
|
||||
)
|
||||
all_pages_results.append(search_res_with_offset)
|
||||
|
||||
# 3. Search without pagination
|
||||
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
|
||||
search_res_full, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params_full,
|
||||
limit=limit * pages
|
||||
)
|
||||
|
||||
# 4. Compare results - verify pagination results equal the results in full search with offsets
|
||||
for p in range(pages):
|
||||
page_res = all_pages_results[p]
|
||||
for i in range(default_nq):
|
||||
page_ids = [page_res[i][j].get('id') for j in range(limit)]
|
||||
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
|
||||
assert page_ids == ids_in_full
|
||||
|
||||
# @pytest.mark.tags(CaseLabel.L0)
|
||||
# def test_milvus_client_search_with_pagination_default(self):
|
||||
# """
|
||||
# target: test search with pagination
|
||||
# method: 1. connect and create a collection
|
||||
# 2. search pagination with offset
|
||||
# 3. search with offset+limit
|
||||
# 4. compare with the search results whose corresponding ids should be the same
|
||||
# expected: search successfully and ids is correct
|
||||
# """
|
||||
# client = self._client()
|
||||
# # 1. Create collection with schema
|
||||
# collection_name = cf.gen_unique_str("test_search_pagination")
|
||||
# self.create_collection(client, collection_name, default_dim)
|
||||
#
|
||||
# # Insert data 5 times with non-duplicated primary keys
|
||||
# for j in range(5):
|
||||
# rows = [{default_primary_key_field_name: i + j * default_nb,
|
||||
# default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
|
||||
# default_float_field_name: (i + j * default_nb) * 1.0,
|
||||
# default_string_field_name: str(i + j * default_nb)}
|
||||
# for i in range(default_nb)]
|
||||
# self.insert(client, collection_name, rows)
|
||||
# self.flush(client, collection_name)
|
||||
#
|
||||
# # 2. Search with pagination for 10 pages
|
||||
# limit = 100
|
||||
# pages = 10
|
||||
# vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
# all_pages_results = []
|
||||
# for page in range(pages):
|
||||
# offset = page * limit
|
||||
# search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
|
||||
# search_res_with_offset, _ = self.search(
|
||||
# client,
|
||||
# collection_name,
|
||||
# vectors_to_search[:default_nq],
|
||||
# anns_field=default_vector_field_name,
|
||||
# search_params=search_params,
|
||||
# limit=limit,
|
||||
# check_task=CheckTasks.check_search_results,
|
||||
# check_items={"enable_milvus_client_api": True,
|
||||
# "nq": default_nq,
|
||||
# "limit": limit
|
||||
# }
|
||||
# )
|
||||
# all_pages_results.append(search_res_with_offset)
|
||||
#
|
||||
# # 3. Search without pagination
|
||||
# search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
|
||||
# search_res_full, _ = self.search(
|
||||
# client,
|
||||
# collection_name,
|
||||
# vectors_to_search[:default_nq],
|
||||
# anns_field=default_vector_field_name,
|
||||
# search_params=search_params_full,
|
||||
# limit=limit * pages
|
||||
# )
|
||||
#
|
||||
# # 4. Compare results - verify pagination results equal the results in full search with offsets
|
||||
# for p in range(pages):
|
||||
# page_res = all_pages_results[p]
|
||||
# for i in range(default_nq):
|
||||
# page_ids = [page_res[i][j].get('id') for j in range(limit)]
|
||||
# ids_in_full = [search_res_full[i][p*limit:p*limit+limit][j].get('id') for j in range(limit)]
|
||||
# assert page_ids == ids_in_full
|
||||
|
||||
|
||||
class TestSearchPagination(TestcaseBase):
|
||||
""" Test case of search pagination """
|
||||
|
||||
@pytest.fixture(scope="function", params=[0, 10, 100])
|
||||
def offset(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def auto_id(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def _async(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def enable_dynamic_field(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||
def vector_data_type(self, request):
|
||||
yield request.param
|
||||
|
||||
"""
|
||||
******************************************************************
|
||||
# The following are valid base cases
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_string_with_pagination(self, offset, _async):
|
||||
"""
|
||||
target: test search string with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
auto_id = True
|
||||
enable_dynamic_field = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
# 2. search
|
||||
search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, default_limit,
|
||||
default_search_string_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})[0]
|
||||
# 3. search with offset+limit
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
default_limit + offset, default_search_string_exp, _async=_async)[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
res.done()
|
||||
res = res.result()
|
||||
res_distance = res[0].distances[offset:]
|
||||
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_binary_with_pagination(self, offset):
|
||||
"""
|
||||
target: test search binary with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
auto_id = False
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, is_binary=True, auto_id=auto_id, dim=default_dim)[0:4]
|
||||
# 2. search
|
||||
search_param = {"metric_type": "JACCARD",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
binary_vectors = cf.gen_binary_vectors(default_nq, default_dim)[1]
|
||||
search_res = collection_w.search(binary_vectors[:default_nq], "binary_vector",
|
||||
search_param, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit})[0]
|
||||
# 3. search with offset+limit
|
||||
search_binary_param = {
|
||||
"metric_type": "JACCARD", "params": {"nprobe": 10}}
|
||||
res = collection_w.search(binary_vectors[:default_nq], "binary_vector", search_binary_param,
|
||||
default_limit + offset)[0]
|
||||
|
||||
assert len(search_res[0].ids) == len(res[0].ids[offset:])
|
||||
assert sorted(search_res[0].distances, key=np.float32) == sorted(
|
||||
res[0].distances[offset:], key=np.float32)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_all_vector_type_with_pagination(self, vector_data_type):
|
||||
"""
|
||||
target: test search with pagination using different vector datatype
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
auto_id = False
|
||||
enable_dynamic_field = True
|
||||
offset = 100
|
||||
limit = 20
|
||||
collection_w = self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
vector_data_type=vector_data_type)[0]
|
||||
# 2. search pagination with offset
|
||||
search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
|
||||
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim, vector_data_type)
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, limit,
|
||||
default_search_exp,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": limit})[0]
|
||||
# 3. search with offset+limit
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
limit + offset, default_search_exp)[0]
|
||||
res_distance = res[0].distances[offset:]
|
||||
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("limit", [100, 3000, 10000])
|
||||
def test_search_with_pagination_topK(self, limit, _async):
|
||||
"""
|
||||
target: test search with pagination limit + offset = topK
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with topK
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
topK = 16384
|
||||
auto_id = True
|
||||
offset = topK - limit
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, nb=20000, auto_id=auto_id, dim=default_dim)[0]
|
||||
# 2. search
|
||||
search_param = {"metric_type": "COSINE",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, limit,
|
||||
default_search_exp, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": limit,
|
||||
"_async": _async})[0]
|
||||
# 3. search with topK
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
topK, default_search_exp, _async=_async)[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
res.done()
|
||||
res = res.result()
|
||||
res_distance = res[0].distances[offset:]
|
||||
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_pagination_with_expression(self, offset):
|
||||
"""
|
||||
target: test search pagination with expression
|
||||
method: create connection, collection, insert and search with expression
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. create a collection
|
||||
nb = 2500
|
||||
dim = 38
|
||||
enable_dynamic_field = False
|
||||
collection_w, _vectors, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, nb=nb, dim=dim,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
collection_w.load()
|
||||
# filter result with expression in collection
|
||||
_vectors = _vectors[0]
|
||||
for _async in [False, True]:
|
||||
for expressions in cf.gen_normal_expressions_and_templates():
|
||||
log.debug(f"search with expression: {expressions} with _async: {_async}")
|
||||
expr = expressions[0].replace("&&", "and").replace("||", "or")
|
||||
filter_ids = []
|
||||
for i, _id in enumerate(insert_ids):
|
||||
if enable_dynamic_field:
|
||||
int64 = _vectors[i][ct.default_int64_field_name]
|
||||
float = _vectors[i][ct.default_float_field_name]
|
||||
else:
|
||||
int64 = _vectors.int64[i]
|
||||
float = _vectors.float[i]
|
||||
if not expr or eval(expr):
|
||||
filter_ids.append(_id)
|
||||
# 2. search
|
||||
limit = min(default_limit, len(filter_ids))
|
||||
if offset >= len(filter_ids):
|
||||
limit = 0
|
||||
elif len(filter_ids) - offset < default_limit:
|
||||
limit = len(filter_ids) - offset
|
||||
search_param = {"metric_type": "COSINE",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, default_limit,
|
||||
expr=expr,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async})
|
||||
# 3. search with offset+limit
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
default_limit + offset,
|
||||
expr=expr, _async=_async)[0]
|
||||
if _async:
|
||||
res.done()
|
||||
res = res.result()
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
filter_ids_set = set(filter_ids)
|
||||
for hits in search_res:
|
||||
ids = hits.ids
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
# 4. search again with expression template
|
||||
expr = cf.get_expr_from_template(expressions[1]).replace("&&", "and").replace("||", "or")
|
||||
expr_params = cf.get_expr_params_from_template(expressions[1])
|
||||
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, default_limit,
|
||||
expr=expr, expr_params=expr_params,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async})
|
||||
# 3. search with offset+limit
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
default_limit + offset,
|
||||
expr=expr, expr_params=expr_params, _async=_async)[0]
|
||||
if _async:
|
||||
res.done()
|
||||
res = res.result()
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
filter_ids_set = set(filter_ids)
|
||||
for hits in search_res:
|
||||
ids = hits.ids
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_pagination_with_index_partition(self, offset, _async):
|
||||
"""
|
||||
target: test search pagination with index and partition
|
||||
method: create connection, collection, insert data, create index and search
|
||||
expected: searched successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
auto_id = False
|
||||
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True,
|
||||
partition_num=1,
|
||||
auto_id=auto_id,
|
||||
is_index=False)[0:4]
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "IVF_FLAT",
|
||||
"params": {"nlist": 128}, "metric_type": "L2"}
|
||||
collection_w.create_index("float_vector", default_index)
|
||||
collection_w.load()
|
||||
# 3. search through partitions
|
||||
par = collection_w.partitions
|
||||
limit = 100
|
||||
search_params = {"metric_type": "L2",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_params, limit, default_search_exp,
|
||||
[par[0].name, par[1].name], _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": limit,
|
||||
"_async": _async})[0]
|
||||
# 3. search through partitions with offset+limit
|
||||
search_params = {"metric_type": "L2"}
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, search_params,
|
||||
limit + offset, default_search_exp,
|
||||
[par[0].name, par[1].name], _async=_async)[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
res.done()
|
||||
res = res.result()
|
||||
res_distance = res[0].distances[offset:]
|
||||
# assert cf.sort_search_distance(search_res[0].distances) == cf.sort_search_distance(res_distance)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_pagination_with_inserted_data(self, offset, _async):
|
||||
"""
|
||||
target: test search pagination with inserted data
|
||||
method: create connection, collection, insert data and search
|
||||
check the results by searching with limit+offset
|
||||
expected: searched successfully
|
||||
"""
|
||||
# 1. create collection
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, False, dim=default_dim)[0]
|
||||
# 2. insert data
|
||||
data = cf.gen_default_dataframe_data(dim=default_dim)
|
||||
collection_w.insert(data)
|
||||
collection_w.load()
|
||||
# 3. search
|
||||
search_params = {"offset": offset}
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_params, default_limit,
|
||||
default_search_exp, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"_async": _async})[0]
|
||||
# 4. search through partitions with offset+limit
|
||||
search_params = {}
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, search_params,
|
||||
default_limit + offset, default_search_exp, _async=_async)[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
res.done()
|
||||
res = res.result()
|
||||
res_distance = res[0].distances[offset:]
|
||||
assert sorted(search_res[0].distances) == sorted(res_distance)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_pagination_empty(self, offset, _async):
|
||||
"""
|
||||
target: test search pagination empty
|
||||
method: connect, create collection, insert data and search
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize without data
|
||||
auto_id = False
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, dim=default_dim)[0]
|
||||
# 2. search collection without data
|
||||
search_param = {"metric_type": "COSINE",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
search_res = collection_w.search([], default_search_field, search_param,
|
||||
default_limit, default_search_exp, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": 0,
|
||||
"_async": _async})[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
assert len(search_res) == 0
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("offset", [3000, 5000])
|
||||
def test_search_pagination_with_offset_over_num_entities(self, offset):
|
||||
"""
|
||||
target: test search pagination with offset over num_entities
|
||||
method: create connection, collection, insert 3000 entities and search with offset over 3000
|
||||
expected: return an empty list
|
||||
"""
|
||||
# 1. initialize
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, dim=default_dim)[0]
|
||||
# 2. search
|
||||
search_param = {"metric_type": "COSINE",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, default_limit,
|
||||
default_search_exp,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": 0})[0]
|
||||
assert res[0].ids == []
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[:7])
|
||||
def test_search_pagination_after_different_index(self, index, offset, _async):
|
||||
"""
|
||||
target: test search pagination after different index
|
||||
method: test search pagination after different index and corresponding search params
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids, time_stamp = self.init_collection_general(prefix, True, 1000,
|
||||
partition_num=1,
|
||||
auto_id=auto_id,
|
||||
dim=dim, is_index=False)[0:5]
|
||||
# 2. create index and load
|
||||
params = cf.get_index_params_params(index)
|
||||
default_index = {"index_type": index, "params": params, "metric_type": "L2"}
|
||||
collection_w.create_index("float_vector", default_index)
|
||||
collection_w.load()
|
||||
# 3. search
|
||||
search_params = cf.gen_search_param(index)
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
for search_param in search_params:
|
||||
res = collection_w.search(vectors[:default_nq], default_search_field, search_param,
|
||||
default_limit + offset, default_search_exp, _async=_async)[0]
|
||||
search_param["offset"] = offset
|
||||
log.info("Searching with search params: {}".format(search_param))
|
||||
search_res = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, default_limit,
|
||||
default_search_exp, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})[0]
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
res.done()
|
||||
res = res.result()
|
||||
res_distance = res[0].distances[offset:]
|
||||
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
|
||||
assert set(search_res[0].ids) == set(res[0].ids[offset:])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("offset", [100, default_nb // 2])
|
||||
def test_search_offset_different_position(self, offset):
|
||||
"""
|
||||
target: test search pagination with offset in different position
|
||||
method: create connection, collection, insert entities and search with offset
|
||||
expected: search successfully
|
||||
"""
|
||||
# 1. initialize
|
||||
collection_w = self.init_collection_general(prefix, True)[0]
|
||||
# 2. search with offset in params
|
||||
search_params = {"metric_type": "COSINE",
|
||||
"params": {"nprobe": 10}, "offset": offset}
|
||||
res1 = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_params, default_limit)[0]
|
||||
|
||||
# 3. search with offset outside params
|
||||
res2 = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
|
||||
default_limit, offset=offset)[0]
|
||||
assert res1[0].ids == res2[0].ids
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("offset", [1, 5, 20])
|
||||
def test_search_sparse_with_pagination(self, offset):
|
||||
"""
|
||||
target: test search sparse with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
auto_id = False
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4]
|
||||
# 2. search with offset+limit
|
||||
search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset}
|
||||
search_vectors = cf.gen_default_list_sparse_data()[-1][-2:]
|
||||
search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name,
|
||||
search_param, default_limit)[0]
|
||||
# 3. search
|
||||
_search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
|
||||
res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param,
|
||||
default_limit + offset)[0]
|
||||
assert len(search_res[0].ids) == len(res[0].ids[offset:])
|
||||
assert sorted(search_res[0].distances, key=numpy.float32) == sorted(
|
||||
res[0].distances[offset:], key=numpy.float32)
|
||||
|
||||
|
||||
class TestSearchPaginationInvalid(TestMilvusClientV2Base):
|
||||
""" Test case of search pagination """
|
||||
"""
|
||||
******************************************************************
|
||||
# The following are invalid cases
|
||||
******************************************************************
|
||||
"""
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_pagination_with_invalid_offset_type(self):
|
||||
"""
|
||||
target: test search pagination with invalid offset type
|
||||
method: create connection, collection, insert and search with invalid offset type
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
|
||||
# 1. Create collection with schema
|
||||
collection_name = cf.gen_unique_str("test_search_pagination")
|
||||
self.create_collection(client, collection_name, default_dim)
|
||||
|
||||
# Insert data
|
||||
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
|
||||
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
|
||||
self.insert(client, collection_name, rows)
|
||||
self.flush(client, collection_name)
|
||||
|
||||
# Search with invalid offset types
|
||||
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
invalid_offsets = [" ", [1, 2], {1}, "12 s"]
|
||||
|
||||
for offset in invalid_offsets:
|
||||
log.debug(f"assert search error if offset={offset}")
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
|
||||
self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=default_limit,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={
|
||||
"err_code": 1,
|
||||
"err_msg": "wrong type for offset, expect int"
|
||||
}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_pagination_with_invalid_offset_value(self):
|
||||
"""
|
||||
target: test search pagination with invalid offset value
|
||||
method: create connection, collection, insert and search with invalid offset value
|
||||
expected: raise exception
|
||||
"""
|
||||
client = self._client()
|
||||
|
||||
# 1. Create collection with schema
|
||||
collection_name = cf.gen_unique_str("test_search_pagination")
|
||||
self.create_collection(client, collection_name, default_dim)
|
||||
|
||||
# Insert data
|
||||
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
|
||||
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
|
||||
self.insert(client, collection_name, rows)
|
||||
self.flush(client, collection_name)
|
||||
|
||||
# Search with invalid offset values
|
||||
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
|
||||
invalid_offsets = [-1, 16385]
|
||||
|
||||
for offset in invalid_offsets:
|
||||
log.debug(f"assert search error if offset={offset}")
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
|
||||
self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
anns_field=default_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=default_limit,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={
|
||||
"err_code": 1,
|
||||
"err_msg": f"offset [{offset}] is invalid, it should be in range [1, 16384]"
|
||||
}
|
||||
)
|
|
@ -0,0 +1,729 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchString(TestcaseBase):
|
||||
"""
|
||||
******************************************************************
|
||||
The following cases are used to test search about string
|
||||
******************************************************************
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope="function",
|
||||
params=[default_nb, default_nb_medium])
|
||||
def nb(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[2, 500])
|
||||
def nq(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[32, 128])
|
||||
def dim(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def auto_id(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[False, True])
|
||||
def _async(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.fixture(scope="function", params=[True, False])
|
||||
def enable_dynamic_field(self, request):
|
||||
yield request.param
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_not_primary(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is not primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field, string field is not primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
auto_id = True
|
||||
enable_dynamic_field = False
|
||||
collection_w, insert_data, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim, nb=1000,
|
||||
enable_dynamic_field=enable_dynamic_field, language="Chinese")[0:4]
|
||||
search_str = insert_data[0][default_string_field_name][1]
|
||||
search_exp = f"{default_string_field_name} == '{search_str}'"
|
||||
# 2. search
|
||||
log.info("test_search_string_field_not_primary: searching collection %s" % collection_w.name)
|
||||
log.info("search expr: %s" % search_exp)
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit, search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"_async": _async})
|
||||
if _async:
|
||||
res.done()
|
||||
res = res.result()
|
||||
assert res[0][0].entity.varchar == search_str
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_is_primary_true(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field ,string field is primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
enable_dynamic_field = True
|
||||
collection_w, insert_data, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
|
||||
enable_dynamic_field=enable_dynamic_field, language="English", nb=1000)[0:4]
|
||||
search_str = insert_data[0][1][default_string_field_name]
|
||||
search_exp = f"{default_string_field_name} == '{search_str}'"
|
||||
# 2. search
|
||||
log.info("test_search_string_field_is_primary_true: searching collection %s" % collection_w.name)
|
||||
log.info("search expr: %s" % search_exp)
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit, search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"_async": _async})
|
||||
if _async:
|
||||
res.done()
|
||||
res = res.result()
|
||||
assert res[0][0].entity.varchar == search_str
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_is_primary_true_multi_vector_fields(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field ,string field is primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
enable_dynamic_field = False
|
||||
multiple_dim_array = [dim, dim]
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
multiple_dim_array=multiple_dim_array, language="German")[0:4]
|
||||
# 2. search
|
||||
log.info("test_search_string_field_is_primary_true: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
vector_list = cf.extract_vector_field_name_list(collection_w)
|
||||
for search_field in vector_list:
|
||||
collection_w.search(vectors[:default_nq], search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_string_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_range_search_string_field_is_primary_true(self, _async):
|
||||
"""
|
||||
target: test range search with string expr and string field is primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field ,string field is primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
enable_dynamic_field = True
|
||||
multiple_dim_array = [dim, dim]
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
|
||||
enable_dynamic_field=enable_dynamic_field, is_index=False,
|
||||
multiple_dim_array=multiple_dim_array)[0:4]
|
||||
vector_list = cf.extract_vector_field_name_list(collection_w)
|
||||
collection_w.create_index(field_name, {"metric_type": "L2"})
|
||||
for vector_field_name in vector_list:
|
||||
collection_w.create_index(vector_field_name, {"metric_type": "L2"})
|
||||
collection_w.load()
|
||||
# 2. search
|
||||
log.info("test_search_string_field_is_primary_true: searching collection %s" %
|
||||
collection_w.name)
|
||||
range_search_params = {"metric_type": "L2",
|
||||
"params": {"radius": 1000, "range_filter": 0}}
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
for search_field in vector_list:
|
||||
collection_w.search(vectors[:default_nq], search_field,
|
||||
range_search_params, default_limit,
|
||||
default_search_string_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_mix_expr(self, _async):
|
||||
"""
|
||||
target: test search with mix string and int expr
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses mix expr
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
auto_id = False
|
||||
enable_dynamic_field = False
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim,
|
||||
enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
# 2. search
|
||||
log.info("test_search_string_mix_expr: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_search_mix_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_with_invalid_expr(self):
|
||||
"""
|
||||
target: test search data
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses invalid string expr
|
||||
expected: Raise exception
|
||||
"""
|
||||
# 1. initialize with data
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim)[0:4]
|
||||
# 2. search
|
||||
log.info("test_search_string_with_invalid_expr: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
default_invaild_string_exp,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 1100,
|
||||
"err_msg": "failed to create query plan: cannot "
|
||||
"parse expression: varchar >= 0"})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("expression", cf.gen_normal_string_expressions([ct.default_string_field_name]))
|
||||
def test_search_with_different_string_expr(self, expression, _async):
|
||||
"""
|
||||
target: test search with different string expressions
|
||||
method: test search with different string expressions
|
||||
expected: searched successfully with correct limit(topK)
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 64
|
||||
nb = 1000
|
||||
enable_dynamic_field = True
|
||||
collection_w, _vectors, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, nb, dim=dim,
|
||||
is_index=False, enable_dynamic_field=enable_dynamic_field)[0:4]
|
||||
|
||||
# filter result with expression in collection
|
||||
_vectors = _vectors[0]
|
||||
filter_ids = []
|
||||
expression = expression.replace("&&", "and").replace("||", "or")
|
||||
for i, _id in enumerate(insert_ids):
|
||||
if enable_dynamic_field:
|
||||
int64 = _vectors[i][ct.default_int64_field_name]
|
||||
varchar = _vectors[i][ct.default_string_field_name]
|
||||
else:
|
||||
int64 = _vectors.int64[i]
|
||||
varchar = _vectors.varchar[i]
|
||||
if not expression or eval(expression):
|
||||
filter_ids.append(_id)
|
||||
|
||||
# 2. create index
|
||||
index_param = {"index_type": "FLAT", "metric_type": "COSINE", "params": {"nlist": 100}}
|
||||
collection_w.create_index("float_vector", index_param)
|
||||
collection_w.load()
|
||||
|
||||
# 3. search with expression
|
||||
log.info("test_search_with_expression: searching with expression: %s" % expression)
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
|
||||
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, nb, expression,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": min(nb, len(filter_ids)),
|
||||
"_async": _async})
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
|
||||
filter_ids_set = set(filter_ids)
|
||||
for hits in search_res:
|
||||
ids = hits.ids
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_is_primary_binary(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field ,string field is primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
dim = 64
|
||||
# 1. initialize with binary data
|
||||
collection_w, _, binary_raw_vector, insert_ids = \
|
||||
self.init_collection_general(prefix, True, 2, is_binary=True, dim=dim,
|
||||
is_index=False, primary_field=ct.default_string_field_name)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "BIN_IVF_FLAT",
|
||||
"params": {"nlist": 128}, "metric_type": "JACCARD"}
|
||||
collection_w.create_index("binary_vector", default_index)
|
||||
collection_w.load()
|
||||
# 3. search with exception
|
||||
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
|
||||
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
|
||||
output_fields = [default_string_field_name]
|
||||
collection_w.search(binary_vectors[:default_nq], "binary_vector", search_params,
|
||||
default_limit, default_search_string_exp, output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 2,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_binary(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is not primary
|
||||
method: create an binary collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field, string field is not primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with binary data
|
||||
dim = 128
|
||||
auto_id = True
|
||||
collection_w, _, binary_raw_vector, insert_ids = \
|
||||
self.init_collection_general(prefix, True, 2, is_binary=True, auto_id=auto_id,
|
||||
dim=dim, is_index=False)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "BIN_IVF_FLAT",
|
||||
"params": {"nlist": 128}, "metric_type": "JACCARD"}
|
||||
collection_w.create_index("binary_vector", default_index)
|
||||
collection_w.load()
|
||||
# 2. search with exception
|
||||
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
|
||||
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
|
||||
collection_w.search(binary_vectors[:default_nq], "binary_vector", search_params,
|
||||
default_limit, default_search_string_exp,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 2,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_mix_expr_with_binary(self, _async):
|
||||
"""
|
||||
target: test search with mix string and int expr
|
||||
method: create an binary collection and insert data
|
||||
create index and collection load
|
||||
collection search uses mix expr
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, dim=dim, is_binary=True, is_index=False)[0:4]
|
||||
# 2. create index
|
||||
default_index = {"index_type": "BIN_IVF_FLAT",
|
||||
"params": {"nlist": 128}, "metric_type": "JACCARD"}
|
||||
collection_w.create_index("binary_vector", default_index)
|
||||
collection_w.load()
|
||||
# 2. search
|
||||
log.info("test_search_mix_expr_with_binary: searching collection %s" %
|
||||
collection_w.name)
|
||||
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
|
||||
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
collection_w.search(binary_vectors[:default_nq], "binary_vector",
|
||||
search_params, default_limit,
|
||||
default_search_mix_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_not_primary_prefix(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is not primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field, string field is not primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
auto_id = False
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, dim=default_dim, is_index=False)[0:4]
|
||||
index_param = {"index_type": "IVF_FLAT",
|
||||
"metric_type": "L2", "params": {"nlist": 100}}
|
||||
collection_w.create_index("float_vector", index_param, index_name="a")
|
||||
index_param_two = {}
|
||||
collection_w.create_index("varchar", index_param_two, index_name="b")
|
||||
collection_w.load()
|
||||
# 2. search
|
||||
log.info("test_search_string_field_not_primary: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
# search all buckets
|
||||
{"metric_type": "L2", "params": {
|
||||
"nprobe": 100}}, default_limit,
|
||||
perfix_expr,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"_async": _async}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_index(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is not primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field, string field is not primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
auto_id = True
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, dim=default_dim, is_index=False)[0:4]
|
||||
index_param = {"index_type": "IVF_FLAT",
|
||||
"metric_type": "L2", "params": {"nlist": 100}}
|
||||
collection_w.create_index("float_vector", index_param, index_name="a")
|
||||
index_param = {"index_type": "Trie", "params": {}}
|
||||
collection_w.create_index("varchar", index_param, index_name="b")
|
||||
collection_w.load()
|
||||
# 2. search
|
||||
log.info("test_search_string_field_not_primary: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
# search all buckets
|
||||
{"metric_type": "L2", "params": {
|
||||
"nprobe": 100}}, default_limit,
|
||||
perfix_expr,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"_async": _async}
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_all_index_with_compare_expr(self, _async):
|
||||
"""
|
||||
target: test delete after creating index
|
||||
method: 1.create collection , insert data, primary_field is string field
|
||||
2.create string and float index ,delete entities, query
|
||||
3.search
|
||||
expected: assert index and deleted id not in search result
|
||||
"""
|
||||
# create collection, insert tmp_nb, flush and load
|
||||
collection_w, vectors, _, insert_ids = self.init_collection_general(prefix, insert_data=True,
|
||||
primary_field=ct.default_string_field_name,
|
||||
is_index=False)[0:4]
|
||||
|
||||
# create index
|
||||
index_params_one = {"index_type": "IVF_SQ8",
|
||||
"metric_type": "COSINE", "params": {"nlist": 64}}
|
||||
collection_w.create_index(
|
||||
ct.default_float_vec_field_name, index_params_one, index_name=index_name1)
|
||||
index_params_two = {}
|
||||
collection_w.create_index(
|
||||
ct.default_string_field_name, index_params=index_params_two, index_name=index_name2)
|
||||
assert collection_w.has_index(index_name=index_name2)
|
||||
|
||||
collection_w.release()
|
||||
collection_w.load()
|
||||
# delete entity
|
||||
expr = 'float >= int64'
|
||||
# search with id 0 vectors
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_int64_field_name,
|
||||
default_float_field_name, default_string_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
expr,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_is_primary_insert_empty(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is primary
|
||||
method: create collection ,string field is primary
|
||||
collection load and insert data
|
||||
collection search uses string expr in string field
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, _ = \
|
||||
self.init_collection_general(
|
||||
prefix, False, primary_field=ct.default_string_field_name)[0:4]
|
||||
|
||||
nb = 3000
|
||||
data = cf.gen_default_list_data(nb)
|
||||
data[2] = ["" for _ in range(nb)]
|
||||
collection_w.insert(data=data)
|
||||
|
||||
collection_w.load()
|
||||
|
||||
search_string_exp = "varchar >= \"\""
|
||||
limit = 1
|
||||
|
||||
# 2. search
|
||||
log.info("test_search_string_field_is_primary_true: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, limit,
|
||||
search_string_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_field_not_primary_is_empty(self, _async):
|
||||
"""
|
||||
target: test search with string expr and string field is not primary
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field, string field is not primary
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w, _, _, _ = \
|
||||
self.init_collection_general(
|
||||
prefix, False, primary_field=ct.default_int64_field_name, is_index=False)[0:4]
|
||||
|
||||
nb = 3000
|
||||
data = cf.gen_default_list_data(nb)
|
||||
insert_ids = data[0]
|
||||
data[2] = ["" for _ in range(nb)]
|
||||
|
||||
collection_w.insert(data)
|
||||
assert collection_w.num_entities == nb
|
||||
|
||||
# 2. create index
|
||||
index_param = {"index_type": "IVF_FLAT",
|
||||
"metric_type": "COSINE", "params": {"nlist": 100}}
|
||||
collection_w.create_index("float_vector", index_param)
|
||||
collection_w.load()
|
||||
|
||||
search_string_exp = "varchar >= \"\""
|
||||
|
||||
# 3. search
|
||||
log.info("test_search_string_field_not_primary: searching collection %s" %
|
||||
collection_w.name)
|
||||
vectors = [[random.random() for _ in range(default_dim)]
|
||||
for _ in range(default_nq)]
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit,
|
||||
search_string_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_string_different_language(self):
|
||||
"""
|
||||
target: test search with string expr using different language
|
||||
method: create collection and insert data
|
||||
create index and collection load
|
||||
collection search uses string expr in string field
|
||||
expected: Search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
_async = random.choice([True, False])
|
||||
auto_id = random.choice([True, False])
|
||||
enable_dynamic_field = random.choice([True, False])
|
||||
all_language = ["English", "French", "Spanish", "German", "Italian", "Portuguese", "Russian", "Chinese",
|
||||
"Japanese", "Arabic", "Hindi"]
|
||||
language = random.choice(all_language)
|
||||
log.info(f"_async: {_async}, auto_id: {auto_id}, enable_dynamic_field: {enable_dynamic_field},"
|
||||
f"language: {language}")
|
||||
collection_w, insert_data, _, insert_ids = \
|
||||
self.init_collection_general(prefix, True, auto_id=auto_id, nb=100,
|
||||
enable_dynamic_field=enable_dynamic_field, language=language)[0:4]
|
||||
search_str = insert_data[0][default_string_field_name][1] if not enable_dynamic_field \
|
||||
else insert_data[0][1][default_string_field_name]
|
||||
search_exp = f"{default_string_field_name} == '{search_str}'"
|
||||
# 2. search
|
||||
log.info("test_search_string_field_not_primary: searching collection %s" % collection_w.name)
|
||||
log.info("search expr: %s" % search_exp)
|
||||
output_fields = [default_string_field_name, default_float_field_name]
|
||||
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
default_search_params, default_limit, search_exp,
|
||||
output_fields=output_fields,
|
||||
_async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": 1,
|
||||
"_async": _async})
|
||||
if _async:
|
||||
res.done()
|
||||
res = res.result()
|
||||
assert res[0][0].entity.varchar == search_str
|
|
@ -0,0 +1,409 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSearchWithTextMatchFilter(TestcaseBase):
|
||||
"""
|
||||
******************************************************************
|
||||
The following cases are used to test query text match
|
||||
******************************************************************
|
||||
"""
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True, False])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_search_with_text_match_filter_normal_en(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key
|
||||
):
|
||||
"""
|
||||
target: test text match normal
|
||||
method: 1. enable text match and insert data with varchar
|
||||
2. get the most common words and query with text match
|
||||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(
|
||||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection")
|
||||
data_size = 5000
|
||||
collection_w = self.init_collection_wrap(
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
log.info(f"collection {collection_w.describe()}")
|
||||
fake = fake_en
|
||||
if tokenizer == "jieba":
|
||||
language = "zh"
|
||||
fake = fake_zh
|
||||
else:
|
||||
language = "en"
|
||||
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"word": fake.word().lower(),
|
||||
"sentence": fake.sentence().lower(),
|
||||
"paragraph": fake.paragraph().lower(),
|
||||
"text": fake.text().lower(),
|
||||
"float32_emb": [random.random() for _ in range(dim)],
|
||||
"sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
|
||||
}
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
data[i: i + batch_size]
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"float32_emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
)
|
||||
collection_w.create_index(
|
||||
"sparse_emb",
|
||||
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
|
||||
)
|
||||
if enable_inverted_index:
|
||||
collection_w.create_index("word", {"index_type": "INVERTED"})
|
||||
collection_w.load()
|
||||
# analyze the croup
|
||||
text_fields = ["word", "sentence", "paragraph", "text"]
|
||||
wf_map = {}
|
||||
for field in text_fields:
|
||||
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
|
||||
# search with filter single field for one token
|
||||
df_split = cf.split_dataframes(df, text_fields, language=language)
|
||||
log.info(f"df_split\n{df_split}")
|
||||
for ann_field in ["float32_emb", "sparse_emb"]:
|
||||
log.info(f"ann_field {ann_field}")
|
||||
if ann_field == "float32_emb":
|
||||
search_data = [[random.random() for _ in range(dim)]]
|
||||
elif ann_field == "sparse_emb":
|
||||
search_data = cf.gen_sparse_vectors(1, dim=10000)
|
||||
else:
|
||||
search_data = [[random.random() for _ in range(dim)]]
|
||||
for field in text_fields:
|
||||
token = wf_map[field].most_common()[0][0]
|
||||
expr = f"text_match({field}, '{token}')"
|
||||
manual_result = df_split[
|
||||
df_split.apply(lambda row: token in row[field], axis=1)
|
||||
]
|
||||
log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
|
||||
res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
anns_field=ann_field,
|
||||
param={},
|
||||
limit=100,
|
||||
expr=expr, output_fields=["id", field])
|
||||
for res in res_list:
|
||||
log.info(f"res len {len(res)} res {res}")
|
||||
assert len(res) > 0
|
||||
for r in res:
|
||||
r = r.to_dict()
|
||||
assert token in r["entity"][field]
|
||||
|
||||
# search with filter single field for multi-token
|
||||
for field in text_fields:
|
||||
# match top 10 most common words
|
||||
top_10_tokens = []
|
||||
for word, count in wf_map[field].most_common(10):
|
||||
top_10_tokens.append(word)
|
||||
string_of_top_10_words = " ".join(top_10_tokens)
|
||||
expr = f"text_match({field}, '{string_of_top_10_words}')"
|
||||
log.info(f"expr {expr}")
|
||||
res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
anns_field=ann_field,
|
||||
param={},
|
||||
limit=100,
|
||||
expr=expr, output_fields=["id", field])
|
||||
for res in res_list:
|
||||
log.info(f"res len {len(res)} res {res}")
|
||||
assert len(res) > 0
|
||||
for r in res:
|
||||
r = r.to_dict()
|
||||
assert any([token in r["entity"][field] for token in top_10_tokens])
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True, False])
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba"])
|
||||
@pytest.mark.xfail(reason="unstable case")
|
||||
def test_search_with_text_match_filter_normal_zh(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key
|
||||
):
|
||||
"""
|
||||
target: test text match normal
|
||||
method: 1. enable text match and insert data with varchar
|
||||
2. get the most common words and query with text match
|
||||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(
|
||||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection")
|
||||
data_size = 5000
|
||||
collection_w = self.init_collection_wrap(
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
log.info(f"collection {collection_w.describe()}")
|
||||
fake = fake_en
|
||||
if tokenizer == "jieba":
|
||||
language = "zh"
|
||||
fake = fake_zh
|
||||
else:
|
||||
language = "en"
|
||||
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"word": fake.word().lower(),
|
||||
"sentence": fake.sentence().lower(),
|
||||
"paragraph": fake.paragraph().lower(),
|
||||
"text": fake.text().lower(),
|
||||
"float32_emb": [random.random() for _ in range(dim)],
|
||||
"sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
|
||||
}
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
data[i : i + batch_size]
|
||||
if i + batch_size < len(df)
|
||||
else data[i : len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"float32_emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
)
|
||||
collection_w.create_index(
|
||||
"sparse_emb",
|
||||
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
|
||||
)
|
||||
if enable_inverted_index:
|
||||
collection_w.create_index("word", {"index_type": "INVERTED"})
|
||||
collection_w.load()
|
||||
# analyze the croup
|
||||
text_fields = ["word", "sentence", "paragraph", "text"]
|
||||
wf_map = {}
|
||||
for field in text_fields:
|
||||
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
|
||||
# search with filter single field for one token
|
||||
df_split = cf.split_dataframes(df, text_fields, language=language)
|
||||
log.info(f"df_split\n{df_split}")
|
||||
for ann_field in ["float32_emb", "sparse_emb"]:
|
||||
log.info(f"ann_field {ann_field}")
|
||||
if ann_field == "float32_emb":
|
||||
search_data = [[random.random() for _ in range(dim)]]
|
||||
elif ann_field == "sparse_emb":
|
||||
search_data = cf.gen_sparse_vectors(1,dim=10000)
|
||||
else:
|
||||
search_data = [[random.random() for _ in range(dim)]]
|
||||
for field in text_fields:
|
||||
token = wf_map[field].most_common()[0][0]
|
||||
expr = f"text_match({field}, '{token}')"
|
||||
manual_result = df_split[
|
||||
df_split.apply(lambda row: token in row[field], axis=1)
|
||||
]
|
||||
log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
|
||||
res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
anns_field=ann_field,
|
||||
param={},
|
||||
limit=100,
|
||||
expr=expr, output_fields=["id", field])
|
||||
for res in res_list:
|
||||
log.info(f"res len {len(res)} res {res}")
|
||||
assert len(res) > 0
|
||||
for r in res:
|
||||
r = r.to_dict()
|
||||
assert token in r["entity"][field]
|
||||
|
||||
# search with filter single field for multi-token
|
||||
for field in text_fields:
|
||||
# match top 10 most common words
|
||||
top_10_tokens = []
|
||||
for word, count in wf_map[field].most_common(10):
|
||||
top_10_tokens.append(word)
|
||||
string_of_top_10_words = " ".join(top_10_tokens)
|
||||
expr = f"text_match({field}, '{string_of_top_10_words}')"
|
||||
log.info(f"expr {expr}")
|
||||
res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
anns_field=ann_field,
|
||||
param={},
|
||||
limit=100,
|
||||
expr=expr, output_fields=["id", field])
|
||||
for res in res_list:
|
||||
log.info(f"res len {len(res)} res {res}")
|
||||
assert len(res) > 0
|
||||
for r in res:
|
||||
r = r.to_dict()
|
||||
assert any([token in r["entity"][field] for token in top_10_tokens])
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,296 @@
|
|||
import numpy as np
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
|
||||
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from common.constants import *
|
||||
from utils.util_pymilvus import *
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from utils.util_log import test_log as log
|
||||
from base.client_base import TestcaseBase
|
||||
import heapq
|
||||
from time import sleep
|
||||
from decimal import Decimal, getcontext
|
||||
import decimal
|
||||
import multiprocessing
|
||||
import numbers
|
||||
import random
|
||||
import math
|
||||
import numpy
|
||||
import threading
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
||||
|
||||
pd.set_option("expand_frame_repr", False)
|
||||
|
||||
prefix = "search_collection"
|
||||
search_num = 10
|
||||
max_dim = ct.max_dim
|
||||
min_dim = ct.min_dim
|
||||
epsilon = ct.epsilon
|
||||
hybrid_search_epsilon = 0.01
|
||||
gracefulTime = ct.gracefulTime
|
||||
default_nb = ct.default_nb
|
||||
default_nb_medium = ct.default_nb_medium
|
||||
default_nq = ct.default_nq
|
||||
default_dim = ct.default_dim
|
||||
default_limit = ct.default_limit
|
||||
max_limit = ct.max_limit
|
||||
default_search_exp = "int64 >= 0"
|
||||
default_search_string_exp = "varchar >= \"0\""
|
||||
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
|
||||
default_invaild_string_exp = "varchar >= 0"
|
||||
default_json_search_exp = "json_field[\"number\"] >= 0"
|
||||
perfix_expr = 'varchar like "0%"'
|
||||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
default_int64_field_name = ct.default_int64_field_name
|
||||
default_float_field_name = ct.default_float_field_name
|
||||
default_bool_field_name = ct.default_bool_field_name
|
||||
default_string_field_name = ct.default_string_field_name
|
||||
default_json_field_name = ct.default_json_field_name
|
||||
default_index_params = ct.default_index
|
||||
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
range_search_supported_indexes = ct.all_index_types[:7]
|
||||
uid = "test_search"
|
||||
nq = 1
|
||||
epsilon = 0.001
|
||||
field_name = default_float_vec_field_name
|
||||
binary_field_name = default_binary_vec_field_name
|
||||
search_param = {"nprobe": 1}
|
||||
entity = gen_entities(1, is_normal=True)
|
||||
entities = gen_entities(default_nb, is_normal=True)
|
||||
raw_vectors, binary_entities = gen_binary_entities(default_nb)
|
||||
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
|
||||
index_name1 = cf.gen_unique_str("float")
|
||||
index_name2 = cf.gen_unique_str("varhar")
|
||||
half_nb = ct.default_nb // 2
|
||||
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
|
||||
|
||||
|
||||
class TestSparseSearch(TestcaseBase):
|
||||
""" Add some test cases for the sparse vector """
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_index_search(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: verify that sparse index for sparse vectors can be searched properly
|
||||
method: create connection, collection, insert and search
|
||||
expected: search successfully
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=3000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
collection_w.load()
|
||||
|
||||
_params = cf.get_search_params_params(index)
|
||||
_params.update({"dim_max_score_ratio": 1.05})
|
||||
search_params = {"params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
output_fields=[ct.default_sparse_vec_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"original_entities": [data],
|
||||
"output_fields": [ct.default_sparse_vec_field_name]})
|
||||
expr = "int64 < 100 "
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
expr=expr, output_fields=[ct.default_sparse_vec_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"original_entities": [data],
|
||||
"output_fields": [ct.default_sparse_vec_field_name]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@pytest.mark.parametrize("dim", [32768, ct.max_sparse_vector_dim])
|
||||
def test_sparse_index_dim(self, index, dim):
|
||||
"""
|
||||
target: validating the sparse index in different dimensions
|
||||
method: create connection, collection, insert and hybrid search
|
||||
expected: search successfully
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(dim=dim)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.load()
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, limit=1,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": 1})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_index_enable_mmap_search(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: verify that the sparse indexes of sparse vectors can be searched properly after turning on mmap
|
||||
method: create connection, collection, enable mmap, insert and search
|
||||
expected: search successfully , query result is correct
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
|
||||
first_nb = 3000
|
||||
data = cf.gen_default_list_sparse_data(nb=first_nb, start=0)
|
||||
collection_w.insert(data)
|
||||
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.set_properties({'mmap.enabled': True})
|
||||
pro = collection_w.describe()[0].get("properties")
|
||||
assert pro["mmap.enabled"] == 'True'
|
||||
collection_w.alter_index(index, {'mmap.enabled': True})
|
||||
assert collection_w.index()[0].params["mmap.enabled"] == 'True'
|
||||
data2 = cf.gen_default_list_sparse_data(nb=2000, start=first_nb) # id shall be continuous
|
||||
all_data = [] # combine 2 insert datas for next checking
|
||||
for i in range(len(data2)):
|
||||
all_data.append(data[i] + data2[i])
|
||||
collection_w.insert(data2)
|
||||
collection_w.flush()
|
||||
collection_w.load()
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, default_limit,
|
||||
output_fields=[ct.default_sparse_vec_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"original_entities": [all_data],
|
||||
"output_fields": [ct.default_sparse_vec_field_name]})
|
||||
expr_id_list = [0, 1, 10, 100]
|
||||
term_expr = f'{ct.default_int64_field_name} in {expr_id_list}'
|
||||
res = collection_w.query(term_expr)[0]
|
||||
assert len(res) == 4
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("drop_ratio_build", [0.01])
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_search_sparse_ratio(self, drop_ratio_build, index):
|
||||
"""
|
||||
target: create a sparse index by adjusting the ratio parameter.
|
||||
method: create a sparse index by adjusting the ratio parameter.
|
||||
expected: search successfully
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=4000)
|
||||
collection_w.insert(data)
|
||||
collection_w.flush()
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": drop_ratio_build}}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
|
||||
collection_w.load()
|
||||
assert collection_w.has_index(index_name=index)[0] is True
|
||||
_params = {"drop_ratio_search": 0.2}
|
||||
for dim_max_score_ratio in [0.5, 0.99, 1, 1.3]:
|
||||
_params.update({"dim_max_score_ratio": dim_max_score_ratio})
|
||||
search_params = {"metric_type": "IP", "params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit})
|
||||
error = {ct.err_code: 999,
|
||||
ct.err_msg: "should be in range [0.500000, 1.300000]"}
|
||||
for invalid_ratio in [0.49, 1.4]:
|
||||
_params.update({"dim_max_score_ratio": invalid_ratio})
|
||||
search_params = {"metric_type": "IP", "params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_vector_search_output_field(self, index):
|
||||
"""
|
||||
target: create sparse vectors and search
|
||||
method: create sparse vectors and search
|
||||
expected: normal search
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=4000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.load()
|
||||
d = cf.gen_default_list_sparse_data(nb=10)
|
||||
collection_w.search(d[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, default_limit,
|
||||
output_fields=["float", "sparse_vector"],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"output_fields": ["float", "sparse_vector"]
|
||||
})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_vector_search_iterator(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: create sparse vectors and search iterator
|
||||
method: create sparse vectors and search iterator
|
||||
expected: normal search
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=4000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.load()
|
||||
batch_size = 100
|
||||
collection_w.search_iterator(data[-1][0:1], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, limit=500, batch_size=batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"batch_size": batch_size})
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,5 @@
|
|||
[pytest]
|
||||
addopts = --strict --endpoint http://127.0.0.1:19530 --token root:Milvus --minio_host 127.0.0.1
|
||||
addopts = --strict --endpoint http://10.104.19.195:19530 --token root:Milvus --minio_host 10.104.32.27
|
||||
|
||||
log_format = [%(asctime)s - %(levelname)s - %(name)s]: %(message)s (%(filename)s:%(lineno)s)
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
||||
|
|
Loading…
Reference in New Issue