test: Split test_search and refactor on test class to share collections (#40677)

issue: #40698

---------

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
pull/40766/head
yanliang567 2025-03-19 14:20:15 +08:00 committed by GitHub
parent b119ac5d30
commit cf223bae7b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 16299 additions and 13802 deletions

View File

@ -53,8 +53,6 @@ image:
tag: PR-35426-20240812-46dadb120
indexCoordinator:
enabled: false
gc:
interval: 1
resources:
limits:
cpu: "1"
@ -107,11 +105,20 @@ log:
extraConfigFiles:
user.yaml: |+
indexCoord:
gc:
interval: 1
scheduler:
interval: 100
indexNode:
scheduler:
buildParallel: 4
queryNode:
mmap:
vectorField: true
vectorIndex: true
scalarField: true
scalarIndex: true
growingMmapEnabled: true
metrics:
serviceMonitor:
enabled: true
@ -260,12 +267,6 @@ queryNode:
requests:
cpu: "0.5"
memory: 500Mi
mmap:
vectorField: true
vectorIndex: true
scalarField: true
scalarIndex: true
growingMmapEnabled: true
rootCoordinator:
resources:
limits:

View File

@ -90,7 +90,7 @@ class TestMilvusClientV2Base(Base):
collection_name=collection_name, dimension=dimension,
**kwargs).run()
self.tear_down_collection_names.append(collection_name)
# self.tear_down_collection_names.append(collection_name)
return res, check_result
def has_collection(self, client, collection_name, timeout=None, check_task=None,

View File

@ -305,6 +305,21 @@ class TestMilvusClientSearchInvalid(TestMilvusClientV2Base):
self.create_collection(client, collection_name, default_dim, id_type="invalid",
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_collection_string_auto_id(self):
"""
target: test high level api: client.create_collection
method: create collection with auto id on string primary key without mx length
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_unique_str(prefix)
# 1. create collection
error = {ct.err_code: 65535, ct.err_msg: f"type param(max_length) should be specified for the "
f"field({default_primary_key_field_name}) of collection {collection_name}"}
self.create_collection(client, collection_name, default_dim, id_type="string", auto_id=True,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_create_same_collection_different_params(self):
"""

View File

@ -0,0 +1,454 @@
import pytest
import random
from base.client_v2_base import TestMilvusClientV2Base
from utils.util_log import test_log as log
from common import common_func as cf
from common import common_type as ct
from common.common_type import CaseLabel, CheckTasks
from utils.util_pymilvus import *
from common.constants import *
from pymilvus import DataType
prefix = "alias"
exp_name = "name"
exp_schema = "schema"
default_schema = cf.gen_default_collection_schema()
default_binary_schema = cf.gen_default_binary_collection_schema()
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
default_search_exp = "int64 >= 0"
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_primary_key_field_name = "id"
default_vector_field_name = "vector"
default_float_field_name = ct.default_float_field_name
default_string_field_name = ct.default_string_field_name
class TestMilvusClientV2AliasInvalid(TestMilvusClientV2Base):
""" Negative test cases of alias interface parameters"""
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("alias_name", ct.invalid_resource_names)
def test_milvus_client_v2_create_alias_with_invalid_name(self, alias_name):
"""
target: test alias inserting data
method: create a collection with invalid alias name
expected: create alias failed
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create alias with invalid name
error = {ct.err_code: 1100, ct.err_msg: "Invalid collection alias"}
if alias_name is None or alias_name.strip() == "":
error = {ct.err_code: 1100, ct.err_msg: "collection alias should not be empty"}
self.create_alias(client, collection_name, alias_name,
check_task=CheckTasks.err_res, check_items=error)
# cleanup
self.drop_collection(client, collection_name)
class TestMilvusClientV2AliasOperation(TestMilvusClientV2Base):
""" Test cases of alias interface operations"""
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_v2_alter_alias_operation_default(self):
"""
target: test collection altering alias
method:
1. create collection_1 with index and load, bind alias to collection_1 and insert 2000 entities
2. verify operations using alias work on collection_1
3. create collection_2 with index and load with 1500 entities
4. alter alias to collection_2
5. verify operations using alias work on collection_2
expected:
1. operations using alias work on collection_1 before alter
2. operations using alias work on collection_2 after alter
"""
client = self._client()
# 1. create collection1 with index and load
collection_name1 = cf.gen_unique_str("collection1")
index_params = self.prepare_index_params(client)[0]
index_params.add_index(field_name=default_vector_field_name, metric_type="L2")
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded", index_params=index_params)
# 2. create alias and insert data
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name1, alias_name)
# 3. insert data into collection1 using alias
nb1 = 2000
vectors = cf.gen_vectors(nb1, default_dim)
rows = [{default_primary_key_field_name: i,
default_vector_field_name: vectors[i],
default_float_field_name: i * 1.0,
default_string_field_name: str(i)} for i in range(nb1)]
self.insert(client, alias_name, rows)
self.flush(client, alias_name)
# 4. verify collection1 data using alias
res1 = self.query(client, alias_name, filter="", output_fields=["count(*)"])
assert res1[0][0].get("count(*)") == nb1
# 5. verify search using alias works on collection1
search_vectors = cf.gen_vectors(1, default_dim)
self.search(client, alias_name, search_vectors, limit=default_limit,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(search_vectors),
"limit": default_limit})
# 6. create collection2 with index and load
collection_name2 = cf.gen_unique_str("collection2")
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded", index_params=index_params)
# 7. insert data into collection2
nb2 = 1500
vectors = cf.gen_vectors(nb2, default_dim)
rows = [{default_primary_key_field_name: i,
default_vector_field_name: vectors[i],
default_float_field_name: i * 1.0,
default_string_field_name: str(i)} for i in range(nb2)]
self.insert(client, collection_name2, rows)
self.flush(client, collection_name2)
# 8. alter alias to collection2
self.alter_alias(client, collection_name2, alias_name)
# 9. verify collection2 data using alias
res2 = self.query(client, alias_name, filter="", output_fields=["count(*)"])
assert res2[0][0].get("count(*)") == nb2
# 10. verify search using alias works on collection2
search_vectors = cf.gen_vectors(1, default_dim)
self.search(client, alias_name, search_vectors, limit=default_limit,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(search_vectors),
"limit": default_limit})
# 11. verify operations on collection1 still work
res1 = self.query(client, collection_name1, filter="", output_fields=["count(*)"])
assert res1[0][0].get("count(*)") == nb1
# cleanup
self.release_collection(client, collection_name1)
self.release_collection(client, collection_name2)
self.drop_collection(client, collection_name1)
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name2)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_v2_create_drop_alias_operation_default(self):
"""
target: test collection creating and dropping alias
method:
1. create a collection with 10 partitions
2. create an alias for the collection
3. verify alias has same partitions as collection
4. drop the alias
5. verify alias is dropped and collection still exists
expected:
1. alias has same partitions as collection
2. alias can be dropped successfully
3. collection remains unchanged after alias operations
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create partitions
partition_names = []
for _ in range(10):
partition_name = cf.gen_unique_str("partition")
partition_names.append(partition_name)
self.create_partition(client, collection_name, partition_name)
# 3. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name, alias_name)
# 4. verify partitions in collection and alias
partitions = self.list_partitions(client, collection_name)
alias_partitions = self.list_partitions(client, alias_name)
assert partitions == alias_partitions
# 5. verify collection exists
assert self.has_collection(client, collection_name)[0]
assert self.has_collection(client, alias_name)[0]
# 6. drop alias
self.drop_alias(client, alias_name)
# 7. verify alias is dropped
error = {ct.err_code: 0,
ct.err_msg: f"can't find collection[database=default][collection={alias_name}]"}
self.describe_collection(client, alias_name,
check_task=CheckTasks.err_res,
check_items=error)
# 8. verify collection still exists and unchanged
assert self.has_collection(client, collection_name)[0]
collection_partitions = self.list_partitions(client, collection_name)
assert collection_partitions == partitions
# cleanup
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_v2_collection_operations_by_alias(self):
"""
target: test collection operations using alias
method:
1. create collection with alias
2. verify has_collection works with alias
3. verify drop_collection fails with alias
expected:
1. has_collection returns True for alias
2. drop_collection fails with error message
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name, alias_name)
# 3. verify has_collection works with alias
assert self.has_collection(client, alias_name)[0]
assert self.has_collection(client, collection_name)[0]
# 4. verify drop_collection fails with alias
error = {ct.err_code: 1,
ct.err_msg: f"cannot drop the collection via alias = {alias_name}"}
self.drop_collection(client, alias_name,
check_task=CheckTasks.err_res,
check_items=error)
# cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name)
assert not self.has_collection(client, collection_name)[0]
class TestMilvusClientV2AliasOperationInvalid(TestMilvusClientV2Base):
""" Test cases of alias interface invalid operations"""
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_v2_create_duplication_alias(self):
"""
target: test create duplicate alias
method: create alias twice with same name to different collections
expected: raise exception
"""
client = self._client()
collection_name1 = cf.gen_unique_str("collection1")
collection_name2 = cf.gen_unique_str("collection2")
# 1. create collection1
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
# 2. create collection2
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
# 3. create alias for collection1
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name1, alias_name)
# 4. try to create same alias for collection2
error = {ct.err_code: 1,
ct.err_msg: f"{alias_name} is alias to another collection: {collection_name1}"}
self.create_alias(client, collection_name2, alias_name,
check_task=CheckTasks.err_res,
check_items=error)
# cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name1)
self.drop_collection(client, collection_name2)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_v2_alter_not_exist_alias(self):
"""
target: test alter not exist alias
method: alter alias that not exists
expected: raise exception
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
alias_name = cf.gen_unique_str(prefix)
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create alias and link to the collection
self.create_alias(client, collection_name, alias_name)
# 3. alter alias, trying to link the collection to a non existing alias
non_exist_alias = cf.gen_unique_str(prefix)
error = {ct.err_code: 1600,
ct.err_msg: f"alias not found[database=default][alias={non_exist_alias}]"}
self.alter_alias(client, collection_name, non_exist_alias,
check_task=CheckTasks.err_res,
check_items=error)
# 4. cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_v2_drop_not_exist_alias(self):
"""
target: test drop not exist alias
method: drop alias that not exists
expected: no exception
"""
client = self._client()
alias_name = cf.gen_unique_str(prefix)
# trying to drop a non existing alias
self.drop_alias(client, alias_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_v2_drop_same_alias_twice(self):
"""
target: test drop same alias twice
method: drop alias twice
expected: no exception
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name, alias_name)
# 3. drop alias first time
self.drop_alias(client, alias_name)
# 4. try to drop alias second time
self.drop_alias(client, alias_name)
# cleanup
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_v2_create_dup_name_collection(self):
"""
target: test create collection with duplicate name
method: create collection with alias name
expected: raise exception
"""
client = self._client()
collection_name = cf.gen_unique_str("collection")
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
# 2. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name, alias_name)
# 3. try to create collection with alias name
error = {ct.err_code: 0,
ct.err_msg: f"collection name [{alias_name}] conflicts with an existing alias,"
" please choose a unique name"}
self.create_collection(client, alias_name, default_dim, consistency_level="Bounded",
check_task=CheckTasks.err_res,
check_items=error)
# cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_v2_reuse_alias_name(self):
"""
target: test reuse alias name from dropped collection
method:
1.create collection1 with alias
2.drop collection1
3.create collection2 with same alias name
expected: create collection2 successfully
"""
client = self._client()
collection_name1 = cf.gen_unique_str("collection1")
# 1. create collection1
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
# 2. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name1, alias_name)
# 3. drop the alias and collection1
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name1)
# 4. create collection2
collection_name2 = cf.gen_unique_str("collection2")
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
# 5. create alias with the previous alias name and assign it to collection2
self.create_alias(client, collection_name2, alias_name)
# 6. verify collection2
assert self.has_collection(client, collection_name2)[0]
assert self.has_collection(client, alias_name)[0]
# cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name2)
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_v2_rename_collection_to_alias_name(self):
"""
target: test rename collection to alias name
method:
1.create collection1 with alias
2.rename collection2 to alias name
expected: raise exception
"""
client = self._client()
collection_name1 = cf.gen_unique_str("collection1")
collection_name2 = cf.gen_unique_str("collection2")
# 1. create collection1
self.create_collection(client, collection_name1, default_dim, consistency_level="Bounded")
# 2. create alias
alias_name = cf.gen_unique_str(prefix)
self.create_alias(client, collection_name1, alias_name)
# 3. create collection2
self.create_collection(client, collection_name2, default_dim, consistency_level="Bounded")
# 4. try to rename collection2 to alias name
error = {ct.err_code: 999,
ct.err_msg: f"cannot rename collection to an existing alias: {alias_name}"}
self.rename_collection(client, collection_name2, alias_name,
check_task=CheckTasks.err_res,
check_items=error)
# cleanup
self.drop_alias(client, alias_name)
self.drop_collection(client, collection_name1)
self.drop_collection(client, collection_name2)

View File

@ -0,0 +1,596 @@
import pytest
import numpy as np
import time
from common.common_type import CaseLabel, CheckTasks
from common import common_func as cf
from common import common_type as ct
from utils.util_log import test_log as log
from utils.util_pymilvus import *
from base.client_v2_base import TestMilvusClientV2Base
from pymilvus import DataType, FieldSchema, CollectionSchema
# Test parameters
default_dim = ct.default_dim
default_nb = ct.default_nb
default_nq = ct.default_nq
default_limit = ct.default_limit
default_search_exp = "id >= 0"
exp_res = "exp_res"
default_primary_key_field_name = "id"
default_vector_field_name = "vector"
default_float_field_name = ct.default_float_field_name
default_string_field_name = ct.default_string_field_name
class TestMilvusClientE2E(TestMilvusClientV2Base):
""" Test case of end-to-end interface """
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.skip(reason="issue #40686")
@pytest.mark.parametrize("flush_enable", [True, False])
@pytest.mark.parametrize("scalar_index_enable", [True, False])
def test_milvus_client_e2e_default(self, flush_enable, scalar_index_enable):
"""
target: test high level api: client.create_collection, insert, search, query
method: create connection, collection, insert and search with:
1. flush enabled/disabled
2. scalar index enabled/disabled
expected: search/query successfully
"""
client = self._client()
# 1. Create collection with custom schema
collection_name = cf.gen_unique_str("test_e2e")
schema = self.create_schema(client, enable_dynamic_field=False)[0]
# Primary key and vector field
schema.add_field("id", DataType.INT64, is_primary=True, auto_id=False)
schema.add_field("embeddings", DataType.FLOAT_VECTOR, dim=default_dim)
# Boolean type
schema.add_field("bool_field", DataType.BOOL, nullable=True)
# Integer types
schema.add_field("int8_field", DataType.INT8, nullable=True)
schema.add_field("int16_field", DataType.INT16, nullable=True)
schema.add_field("int32_field", DataType.INT32, nullable=True)
schema.add_field("int64_field", DataType.INT64, nullable=True)
# Float types
schema.add_field("float_field", DataType.FLOAT, nullable=True)
schema.add_field("double_field", DataType.DOUBLE, nullable=True)
# String type
schema.add_field("varchar_field", DataType.VARCHAR, max_length=65535, nullable=True)
# JSON type
schema.add_field("json_field", DataType.JSON, nullable=True)
# Array type
schema.add_field("array_field", DataType.ARRAY, element_type=DataType.INT64, max_capacity=12, nullable=True)
# Create collection
self.create_collection(client, collection_name, schema=schema)
# 2. Insert data with null values for nullable fields
num_inserts = 3 # insert data for 3 times
total_rows = []
for batch in range(num_inserts):
vectors = cf.gen_vectors(default_nb, default_dim)
rows = []
start_id = batch * default_nb # ensure id is not duplicated
for i in range(default_nb):
row = {
"id": start_id + i, # ensure id is not duplicated
"embeddings": list(vectors[i])
}
# Add nullable fields with null values for every 5th record
if i % 5 == 0:
row.update({
"bool_field": None,
"int8_field": None,
"int16_field": None,
"int32_field": None,
"int64_field": None,
"float_field": None,
"double_field": None,
"varchar_field": None,
"json_field": None,
"array_field": None
})
else:
row.update({
"bool_field": i % 2 == 0,
"int8_field": i % 128,
"int16_field": i % 32768,
"int32_field": i,
"int64_field": i,
"float_field": float(i),
"double_field": float(i) * 1.0,
"varchar_field": f"varchar_{start_id + i}",
"json_field": {"id": start_id + i, "value": f"json_{start_id + i}"},
"array_field": [i, i + 1, i + 2]
})
rows.append(row)
total_rows.append(row)
t0 = time.time()
self.insert(client, collection_name, rows)
t1 = time.time()
log.info(f"Insert batch {batch + 1}: {default_nb} entities cost {t1 - t0:.4f} seconds")
log.info(f"Total inserted {num_inserts * default_nb} entities")
if flush_enable:
self.flush(client, collection_name)
log.info("Flush enabled: executing flush operation")
else:
log.info("Flush disabled: skipping flush operation")
# Create index parameters
index_params = self.prepare_index_params(client)[0]
index_params.add_index("embeddings", metric_type="COSINE")
# Add autoindex for scalar fields if enabled
if scalar_index_enable:
index_params.add_index(field_name="int8_field", index_type="AUTOINDEX")
index_params.add_index(field_name="int16_field", index_type="AUTOINDEX")
index_params.add_index(field_name="int32_field", index_type="AUTOINDEX")
index_params.add_index(field_name="int64_field", index_type="AUTOINDEX")
index_params.add_index(field_name="float_field", index_type="AUTOINDEX")
index_params.add_index(field_name="double_field", index_type="AUTOINDEX")
index_params.add_index(field_name="varchar_field", index_type="AUTOINDEX")
# 3. create index
self.create_index(client, collection_name, index_params)
# Verify scalar indexes are created if enabled
indexes = self.list_indexes(client, collection_name)[0]
log.info(f"Created indexes: {indexes}")
expected_scalar_indexes = ["int8_field", "int16_field", "int32_field", "int64_field",
"float_field", "double_field", "varchar_field"]
if scalar_index_enable:
for field in expected_scalar_indexes:
assert field in indexes, f"Scalar index not created for field: {field}"
else:
for field in expected_scalar_indexes:
assert field not in indexes, f"Scalar index should not be created for field: {field}"
# 4. Load collection
t0 = time.time()
self.load_collection(client, collection_name)
t1 = time.time()
log.info(f"Load collection cost {t1 - t0:.4f} seconds")
# 4. Search
t0 = time.time()
vectors_to_search = cf.gen_vectors(1, default_dim)
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}}
search_res, _ = self.search(
client,
collection_name,
vectors_to_search,
anns_field="embeddings",
search_params=search_params,
limit=default_limit,
output_fields=['*'],
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(vectors_to_search),
"limit": default_limit
}
)
t1 = time.time()
log.info(f"Search cost {t1 - t0:.4f} seconds")
# 5. Query with filters on each scalar field
t0 = time.time()
# Query on boolean field
bool_filter = "bool_field == true"
bool_expected = [r for r in total_rows if r["bool_field"] is not None and r["bool_field"]]
query_res, _ = self.query(
client,
collection_name,
filter=bool_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": bool_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on int8 field
int8_filter = "int8_field < 50"
int8_expected = [r for r in total_rows if r["int8_field"] is not None and r["int8_field"] < 50]
query_res, _ = self.query(
client,
collection_name,
filter=int8_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int8_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on int16 field
int16_filter = "int16_field < 1000"
int16_expected = [r for r in total_rows if r["int16_field"] is not None and r["int16_field"] < 1000]
query_res, _ = self.query(
client,
collection_name,
filter=int16_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int16_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on int32 field
int32_filter = "int32_field in [1,2,3,4,5]"
int32_expected = [r for r in total_rows if r["int32_field"] is not None and r["int32_field"] in [1,2,3,4,5]]
query_res, _ = self.query(
client,
collection_name,
filter=int32_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int32_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on int64 field
int64_filter = "int64_field >= 10"
int64_expected = [r for r in total_rows if r["int64_field"] is not None and r["int64_field"] >= 10]
query_res, _ = self.query(
client,
collection_name,
filter=int64_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int64_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on float field
float_filter = "float_field > 5.0"
float_expected = [r for r in total_rows if r["float_field"] is not None and r["float_field"] > 5.0]
query_res, _ = self.query(
client,
collection_name,
filter=float_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": float_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on double field
double_filter = "3.0 <=double_field <= 7.0"
double_expected = [r for r in total_rows if r["double_field"] is not None and 3.0 <= r["double_field"] <= 7.0]
query_res, _ = self.query(
client,
collection_name,
filter=double_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": double_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on varchar field
varchar_filter = "varchar_field like \"varchar_1%\""
varchar_expected = [r for r in total_rows if r["varchar_field"] is not None and r["varchar_field"].startswith("varchar_1")]
query_res, _ = self.query(
client,
collection_name,
filter=varchar_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": varchar_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on varchar null values
varchar_null_filter = "varchar_field is null"
varchar_null_expected = [r for r in total_rows if r["varchar_field"] is None]
query_res, _ = self.query(
client,
collection_name,
filter=varchar_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": varchar_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on json field null values
json_null_filter = "json_field is null"
json_null_expected = [r for r in total_rows if r["json_field"] is None]
query_res, _ = self.query(
client,
collection_name,
filter=json_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": json_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on array field null values
array_null_filter = "array_field is null"
array_null_expected = [r for r in total_rows if r["array_field"] is None]
query_res, _ = self.query(
client,
collection_name,
filter=array_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": array_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on multiple nullable fields
multi_null_filter = "varchar_field is null and json_field is null and array_field is null"
multi_null_expected = [r for r in total_rows if r["varchar_field"] is None and r["json_field"] is None and r["array_field"] is None]
query_res, _ = self.query(
client,
collection_name,
filter=multi_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": multi_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on mix of null and non-null conditions
mix_filter = "varchar_field is null and json_field is not null"
mix_expected = [r for r in total_rows if r["varchar_field"] is None and r["json_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=mix_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": mix_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Query on is not null conditions for each scalar field
# Int8 field is not null
int8_not_null_filter = "int8_field is not null"
int8_not_null_expected = [r for r in total_rows if r["int8_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=int8_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int8_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Int16 field is not null
int16_not_null_filter = "int16_field is not null"
int16_not_null_expected = [r for r in total_rows if r["int16_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=int16_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": int16_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Float field is not null
float_not_null_filter = "float_field is not null"
float_not_null_expected = [r for r in total_rows if r["float_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=float_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": float_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Double field is not null
double_not_null_filter = "double_field is not null"
double_not_null_expected = [r for r in total_rows if r["double_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=double_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": double_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Varchar field is not null
varchar_not_null_filter = "varchar_field is not null"
varchar_not_null_expected = [r for r in total_rows if r["varchar_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=varchar_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": varchar_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# JSON field is not null
json_not_null_filter = "json_field is not null"
json_not_null_expected = [r for r in total_rows if r["json_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=json_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": json_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Array field is not null
array_not_null_filter = "array_field is not null"
array_not_null_expected = [r for r in total_rows if r["array_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=array_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": array_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Multiple fields is not null
multi_not_null_filter = "varchar_field is not null and json_field is not null and array_field is not null"
multi_not_null_expected = [r for r in total_rows if r["varchar_field"] is not None and r["json_field"] is not None and r["array_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=multi_not_null_filter,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": multi_not_null_expected,
"with_vec": True,
"primary_field": "id"
}
)
# Complex mixed conditions with is null, is not null, and comparison operators
# Test case 1: int field is null AND float field > value AND varchar field is not null
complex_mix_filter1 = "int32_field is null and float_field > 10.0 and varchar_field is not null"
complex_mix_expected1 = [r for r in total_rows if r["int32_field"] is None and
r["float_field"] is not None and r["float_field"] > 10.0 and
r["varchar_field"] is not None]
query_res, _ = self.query(
client,
collection_name,
filter=complex_mix_filter1,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": complex_mix_expected1,
"with_vec": True,
"primary_field": "id"
}
)
# Test case 2: varchar field is not null AND int field between values AND float field is null
complex_mix_filter2 = "varchar_field is not null and 5 <= int64_field <= 15 and float_field is null"
complex_mix_expected2 = [r for r in total_rows if r["varchar_field"] is not None and
r["int64_field"] is not None and 5 <= r["int64_field"] <= 15 and
r["float_field"] is None]
query_res, _ = self.query(
client,
collection_name,
filter=complex_mix_filter2,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": complex_mix_expected2,
"with_vec": True,
"primary_field": "id"
}
)
# Test case 3: Multiple fields with mixed null/not null conditions and range comparisons
complex_mix_filter3 = "int8_field is not null and int8_field < 50 and double_field is null and varchar_field is not null and varchar_field like \"varchar_2%\""
complex_mix_expected3 = [r for r in total_rows if r["int8_field"] is not None and r["int8_field"] < 50 and
r["double_field"] is None and
r["varchar_field"] is not None and r["varchar_field"].startswith("varchar_2")]
query_res, _ = self.query(
client,
collection_name,
filter=complex_mix_filter3,
output_fields=['*'],
check_task=CheckTasks.check_query_results,
check_items={
"exp_res": complex_mix_expected3,
"with_vec": True,
"primary_field": "id"
}
)
t1 = time.time()
log.info(f"Query on all scalar fields cost {t1 - t0:.4f} seconds")
# 6. Delete data
t0 = time.time()
self.delete(client, collection_name, filter=default_search_exp)
t1 = time.time()
log.info(f"Delete cost {t1 - t0:.4f} seconds")
# 7. Verify deletion
query_res, _ = self.query(
client,
collection_name,
filter=default_search_exp,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": []}
)
# 8. Cleanup
self.release_collection(client, collection_name)
self.drop_collection(client, collection_name)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchArray(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("array_element_data_type", [DataType.INT64])
def test_search_array_with_inverted_index(self, array_element_data_type):
# create collection
additional_params = {"max_length": 1000} if array_element_data_type == DataType.VARCHAR else {}
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(name="contains", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000,
**additional_params),
FieldSchema(name="contains_any", dtype=DataType.ARRAY, element_type=array_element_data_type,
max_capacity=2000, **additional_params),
FieldSchema(name="contains_all", dtype=DataType.ARRAY, element_type=array_element_data_type,
max_capacity=2000, **additional_params),
FieldSchema(name="equals", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000,
**additional_params),
FieldSchema(name="array_length_field", dtype=DataType.ARRAY, element_type=array_element_data_type,
max_capacity=2000, **additional_params),
FieldSchema(name="array_access", dtype=DataType.ARRAY, element_type=array_element_data_type,
max_capacity=2000, **additional_params),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=128)
]
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True)
collection_w = self.init_collection_wrap(name=cf.gen_unique_str(prefix), schema=schema)
# insert data
train_data, query_expr = cf.prepare_array_test_data(3000, hit_rate=0.05)
collection_w.insert(train_data)
index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 48, "efConstruction": 500}}
collection_w.create_index("emb", index_params=index_params)
for f in ["contains", "contains_any", "contains_all", "equals", "array_length_field", "array_access"]:
collection_w.create_index(f, {"index_type": "INVERTED"})
collection_w.load()
for item in query_expr:
expr = item["expr"]
ground_truth_candidate = item["ground_truth"]
res, _ = collection_w.search(
data=[np.array([random.random() for j in range(128)], dtype=np.dtype("float32"))],
anns_field="emb",
param={"metric_type": "L2", "params": {"M": 32, "efConstruction": 360}},
limit=10,
expr=expr,
output_fields=["*"],
)
assert len(res) == 1
for i in range(len(res)):
assert len(res[i]) == 10
for hit in res[i]:
assert hit.id in ground_truth_candidate

View File

@ -0,0 +1,475 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchDiskann(TestcaseBase):
"""
******************************************************************
The following cases are used to test search about diskann index
******************************************************************
"""
@pytest.fixture(scope="function", params=[32, 128])
def dim(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def _async(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def enable_dynamic_field(self, request):
yield request.param
@pytest.mark.tags(CaseLabel.L2)
def test_search_with_diskann_index(self, _async):
"""
target: test delete after creating index
method: 1.create collection , insert data, primary_field is int field
2.create diskann index , then load
3.search
expected: search successfully
"""
# 1. initialize with data
dim = 100
auto_id = False
enable_dynamic_field = True
nb = 2000
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
nb=nb, dim=dim, is_index=False,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "L2", "params": {}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
collection_w.load()
default_search_params = {
"metric_type": "L2", "params": {"search_list": 30}}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async}
)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("search_list", [20, 200])
def test_search_with_limit_20(self, _async, search_list):
"""
target: test delete after creating index
method: 1.create collection , insert data, primary_field is int field
2.create diskann index , then load
3.search
expected: search successfully
"""
limit = 20
# 1. initialize with data
enable_dynamic_field = True
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, is_index=False,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}}
collection_w.create_index(ct.default_float_vec_field_name, default_index)
collection_w.load()
search_params = {"metric_type": "L2", "params": {"search_list": search_list}}
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
search_params, limit, default_search_exp,
output_fields=output_fields, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_invalid_params_with_diskann_B(self):
"""
target: test delete after creating index
method: 1.create collection , insert data, primary_field is int field
2.create diskann index
3.search with invalid params, [k, 200] when k <= 20
expected: search report an error
"""
# 1. initialize with data
dim = 100
limit = 20
auto_id = True
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}}
collection_w.create_index(ct.default_float_vec_field_name, default_index)
collection_w.load()
default_search_params = {"metric_type": "L2", "params": {"search_list": limit-1}}
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, limit,
default_search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 999,
"err_msg": f"should be larger than k({limit})"})
@pytest.mark.tags(CaseLabel.L2)
def test_search_with_diskann_with_string_pk(self):
"""
target: test delete after creating index
method: 1.create collection , insert data, primary_field is string field
2.create diskann index
3.search with invalid metric type
expected: search successfully
"""
# 1. initialize with data
dim = 128
enable_dynamic_field = True
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=False, dim=dim, is_index=False,
primary_field=ct.default_string_field_name,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "L2", "params": {}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
collection_w.load()
search_list = 20
default_search_params = {"metric_type": "L2",
"params": {"search_list": search_list}}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit}
)
@pytest.mark.tags(CaseLabel.L2)
def test_search_with_delete_data(self, _async):
"""
target: test delete after creating index
method: 1.create collection , insert data,
2.create diskann index
3.delete data, the search
expected: assert index and deleted id not in search result
"""
# 1. initialize with data
dim = 100
auto_id = True
enable_dynamic_field = True
collection_w, _, _, ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "L2", "params": {}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
collection_w.load()
tmp_expr = f'{ct.default_int64_field_name} in {[0]}'
expr = f'{ct.default_int64_field_name} in {ids[:half_nb]}'
# delete half of data
del_res = collection_w.delete(expr)[0]
assert del_res.delete_count == half_nb
collection_w.delete(tmp_expr)
default_search_params = {
"metric_type": "L2", "params": {"search_list": 30}}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": ids,
"limit": default_limit,
"_async": _async}
)
@pytest.mark.tags(CaseLabel.L2)
def test_search_with_diskann_and_more_index(self, _async):
"""
target: test delete after creating index
method: 1.create collection , insert data
2.create more index ,then load
3.delete half data, search
expected: assert index and deleted id not in search result
"""
# 1. initialize with data
dim = 64
auto_id = False
enable_dynamic_field = True
collection_w, _, _, ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False,
enable_dynamic_field=enable_dynamic_field, language="French")[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "COSINE", "params": {}}
collection_w.create_index(ct.default_float_vec_field_name, default_index, index_name=index_name1)
if not enable_dynamic_field:
index_params_one = {}
collection_w.create_index("float", index_params_one, index_name="a")
index_param_two = {}
collection_w.create_index("varchar", index_param_two, index_name="b")
collection_w.load()
tmp_expr = f'{ct.default_int64_field_name} in {[0]}'
expr = f'{ct.default_int64_field_name} in {ids[:half_nb]}'
# delete half of data
del_res = collection_w.delete(expr)[0]
assert del_res.delete_count == half_nb
collection_w.delete(tmp_expr)
default_search_params = {"metric_type": "COSINE", "params": {"search_list": 30}}
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L1)
def test_search_with_scalar_field(self, _async):
"""
target: test search with scalar field
method: 1.create collection , insert data
2.create more index ,then load
3.search with expr
expected: assert index and search successfully
"""
# 1. initialize with data
dim = 66
enable_dynamic_field = True
collection_w, _, _, ids = \
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
is_index=False, enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "IVF_SQ8",
"metric_type": "COSINE", "params": {"nlist": 64}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
index_params = {}
if not enable_dynamic_field:
collection_w.create_index(
ct.default_float_field_name, index_params=index_params)
collection_w.create_index(
ct.default_int64_field_name, index_params=index_params)
else:
collection_w.create_index(
ct.default_string_field_name, index_params=index_params)
collection_w.load()
default_expr = "int64 in [1, 2, 3, 4]"
limit = 4
default_search_params = {"metric_type": "COSINE", "params": {"nprobe": 64}}
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
search_res = collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, limit, default_expr,
output_fields=output_fields, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": ids,
"limit": limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("limit", [10, 100, 1000])
def test_search_diskann_search_list_equal_to_limit(self, limit, _async):
"""
target: test search diskann index when search_list equal to limit
method: 1.create collection , insert data, primary_field is int field
2.create diskann index , then load
3.search
expected: search successfully
"""
# 1. initialize with data
dim = 77
auto_id = False
enable_dynamic_field = False
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
dim=dim, is_index=False,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "L2", "params": {}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
collection_w.load()
search_params = {"metric_type": "L2", "params": {"search_list": limit}}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
search_params, limit,
default_search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async}
)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.xfail(reason="issue #23672")
def test_search_diskann_search_list_up_to_min(self, _async):
"""
target: test search diskann index when search_list up to min
method: 1.create collection , insert data, primary_field is int field
2.create diskann index , then load
3.search
expected: search successfully
"""
# 1. initialize with data
dim = 100
auto_id = True
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id,
dim=dim, is_index=False)[0:4]
# 2. create index
default_index = {"index_type": "DISKANN",
"metric_type": "L2", "params": {}}
collection_w.create_index(
ct.default_float_vec_field_name, default_index)
collection_w.load()
search_params = {"metric_type": "L2",
"params": {"k": 200, "search_list": 201}}
search_vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(search_vectors[:default_nq], default_search_field,
search_params, default_limit,
default_search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})

View File

@ -0,0 +1,102 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchDSL(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
def test_search_vector_only(self):
"""
target: test search normal scenario
method: search vector only
expected: search status ok, the length of result
"""
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, ct.default_nb)[0:5]
vectors = [[random.random() for _ in range(ct.default_dim)]
for _ in range(nq)]
collection_w.search(vectors[:nq], default_search_field,
default_search_params, ct.default_top_k,
default_search_exp,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": ct.default_top_k})

View File

@ -0,0 +1,357 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchGroupBy(TestcaseBase):
""" Test case of search group by """
@pytest.mark.tags(CaseLabel.L2)
def test_search_max_group_size_and_max_limit(self):
"""
target: test search group by with max group size and max limit
method: 1. create a collection with data
2. search with group by int32 with max group size and max limit
"""
pass
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("group_size", [0, -1])
@pytest.mark.xfail(reason="issue #36146")
def test_search_negative_group_size(self, group_size):
"""
target: test search group by with negative group size
"""
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=True, is_index=True)[0]
search_params = ct.default_search_params
search_vectors = cf.gen_vectors(1, dim=ct.default_dim)
# verify
error = {ct.err_code: 999, ct.err_msg: "group_size must be greater than 1"}
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
param=search_params, limit=10,
group_by_field=ct.default_int64_field_name,
group_size=group_size,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("metric", ["JACCARD", "HAMMING"])
def test_search_binary_vec_group_by(self, metric):
"""
target: test search on birany vector does not support group by
method: 1. create a collection with binary vectors
2. create index with different metric types
3. search with group by
verified error code and msg
"""
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
is_binary=True)[0]
_index = {"index_type": "BIN_FLAT", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index)
# insert with the same values for scalar fields
for _ in range(10):
data = cf.gen_default_binary_dataframe_data(nb=100, auto_id=True)[0]
collection_w.insert(data)
collection_w.flush()
collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index)
collection_w.load()
search_params = {"metric_type": metric, "params": {"ef": 128}}
nq = 2
limit = 10
search_vectors = cf.gen_binary_vectors(nq, dim=ct.default_dim)[1]
# verify the results are same if group by pk
err_code = 999
err_msg = "not support search_group_by operation based on binary"
collection_w.search(data=search_vectors, anns_field=ct.default_binary_vec_field_name,
param=search_params, limit=limit,
group_by_field=ct.default_int64_field_name,
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("grpby_unsupported_field", [ct.default_float_field_name, ct.default_json_field_name,
ct.default_double_field_name, ct.default_float_vec_field_name])
def test_search_group_by_unsupported_field(self, grpby_unsupported_field):
"""
target: test search group by with the unsupported field
method: 1. create a collection with data
2. create index
3. search with group by the unsupported fields
verify: the error code and msg
"""
metric = "IP"
collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False,
is_all_data_type=True, with_json=True, )[0]
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
collection_w.load()
search_params = {"metric_type": metric, "params": {"ef": 64}}
nq = 1
limit = 1
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
# search with groupby
err_code = 999
err_msg = f"unsupported data type"
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
param=search_params, limit=limit,
group_by_field=grpby_unsupported_field,
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[:7])
def test_search_group_by_unsupported_index(self, index):
"""
target: test search group by with the unsupported vector index
method: 1. create a collection with data
2. create a groupby unsupported index
3. search with group by
verify: the error code and msg
"""
if index in ["HNSW", "IVF_FLAT", "FLAT", "IVF_SQ8", "DISKANN", "SCANN"]:
pass # Only HNSW and IVF_FLAT are supported
else:
metric = "L2"
collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False,
is_all_data_type=True, with_json=False)[0]
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "params": params, "metric_type": metric}
collection_w.create_index(ct.default_float_vec_field_name, index_params)
collection_w.load()
search_params = {"params": {}}
nq = 1
limit = 1
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
# search with groupby
err_code = 999
err_msg = f"current index:{index} doesn't support"
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
param=search_params, limit=limit,
group_by_field=ct.default_int8_field_name,
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L2)
def test_search_group_by_multi_fields(self):
"""
target: test search group by with the multi fields
method: 1. create a collection with data
2. create index
3. search with group by the multi fields
verify: the error code and msg
"""
metric = "IP"
collection_w = self.init_collection_general(prefix, insert_data=False, is_index=False,
is_all_data_type=True, with_json=True, )[0]
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
collection_w.load()
search_params = {"metric_type": metric, "params": {"ef": 128}}
nq = 1
limit = 1
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
# search with groupby
err_code = 1700
err_msg = f"groupBy field not found in schema"
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
param=search_params, limit=limit,
group_by_field=[ct.default_string_field_name, ct.default_int32_field_name],
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("grpby_nonexist_field", ["nonexit_field", 100])
def test_search_group_by_nonexit_fields(self, grpby_nonexist_field):
"""
target: test search group by with the nonexisting field
method: 1. create a collection with data
2. create index
3. search with group by the unsupported fields
verify: the error code and msg
"""
metric = "IP"
collection_w = self.init_collection_general(prefix, insert_data=False, is_index=False,
is_all_data_type=True, with_json=True, )[0]
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
vector_name_list = cf.extract_vector_field_name_list(collection_w)
index_param = {"index_type": "FLAT", "metric_type": "COSINE", "params": {"nlist": 100}}
for vector_name in vector_name_list:
collection_w.create_index(vector_name, index_param)
collection_w.load()
search_params = {"metric_type": metric, "params": {"ef": 128}}
nq = 1
limit = 1
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
# search with groupby
err_code = 1700
err_msg = f"groupBy field not found in schema: field not found[field={grpby_nonexist_field}]"
collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name,
param=search_params, limit=limit,
group_by_field=grpby_nonexist_field,
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L1)
def test_search_iterator_not_support_group_by(self):
"""
target: test search iterator does not support group by
method: 1. create a collection with data
2. create index HNSW
3. search iterator with group by
4. search with filtering every value of group_by_field
verify: error code and msg
"""
metric = "COSINE"
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
is_all_data_type=True, with_json=False)[0]
# insert with the same values for scalar fields
for _ in range(10):
data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False)
collection_w.insert(data)
collection_w.flush()
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
collection_w.load()
grpby_field = ct.default_int32_field_name
search_vectors = cf.gen_vectors(1, dim=ct.default_dim)
search_params = {"metric_type": metric}
batch_size = 10
err_code = 1100
err_msg = "Not allowed to do groupBy when doing iteration"
collection_w.search_iterator(search_vectors, ct.default_float_vec_field_name,
search_params, batch_size, group_by_field=grpby_field,
output_fields=[grpby_field],
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L2)
def test_range_search_not_support_group_by(self):
"""
target: test range search does not support group by
method: 1. create a collection with data
2. create index hnsw
3. range search with group by
verify: the error code and msg
"""
metric = "COSINE"
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
is_all_data_type=True, with_json=False)[0]
_index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
# insert with the same values for scalar fields
for _ in range(10):
data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False)
collection_w.insert(data)
collection_w.flush()
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index)
collection_w.load()
nq = 1
limit = 5
search_vectors = cf.gen_vectors(nq, dim=ct.default_dim)
grpby_field = ct.default_int32_field_name
range_search_params = {"metric_type": "COSINE", "params": {"radius": 0.1,
"range_filter": 0.5}}
err_code = 1100
err_msg = f"Not allowed to do range-search"
collection_w.search(search_vectors, ct.default_float_vec_field_name,
range_search_params, limit,
default_search_exp, group_by_field=grpby_field,
output_fields=[grpby_field],
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,213 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchIterator(TestcaseBase):
""" Test case of search iterator """
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("metric_type", ct.float_metrics)
@pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def test_range_search_iterator_default(self, metric_type, vector_data_type):
"""
target: test iterator range search
method: 1. search iterator
2. check the result, expect pk not repeat and meet the range requirements
expected: search successfully
"""
# 1. initialize with data
batch_size = 100
collection_w = self.init_collection_general(prefix, True, dim=default_dim, is_index=False,
vector_data_type=vector_data_type)[0]
collection_w.create_index(field_name, {"metric_type": metric_type})
collection_w.load()
search_vector = cf.gen_vectors(1, default_dim, vector_data_type)
search_params = {"metric_type": metric_type}
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"metric_type": metric_type,
"batch_size": batch_size})
limit = 200
res = collection_w.search(search_vector, field_name, param=search_params, limit=200,
check_task=CheckTasks.check_search_results,
check_items={"nq": 1, "limit": limit})[0]
# 2. search iterator
if metric_type != "L2":
radius = res[0][limit // 2].distance - 0.1 # pick a radius to make sure there exists results
range_filter = res[0][0].distance + 0.1
search_params = {"metric_type": metric_type, "params": {"radius": radius, "range_filter": range_filter}}
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"metric_type": metric_type, "batch_size": batch_size,
"radius": radius,
"range_filter": range_filter})
else:
radius = res[0][limit // 2].distance + 0.1
range_filter = res[0][0].distance - 0.1
search_params = {"metric_type": metric_type, "params": {"radius": radius, "range_filter": range_filter}}
collection_w.search_iterator(search_vector, field_name, search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"metric_type": metric_type, "batch_size": batch_size,
"radius": radius,
"range_filter": range_filter})
@pytest.mark.tags(CaseLabel.L1)
def test_search_iterator_binary(self):
"""
target: test search iterator binary
method: 1. search iterator
2. check the result, expect pk
expected: search successfully
"""
# 1. initialize with data
batch_size = 200
collection_w = self.init_collection_general(
prefix, True, is_binary=True)[0]
# 2. search iterator
_, binary_vectors = cf.gen_binary_vectors(2, ct.default_dim)
collection_w.search_iterator(binary_vectors[:1], binary_field_name,
ct.default_search_binary_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("metrics", ct.float_metrics)
def test_search_iterator_with_expression(self, metrics):
"""
target: test search iterator normal
method: 1. search iterator
2. check the result, expect pk not repeat and meet the expr requirements
expected: search successfully
"""
# 1. initialize with data
batch_size = 100
dim = 128
collection_w = self.init_collection_general(
prefix, True, dim=dim, is_index=False)[0]
collection_w.create_index(field_name, {"metric_type": metrics})
collection_w.load()
# 2. search iterator
search_params = {"metric_type": metrics}
expression = "1000.0 <= float < 2000.0"
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
expr=expression, check_task=CheckTasks.check_search_iterator,
check_items={})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("batch_size", [10, 100, 777, 1000])
def test_search_iterator_with_different_limit(self, batch_size):
"""
target: test search iterator normal
method: 1. search iterator
2. check the result, expect pk not repeat and meet the expr requirements
expected: search successfully
"""
# 1. initialize with data
collection_w = self.init_collection_general(prefix, True)[0]
# 2. search iterator
search_params = {"metric_type": "COSINE"}
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L2)
def test_search_iterator_invalid_nq(self):
"""
target: test search iterator normal
method: 1. search iterator
2. check the result, expect pk
expected: search successfully
"""
# 1. initialize with data
batch_size = 100
dim = 128
collection_w = self.init_collection_general(
prefix, True, dim=dim, is_index=False)[0]
collection_w.create_index(field_name, {"metric_type": "L2"})
collection_w.load()
# 2. search iterator
search_params = {"metric_type": "L2"}
collection_w.search_iterator(vectors[:2], field_name, search_params, batch_size,
check_task=CheckTasks.err_res,
check_items={"err_code": 1,
"err_msg": "Not support search iteration over multiple vectors at present"})

View File

@ -0,0 +1,491 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestCollectionSearchJSON(TestcaseBase):
""" Test case of search interface """
@pytest.fixture(scope="function",
params=[default_nb, default_nb_medium])
def nb(self, request):
yield request.param
@pytest.fixture(scope="function", params=[2, 500])
def nq(self, request):
yield request.param
@pytest.fixture(scope="function", params=[32, 128])
def dim(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def _async(self, request):
yield request.param
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
def metrics(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def is_flush(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def enable_dynamic_field(self, request):
yield request.param
@pytest.fixture(scope="function", params=[0, 0.5, 1])
def null_data_percent(self, request):
yield request.param
"""
******************************************************************
# The followings are invalid base cases
******************************************************************
"""
@pytest.mark.skip("Supported json like: 1, \"abc\", [1,2,3,4]")
@pytest.mark.tags(CaseLabel.L1)
def test_search_json_expression_object(self):
"""
target: test search with comparisons jsonField directly
method: search with expressions using jsonField name directly
expected: Raise error
"""
# 1. initialize with data
nq = 1
dim = 128
collection_w, _, _, insert_ids, time_stamp = self.init_collection_general(prefix, True, dim=dim)[0:5]
# 2. search before insert time_stamp
log.info("test_search_json_expression_object: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
# 3. search after insert time_stamp
json_search_exp = "json_field > 0"
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
json_search_exp,
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1,
ct.err_msg: "can not comparisons jsonField directly"})
"""
******************************************************************
# The followings are valid base cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
def test_search_json_expression_default(self, nq, is_flush, enable_dynamic_field):
"""
target: test search case with default json expression
method: create connection, collection, insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
dim = 64
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, auto_id=True, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field, language="Hindi")[0:5]
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
# 2. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
default_json_search_exp,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": default_limit})
@pytest.mark.tags(CaseLabel.L2)
def test_search_json_nullable_load_before_insert(self, nq, is_flush, enable_dynamic_field):
"""
target: test search case with default json expression
method: create connection, collection, insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize collection
dim = 64
enable_dynamic_field = False
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, False, auto_id=True, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
nullable_fields={ct.default_json_field_name: 1})[0:5]
# insert data
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nb)]
data = [[np.float32(i) for i in range(default_nb)], [str(i) for i in range(default_nb)], [], vectors]
collection_w.insert(data)
collection_w.num_entities
# 2. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"limit": default_limit})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue 37113")
def test_search_json_nullable_insert_before_load(self, nq, is_flush, enable_dynamic_field):
"""
target: test search case with default json expression
method: create connection, collection, insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize collection
dim = 64
enable_dynamic_field = False
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, False, auto_id=True, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
nullable_fields={ct.default_json_field_name: 1})[0:5]
collection_w.release()
# insert data
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nb)]
data = [[np.float32(i) for i in range(default_nb)], [str(i) for i in range(default_nb)], [], vectors]
collection_w.insert(data)
collection_w.num_entities
collection_w.load()
# 2. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"limit": default_limit})
@pytest.mark.tags(CaseLabel.L1)
def test_search_expression_json_contains(self, enable_dynamic_field):
"""
target: test search with expression using json_contains
method: search with expression (json_contains)
expected: search successfully
"""
# 1. initialize with data
collection_w = self.init_collection_general(
prefix, enable_dynamic_field=enable_dynamic_field)[0]
# 2. insert data
array = []
for i in range(default_nb):
data = {
default_int64_field_name: i,
default_float_field_name: i * 1.0,
default_string_field_name: str(i),
default_json_field_name: {"number": i, "list": [i, i + 1, i + 2]},
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
}
array.append(data)
collection_w.insert(array)
# 2. search
collection_w.load()
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
collection_w.name)
expressions = [
"json_contains(json_field['list'], 100)", "JSON_CONTAINS(json_field['list'], 100)"]
for expression in expressions:
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit, expression,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": 3})
@pytest.mark.tags(CaseLabel.L2)
def test_search_expression_json_contains_list(self, auto_id):
"""
target: test search with expression using json_contains
method: search with expression (json_contains)
expected: search successfully
"""
# 1. initialize with data
collection_w = self.init_collection_general(
prefix, auto_id=auto_id, enable_dynamic_field=True)[0]
# 2. insert data
limit = 100
array = []
for i in range(default_nb):
data = {
default_int64_field_name: i,
default_json_field_name: [j for j in range(i, i + limit)],
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
}
if auto_id:
data.pop(default_int64_field_name, None)
array.append(data)
collection_w.insert(array)
# 2. search
collection_w.load()
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
collection_w.name)
expressions = [
"json_contains(json_field, 100)", "JSON_CONTAINS(json_field, 100)"]
for expression in expressions:
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, limit, expression,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": limit})
@pytest.mark.tags(CaseLabel.L2)
def test_search_expression_json_contains_combined_with_normal(self, enable_dynamic_field):
"""
target: test search with expression using json_contains
method: search with expression (json_contains)
expected: search successfully
"""
# 1. initialize with data
collection_w = self.init_collection_general(
prefix, enable_dynamic_field=enable_dynamic_field)[0]
# 2. insert data
limit = 100
array = []
for i in range(default_nb):
data = {
default_int64_field_name: i,
default_float_field_name: i * 1.0,
default_string_field_name: str(i),
default_json_field_name: {"number": i, "list": [str(j) for j in range(i, i + limit)]},
default_float_vec_field_name: gen_vectors(1, default_dim)[0]
}
array.append(data)
collection_w.insert(array)
# 2. search
collection_w.load()
log.info("test_search_with_output_field_json_contains: Searching collection %s" %
collection_w.name)
tar = 1000
expressions = [f"json_contains(json_field['list'], '{tar}') && int64 > {tar - limit // 2}",
f"JSON_CONTAINS(json_field['list'], '{tar}') && int64 > {tar - limit // 2}"]
for expression in expressions:
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, limit, expression,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": limit // 2})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_prefix", ["array_contains", "ARRAY_CONTAINS"])
def test_search_expr_array_contains(self, expr_prefix):
"""
target: test query with expression using json_contains
method: query with expression using json_contains
expected: succeed
"""
# 1. create a collection
schema = cf.gen_array_collection_schema()
collection_w = self.init_collection_wrap(schema=schema)
# 2. insert data
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
data = cf.gen_array_dataframe_data()
data[ct.default_string_array_field_name] = string_field_value
collection_w.insert(data)
collection_w.create_index(ct.default_float_vec_field_name, {})
# 3. search
collection_w.load()
expression = f"{expr_prefix}({ct.default_string_array_field_name}, '1000')"
res = collection_w.search(vectors[:default_nq], default_search_field, {},
limit=ct.default_nb, expr=expression)[0]
exp_ids = cf.assert_json_contains(expression, string_field_value)
assert set(res[0].ids) == set(exp_ids)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_prefix", ["array_contains", "ARRAY_CONTAINS"])
def test_search_expr_not_array_contains(self, expr_prefix):
"""
target: test query with expression using json_contains
method: query with expression using json_contains
expected: succeed
"""
# 1. create a collection
schema = cf.gen_array_collection_schema()
collection_w = self.init_collection_wrap(schema=schema)
# 2. insert data
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
data = cf.gen_array_dataframe_data()
data[ct.default_string_array_field_name] = string_field_value
collection_w.insert(data)
collection_w.create_index(ct.default_float_vec_field_name, {})
# 3. search
collection_w.load()
expression = f"not {expr_prefix}({ct.default_string_array_field_name}, '1000')"
res = collection_w.search(vectors[:default_nq], default_search_field, {},
limit=ct.default_nb, expr=expression)[0]
exp_ids = cf.assert_json_contains(expression, string_field_value)
assert set(res[0].ids) == set(exp_ids)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_prefix", ["array_contains_all", "ARRAY_CONTAINS_ALL"])
def test_search_expr_array_contains_all(self, expr_prefix):
"""
target: test query with expression using json_contains
method: query with expression using json_contains
expected: succeed
"""
# 1. create a collection
schema = cf.gen_array_collection_schema()
collection_w = self.init_collection_wrap(schema=schema)
# 2. insert data
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
data = cf.gen_array_dataframe_data()
data[ct.default_string_array_field_name] = string_field_value
collection_w.insert(data)
collection_w.create_index(ct.default_float_vec_field_name, {})
# 3. search
collection_w.load()
expression = f"{expr_prefix}({ct.default_string_array_field_name}, ['1000'])"
res = collection_w.search(vectors[:default_nq], default_search_field, {},
limit=ct.default_nb, expr=expression)[0]
exp_ids = cf.assert_json_contains(expression, string_field_value)
assert set(res[0].ids) == set(exp_ids)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_prefix", ["array_contains_any", "ARRAY_CONTAINS_ANY",
"not array_contains_any", "not ARRAY_CONTAINS_ANY"])
def test_search_expr_array_contains_any(self, expr_prefix):
"""
target: test query with expression using json_contains
method: query with expression using json_contains
expected: succeed
"""
# 1. create a collection
schema = cf.gen_array_collection_schema()
collection_w = self.init_collection_wrap(schema=schema)
# 2. insert data
string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)]
data = cf.gen_array_dataframe_data()
data[ct.default_string_array_field_name] = string_field_value
collection_w.insert(data)
collection_w.create_index(ct.default_float_vec_field_name, {})
# 3. search
collection_w.load()
expression = f"{expr_prefix}({ct.default_string_array_field_name}, ['1000'])"
res = collection_w.search(vectors[:default_nq], default_search_field, {},
limit=ct.default_nb, expr=expression)[0]
exp_ids = cf.assert_json_contains(expression, string_field_value)
assert set(res[0].ids) == set(exp_ids)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("expr_prefix", ["array_contains_all", "ARRAY_CONTAINS_ALL",
"array_contains_any", "ARRAY_CONTAINS_ANY"])
def test_search_expr_array_contains_invalid(self, expr_prefix):
"""
target: test query with expression using json_contains
method: query with expression using json_contains(a, b) b not list
expected: report error
"""
# 1. create a collection
schema = cf.gen_array_collection_schema()
collection_w = self.init_collection_wrap(schema=schema)
# 2. insert data
data = cf.gen_array_dataframe_data()
collection_w.insert(data)
collection_w.create_index(ct.default_float_vec_field_name, {})
# 3. search
collection_w.load()
expression = f"{expr_prefix}({ct.default_string_array_field_name}, '1000')"
error = {ct.err_code: 1100,
ct.err_msg: f"cannot parse expression: {expression}, "
f"error: ContainsAll operation element must be an array"}
if expr_prefix in ["array_contains_any", "ARRAY_CONTAINS_ANY"]:
error = {ct.err_code: 1100,
ct.err_msg: f"cannot parse expression: {expression}, "
f"error: ContainsAny operation element must be an array"}
collection_w.search(vectors[:default_nq], default_search_field, {},
limit=ct.default_nb, expr=expression,
check_task=CheckTasks.err_res, check_items=error)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,585 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
""" Test case of search interface """
@pytest.fixture(scope="function", params=[default_nb_medium])
def nb(self, request):
yield request.param
@pytest.fixture(scope="function", params=[200])
def nq(self, request):
yield request.param
@pytest.fixture(scope="function", params=[32, 128])
def dim(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def _async(self, request):
yield request.param
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
def metrics(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def is_flush(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def enable_dynamic_field(self, request):
yield request.param
@pytest.fixture(scope="function", params=["IP", "COSINE", "L2"])
def metric_type(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def random_primary_key(self, request):
yield request.param
@pytest.fixture(scope="function", params=["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def vector_data_type(self, request):
yield request.param
@pytest.fixture(scope="function", params=["STL_SORT", "INVERTED"])
def numeric_scalar_index(self, request):
yield request.param
@pytest.fixture(scope="function", params=["TRIE", "INVERTED", "BITMAP"])
def varchar_scalar_index(self, request):
yield request.param
@pytest.fixture(scope="function", params=[200, 600])
def batch_size(self, request):
yield request.param
@pytest.fixture(scope="function", params=[0, 0.5, 1])
def null_data_percent(self, request):
yield request.param
"""
******************************************************************
# The following are valid base cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
def test_search_normal_none_data(self, nq, dim, auto_id, is_flush, enable_dynamic_field, vector_data_type,
null_data_percent):
"""
target: test search normal case with none data inserted
method: create connection, collection with nullable fields, insert data including none, and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
vector_data_type=vector_data_type,
nullable_fields={ct.default_float_field_name: null_data_percent})[0:5]
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
# 3. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=[default_int64_field_name,
default_float_field_name],
guarantee_timestamp=0,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": default_limit,
"output_fields": [default_int64_field_name,
default_float_field_name]})
@pytest.mark.tags(CaseLabel.L2)
def test_search_after_none_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index,
null_data_percent, _async):
"""
target: test search after different index
method: test search after different index and corresponding search params
expected: search successfully with limit(topK)
"""
# 1. initialize with data
nullable_fields = {ct.default_int32_field_name: null_data_percent,
ct.default_int16_field_name: null_data_percent,
ct.default_int8_field_name: null_data_percent,
ct.default_bool_field_name: null_data_percent,
ct.default_float_field_name: null_data_percent,
ct.default_double_field_name: null_data_percent,
ct.default_string_field_name: null_data_percent}
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, 5000, partition_num=1,
is_all_data_type=True, dim=default_dim,
is_index=False, nullable_fields=nullable_fields)[0:4]
# 2. create index on vector field and load
index = "HNSW"
params = cf.get_index_params_params(index)
default_index = {"index_type": index, "params": params, "metric_type": "COSINE"}
vector_name_list = cf.extract_vector_field_name_list(collection_w)
vector_name_list.append(ct.default_float_vec_field_name)
for vector_name in vector_name_list:
collection_w.create_index(vector_name, default_index)
# 3. create index on scalar field with None data
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
# 4. create index on scalar field with default data
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
collection_w.create_index(ct.default_int64_field_name, scalar_index_params)
collection_w.create_index(ct.default_int32_field_name, scalar_index_params)
collection_w.create_index(ct.default_int16_field_name, scalar_index_params)
collection_w.create_index(ct.default_int8_field_name, scalar_index_params)
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
scalar_index_params = {"index_type": "INVERTED", "params": {}}
collection_w.create_index(ct.default_bool_field_name, scalar_index_params)
collection_w.load()
# 5. search
search_params = cf.gen_search_param(index, "COSINE")
limit = search_params[0]["params"]["ef"]
log.info("Searching with search params: {}".format(search_params[0]))
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
collection_w.search(vectors[:default_nq], default_search_field,
search_param, limit, default_search_exp, _async=_async,
output_fields=[ct.default_string_field_name, ct.default_float_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async,
"output_fields": [ct.default_string_field_name,
ct.default_float_field_name]})
@pytest.mark.tags(CaseLabel.L0)
def test_search_default_value_with_insert(self, nq, dim, auto_id, is_flush, enable_dynamic_field, vector_data_type):
"""
target: test search normal case with default value set
method: create connection, collection with default value set, insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
vector_data_type=vector_data_type,
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
# 3. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=[default_int64_field_name,
default_float_field_name],
guarantee_timestamp=0,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": default_limit,
"output_fields": [default_int64_field_name,
default_float_field_name]})
@pytest.mark.tags(CaseLabel.L1)
def test_search_default_value_without_insert(self, enable_dynamic_field):
"""
target: test search normal case with default value set
method: create connection, collection with default value set, no insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w = self.init_collection_general(prefix, False, dim=default_dim,
enable_dynamic_field=enable_dynamic_field,
nullable_fields={ct.default_float_field_name: 0},
default_value_fields={
ct.default_float_field_name: np.float32(10.0)})[0]
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim, "FLOAT_VECTOR")
# 3. search after insert
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
guarantee_timestamp=0,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": 0})
@pytest.mark.tags(CaseLabel.L2)
def test_search_after_default_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index, _async):
"""
target: test search after different index
method: test search after different index and corresponding search params
expected: search successfully with limit(topK)
"""
# 1. initialize with data
default_value_fields = {ct.default_int32_field_name: np.int32(1),
ct.default_int16_field_name: np.int32(2),
ct.default_int8_field_name: np.int32(3),
ct.default_bool_field_name: True,
ct.default_float_field_name: np.float32(10.0),
ct.default_double_field_name: 10.0,
ct.default_string_field_name: "1"}
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, 5000, partition_num=1,
is_all_data_type=True, dim=default_dim,
is_index=False,
default_value_fields=default_value_fields)[0:4]
# 2. create index on vector field and load
index = "HNSW"
params = cf.get_index_params_params(index)
default_index = {"index_type": index, "params": params, "metric_type": "L2"}
vector_name_list = cf.extract_vector_field_name_list(collection_w)
vector_name_list.append(ct.default_float_vec_field_name)
for vector_name in vector_name_list:
collection_w.create_index(vector_name, default_index)
# 3. create index on scalar field with None data
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
# 4. create index on scalar field with default data
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
collection_w.create_index(ct.default_int64_field_name, scalar_index_params)
collection_w.create_index(ct.default_int32_field_name, scalar_index_params)
collection_w.create_index(ct.default_int16_field_name, scalar_index_params)
collection_w.create_index(ct.default_int8_field_name, scalar_index_params)
if numeric_scalar_index != "STL_SORT":
collection_w.create_index(ct.default_bool_field_name, scalar_index_params)
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
collection_w.load()
# 5. search
search_params = cf.gen_search_param(index, "L2")
limit = search_params[0]["params"]["ef"]
log.info("Searching with search params: {}".format(search_params[0]))
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
output_fields = [ct.default_int64_field_name, ct.default_int32_field_name,
ct.default_int16_field_name, ct.default_int8_field_name,
ct.default_bool_field_name, ct.default_float_field_name,
ct.default_double_field_name, ct.default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
search_param, limit, default_search_exp, _async=_async,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async,
"output_fields": output_fields})
@pytest.mark.tags(CaseLabel.L1)
def test_search_both_default_value_non_data(self, nq, dim, auto_id, is_flush, enable_dynamic_field,
vector_data_type):
"""
target: test search normal case with default value set
method: create connection, collection with default value set, insert and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
vector_data_type=vector_data_type,
nullable_fields={ct.default_float_field_name: 1},
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(nq, dim, vector_data_type)
# 3. search after insert
collection_w.search(vectors[:nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=[default_int64_field_name,
default_float_field_name],
guarantee_timestamp=0,
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": default_limit,
"output_fields": [default_int64_field_name,
default_float_field_name]})
@pytest.mark.tags(CaseLabel.L1)
def test_search_collection_with_non_default_data_after_release_load(self, nq, _async, null_data_percent):
"""
target: search the pre-released collection after load
method: 1. create collection
2. release collection
3. load collection
4. search the pre-released collection
expected: search successfully
"""
# 1. initialize without data
nb = 2000
dim = 64
auto_id = True
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, nb, 1, auto_id=auto_id, dim=dim,
nullable_fields={ct.default_string_field_name: null_data_percent},
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:5]
# 2. release collection
collection_w.release()
# 3. Search the pre-released collection after load
collection_w.load()
log.info("test_search_collection_awith_non_default_data_after_release_load: searching after load")
vectors = [[random.random() for _ in range(dim)] for _ in range(nq)]
collection_w.search(vectors[:nq], default_search_field, default_search_params,
default_limit, default_search_exp, _async=_async,
output_fields=[ct.default_float_field_name, ct.default_string_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async,
"output_fields": [ct.default_float_field_name,
ct.default_string_field_name]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.GPU)
def test_search_after_different_index_with_params_none_default_data(self, varchar_scalar_index,
numeric_scalar_index,
null_data_percent, _async):
"""
target: test search after different index
method: test search after different index and corresponding search params
expected: search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, 5000, partition_num=1, is_all_data_type=True,
dim=default_dim, is_index=False,
nullable_fields={ct.default_string_field_name: null_data_percent},
default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:4]
# 2. create index on vector field and load
index = "HNSW"
params = cf.get_index_params_params(index)
default_index = {"index_type": index, "params": params, "metric_type": "COSINE"}
vector_name_list = cf.extract_vector_field_name_list(collection_w)
vector_name_list.append(ct.default_float_vec_field_name)
for vector_name in vector_name_list:
collection_w.create_index(vector_name, default_index)
# 3. create index on scalar field with None data
scalar_index_params = {"index_type": varchar_scalar_index, "params": {}}
collection_w.create_index(ct.default_string_field_name, scalar_index_params)
# 4. create index on scalar field with default data
scalar_index_params = {"index_type": numeric_scalar_index, "params": {}}
collection_w.create_index(ct.default_float_field_name, scalar_index_params)
collection_w.load()
# 5. search
search_params = cf.gen_search_param(index, "COSINE")
limit = search_params[0]["params"]["ef"]
log.info("Searching with search params: {}".format(search_params[0]))
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
collection_w.search(vectors[:default_nq], default_search_field,
search_param, limit, default_search_exp, _async=_async,
output_fields=[ct.default_string_field_name, ct.default_float_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async,
"output_fields": [ct.default_string_field_name,
ct.default_float_field_name]})
@pytest.mark.tags(CaseLabel.L1)
def test_search_iterator_with_none_data(self, batch_size, null_data_percent):
"""
target: test search iterator normal
method: 1. search iterator
2. check the result, expect pk
expected: search successfully
"""
# 1. initialize with data
dim = 64
collection_w = \
self.init_collection_general(prefix, True, dim=dim, is_index=False,
nullable_fields={ct.default_string_field_name: null_data_percent})[0]
collection_w.create_index(field_name, {"metric_type": "L2"})
collection_w.load()
# 2. search iterator
search_params = {"metric_type": "L2"}
vectors = cf.gen_vectors_based_on_vector_type(1, dim, "FLOAT_VECTOR")
collection_w.search_iterator(vectors[:1], field_name, search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L2)
def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, null_data_percent):
"""
target: test search normal case with none data inserted
method: create connection, collection with nullable fields, insert data including none, and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
nullable_fields={ct.default_float_field_name: null_data_percent})[0:5]
# 2. release and partial load again
collection_w.release()
loaded_fields = [default_int64_field_name, ct.default_float_vec_field_name]
if not enable_dynamic_field:
loaded_fields.append(default_float_field_name)
collection_w.load(load_fields=loaded_fields)
# 3. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 4. search after partial load field with None data
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"output_fields": output_fields})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_none_data_expr_cache(self, is_flush):
"""
target: test search case with none data to test expr cache
method: 1. create collection with double datatype as nullable field
2. search with expr "nullableFid == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of nullable field
as varchar datatype
5. search with expr "nullableFid == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
nullable_fields={ct.default_float_field_name: 0.5})[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: None,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})

View File

@ -0,0 +1,935 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_v2_base import TestMilvusClientV2Base
from base.client_base import TestcaseBase
import random
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
default_nb = ct.default_nb
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
nq = 1
field_name = default_float_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
half_nb = ct.default_nb // 2
default_primary_key_field_name = "id"
default_vector_field_name = "vector"
default_float_field_name = ct.default_float_field_name
default_string_field_name = ct.default_string_field_name
@pytest.mark.xdist_group("TestMilvusClientSearchPagination")
class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
"""Test search with pagination functionality"""
def setup_class(self):
super().setup_class(self)
self.collection_name = cf.gen_unique_str("test_search_pagination")
@pytest.fixture(scope="class", autouse=True)
def prepare_collection(self, request):
"""
Initialize collection before test class runs
"""
# Get client connection
client = self._client()
# Create collection
self.collection_schema = self.create_schema(client, enable_dynamic_field=False)[0]
self.collection_schema.add_field(default_primary_key_field_name, DataType.INT64, is_primary=True, auto_id=False)
self.collection_schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=default_dim)
self.collection_schema.add_field(default_float_field_name, DataType.FLOAT)
self.collection_schema.add_field(default_string_field_name, DataType.VARCHAR, max_length=65535)
self.create_collection(client, self.collection_name, schema=self.collection_schema)
# Insert data 5 times with non-duplicated primary keys
for j in range(5):
rows = [{default_primary_key_field_name: i + j * default_nb,
default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
default_float_field_name: (i + j * default_nb) * 1.0,
default_string_field_name: str(i + j * default_nb)}
for i in range(default_nb)]
self.insert(client, self.collection_name, rows)
self.flush(client, self.collection_name)
# Create index
self.index_params = self.prepare_index_params(client)[0]
self.index_params.add_index(field_name=default_vector_field_name,
metric_type="COSINE",
index_type="IVF_FLAT",
params={"nlist": 128})
self.create_index(client, self.collection_name, index_params=self.index_params)
# Load collection
self.load_collection(client, self.collection_name)
def teardown():
self.drop_collection(self._client(), self.collection_name)
request.addfinalizer(teardown)
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_search_with_pagination_default(self):
"""
target: test search with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
client = self._client()
# 1. Create collection with schema
collection_name = self.collection_name
# 2. Search with pagination for 10 pages
limit = 100
pages = 10
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
all_pages_results = []
for page in range(pages):
offset = page * limit
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
search_res_with_offset, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params,
limit=limit,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": default_nq,
"limit": limit
}
)
all_pages_results.append(search_res_with_offset)
# 3. Search without pagination
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
search_res_full, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params_full,
limit=limit * pages
)
# 4. Compare results - verify pagination results equal the results in full search with offsets
for p in range(pages):
page_res = all_pages_results[p]
for i in range(default_nq):
page_ids = [page_res[i][j].get('id') for j in range(limit)]
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
assert page_ids == ids_in_full
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_search_with_pagination_default1(self):
"""
target: test search with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
client = self._client()
# 1. Create collection with schema
collection_name = self.collection_name
# 2. Search with pagination for 10 pages
limit = 100
pages = 10
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
all_pages_results = []
for page in range(pages):
offset = page * limit
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
search_res_with_offset, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params,
limit=limit,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": default_nq,
"limit": limit
}
)
all_pages_results.append(search_res_with_offset)
# 3. Search without pagination
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
search_res_full, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params_full,
limit=limit * pages
)
# 4. Compare results - verify pagination results equal the results in full search with offsets
for p in range(pages):
page_res = all_pages_results[p]
for i in range(default_nq):
page_ids = [page_res[i][j].get('id') for j in range(limit)]
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
assert page_ids == ids_in_full
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_search_with_pagination_default2(self):
"""
target: test search with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
client = self._client()
# 1. Create collection with schema
collection_name = self.collection_name
# 2. Search with pagination for 10 pages
limit = 100
pages = 10
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
all_pages_results = []
for page in range(pages):
offset = page * limit
search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
search_res_with_offset, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params,
limit=limit,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": default_nq,
"limit": limit
}
)
all_pages_results.append(search_res_with_offset)
# 3. Search without pagination
search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
search_res_full, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params_full,
limit=limit * pages
)
# 4. Compare results - verify pagination results equal the results in full search with offsets
for p in range(pages):
page_res = all_pages_results[p]
for i in range(default_nq):
page_ids = [page_res[i][j].get('id') for j in range(limit)]
ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
assert page_ids == ids_in_full
# @pytest.mark.tags(CaseLabel.L0)
# def test_milvus_client_search_with_pagination_default(self):
# """
# target: test search with pagination
# method: 1. connect and create a collection
# 2. search pagination with offset
# 3. search with offset+limit
# 4. compare with the search results whose corresponding ids should be the same
# expected: search successfully and ids is correct
# """
# client = self._client()
# # 1. Create collection with schema
# collection_name = cf.gen_unique_str("test_search_pagination")
# self.create_collection(client, collection_name, default_dim)
#
# # Insert data 5 times with non-duplicated primary keys
# for j in range(5):
# rows = [{default_primary_key_field_name: i + j * default_nb,
# default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
# default_float_field_name: (i + j * default_nb) * 1.0,
# default_string_field_name: str(i + j * default_nb)}
# for i in range(default_nb)]
# self.insert(client, collection_name, rows)
# self.flush(client, collection_name)
#
# # 2. Search with pagination for 10 pages
# limit = 100
# pages = 10
# vectors_to_search = cf.gen_vectors(default_nq, default_dim)
# all_pages_results = []
# for page in range(pages):
# offset = page * limit
# search_params = {"metric_type": "COSINE", "params": {"nprobe": 100}, "offset": offset}
# search_res_with_offset, _ = self.search(
# client,
# collection_name,
# vectors_to_search[:default_nq],
# anns_field=default_vector_field_name,
# search_params=search_params,
# limit=limit,
# check_task=CheckTasks.check_search_results,
# check_items={"enable_milvus_client_api": True,
# "nq": default_nq,
# "limit": limit
# }
# )
# all_pages_results.append(search_res_with_offset)
#
# # 3. Search without pagination
# search_params_full = {"metric_type": "COSINE", "params": {"nprobe": 100}}
# search_res_full, _ = self.search(
# client,
# collection_name,
# vectors_to_search[:default_nq],
# anns_field=default_vector_field_name,
# search_params=search_params_full,
# limit=limit * pages
# )
#
# # 4. Compare results - verify pagination results equal the results in full search with offsets
# for p in range(pages):
# page_res = all_pages_results[p]
# for i in range(default_nq):
# page_ids = [page_res[i][j].get('id') for j in range(limit)]
# ids_in_full = [search_res_full[i][p*limit:p*limit+limit][j].get('id') for j in range(limit)]
# assert page_ids == ids_in_full
class TestSearchPagination(TestcaseBase):
""" Test case of search pagination """
@pytest.fixture(scope="function", params=[0, 10, 100])
def offset(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def _async(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def enable_dynamic_field(self, request):
yield request.param
@pytest.fixture(scope="function", params=["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def vector_data_type(self, request):
yield request.param
"""
******************************************************************
# The following are valid base cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
def test_search_string_with_pagination(self, offset, _async):
"""
target: test search string with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
auto_id = True
enable_dynamic_field = True
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. search
search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_param, default_limit,
default_search_string_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})[0]
# 3. search with offset+limit
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
default_limit + offset, default_search_string_exp, _async=_async)[0]
if _async:
search_res.done()
search_res = search_res.result()
res.done()
res = res.result()
res_distance = res[0].distances[offset:]
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L1)
def test_search_binary_with_pagination(self, offset):
"""
target: test search binary with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
auto_id = False
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, is_binary=True, auto_id=auto_id, dim=default_dim)[0:4]
# 2. search
search_param = {"metric_type": "JACCARD",
"params": {"nprobe": 10}, "offset": offset}
binary_vectors = cf.gen_binary_vectors(default_nq, default_dim)[1]
search_res = collection_w.search(binary_vectors[:default_nq], "binary_vector",
search_param, default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit})[0]
# 3. search with offset+limit
search_binary_param = {
"metric_type": "JACCARD", "params": {"nprobe": 10}}
res = collection_w.search(binary_vectors[:default_nq], "binary_vector", search_binary_param,
default_limit + offset)[0]
assert len(search_res[0].ids) == len(res[0].ids[offset:])
assert sorted(search_res[0].distances, key=np.float32) == sorted(
res[0].distances[offset:], key=np.float32)
@pytest.mark.tags(CaseLabel.L1)
def test_search_all_vector_type_with_pagination(self, vector_data_type):
"""
target: test search with pagination using different vector datatype
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
auto_id = False
enable_dynamic_field = True
offset = 100
limit = 20
collection_w = self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim,
enable_dynamic_field=enable_dynamic_field,
vector_data_type=vector_data_type)[0]
# 2. search pagination with offset
search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim, vector_data_type)
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_param, limit,
default_search_exp,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": limit})[0]
# 3. search with offset+limit
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
limit + offset, default_search_exp)[0]
res_distance = res[0].distances[offset:]
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("limit", [100, 3000, 10000])
def test_search_with_pagination_topK(self, limit, _async):
"""
target: test search with pagination limit + offset = topK
method: 1. connect and create a collection
2. search pagination with offset
3. search with topK
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
topK = 16384
auto_id = True
offset = topK - limit
collection_w = self.init_collection_general(
prefix, True, nb=20000, auto_id=auto_id, dim=default_dim)[0]
# 2. search
search_param = {"metric_type": "COSINE",
"params": {"nprobe": 10}, "offset": offset}
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_param, limit,
default_search_exp, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": limit,
"_async": _async})[0]
# 3. search with topK
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
topK, default_search_exp, _async=_async)[0]
if _async:
search_res.done()
search_res = search_res.result()
res.done()
res = res.result()
res_distance = res[0].distances[offset:]
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
def test_search_pagination_with_expression(self, offset):
"""
target: test search pagination with expression
method: create connection, collection, insert and search with expression
expected: search successfully
"""
# 1. create a collection
nb = 2500
dim = 38
enable_dynamic_field = False
collection_w, _vectors, _, insert_ids = \
self.init_collection_general(prefix, True, nb=nb, dim=dim,
enable_dynamic_field=enable_dynamic_field)[0:4]
collection_w.load()
# filter result with expression in collection
_vectors = _vectors[0]
for _async in [False, True]:
for expressions in cf.gen_normal_expressions_and_templates():
log.debug(f"search with expression: {expressions} with _async: {_async}")
expr = expressions[0].replace("&&", "and").replace("||", "or")
filter_ids = []
for i, _id in enumerate(insert_ids):
if enable_dynamic_field:
int64 = _vectors[i][ct.default_int64_field_name]
float = _vectors[i][ct.default_float_field_name]
else:
int64 = _vectors.int64[i]
float = _vectors.float[i]
if not expr or eval(expr):
filter_ids.append(_id)
# 2. search
limit = min(default_limit, len(filter_ids))
if offset >= len(filter_ids):
limit = 0
elif len(filter_ids) - offset < default_limit:
limit = len(filter_ids) - offset
search_param = {"metric_type": "COSINE",
"params": {"nprobe": 10}, "offset": offset}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
search_param, default_limit,
expr=expr,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async})
# 3. search with offset+limit
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
default_limit + offset,
expr=expr, _async=_async)[0]
if _async:
res.done()
res = res.result()
search_res.done()
search_res = search_res.result()
filter_ids_set = set(filter_ids)
for hits in search_res:
ids = hits.ids
assert set(ids).issubset(filter_ids_set)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
# 4. search again with expression template
expr = cf.get_expr_from_template(expressions[1]).replace("&&", "and").replace("||", "or")
expr_params = cf.get_expr_params_from_template(expressions[1])
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
search_param, default_limit,
expr=expr, expr_params=expr_params,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async})
# 3. search with offset+limit
res = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
default_limit + offset,
expr=expr, expr_params=expr_params, _async=_async)[0]
if _async:
res.done()
res = res.result()
search_res.done()
search_res = search_res.result()
filter_ids_set = set(filter_ids)
for hits in search_res:
ids = hits.ids
assert set(ids).issubset(filter_ids_set)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
def test_search_pagination_with_index_partition(self, offset, _async):
"""
target: test search pagination with index and partition
method: create connection, collection, insert data, create index and search
expected: searched successfully
"""
# 1. initialize with data
auto_id = False
collection_w, _, _, insert_ids = self.init_collection_general(prefix, True,
partition_num=1,
auto_id=auto_id,
is_index=False)[0:4]
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
# 2. create index
default_index = {"index_type": "IVF_FLAT",
"params": {"nlist": 128}, "metric_type": "L2"}
collection_w.create_index("float_vector", default_index)
collection_w.load()
# 3. search through partitions
par = collection_w.partitions
limit = 100
search_params = {"metric_type": "L2",
"params": {"nprobe": 10}, "offset": offset}
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_params, limit, default_search_exp,
[par[0].name, par[1].name], _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": limit,
"_async": _async})[0]
# 3. search through partitions with offset+limit
search_params = {"metric_type": "L2"}
res = collection_w.search(vectors[:default_nq], default_search_field, search_params,
limit + offset, default_search_exp,
[par[0].name, par[1].name], _async=_async)[0]
if _async:
search_res.done()
search_res = search_res.result()
res.done()
res = res.result()
res_distance = res[0].distances[offset:]
# assert cf.sort_search_distance(search_res[0].distances) == cf.sort_search_distance(res_distance)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
def test_search_pagination_with_inserted_data(self, offset, _async):
"""
target: test search pagination with inserted data
method: create connection, collection, insert data and search
check the results by searching with limit+offset
expected: searched successfully
"""
# 1. create collection
collection_w = self.init_collection_general(
prefix, False, dim=default_dim)[0]
# 2. insert data
data = cf.gen_default_dataframe_data(dim=default_dim)
collection_w.insert(data)
collection_w.load()
# 3. search
search_params = {"offset": offset}
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_params, default_limit,
default_search_exp, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"_async": _async})[0]
# 4. search through partitions with offset+limit
search_params = {}
res = collection_w.search(vectors[:default_nq], default_search_field, search_params,
default_limit + offset, default_search_exp, _async=_async)[0]
if _async:
search_res.done()
search_res = search_res.result()
res.done()
res = res.result()
res_distance = res[0].distances[offset:]
assert sorted(search_res[0].distances) == sorted(res_distance)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
def test_search_pagination_empty(self, offset, _async):
"""
target: test search pagination empty
method: connect, create collection, insert data and search
expected: search successfully
"""
# 1. initialize without data
auto_id = False
collection_w = self.init_collection_general(
prefix, True, auto_id=auto_id, dim=default_dim)[0]
# 2. search collection without data
search_param = {"metric_type": "COSINE",
"params": {"nprobe": 10}, "offset": offset}
search_res = collection_w.search([], default_search_field, search_param,
default_limit, default_search_exp, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": 0,
"_async": _async})[0]
if _async:
search_res.done()
search_res = search_res.result()
assert len(search_res) == 0
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("offset", [3000, 5000])
def test_search_pagination_with_offset_over_num_entities(self, offset):
"""
target: test search pagination with offset over num_entities
method: create connection, collection, insert 3000 entities and search with offset over 3000
expected: return an empty list
"""
# 1. initialize
collection_w = self.init_collection_general(
prefix, True, dim=default_dim)[0]
# 2. search
search_param = {"metric_type": "COSINE",
"params": {"nprobe": 10}, "offset": offset}
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
res = collection_w.search(vectors[:default_nq], default_search_field,
search_param, default_limit,
default_search_exp,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": 0})[0]
assert res[0].ids == []
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[:7])
def test_search_pagination_after_different_index(self, index, offset, _async):
"""
target: test search pagination after different index
method: test search pagination after different index and corresponding search params
expected: search successfully
"""
# 1. initialize with data
dim = 128
auto_id = True
collection_w, _, _, insert_ids, time_stamp = self.init_collection_general(prefix, True, 1000,
partition_num=1,
auto_id=auto_id,
dim=dim, is_index=False)[0:5]
# 2. create index and load
params = cf.get_index_params_params(index)
default_index = {"index_type": index, "params": params, "metric_type": "L2"}
collection_w.create_index("float_vector", default_index)
collection_w.load()
# 3. search
search_params = cf.gen_search_param(index)
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
for search_param in search_params:
res = collection_w.search(vectors[:default_nq], default_search_field, search_param,
default_limit + offset, default_search_exp, _async=_async)[0]
search_param["offset"] = offset
log.info("Searching with search params: {}".format(search_param))
search_res = collection_w.search(vectors[:default_nq], default_search_field,
search_param, default_limit,
default_search_exp, _async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})[0]
if _async:
search_res.done()
search_res = search_res.result()
res.done()
res = res.result()
res_distance = res[0].distances[offset:]
# assert sorted(search_res[0].distances, key=numpy.float32) == sorted(res_distance, key=numpy.float32)
assert set(search_res[0].ids) == set(res[0].ids[offset:])
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("offset", [100, default_nb // 2])
def test_search_offset_different_position(self, offset):
"""
target: test search pagination with offset in different position
method: create connection, collection, insert entities and search with offset
expected: search successfully
"""
# 1. initialize
collection_w = self.init_collection_general(prefix, True)[0]
# 2. search with offset in params
search_params = {"metric_type": "COSINE",
"params": {"nprobe": 10}, "offset": offset}
res1 = collection_w.search(vectors[:default_nq], default_search_field,
search_params, default_limit)[0]
# 3. search with offset outside params
res2 = collection_w.search(vectors[:default_nq], default_search_field, default_search_params,
default_limit, offset=offset)[0]
assert res1[0].ids == res2[0].ids
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("offset", [1, 5, 20])
def test_search_sparse_with_pagination(self, offset):
"""
target: test search sparse with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
auto_id = False
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4]
# 2. search with offset+limit
search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset}
search_vectors = cf.gen_default_list_sparse_data()[-1][-2:]
search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name,
search_param, default_limit)[0]
# 3. search
_search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param,
default_limit + offset)[0]
assert len(search_res[0].ids) == len(res[0].ids[offset:])
assert sorted(search_res[0].distances, key=numpy.float32) == sorted(
res[0].distances[offset:], key=numpy.float32)
class TestSearchPaginationInvalid(TestMilvusClientV2Base):
""" Test case of search pagination """
"""
******************************************************************
# The following are invalid cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
def test_search_pagination_with_invalid_offset_type(self):
"""
target: test search pagination with invalid offset type
method: create connection, collection, insert and search with invalid offset type
expected: raise exception
"""
client = self._client()
# 1. Create collection with schema
collection_name = cf.gen_unique_str("test_search_pagination")
self.create_collection(client, collection_name, default_dim)
# Insert data
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
self.insert(client, collection_name, rows)
self.flush(client, collection_name)
# Search with invalid offset types
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
invalid_offsets = [" ", [1, 2], {1}, "12 s"]
for offset in invalid_offsets:
log.debug(f"assert search error if offset={offset}")
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params,
limit=default_limit,
check_task=CheckTasks.err_res,
check_items={
"err_code": 1,
"err_msg": "wrong type for offset, expect int"
}
)
@pytest.mark.tags(CaseLabel.L1)
def test_search_pagination_with_invalid_offset_value(self):
"""
target: test search pagination with invalid offset value
method: create connection, collection, insert and search with invalid offset value
expected: raise exception
"""
client = self._client()
# 1. Create collection with schema
collection_name = cf.gen_unique_str("test_search_pagination")
self.create_collection(client, collection_name, default_dim)
# Insert data
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
self.insert(client, collection_name, rows)
self.flush(client, collection_name)
# Search with invalid offset values
vectors_to_search = cf.gen_vectors(default_nq, default_dim)
invalid_offsets = [-1, 16385]
for offset in invalid_offsets:
log.debug(f"assert search error if offset={offset}")
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}, "offset": offset}
self.search(
client,
collection_name,
vectors_to_search[:default_nq],
anns_field=default_vector_field_name,
search_params=search_params,
limit=default_limit,
check_task=CheckTasks.err_res,
check_items={
"err_code": 1,
"err_msg": f"offset [{offset}] is invalid, it should be in range [1, 16384]"
}
)

View File

@ -0,0 +1,729 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchString(TestcaseBase):
"""
******************************************************************
The following cases are used to test search about string
******************************************************************
"""
@pytest.fixture(scope="function",
params=[default_nb, default_nb_medium])
def nb(self, request):
yield request.param
@pytest.fixture(scope="function", params=[2, 500])
def nq(self, request):
yield request.param
@pytest.fixture(scope="function", params=[32, 128])
def dim(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=[False, True])
def _async(self, request):
yield request.param
@pytest.fixture(scope="function", params=[True, False])
def enable_dynamic_field(self, request):
yield request.param
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_not_primary(self, _async):
"""
target: test search with string expr and string field is not primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field, string field is not primary
expected: Search successfully
"""
# 1. initialize with data
auto_id = True
enable_dynamic_field = False
collection_w, insert_data, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim, nb=1000,
enable_dynamic_field=enable_dynamic_field, language="Chinese")[0:4]
search_str = insert_data[0][default_string_field_name][1]
search_exp = f"{default_string_field_name} == '{search_str}'"
# 2. search
log.info("test_search_string_field_not_primary: searching collection %s" % collection_w.name)
log.info("search expr: %s" % search_exp)
output_fields = [default_string_field_name, default_float_field_name]
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit, search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"_async": _async})
if _async:
res.done()
res = res.result()
assert res[0][0].entity.varchar == search_str
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_is_primary_true(self, _async):
"""
target: test search with string expr and string field is primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field ,string field is primary
expected: Search successfully
"""
# 1. initialize with data
dim = 64
enable_dynamic_field = True
collection_w, insert_data, _, insert_ids = \
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
enable_dynamic_field=enable_dynamic_field, language="English", nb=1000)[0:4]
search_str = insert_data[0][1][default_string_field_name]
search_exp = f"{default_string_field_name} == '{search_str}'"
# 2. search
log.info("test_search_string_field_is_primary_true: searching collection %s" % collection_w.name)
log.info("search expr: %s" % search_exp)
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit, search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"_async": _async})
if _async:
res.done()
res = res.result()
assert res[0][0].entity.varchar == search_str
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_is_primary_true_multi_vector_fields(self, _async):
"""
target: test search with string expr and string field is primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field ,string field is primary
expected: Search successfully
"""
# 1. initialize with data
dim = 64
enable_dynamic_field = False
multiple_dim_array = [dim, dim]
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
enable_dynamic_field=enable_dynamic_field,
multiple_dim_array=multiple_dim_array, language="German")[0:4]
# 2. search
log.info("test_search_string_field_is_primary_true: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
vector_list = cf.extract_vector_field_name_list(collection_w)
for search_field in vector_list:
collection_w.search(vectors[:default_nq], search_field,
default_search_params, default_limit,
default_search_string_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_range_search_string_field_is_primary_true(self, _async):
"""
target: test range search with string expr and string field is primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field ,string field is primary
expected: Search successfully
"""
# 1. initialize with data
dim = 64
enable_dynamic_field = True
multiple_dim_array = [dim, dim]
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, dim=dim, primary_field=ct.default_string_field_name,
enable_dynamic_field=enable_dynamic_field, is_index=False,
multiple_dim_array=multiple_dim_array)[0:4]
vector_list = cf.extract_vector_field_name_list(collection_w)
collection_w.create_index(field_name, {"metric_type": "L2"})
for vector_field_name in vector_list:
collection_w.create_index(vector_field_name, {"metric_type": "L2"})
collection_w.load()
# 2. search
log.info("test_search_string_field_is_primary_true: searching collection %s" %
collection_w.name)
range_search_params = {"metric_type": "L2",
"params": {"radius": 1000, "range_filter": 0}}
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
for search_field in vector_list:
collection_w.search(vectors[:default_nq], search_field,
range_search_params, default_limit,
default_search_string_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_mix_expr(self, _async):
"""
target: test search with mix string and int expr
method: create collection and insert data
create index and collection load
collection search uses mix expr
expected: Search successfully
"""
# 1. initialize with data
dim = 64
auto_id = False
enable_dynamic_field = False
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim,
enable_dynamic_field=enable_dynamic_field)[0:4]
# 2. search
log.info("test_search_string_mix_expr: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(dim)]
for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_mix_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_with_invalid_expr(self):
"""
target: test search data
method: create collection and insert data
create index and collection load
collection search uses invalid string expr
expected: Raise exception
"""
# 1. initialize with data
auto_id = True
collection_w, _, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, dim=default_dim)[0:4]
# 2. search
log.info("test_search_string_with_invalid_expr: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_invaild_string_exp,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot "
"parse expression: varchar >= 0"})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("expression", cf.gen_normal_string_expressions([ct.default_string_field_name]))
def test_search_with_different_string_expr(self, expression, _async):
"""
target: test search with different string expressions
method: test search with different string expressions
expected: searched successfully with correct limit(topK)
"""
# 1. initialize with data
dim = 64
nb = 1000
enable_dynamic_field = True
collection_w, _vectors, _, insert_ids = \
self.init_collection_general(prefix, True, nb, dim=dim,
is_index=False, enable_dynamic_field=enable_dynamic_field)[0:4]
# filter result with expression in collection
_vectors = _vectors[0]
filter_ids = []
expression = expression.replace("&&", "and").replace("||", "or")
for i, _id in enumerate(insert_ids):
if enable_dynamic_field:
int64 = _vectors[i][ct.default_int64_field_name]
varchar = _vectors[i][ct.default_string_field_name]
else:
int64 = _vectors.int64[i]
varchar = _vectors.varchar[i]
if not expression or eval(expression):
filter_ids.append(_id)
# 2. create index
index_param = {"index_type": "FLAT", "metric_type": "COSINE", "params": {"nlist": 100}}
collection_w.create_index("float_vector", index_param)
collection_w.load()
# 3. search with expression
log.info("test_search_with_expression: searching with expression: %s" % expression)
vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)]
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, nb, expression,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": min(nb, len(filter_ids)),
"_async": _async})
if _async:
search_res.done()
search_res = search_res.result()
filter_ids_set = set(filter_ids)
for hits in search_res:
ids = hits.ids
assert set(ids).issubset(filter_ids_set)
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_is_primary_binary(self, _async):
"""
target: test search with string expr and string field is primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field ,string field is primary
expected: Search successfully
"""
dim = 64
# 1. initialize with binary data
collection_w, _, binary_raw_vector, insert_ids = \
self.init_collection_general(prefix, True, 2, is_binary=True, dim=dim,
is_index=False, primary_field=ct.default_string_field_name)[0:4]
# 2. create index
default_index = {"index_type": "BIN_IVF_FLAT",
"params": {"nlist": 128}, "metric_type": "JACCARD"}
collection_w.create_index("binary_vector", default_index)
collection_w.load()
# 3. search with exception
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
output_fields = [default_string_field_name]
collection_w.search(binary_vectors[:default_nq], "binary_vector", search_params,
default_limit, default_search_string_exp, output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 2,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_binary(self, _async):
"""
target: test search with string expr and string field is not primary
method: create an binary collection and insert data
create index and collection load
collection search uses string expr in string field, string field is not primary
expected: Search successfully
"""
# 1. initialize with binary data
dim = 128
auto_id = True
collection_w, _, binary_raw_vector, insert_ids = \
self.init_collection_general(prefix, True, 2, is_binary=True, auto_id=auto_id,
dim=dim, is_index=False)[0:4]
# 2. create index
default_index = {"index_type": "BIN_IVF_FLAT",
"params": {"nlist": 128}, "metric_type": "JACCARD"}
collection_w.create_index("binary_vector", default_index)
collection_w.load()
# 2. search with exception
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
collection_w.search(binary_vectors[:default_nq], "binary_vector", search_params,
default_limit, default_search_string_exp,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 2,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_mix_expr_with_binary(self, _async):
"""
target: test search with mix string and int expr
method: create an binary collection and insert data
create index and collection load
collection search uses mix expr
expected: Search successfully
"""
# 1. initialize with data
dim = 128
auto_id = True
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, auto_id=auto_id, dim=dim, is_binary=True, is_index=False)[0:4]
# 2. create index
default_index = {"index_type": "BIN_IVF_FLAT",
"params": {"nlist": 128}, "metric_type": "JACCARD"}
collection_w.create_index("binary_vector", default_index)
collection_w.load()
# 2. search
log.info("test_search_mix_expr_with_binary: searching collection %s" %
collection_w.name)
binary_vectors = cf.gen_binary_vectors(3000, dim)[1]
search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}}
output_fields = [default_string_field_name, default_float_field_name]
collection_w.search(binary_vectors[:default_nq], "binary_vector",
search_params, default_limit,
default_search_mix_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_not_primary_prefix(self, _async):
"""
target: test search with string expr and string field is not primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field, string field is not primary
expected: Search successfully
"""
# 1. initialize with data
auto_id = False
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, auto_id=auto_id, dim=default_dim, is_index=False)[0:4]
index_param = {"index_type": "IVF_FLAT",
"metric_type": "L2", "params": {"nlist": 100}}
collection_w.create_index("float_vector", index_param, index_name="a")
index_param_two = {}
collection_w.create_index("varchar", index_param_two, index_name="b")
collection_w.load()
# 2. search
log.info("test_search_string_field_not_primary: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
output_fields = [default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
# search all buckets
{"metric_type": "L2", "params": {
"nprobe": 100}}, default_limit,
perfix_expr,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"_async": _async}
)
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_index(self, _async):
"""
target: test search with string expr and string field is not primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field, string field is not primary
expected: Search successfully
"""
# 1. initialize with data
auto_id = True
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, auto_id=auto_id, dim=default_dim, is_index=False)[0:4]
index_param = {"index_type": "IVF_FLAT",
"metric_type": "L2", "params": {"nlist": 100}}
collection_w.create_index("float_vector", index_param, index_name="a")
index_param = {"index_type": "Trie", "params": {}}
collection_w.create_index("varchar", index_param, index_name="b")
collection_w.load()
# 2. search
log.info("test_search_string_field_not_primary: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
output_fields = [default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
# search all buckets
{"metric_type": "L2", "params": {
"nprobe": 100}}, default_limit,
perfix_expr,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"_async": _async}
)
@pytest.mark.tags(CaseLabel.L1)
def test_search_all_index_with_compare_expr(self, _async):
"""
target: test delete after creating index
method: 1.create collection , insert data, primary_field is string field
2.create string and float index ,delete entities, query
3.search
expected: assert index and deleted id not in search result
"""
# create collection, insert tmp_nb, flush and load
collection_w, vectors, _, insert_ids = self.init_collection_general(prefix, insert_data=True,
primary_field=ct.default_string_field_name,
is_index=False)[0:4]
# create index
index_params_one = {"index_type": "IVF_SQ8",
"metric_type": "COSINE", "params": {"nlist": 64}}
collection_w.create_index(
ct.default_float_vec_field_name, index_params_one, index_name=index_name1)
index_params_two = {}
collection_w.create_index(
ct.default_string_field_name, index_params=index_params_two, index_name=index_name2)
assert collection_w.has_index(index_name=index_name2)
collection_w.release()
collection_w.load()
# delete entity
expr = 'float >= int64'
# search with id 0 vectors
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
output_fields = [default_int64_field_name,
default_float_field_name, default_string_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
expr,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_is_primary_insert_empty(self, _async):
"""
target: test search with string expr and string field is primary
method: create collection ,string field is primary
collection load and insert data
collection search uses string expr in string field
expected: Search successfully
"""
# 1. initialize with data
collection_w, _, _, _ = \
self.init_collection_general(
prefix, False, primary_field=ct.default_string_field_name)[0:4]
nb = 3000
data = cf.gen_default_list_data(nb)
data[2] = ["" for _ in range(nb)]
collection_w.insert(data=data)
collection_w.load()
search_string_exp = "varchar >= \"\""
limit = 1
# 2. search
log.info("test_search_string_field_is_primary_true: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, limit,
search_string_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_field_not_primary_is_empty(self, _async):
"""
target: test search with string expr and string field is not primary
method: create collection and insert data
create index and collection load
collection search uses string expr in string field, string field is not primary
expected: Search successfully
"""
# 1. initialize with data
collection_w, _, _, _ = \
self.init_collection_general(
prefix, False, primary_field=ct.default_int64_field_name, is_index=False)[0:4]
nb = 3000
data = cf.gen_default_list_data(nb)
insert_ids = data[0]
data[2] = ["" for _ in range(nb)]
collection_w.insert(data)
assert collection_w.num_entities == nb
# 2. create index
index_param = {"index_type": "IVF_FLAT",
"metric_type": "COSINE", "params": {"nlist": 100}}
collection_w.create_index("float_vector", index_param)
collection_w.load()
search_string_exp = "varchar >= \"\""
# 3. search
log.info("test_search_string_field_not_primary: searching collection %s" %
collection_w.name)
vectors = [[random.random() for _ in range(default_dim)]
for _ in range(default_nq)]
output_fields = [default_string_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_string_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_search_string_different_language(self):
"""
target: test search with string expr using different language
method: create collection and insert data
create index and collection load
collection search uses string expr in string field
expected: Search successfully
"""
# 1. initialize with data
_async = random.choice([True, False])
auto_id = random.choice([True, False])
enable_dynamic_field = random.choice([True, False])
all_language = ["English", "French", "Spanish", "German", "Italian", "Portuguese", "Russian", "Chinese",
"Japanese", "Arabic", "Hindi"]
language = random.choice(all_language)
log.info(f"_async: {_async}, auto_id: {auto_id}, enable_dynamic_field: {enable_dynamic_field},"
f"language: {language}")
collection_w, insert_data, _, insert_ids = \
self.init_collection_general(prefix, True, auto_id=auto_id, nb=100,
enable_dynamic_field=enable_dynamic_field, language=language)[0:4]
search_str = insert_data[0][default_string_field_name][1] if not enable_dynamic_field \
else insert_data[0][1][default_string_field_name]
search_exp = f"{default_string_field_name} == '{search_str}'"
# 2. search
log.info("test_search_string_field_not_primary: searching collection %s" % collection_w.name)
log.info("search expr: %s" % search_exp)
output_fields = [default_string_field_name, default_float_field_name]
res, _ = collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit, search_exp,
output_fields=output_fields,
_async=_async,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"_async": _async})
if _async:
res.done()
res = res.result()
assert res[0][0].entity.varchar == search_str

View File

@ -0,0 +1,409 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSearchWithTextMatchFilter(TestcaseBase):
"""
******************************************************************
The following cases are used to test query text match
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("enable_inverted_index", [True, False])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_search_with_text_match_filter_normal_en(
self, tokenizer, enable_inverted_index, enable_partition_key
):
"""
target: test text match normal
method: 1. enable text match and insert data with varchar
2. get the most common words and query with text match
3. verify the result
expected: text match successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
log.info(f"collection {collection_w.describe()}")
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"float32_emb": [random.random() for _ in range(dim)],
"sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"float32_emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"sparse_emb",
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
)
if enable_inverted_index:
collection_w.create_index("word", {"index_type": "INVERTED"})
collection_w.load()
# analyze the croup
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
# search with filter single field for one token
df_split = cf.split_dataframes(df, text_fields, language=language)
log.info(f"df_split\n{df_split}")
for ann_field in ["float32_emb", "sparse_emb"]:
log.info(f"ann_field {ann_field}")
if ann_field == "float32_emb":
search_data = [[random.random() for _ in range(dim)]]
elif ann_field == "sparse_emb":
search_data = cf.gen_sparse_vectors(1, dim=10000)
else:
search_data = [[random.random() for _ in range(dim)]]
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"text_match({field}, '{token}')"
manual_result = df_split[
df_split.apply(lambda row: token in row[field], axis=1)
]
log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
res_list, _ = collection_w.search(
data=search_data,
anns_field=ann_field,
param={},
limit=100,
expr=expr, output_fields=["id", field])
for res in res_list:
log.info(f"res len {len(res)} res {res}")
assert len(res) > 0
for r in res:
r = r.to_dict()
assert token in r["entity"][field]
# search with filter single field for multi-token
for field in text_fields:
# match top 10 most common words
top_10_tokens = []
for word, count in wf_map[field].most_common(10):
top_10_tokens.append(word)
string_of_top_10_words = " ".join(top_10_tokens)
expr = f"text_match({field}, '{string_of_top_10_words}')"
log.info(f"expr {expr}")
res_list, _ = collection_w.search(
data=search_data,
anns_field=ann_field,
param={},
limit=100,
expr=expr, output_fields=["id", field])
for res in res_list:
log.info(f"res len {len(res)} res {res}")
assert len(res) > 0
for r in res:
r = r.to_dict()
assert any([token in r["entity"][field] for token in top_10_tokens])
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("enable_inverted_index", [True, False])
@pytest.mark.parametrize("tokenizer", ["jieba"])
@pytest.mark.xfail(reason="unstable case")
def test_search_with_text_match_filter_normal_zh(
self, tokenizer, enable_inverted_index, enable_partition_key
):
"""
target: test text match normal
method: 1. enable text match and insert data with varchar
2. get the most common words and query with text match
3. verify the result
expected: text match successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
log.info(f"collection {collection_w.describe()}")
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"float32_emb": [random.random() for _ in range(dim)],
"sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i : i + batch_size]
if i + batch_size < len(df)
else data[i : len(df)]
)
collection_w.flush()
collection_w.create_index(
"float32_emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"sparse_emb",
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
)
if enable_inverted_index:
collection_w.create_index("word", {"index_type": "INVERTED"})
collection_w.load()
# analyze the croup
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
# search with filter single field for one token
df_split = cf.split_dataframes(df, text_fields, language=language)
log.info(f"df_split\n{df_split}")
for ann_field in ["float32_emb", "sparse_emb"]:
log.info(f"ann_field {ann_field}")
if ann_field == "float32_emb":
search_data = [[random.random() for _ in range(dim)]]
elif ann_field == "sparse_emb":
search_data = cf.gen_sparse_vectors(1,dim=10000)
else:
search_data = [[random.random() for _ in range(dim)]]
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"text_match({field}, '{token}')"
manual_result = df_split[
df_split.apply(lambda row: token in row[field], axis=1)
]
log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
res_list, _ = collection_w.search(
data=search_data,
anns_field=ann_field,
param={},
limit=100,
expr=expr, output_fields=["id", field])
for res in res_list:
log.info(f"res len {len(res)} res {res}")
assert len(res) > 0
for r in res:
r = r.to_dict()
assert token in r["entity"][field]
# search with filter single field for multi-token
for field in text_fields:
# match top 10 most common words
top_10_tokens = []
for word, count in wf_map[field].most_common(10):
top_10_tokens.append(word)
string_of_top_10_words = " ".join(top_10_tokens)
expr = f"text_match({field}, '{string_of_top_10_words}')"
log.info(f"expr {expr}")
res_list, _ = collection_w.search(
data=search_data,
anns_field=ann_field,
param={},
limit=100,
expr=expr, output_fields=["id", field])
for res in res_list:
log.info(f"res len {len(res)} res {res}")
assert len(res) > 0
for r in res:
r = r.to_dict()
assert any([token in r["entity"][field] for token in top_10_tokens])

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,296 @@
import numpy as np
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_SESSION, CONSISTENCY_EVENTUALLY
from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from common.constants import *
from utils.util_pymilvus import *
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import heapq
from time import sleep
from decimal import Decimal, getcontext
import decimal
import multiprocessing
import numbers
import random
import math
import numpy
import threading
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "search_collection"
search_num = 10
max_dim = ct.max_dim
min_dim = ct.min_dim
epsilon = ct.epsilon
hybrid_search_epsilon = 0.01
gracefulTime = ct.gracefulTime
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
max_limit = ct.max_limit
default_search_exp = "int64 >= 0"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_int64_field_name = ct.default_int64_field_name
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_json_field_name = ct.default_json_field_name
default_index_params = ct.default_index
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
range_search_supported_indexes = ct.all_index_types[:7]
uid = "test_search"
nq = 1
epsilon = 0.001
field_name = default_float_vec_field_name
binary_field_name = default_binary_vec_field_name
search_param = {"nprobe": 1}
entity = gen_entities(1, is_normal=True)
entities = gen_entities(default_nb, is_normal=True)
raw_vectors, binary_entities = gen_binary_entities(default_nb)
default_query, _ = gen_search_vectors_params(field_name, entities, default_top_k, nq)
index_name1 = cf.gen_unique_str("float")
index_name2 = cf.gen_unique_str("varhar")
half_nb = ct.default_nb // 2
max_hybrid_search_req_num = ct.max_hybrid_search_req_num
class TestSparseSearch(TestcaseBase):
""" Add some test cases for the sparse vector """
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
def test_sparse_index_search(self, index, inverted_index_algo):
"""
target: verify that sparse index for sparse vectors can be searched properly
method: create connection, collection, insert and search
expected: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w = self.init_collection_wrap(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=3000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
params.update({"inverted_index_algo": inverted_index_algo})
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
_params = cf.get_search_params_params(index)
_params.update({"dim_max_score_ratio": 1.05})
search_params = {"params": _params}
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
search_params, default_limit,
output_fields=[ct.default_sparse_vec_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"original_entities": [data],
"output_fields": [ct.default_sparse_vec_field_name]})
expr = "int64 < 100 "
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
search_params, default_limit,
expr=expr, output_fields=[ct.default_sparse_vec_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"original_entities": [data],
"output_fields": [ct.default_sparse_vec_field_name]})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
@pytest.mark.parametrize("dim", [32768, ct.max_sparse_vector_dim])
def test_sparse_index_dim(self, index, dim):
"""
target: validating the sparse index in different dimensions
method: create connection, collection, insert and hybrid search
expected: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w = self.init_collection_wrap(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(dim=dim)
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, limit=1,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": 1})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
def test_sparse_index_enable_mmap_search(self, index, inverted_index_algo):
"""
target: verify that the sparse indexes of sparse vectors can be searched properly after turning on mmap
method: create connection, collection, enable mmap, insert and search
expected: search successfully , query result is correct
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w = self.init_collection_wrap(c_name, schema=schema)
first_nb = 3000
data = cf.gen_default_list_sparse_data(nb=first_nb, start=0)
collection_w.insert(data)
params = cf.get_index_params_params(index)
params.update({"inverted_index_algo": inverted_index_algo})
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.set_properties({'mmap.enabled': True})
pro = collection_w.describe()[0].get("properties")
assert pro["mmap.enabled"] == 'True'
collection_w.alter_index(index, {'mmap.enabled': True})
assert collection_w.index()[0].params["mmap.enabled"] == 'True'
data2 = cf.gen_default_list_sparse_data(nb=2000, start=first_nb) # id shall be continuous
all_data = [] # combine 2 insert datas for next checking
for i in range(len(data2)):
all_data.append(data[i] + data2[i])
collection_w.insert(data2)
collection_w.flush()
collection_w.load()
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, default_limit,
output_fields=[ct.default_sparse_vec_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"original_entities": [all_data],
"output_fields": [ct.default_sparse_vec_field_name]})
expr_id_list = [0, 1, 10, 100]
term_expr = f'{ct.default_int64_field_name} in {expr_id_list}'
res = collection_w.query(term_expr)[0]
assert len(res) == 4
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("drop_ratio_build", [0.01])
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_search_sparse_ratio(self, drop_ratio_build, index):
"""
target: create a sparse index by adjusting the ratio parameter.
method: create a sparse index by adjusting the ratio parameter.
expected: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w = self.init_collection_wrap(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=4000)
collection_w.insert(data)
collection_w.flush()
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": drop_ratio_build}}
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
collection_w.load()
assert collection_w.has_index(index_name=index)[0] is True
_params = {"drop_ratio_search": 0.2}
for dim_max_score_ratio in [0.5, 0.99, 1, 1.3]:
_params.update({"dim_max_score_ratio": dim_max_score_ratio})
search_params = {"metric_type": "IP", "params": _params}
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
search_params, default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit})
error = {ct.err_code: 999,
ct.err_msg: "should be in range [0.500000, 1.300000]"}
for invalid_ratio in [0.49, 1.4]:
_params.update({"dim_max_score_ratio": invalid_ratio})
search_params = {"metric_type": "IP", "params": _params}
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
search_params, default_limit,
check_task=CheckTasks.err_res,
check_items=error)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_sparse_vector_search_output_field(self, index):
"""
target: create sparse vectors and search
method: create sparse vectors and search
expected: normal search
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=4000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
d = cf.gen_default_list_sparse_data(nb=10)
collection_w.search(d[-1][0:default_nq], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, default_limit,
output_fields=["float", "sparse_vector"],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"output_fields": ["float", "sparse_vector"]
})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
def test_sparse_vector_search_iterator(self, index, inverted_index_algo):
"""
target: create sparse vectors and search iterator
method: create sparse vectors and search iterator
expected: normal search
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=4000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
params.update({"inverted_index_algo": inverted_index_algo})
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
batch_size = 100
collection_w.search_iterator(data[-1][0:1], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, limit=500, batch_size=batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"batch_size": batch_size})

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
[pytest]
addopts = --strict --endpoint http://127.0.0.1:19530 --token root:Milvus --minio_host 127.0.0.1
addopts = --strict --endpoint http://10.104.19.195:19530 --token root:Milvus --minio_host 10.104.32.27
log_format = [%(asctime)s - %(levelname)s - %(name)s]: %(message)s (%(filename)s:%(lineno)s)
log_date_format = %Y-%m-%d %H:%M:%S