test: Add group search test on sparse vectors (#36562)

related issue: #36295
add one test for alter alias

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
pull/36469/head
yanliang567 2024-09-27 14:15:14 +08:00 committed by GitHub
parent 7c2cb8c5d4
commit e6c7fd6605
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 92 additions and 270 deletions

View File

@ -22,15 +22,10 @@ default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
class TestAliasParams(TestcaseBase):
""" Test cases of alias interface parameters"""
pass
class TestAliasParamsInvalid(TestcaseBase):
""" Negative test cases of alias interface parameters"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("alias_name", ["12-s", "12 s", "(mn)", "中文", "%$#", "a".join("a" for i in range(256))])
def test_alias_create_alias_with_invalid_name(self, alias_name):
"""
@ -52,6 +47,59 @@ class TestAliasParamsInvalid(TestcaseBase):
class TestAliasOperation(TestcaseBase):
""" Test cases of alias interface operations"""
@pytest.mark.tags(CaseLabel.L0)
def test_alias_alter_operation_default(self):
"""
target: test collection altering alias
method:
1. create collection_1, bind alias to collection_1 and insert 2000 entities
2. create collection_2 with 1500 entities
3. search on alias
verify num_entities=2000
4. alter alias to collection_2 and search on alias
verify num_entities=1500
"""
c_name1 = cf.gen_unique_str("collection1")
collection_w1 = self.init_collection_wrap(name=c_name1, schema=default_schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_name1, exp_schema: default_schema})
alias_name = cf.gen_unique_str(prefix)
# create a collection alias and bind to collection1
self.utility_wrap.create_alias(collection_w1.name, alias_name)
collection_alias = self.init_collection_wrap(name=alias_name)
nb1 = 2000
data1 = cf.gen_default_dataframe_data(nb=nb1)
import pandas as pd
string_values = pd.Series(data=[str(i) for i in range(nb1)], dtype="string")
data1[ct.default_string_field_name] = string_values
collection_alias.insert(data1)
collection_alias.create_index(ct.default_float_vec_field_name, ct.default_index)
collection_alias.load()
assert collection_alias.num_entities == nb1 == collection_w1.num_entities
res1 = collection_alias.query(expr="", output_fields=["count(*)"])[0]
assert res1[0].get("count(*)") == nb1
# create collection2
c_name2 = cf.gen_unique_str("collection2")
collection_w2 = self.init_collection_wrap(name=c_name2, schema=default_schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_name2, exp_schema: default_schema})
nb2 = 1500
data2 = cf.gen_default_dataframe_data(nb=nb2)
string_values = pd.Series(data=[str(i) for i in range(nb2)], dtype="string")
data2[ct.default_string_field_name] = string_values
collection_w2.insert(data2)
collection_w2.create_index(ct.default_float_vec_field_name, ct.default_index)
collection_w2.load()
# alter the collection alias to collection2
self.utility_wrap.alter_alias(collection_w2.name, alias_name)
assert collection_alias.num_entities == nb2 == collection_w2.num_entities
res1 = collection_alias.query(expr="", output_fields=["count(*)"])[0]
assert res1[0].get("count(*)") == nb2
@pytest.mark.tags(CaseLabel.L1)
def test_alias_create_operation_default(self):
"""
@ -81,73 +129,6 @@ class TestAliasOperation(TestcaseBase):
assert [p.name for p in collection_w.partitions] == [
p.name for p in collection_alias.partitions]
@pytest.mark.tags(CaseLabel.L1)
def test_alias_alter_operation_default(self):
"""
target: test collection altering alias
method:
1. create collection_1 with 10 partitions and its alias alias_a
2. create collection_2 with 5 partitions and its alias alias_b
3. collection_1 alter alias to alias_b
expected:
in step 1, collection_1 is equal to alias_a
in step 2, collection_2 is equal to alias_b
in step 3, collection_1 is equal to alias_a and alias_b, and collection_2 is not equal to alias_b
"""
self._connect()
# create collection_1 with 10 partitions and its alias alias_a
c_1_name = cf.gen_unique_str("collection")
collection_1 = self.init_collection_wrap(name=c_1_name, schema=default_schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_1_name, exp_schema: default_schema})
for _ in range(10):
partition_name = cf.gen_unique_str("partition")
# create partition with different names and check the partition exists
self.init_partition_wrap(collection_1, partition_name)
assert collection_1.has_partition(partition_name)[0]
alias_a_name = cf.gen_unique_str(prefix)
self.utility_wrap.create_alias(collection_1.name, alias_a_name)
collection_alias_a = self.init_collection_wrap(name=alias_a_name,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: alias_a_name, exp_schema: default_schema})
# assert collection is equal to alias according to partitions
assert [p.name for p in collection_1.partitions] == [
p.name for p in collection_alias_a.partitions]
# create collection_2 with 5 partitions and its alias alias_b
c_2_name = cf.gen_unique_str("collection")
collection_2 = self.init_collection_wrap(name=c_2_name, schema=default_schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_2_name, exp_schema: default_schema})
for _ in range(5):
partition_name = cf.gen_unique_str("partition")
# create partition with different names and check the partition exists
self.init_partition_wrap(collection_2, partition_name)
assert collection_2.has_partition(partition_name)[0]
alias_b_name = cf.gen_unique_str(prefix)
self.utility_wrap.create_alias(collection_2.name, alias_b_name)
collection_alias_b = self.init_collection_wrap(name=alias_b_name,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: alias_b_name, exp_schema: default_schema})
# assert collection is equal to alias according to partitions
assert [p.name for p in collection_2.partitions] == [
p.name for p in collection_alias_b.partitions]
# collection_1 alter alias to alias_b
self.utility_wrap.alter_alias(collection_1.name, alias_b_name)
# collection_1 has two alias name, alias_a and alias_b, but collection_2 has no alias any more
assert [p.name for p in collection_1.partitions] == [
p.name for p in collection_alias_b.partitions]
assert [p.name for p in collection_1.partitions] == [
p.name for p in collection_alias_a.partitions]
assert [p.name for p in collection_2.partitions] != [
p.name for p in collection_alias_b.partitions]
@pytest.mark.tags(CaseLabel.L1)
def test_alias_drop_operation_default(self):
"""
@ -187,128 +168,7 @@ class TestAliasOperation(TestcaseBase):
check_task=CheckTasks.err_res,
check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_alias_exec_operations_as_collection(self):
"""
target: test alias
1.creating partition,
2.inserting data,
3.creating index,
4.loading collection,
5.searching,
6.releasing collection,
method: follow the steps in target
expected: all steps operated by alias can work
"""
create_partition_flag = True
insert_data_flag = True
create_index_flag = True
load_collection_flag = True
search_flag = True
release_collection_flag = True
self._connect()
c_name = cf.gen_unique_str("collection")
collection_w = self.init_collection_wrap(name=c_name, schema=default_schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_name, exp_schema: default_schema})
alias_name = cf.gen_unique_str(prefix)
self.utility_wrap.create_alias(collection_w.name, alias_name)
# collection_w.create_alias(alias_name)
collection_alias, _ = self.collection_wrap.init_collection(name=alias_name,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: alias_name,
exp_schema: default_schema})
# create partition by alias
partition_name = cf.gen_unique_str("partition")
try:
collection_alias.create_partition(partition_name)
except Exception as e:
log.info(f"alias create partition failed with exception {e}")
create_partition_flag = False
collection_w.create_partition(partition_name)
# assert partition
pytest.assume(create_partition_flag is True and
[p.name for p in collection_alias.partitions] == [p.name for p in collection_w.partitions])
# insert data by alias
df = cf.gen_default_dataframe_data(ct.default_nb)
try:
collection_alias.insert(data=df)
except Exception as e:
log.info(f"alias insert data failed with exception {e}")
insert_data_flag = False
collection_w.insert(data=df)
# assert insert data
pytest.assume(insert_data_flag is True and
collection_w.num_entities == ct.default_nb and
collection_alias.num_entities == ct.default_nb)
# create index by alias
default_index = {"index_type": "IVF_FLAT",
"params": {"nlist": 128}, "metric_type": "L2"}
try:
collection_alias.create_index(
field_name="float_vector", index_params=default_index)
except Exception as e:
log.info(f"alias create index failed with exception {e}")
create_index_flag = False
collection_w.create_index(
field_name="float_vector", index_params=default_index)
# assert create index
pytest.assume(create_index_flag is True and
collection_alias.has_index() is True and
collection_w.has_index()[0] is True)
# load by alias
try:
collection_alias.load()
except Exception as e:
log.info(f"alias load collection failed with exception {e}")
load_collection_flag = False
collection_w.load()
# assert load
pytest.assume(load_collection_flag is True)
# search by alias
topK = 5
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
query = [[random.random() for _ in range(ct.default_dim)]
for _ in range(1)]
alias_res = None
try:
alias_res = collection_alias.search(
query, "float_vector", search_params, topK,
"int64 >= 0", output_fields=["int64"]
)
except Exception as e:
log.info(f"alias search failed with exception {e}")
search_flag = False
collection_res, _ = collection_w.search(
query, "float_vector", search_params, topK,
"int64 >= 0", output_fields=["int64"]
)
# assert search
pytest.assume(
search_flag is True and alias_res[0].ids == collection_res[0].ids)
# release by alias
try:
collection_alias.release()
except Exception as e:
log.info(f"alias release failed with exception {e}")
release_collection_flag = False
collection_w.release()
# assert release
pytest.assume(release_collection_flag is True)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_called_by_utility_has_collection(self):
"""
target: test utility has collection by alias
@ -334,7 +194,7 @@ class TestAliasOperation(TestcaseBase):
assert res is True
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_called_by_utility_drop_collection(self):
"""
target: test utility drop collection by alias
@ -365,7 +225,7 @@ class TestAliasOperation(TestcaseBase):
self.utility_wrap.drop_collection(c_name)
assert not self.utility_wrap.has_collection(c_name)[0]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_called_by_utility_has_partition(self):
"""
target: test utility has partition by alias
@ -482,7 +342,7 @@ class TestAliasOperationInvalid(TestcaseBase):
# check_task=CheckTasks.err_res,
# check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_drop_not_exist_alias(self):
"""
target: test collection dropping alias which is not exist
@ -547,7 +407,7 @@ class TestAliasOperationInvalid(TestcaseBase):
# check_task=CheckTasks.err_res,
# check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_create_dup_name_collection(self):
"""
target: test creating a collection with a same name as alias, but a different schema
@ -571,7 +431,7 @@ class TestAliasOperationInvalid(TestcaseBase):
check_task=CheckTasks.err_res,
check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_alias_drop_collection_by_alias(self):
"""
target: test dropping a collection by alias

View File

@ -1,13 +1,16 @@
import random
import re
import math # do not remove `math`
import pytest
import random
import numpy as np
import pandas as pd
from pymilvus import DataType, AnnSearchRequest, RRFRanker, WeightedRanker
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from common import common_params as cp
from common.code_mapping import QueryErrorMessage as qem
from common.common_params import (
FieldParams, MetricType, DefaultVectorIndexParams, DefaultScalarIndexParams, Expr, AlterIndexParams
@ -1889,7 +1892,10 @@ class TestMixScenes(TestcaseBase):
class TestGroupSearch(TestCaseClassBase):
"""
Testing group search scenarios
1. collection schema: int64_pk(auto_id), float16_vector, float_vector, bfloat16_vector, varchar
1. collection schema:
int64_pk(auto_id), varchar,
float16_vector, float_vector, bfloat16_vector, sparse_vector,
inverted_varchar
2. varchar field is inserted with dup values for group by
3. index for each vector field with different index types, dims and metric types
Author: Yanliang567
@ -1902,34 +1908,34 @@ class TestGroupSearch(TestCaseClassBase):
# init params
self.primary_field = "int64_pk"
self.indexed_string_field = "varchar_with_index"
self.inverted_string_field = "varchar_inverted"
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_group_search"),
name=cf.gen_unique_str("TestGroupSearch"),
schema=cf.set_collection_schema(
fields=[DataType.VARCHAR.name, self.primary_field, DataType.FLOAT16_VECTOR.name,
DataType.FLOAT_VECTOR.name, DataType.BFLOAT16_VECTOR.name,
fields=[self.primary_field, DataType.VARCHAR.name, DataType.FLOAT16_VECTOR.name,
DataType.FLOAT_VECTOR.name, DataType.BFLOAT16_VECTOR.name, DataType.SPARSE_FLOAT_VECTOR.name,
DataType.INT8.name, DataType.INT64.name, DataType.BOOL.name,
self.indexed_string_field],
self.inverted_string_field],
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict,
DataType.FLOAT16_VECTOR.name: FieldParams(dim=31).to_dict,
DataType.FLOAT_VECTOR.name: FieldParams(dim=64).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=24).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=24).to_dict
},
auto_id=True
)
)
self.vector_fields = [DataType.FLOAT16_VECTOR.name, DataType.FLOAT_VECTOR.name, DataType.BFLOAT16_VECTOR.name]
self.dims = [31, 64, 24]
self.index_types = ["IVF_SQ8", "HNSW", "IVF_FLAT"]
self.vector_fields = [DataType.FLOAT16_VECTOR.name, DataType.FLOAT_VECTOR.name,
DataType.BFLOAT16_VECTOR.name, DataType.SPARSE_FLOAT_VECTOR.name]
self.dims = [31, 64, 24, 99]
self.index_types = [cp.IndexName.IVF_SQ8, cp.IndexName.HNSW, cp.IndexName.IVF_FLAT, cp.IndexName.SPARSE_WAND]
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
# prepare data (> 1024 triggering index building)
import pandas as pd
nb = 100
for _ in range(100):
string_values = pd.Series(data=[str(i) for i in range(nb)], dtype="string")
@ -1950,8 +1956,9 @@ class TestGroupSearch(TestCaseClassBase):
**DefaultVectorIndexParams.IVF_SQ8(DataType.FLOAT16_VECTOR.name, metric_type=MetricType.L2),
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name, metric_type=MetricType.IP),
**DefaultVectorIndexParams.IVF_FLAT(DataType.BFLOAT16_VECTOR.name, metric_type=MetricType.COSINE),
**DefaultVectorIndexParams.SPARSE_WAND(DataType.SPARSE_FLOAT_VECTOR.name, metric_type=MetricType.IP),
# index params for varchar field
**DefaultScalarIndexParams.INVERTED(self.indexed_string_field)
**DefaultScalarIndexParams.INVERTED(self.inverted_string_field)
}
self.build_multi_index(index_params=index_params)
@ -1961,11 +1968,11 @@ class TestGroupSearch(TestCaseClassBase):
self.collection_wrap.load()
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("group_by_field", [DataType.VARCHAR.name, "varchar_with_index"])
@pytest.mark.parametrize("group_by_field", [DataType.VARCHAR.name, "varchar_inverted"])
def test_search_group_size(self, group_by_field):
"""
target:
1. search on 3 different float vector fields with group by varchar field with group size
1. search on 4 different float vector fields with group by varchar field with group size
verify results entity = limit * group_size and group size is full if group_strict_size is True
verify results group counts = limit if group_strict_size is False
"""
@ -2005,7 +2012,7 @@ class TestGroupSearch(TestCaseClassBase):
@pytest.mark.xfail()
def test_hybrid_search_group_size(self):
"""
hybrid search group by on 3 different float vector fields with group by varchar field with group size
hybrid search group by on 4 different float vector fields with group by varchar field with group size
verify results returns with de-dup group values and group distances are in order as rank_group_scorer
"""
nq = 2
@ -2024,7 +2031,8 @@ class TestGroupSearch(TestCaseClassBase):
# 4. hybrid search group by
rank_scorers = ["max", "avg", "sum"]
for scorer in rank_scorers:
res = self.collection_wrap.hybrid_search(req_list, WeightedRanker(0.3, 0.3, 0.3), limit=limit,
res = self.collection_wrap.hybrid_search(req_list, WeightedRanker(0.1, 0.3, 0.9, 0.6),
limit=limit,
group_by_field=DataType.VARCHAR.name,
group_size=group_size, rank_group_scorer=scorer,
output_fields=[DataType.VARCHAR.name])[0]
@ -2044,16 +2052,16 @@ class TestGroupSearch(TestCaseClassBase):
group_distances.append(res[i][l + 1].distance)
else:
if scorer == 'sum':
assert np.sum(group_distances) < np.sum(tmp_distances)
assert np.sum(group_distances) <= np.sum(tmp_distances)
elif scorer == 'avg':
assert np.mean(group_distances) < np.mean(tmp_distances)
assert np.mean(group_distances) <= np.mean(tmp_distances)
else: # default max
assert np.max(group_distances) < np.max(tmp_distances)
assert np.max(group_distances) <= np.max(tmp_distances)
tmp_distances = group_distances
group_distances = [res[i][l + 1].distance]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.L2)
def test_hybrid_search_group_by(self):
"""
verify hybrid search group by works with different Rankers
@ -2070,7 +2078,7 @@ class TestGroupSearch(TestCaseClassBase):
req = AnnSearchRequest(**search_param)
req_list.append(req)
# 4. hybrid search group by
res = self.collection_wrap.hybrid_search(req_list, WeightedRanker(0.1, 0.9, 0.2), ct.default_limit,
res = self.collection_wrap.hybrid_search(req_list, WeightedRanker(0.1, 0.9, 0.2, 0.3), ct.default_limit,
group_by_field=DataType.VARCHAR.name,
output_fields=[DataType.VARCHAR.name],
check_task=CheckTasks.check_search_results,
@ -2094,7 +2102,7 @@ class TestGroupSearch(TestCaseClassBase):
req = AnnSearchRequest(**search_param)
req_list.append(req)
self.collection_wrap.hybrid_search(req_list, RRFRanker(), ct.default_limit,
group_by_field=self.indexed_string_field,
group_by_field=self.inverted_string_field,
check_task=CheckTasks.check_search_results,
check_items={"nq": ct.default_nq, "limit": ct.default_limit})
@ -2148,7 +2156,7 @@ class TestGroupSearch(TestCaseClassBase):
page_rounds = 3
search_param = {}
default_search_exp = f"{self.primary_field} >= 0"
grpby_field = self.indexed_string_field
grpby_field = self.inverted_string_field
default_search_field = self.vector_fields[1]
search_vectors = cf.gen_vectors(1, dim=self.dims[1], vector_data_type=self.vector_fields[1])
all_pages_ids = []
@ -2191,7 +2199,7 @@ class TestGroupSearch(TestCaseClassBase):
page_rounds = 3
search_param = {}
default_search_exp = f"{self.primary_field} >= 0"
grpby_field = self.indexed_string_field
grpby_field = self.inverted_string_field
default_search_field = self.vector_fields[1]
search_vectors = cf.gen_vectors(1, dim=self.dims[1], vector_data_type=self.vector_fields[1])
all_pages_ids = []

View File

@ -10558,52 +10558,6 @@ class TestSearchGroupBy(TestcaseBase):
check_task=CheckTasks.err_res,
check_items={"err_code": err_code, "err_msg": err_msg})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_sparse_vectors_group_by(self, index):
"""
target: test search group by works on a collection with sparse vector
method: 1. create a collection
2. create index
3. grouping search
verify: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(c_name, schema=schema)
nb = 5000
data = cf.gen_default_list_sparse_data(nb=nb)
# update float fields
_data = [random.randint(1, 100) for _ in range(nb)]
str_data = [str(i) for i in _data]
data[2] = str_data
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
nq = 2
limit = 20
search_params = ct.default_sparse_search_params
search_vectors = cf.gen_default_list_sparse_data(nb=nq)[-1][0:nq]
# verify the result if gourp by
res = collection_w.search(data=search_vectors, anns_field=ct.default_sparse_vec_field_name,
param=search_params, limit=limit,
group_by_field=ct.default_string_field_name,
output_fields=[ct.default_string_field_name],
check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "limit": limit})[0]
hit = res[0]
set_varchar = set()
for item in hit:
a = list(item.fields.values())
set_varchar.add(a[0])
# groupy by is in effect, then there are no duplicate varchar values
assert len(hit) == len(set_varchar)
class TestCollectionHybridSearchValid(TestcaseBase):
""" Test case of search interface """