mirror of https://github.com/milvus-io/milvus.git
test: add case for sparse vector and verify group by (#32217)
add case for sparse vector and verify group by --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>pull/32279/head
parent
70e3d5b495
commit
fd971a434f
|
@ -586,7 +586,7 @@ class TestSearchVector(TestBase):
|
|||
for i in range(nb):
|
||||
if auto_id:
|
||||
tmp = {
|
||||
"user_id": i%100,
|
||||
"user_id": i%10,
|
||||
"word_count": i,
|
||||
"book_describe": f"book_{i}",
|
||||
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
||||
|
@ -597,7 +597,7 @@ class TestSearchVector(TestBase):
|
|||
else:
|
||||
tmp = {
|
||||
"book_id": i,
|
||||
"user_id": i%100,
|
||||
"user_id": i%10,
|
||||
"word_count": i,
|
||||
"book_describe": f"book_{i}",
|
||||
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
||||
|
@ -634,6 +634,9 @@ class TestSearchVector(TestBase):
|
|||
}
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 200
|
||||
# assert no dup user_id
|
||||
user_ids = [r["user_id"]for r in rsp['data']]
|
||||
assert len(user_ids) == len(set(user_ids))
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [1])
|
||||
@pytest.mark.parametrize("auto_id", [True])
|
||||
|
@ -721,6 +724,93 @@ class TestSearchVector(TestBase):
|
|||
assert len(rsp['data']) == 100
|
||||
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [1])
|
||||
@pytest.mark.parametrize("auto_id", [True])
|
||||
@pytest.mark.parametrize("is_partition_key", [True])
|
||||
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
||||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.xfail(reason="issue https://github.com/milvus-io/milvus/issues/32214")
|
||||
def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_round, auto_id,
|
||||
is_partition_key, enable_dynamic_schema):
|
||||
"""
|
||||
Insert a vector with a simple payload
|
||||
"""
|
||||
# create a collection
|
||||
name = gen_collection_name()
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"schema": {
|
||||
"autoId": auto_id,
|
||||
"enableDynamicField": enable_dynamic_schema,
|
||||
"fields": [
|
||||
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
||||
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
||||
"elementTypeParams": {}},
|
||||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "sparse_float_vector", "dataType": "SparseFloatVector"},
|
||||
]
|
||||
},
|
||||
"indexParams": [
|
||||
{"fieldName": "sparse_float_vector", "indexName": "sparse_float_vector", "metricType": "IP",
|
||||
"indexConfig": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}}
|
||||
]
|
||||
}
|
||||
rsp = self.collection_client.collection_create(payload)
|
||||
assert rsp['code'] == 200
|
||||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 200
|
||||
# insert data
|
||||
for i in range(insert_round):
|
||||
data = []
|
||||
for i in range(nb):
|
||||
if auto_id:
|
||||
tmp = {
|
||||
"user_id": i%100,
|
||||
"word_count": i,
|
||||
"book_describe": f"book_{i}",
|
||||
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim),
|
||||
}
|
||||
else:
|
||||
tmp = {
|
||||
"book_id": i,
|
||||
"user_id": i%100,
|
||||
"word_count": i,
|
||||
"book_describe": f"book_{i}",
|
||||
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim),
|
||||
}
|
||||
if enable_dynamic_schema:
|
||||
tmp.update({f"dynamic_field_{i}": i})
|
||||
data.append(tmp)
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": data,
|
||||
}
|
||||
rsp = self.vector_client.vector_insert(payload)
|
||||
assert rsp['code'] == 200
|
||||
assert rsp['data']['insertCount'] == nb
|
||||
# search data
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": [gen_vector(datatype="SparseFloatVector", dim=dim)],
|
||||
"filter": "word_count > 100",
|
||||
"groupingField": "user_id",
|
||||
"outputFields": ["*"],
|
||||
"searchParams": {
|
||||
"metricType": "IP",
|
||||
"params": {
|
||||
"drop_ratio_search": "0.2",
|
||||
}
|
||||
},
|
||||
"limit": 100,
|
||||
}
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 200
|
||||
assert len(rsp['data']) == 100
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [2])
|
||||
@pytest.mark.parametrize("auto_id", [True])
|
||||
|
|
|
@ -12,7 +12,7 @@ from loguru import logger
|
|||
import datetime
|
||||
|
||||
fake = Faker()
|
||||
|
||||
rng = np.random.default_rng()
|
||||
|
||||
def random_string(length=8):
|
||||
letters = string.ascii_letters
|
||||
|
@ -201,6 +201,8 @@ def gen_vector(datatype="float_vector", dim=128, binary_data=False):
|
|||
value = None
|
||||
if datatype == "FloatVector":
|
||||
return preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
||||
if datatype == "SparseFloatVector":
|
||||
return {d: rng.random() for d in random.sample(range(dim), random.randint(20, 30))}
|
||||
if datatype == "BinaryVector":
|
||||
value = gen_binary_vectors(1, dim)[1][0]
|
||||
if datatype == "Float16Vector":
|
||||
|
|
Loading…
Reference in New Issue