milvus/tests/restful_client_v2/testcases/test_vector_operations.py

5189 lines
210 KiB
Python

import random
from sklearn import preprocessing
import numpy as np
import pandas as pd
import sys
import json
import time
from utils import constant
from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, \
zh_vocabularies_distribution
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
from utils.utils import (gen_unique_str, get_data_by_payload, get_common_fields_by_data, gen_vector, analyze_documents)
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection, utility
)
from faker import Faker
import re
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
patch_faker_text(fake_en, en_vocabularies_distribution)
patch_faker_text(fake_zh, zh_vocabularies_distribution)
@pytest.mark.L0
class TestInsertVector(TestBase):
@pytest.mark.parametrize("insert_round", [3])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_insert_entities_with_simple_payload(self, nb, dim, insert_round):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
collection_payload = {
"collectionName": name,
"dimension": dim,
"metricType": "L2"
}
rsp = self.collection_client.collection_create(collection_payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = get_data_by_payload(collection_payload, nb)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_insert_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": {"key": i},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": {"key": i},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
def test_insert_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema,
pass_fp32_to_fp16_or_bf16):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"},
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"},
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
res = c.query(
expr="user_id > 0",
limit=1,
output_fields=["*"],
)
logger.info(f"res: {res}")
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
def test_insert_entities_with_all_vector_datatype_0(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema,
pass_fp32_to_fp16_or_bf16):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "book_vector", "indexName": "book_vector", "metricType": "L2",
"params": {"index_type": "FLAT"}},
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2",
"params": {"index_type": "IVF_FLAT", "nlist": 128}},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2",
"params": {"index_type": "IVF_SQ8", "nlist": "128"}},
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2",
"params": {"index_type": "IVF_PQ", "nlist": 128, "m": 16, "nbits": 8}},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_vector": gen_vector(datatype="FloatVector", dim=dim),
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_vector": gen_vector(datatype="FloatVector", dim=dim),
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
res = c.query(
expr="user_id > 0",
limit=1,
output_fields=["*"],
)
logger.info(f"res: {res}")
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
def test_insert_entities_with_all_vector_datatype_1(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema,
pass_fp32_to_fp16_or_bf16):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2",
"params": {"index_type": "HNSW", "M": 32, "efConstruction": 360}},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2",
"params": {"index_type": "SCANN", "nlist": "128"}},
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2",
"params": {"index_type": "DISKANN"}},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
res = c.query(
expr="user_id > 0",
limit=1,
output_fields=["*"],
)
logger.info(f"res: {res}")
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_insert_entities_with_all_vector_datatype_2(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "binary_vector_0", "dataType": "BinaryVector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "binary_vector_1", "dataType": "BinaryVector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "sparse_float_vector_0", "dataType": "SparseFloatVector"},
{"fieldName": "sparse_float_vector_1", "dataType": "SparseFloatVector"},
]
},
"indexParams": [
{"fieldName": "binary_vector_0", "indexName": "binary_vector_0_index", "metricType": "HAMMING",
"params": {"index_type": "BIN_FLAT"}},
{"fieldName": "binary_vector_1", "indexName": "binary_vector_1_index", "metricType": "HAMMING",
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}},
{"fieldName": "sparse_float_vector_0", "indexName": "sparse_float_vector_0_index", "metricType": "IP",
"params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}},
{"fieldName": "sparse_float_vector_1", "indexName": "sparse_float_vector_1_index", "metricType": "IP",
"params": {"index_type": "SPARSE_WAND", "drop_ratio_build": "0.2"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim),
"binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim),
"sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
"sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim),
"binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim),
"sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
"sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
res = c.query(
expr="user_id > 0",
limit=1,
output_fields=["*"],
)
logger.info(f"res: {res}")
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_insert_entities_with_all_json_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
json_value = [
1,
1.0,
"1",
[1, 2, 3],
["1", "2", "3"],
[1, 2, "3"],
{"key": "value"},
]
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": json_value[i % len(json_value)],
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": json_value[i % len(json_value)],
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_insert_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
enable_dynamic_schema):
"""
Insert a vector with defaultValue and none
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}, "defaultValue": 10},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
"defaultValue": "default", "nullable": True},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for k in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": None,
"word_count": None,
"book_describe": None,
"json": None,
"varchar_array": None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": None,
"word_count": None,
"book_describe": None,
"json": None,
"varchar_array": None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 5})
assert rsp['code'] == 0
assert len(rsp['data']) == 5
assert rsp['data'][0]['book_describe'] == 'default'
assert rsp['data'][0]['word_count'] is None
assert rsp['data'][0]['json'] is None
@pytest.mark.L0
class TestInsertVectorNegative(TestBase):
def test_insert_vector_with_invalid_collection_name(self):
"""
Insert a vector with an invalid collection name
"""
# create a collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 100
data = get_data_by_payload(payload, nb)
payload = {
"collectionName": "invalid_collection_name",
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 100
assert "can't find collection" in rsp['message']
def test_insert_vector_with_invalid_database_name(self):
"""
Insert a vector with an invalid database name
"""
# create a collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 10
data = get_data_by_payload(payload, nb)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
success = False
rsp = self.vector_client.vector_insert(payload, db_name="invalid_database")
assert rsp['code'] == 800
def test_insert_vector_with_mismatch_dim(self):
"""
Insert a vector with mismatch dim
"""
# create a collection
name = gen_collection_name()
dim = 32
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 1
data = [
{"id": i,
"vector": [np.float64(random.random()) for _ in range(dim + 1)],
} for i in range(nb)
]
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 1804
assert "fail to deal the insert data" in rsp['message']
def test_insert_entities_with_none_no_nullable_field(self):
"""
Insert a vector with none no nullable field
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}},
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
data = []
for i in range(10):
tmp = {
"word_count": i if i % 2 else None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(),
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 1804
assert "fail to deal the insert data" in rsp['message']
@pytest.mark.L0
class TestUpsertVector(TestBase):
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
def test_upsert_vector_default(self, nb, dim, insert_round, id_type):
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True,
"elementTypeParams": {"max_length": "256"}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
"user_id": i * nb + j,
"word_count": i * nb + j,
"book_describe": f"book_{i * nb + j}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
c.flush()
# upsert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
"user_id": i * nb + j + 1,
"word_count": i * nb + j + 2,
"book_describe": f"book_{i * nb + j + 3}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_upsert(payload)
# query data to make sure the data is updated
if id_type == "Int64":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
if id_type == "VarChar":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
for data in rsp['data']:
assert data['user_id'] == int(data['book_id']) + 1
assert data['word_count'] == int(data['book_id']) + 2
assert data['book_describe'] == f"book_{int(data['book_id']) + 3}"
res = utility.get_query_segment_info(name)
logger.info(f"res: {res}")
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
@pytest.mark.xfail(reason="currently not support auto_id for upsert")
def test_upsert_vector_pk_auto_id(self, nb, dim, insert_round, id_type):
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"fields": [
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True,
"elementTypeParams": {"max_length": "256"}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
ids = []
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
"user_id": i * nb + j,
"word_count": i * nb + j,
"book_describe": f"book_{i * nb + j}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
ids.extend(rsp['data']['insertIds'])
c = Collection(name)
c.flush()
# upsert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": ids[i * nb + j],
"user_id": i * nb + j + 1,
"word_count": i * nb + j + 2,
"book_describe": f"book_{i * nb + j + 3}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_upsert(payload)
# query data to make sure the data is updated
if id_type == "Int64":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
if id_type == "VarChar":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
for data in rsp['data']:
assert data['user_id'] == int(data['book_id']) + 1
assert data['word_count'] == int(data['book_id']) + 2
assert data['book_describe'] == f"book_{int(data['book_id']) + 3}"
res = utility.get_query_segment_info(name)
logger.info(f"res: {res}")
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
def test_upsert_vector_with_default_none(self, nb, dim, insert_round, id_type):
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True,
"elementTypeParams": {"max_length": "256"}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 123},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
"nullable": True},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
"user_id": i * nb + j,
"word_count": i * nb + j,
"book_describe": f"book_{i * nb + j}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
c.flush()
# upsert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
"user_id": i * nb + j + 1,
"word_count": None,
"book_describe": None,
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_upsert(payload)
# query data to make sure the data is updated
if id_type == "Int64":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
if id_type == "VarChar":
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
for data in rsp['data']:
assert data['user_id'] == int(data['book_id']) + 1
assert data['word_count'] == 123
assert data['book_describe'] is None
@pytest.mark.L0
class TestUpsertVectorNegative(TestBase):
def test_upsert_vector_with_invalid_collection_name(self):
"""
upsert a vector with an invalid collection name
"""
# create a collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 100
data = get_data_by_payload(payload, nb)
payload = {
"collectionName": "invalid_collection_name",
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_upsert(payload)
assert rsp['code'] == 100
assert "can't find collection" in rsp['message']
def test_upsert_entities_with_none_no_nullable_field(self):
"""
Insert a vector with none no nullable field
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}},
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
data = []
for i in range(10):
tmp = {
"word_count": i if i % 2 else None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(),
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_upsert(payload)
assert rsp['code'] == 1804
assert "fail to deal the insert data" in rsp['message']
@pytest.mark.L0
class TestSearchVector(TestBase):
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [16])
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema,
pass_fp32_to_fp16_or_bf16):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "COSINE"},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "COSINE"},
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "COSINE"},
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 10,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
else:
tmp = {
"book_id": i,
"user_id": i % 10,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector",
"filter": "word_count > 100",
"groupingField": "user_id",
"outputFields": ["*"],
"limit": 100
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
# assert no dup user_id
user_ids = [r["user_id"] for r in rsp['data']]
assert len(user_ids) == len(set(user_ids))
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("nq", [1, 2])
@pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"])
def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, nq, metric_type):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)],
"filter": "word_count > 100",
"groupingField": "user_id",
"outputFields": ["*"],
"limit": 100,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 100 * nq
@pytest.mark.parametrize("insert_round", [1, 10])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("sparse_format", ['dok', 'coo'])
def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField,
sparse_format):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "sparse_float_vector", "dataType": "SparseFloatVector"},
]
},
"indexParams": [
{"fieldName": "sparse_float_vector", "indexName": "sparse_float_vector", "metricType": "IP",
"params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim,
sparse_format=sparse_format),
}
else:
tmp = {
"book_id": idx,
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim,
sparse_format=sparse_format),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok")],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"metricType": "IP",
"params": {
"drop_ratio_search": "0.2",
}
},
"limit": 500,
}
if groupingField:
payload["groupingField"] = groupingField
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("tokenizer", ['standard'])
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# search data
payload = {
"collectionName": name,
"data": [fake.text().lower() for _ in range(1)],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"params": {
"drop_ratio_search": "0.2",
}
},
"limit": 500,
}
if groupingField:
payload["groupingField"] = groupingField
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) > 0
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("tokenizer", ['jieba'])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# search data
payload = {
"collectionName": name,
"data": [fake.text().lower() for _ in range(2)],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"params": {
"drop_ratio_search": "0.2",
}
},
"limit": 500,
}
if groupingField:
payload["groupingField"] = groupingField
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) > 0
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("metric_type", ['HAMMING'])
def test_search_vector_with_binary_vector_datatype(self, metric_type, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": metric_type,
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# flush data
c = Collection(name)
c.flush()
time.sleep(5)
# wait for index
rsp = self.index_client.index_describe(collection_name=name, index_name="binary_vector")
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="BinaryVector", dim=dim)],
"filter": "word_count > 100",
"outputFields": ["*"],
"limit": 100,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 100
@pytest.mark.parametrize("metric_type", ["IP", "L2", "COSINE"])
def test_search_vector_with_simple_payload(self, metric_type):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, metric_type=metric_type)
# search data
dim = 128
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
payload = {
"collectionName": name,
"data": [vector_to_search],
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
limit = int(payload.get("limit", 100))
assert len(res) == limit
ids = [item['id'] for item in res]
assert len(ids) == len(set(ids))
distance = [item['distance'] for item in res]
if metric_type == "L2":
assert distance == sorted(distance)
if metric_type == "IP" or metric_type == "COSINE":
assert distance == sorted(distance, reverse=True)
@pytest.mark.parametrize("sum_limit_offset", [16384, 16385])
@pytest.mark.xfail(reason="")
def test_search_vector_with_exceed_sum_limit_offset(self, sum_limit_offset):
"""
Search a vector with a simple payload
"""
max_search_sum_limit_offset = constant.MAX_SUM_OFFSET_AND_LIMIT
name = gen_collection_name()
self.name = name
nb = sum_limit_offset + 2000
metric_type = "IP"
limit = 100
self.init_collection(name, metric_type=metric_type, nb=nb, batch_size=2000)
# search data
dim = 128
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
payload = {
"collectionName": name,
"vector": vector_to_search,
"limit": limit,
"offset": sum_limit_offset - limit,
}
rsp = self.vector_client.vector_search(payload)
if sum_limit_offset > max_search_sum_limit_offset:
assert rsp['code'] == 65535
return
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
limit = int(payload.get("limit", 100))
assert len(res) == limit
ids = [item['id'] for item in res]
assert len(ids) == len(set(ids))
distance = [item['distance'] for item in res]
if metric_type == "L2":
assert distance == sorted(distance)
if metric_type == "IP":
assert distance == sorted(distance, reverse=True)
@pytest.mark.parametrize("offset", [0, 100])
@pytest.mark.parametrize("limit", [100])
@pytest.mark.parametrize("metric_type", ["L2", "IP", "COSINE"])
def test_search_vector_with_complex_payload(self, limit, offset, metric_type):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
nb = limit + offset + 3000
dim = 128
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": "uid >= 0",
"limit": limit,
"offset": offset,
}
rsp = self.vector_client.vector_search(payload)
if offset + limit > constant.MAX_SUM_OFFSET_AND_LIMIT:
assert rsp['code'] == 90126
return
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) == limit
for item in res:
assert item.get("uid") >= 0
for field in output_fields:
assert field in item
@pytest.mark.parametrize("filter_expr", ["uid >= 0", "uid >= 0 and uid < 100", "uid in [1,2,3]"])
def test_search_vector_with_complex_int_filter(self, filter_expr):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": filter_expr,
"limit": limit,
"offset": 0,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) <= limit
for item in res:
uid = item.get("uid")
eval(filter_expr)
@pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""])
def test_search_vector_with_complex_varchar_filter(self, filter_expr):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
filter_expr = filter_expr.replace("placeholder", prefix)
logger.info(f"filter_expr: {filter_expr}")
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": filter_expr,
"limit": limit,
"offset": 0,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) <= limit
for item in res:
name = item.get("name")
logger.info(f"name: {name}")
if ">" in filter_expr:
assert name > prefix
if "like" in filter_expr:
assert name.startswith(prefix)
@pytest.mark.parametrize("filter_expr", ["uid < 100 and name > \"placeholder\"",
"uid < 100 and name like \"placeholder%\""
])
def test_search_vector_with_complex_int64_varchar_and_filter(self, filter_expr):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
filter_expr = filter_expr.replace("placeholder", prefix)
logger.info(f"filter_expr: {filter_expr}")
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": filter_expr,
"limit": limit,
"offset": 0,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) <= limit
for item in res:
uid = item.get("uid")
name = item.get("name")
logger.info(f"name: {name}")
uid_expr = filter_expr.split("and")[0]
assert eval(uid_expr) is True
varchar_expr = filter_expr.split("and")[1]
if ">" in varchar_expr:
assert name > prefix
if "like" in varchar_expr:
assert name.startswith(prefix)
@pytest.mark.parametrize("consistency_level", ["Strong", "Bounded", "Eventually", "Session"])
def test_search_vector_with_consistency_level(self, consistency_level):
"""
Search a vector with different consistency level
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"consistencyLevel": consistency_level
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) == limit
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
def test_search_vector_with_range_search(self, metric_type):
"""
Search a vector with range search with different metric type
"""
name = gen_collection_name()
self.name = name
nb = 3000
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
training_data = [item[vector_field] for item in data]
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
r1, r2 = distance_sorted[0][nb // 2], distance_sorted[0][nb // 2 + limit + int(
(0.5 * limit))] # recall is not 100% so add 50% to make sure the range is more than limit
if metric_type == "L2":
r1, r2 = r2, r1
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
logger.info(f"r1: {r1}, r2: {r2}")
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"searchParams": {
"params": {
"radius": r1,
"range_filter": r2,
}
}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) >= limit * 0.8
# add buffer to the distance of comparison
if metric_type == "L2":
r1 = r1 + 10 ** -6
r2 = r2 - 10 ** -6
else:
r1 = r1 - 10 ** -6
r2 = r2 + 10 ** -6
for item in res:
distance = item.get("distance")
if metric_type == "L2":
assert r1 > distance > r2
else:
assert r1 < distance < r2
@pytest.mark.parametrize("ignore_growing", [True, False])
def test_search_vector_with_ignore_growing(self, ignore_growing):
"""
Search a vector with range search with different metric type
"""
name = gen_collection_name()
self.name = name
metric_type = "COSINE"
nb = 1000
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
training_data = [item[vector_field] for item in data]
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
r1, r2 = distance_sorted[0][nb // 2], distance_sorted[0][
nb // 2 + limit + int((0.2 * limit))] # recall is not 100% so add 20% to make sure the range is correct
if metric_type == "L2":
r1, r2 = r2, r1
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"searchParams": {
"ignoreGrowing": ignore_growing
}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
if ignore_growing is True:
assert len(res) == 0
else:
assert len(res) == limit
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
def test_search_vector_with_text_match_filter(self, tokenizer):
"""
Query a vector with a simple payload
"""
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
# create a collection
dim = 128
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields=fields, description="test collection")
collection = Collection(name=name, schema=schema
)
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
data_size = 3000
batch_size = 1000
# insert data
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.sentence().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size)
]
df = pd.DataFrame(data)
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
for i in range(0, data_size, batch_size):
tmp = data[i:i + batch_size]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
collection.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
collection.load()
time.sleep(5)
vector_to_search = [[random.random() for _ in range(dim)]]
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"text_match({field}, '{token}')"
logger.info(f"expr: {expr}")
rsp = self.vector_client.vector_search(
{"collectionName": name, "data": vector_to_search, "filter": f"{expr}", "outputFields": ["*"]})
assert rsp['code'] == 0, rsp
for d in rsp['data']:
assert token in d[field]
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_search_vector_with_phrase_match_filter(self, tokenizer):
"""
Search a vector with phrase match filter
"""
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
# create a collection
dim = 128
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields=fields, description="test collection")
collection = Collection(name=name, schema=schema
)
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
data_size = 3000
batch_size = 1000
# insert data
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.sentence().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size)
]
df = pd.DataFrame(data)
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
for i in range(0, data_size, batch_size):
tmp = data[i:i + batch_size]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
# add additional data to make sure query result is not empty
target_data_len = 10
phrase_map = {}
for field in text_fields:
phrase = wf_map[field].most_common()[0][0] + " " + wf_map[field].most_common()[1][0]
phrase_map[field] = phrase
tmp = [
{
"id": i,
"word": phrase_map["word"],
"sentence": phrase_map["sentence"],
"paragraph": phrase_map["paragraph"],
"text": phrase_map["text"],
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size, data_size + target_data_len)
]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
collection.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
collection.load()
time.sleep(5)
vector_to_search = [[random.random() for _ in range(dim)]]
for field in text_fields:
phrase = wf_map[field].most_common()[0][0] + " " + wf_map[field].most_common()[1][0]
# exact match
expr = f"phrase_match({field}, '{phrase}')"
logger.info(f"expr: {expr}")
rsp = self.vector_client.vector_search(
{"collectionName": name, "data": vector_to_search, "filter": f"{expr}", "outputFields": ["*"]})
assert rsp['code'] == 0, rsp
for d in rsp['data']:
d[field] = re.sub(r'[^\w\s]','', d[field])
assert phrase in d[field]
assert len(rsp['data']) >= target_data_len * len(vector_to_search)
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("nq", [1, 2])
@pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"])
def test_search_vector_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
enable_dynamic_schema, nq, metric_type):
"""
Insert a vector with default and none
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}, "defaultValue": 8888},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
"nullable": True, "defaultValue": "8888"},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100 if i % 2 else None,
"word_count": None,
"book_describe": None,
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100 if i % 2 else None,
"word_count": None,
"book_describe": None,
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)],
"filter": "book_id >= 0",
# "groupingField": "user_id",
"outputFields": ["*"],
"limit": 100,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert rsp['data'][0]['book_describe'] == "8888"
assert rsp['data'][0]['word_count'] is None
assert len(rsp['data']) == 100 * nq
@pytest.mark.L0
class TestSearchVectorNegative(TestBase):
@pytest.mark.parametrize("metric_type", ["L2"])
def test_search_vector_without_required_data_param(self, metric_type):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, metric_type=metric_type)
# search data
dim = 128
payload = {
"collectionName": name,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1802
@pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, metric_type="COSINE")
# search data
dim = 128
payload = {
"collectionName": name,
"data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
"searchParams": {
"metricType": invalid_metric_type
}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] != 0
@pytest.mark.parametrize("limit", [0, 16385])
def test_search_vector_with_invalid_limit(self, limit):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
dim = 128
schema_payload, data = self.init_collection(name, dim=dim)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": "uid >= 0",
"limit": limit,
"offset": 0,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 65535
@pytest.mark.parametrize("offset", [-1, 100_001])
def test_search_vector_with_invalid_offset(self, offset):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
dim = 128
schema_payload, data = self.init_collection(name, dim=dim)
vector_field = schema_payload.get("vectorField")
# search data
dim = 128
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"filter": "uid >= 0",
"limit": 100,
"offset": offset,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 65535
def test_search_vector_with_invalid_collection_name(self):
"""
Search a vector with invalid collection name
"""
name = gen_collection_name()
self.name = name
dim = 128
schema_payload, data = self.init_collection(name, dim=dim)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": "invalid_collection_name",
"data": [vector_to_search],
"outputFields": output_fields,
"filter": "uid >= 0",
"limit": 100,
"offset": 0,
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 100
assert "can't find collection" in rsp['message']
@pytest.mark.L0
class TestAdvancedSearchVector(TestBase):
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [2])
def test_advanced_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"},
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# advanced search data
payload = {
"collectionName": name,
"search": [{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_1",
"limit": 10,
"outputFields": ["*"]
},
{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_2",
"limit": 10,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": 10,
"outputFields": ["user_id", "word_count", "book_describe"]
}
rsp = self.vector_client.vector_advanced_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 10
assert len(rsp['topks']) == 1
assert rsp['topks'][0] == 10
@pytest.mark.L0
class TestHybridSearchVector(TestBase):
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [2])
def test_hybrid_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"},
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# advanced search data
payload = {
"collectionName": name,
"search": [{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_1",
"limit": 10,
"outputFields": ["*"]
},
{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_2",
"limit": 10,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": 10,
"outputFields": ["user_id", "word_count", "book_describe"]
}
rsp = self.vector_client.vector_hybrid_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 10
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("limit", [100])
@pytest.mark.parametrize("offset", [100,200])
def test_hybrid_search_vector_with_offset(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, limit, offset):
"""
Test hybrid search with offset parameter
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"},
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
float_vector_1 = gen_vector(datatype="FloatVector", dim=dim)
float_vector_2 = gen_vector(datatype="FloatVector", dim=dim)
# hybrid search with offset
payload = {
"collectionName": name,
"search": [{
"data": [float_vector_1],
"annsField": "float_vector_1",
"limit": offset + limit,
"outputFields": ["*"]
},
{
"data": [float_vector_2],
"annsField": "float_vector_2",
"limit": offset + limit,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": limit,
"offset": offset,
"outputFields": ["book_id", "user_id", "word_count", "book_describe"]
}
rsp = self.vector_client.vector_hybrid_search(payload)
# Check if offset + limit exceeds max allowed
if offset + limit > constant.MAX_SUM_OFFSET_AND_LIMIT:
assert rsp['code'] == 1
assert "exceeds" in rsp['message'] or "invalid" in rsp['message'].lower()
else:
assert rsp['code'] == 0
assert len(rsp['data']) <= limit
# Verify offset works by comparing with search without offset
payload_no_offset = {
"collectionName": name,
"search": [{
"data": [float_vector_1],
"annsField": "float_vector_1",
"limit": offset + limit,
"outputFields": ["*"]
},
{
"data": [float_vector_2],
"annsField": "float_vector_2",
"limit": offset + limit,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": offset + limit,
"outputFields": ["book_id", "user_id", "word_count", "book_describe"]
}
rsp_no_offset = self.vector_client.vector_hybrid_search(payload_no_offset)
if rsp_no_offset['code'] == 0 and len(rsp_no_offset['data']) > offset:
# Extract PKs from results with offset
pks_with_offset = set()
for item in rsp['data']:
if 'book_id' in item:
pks_with_offset.add(item['book_id'])
# Extract PKs from the corresponding portion of results without offset
pks_no_offset_expected = set()
expected_results = rsp_no_offset['data'][offset:offset + limit]
for item in expected_results:
if 'book_id' in item:
pks_no_offset_expected.add(item['book_id'])
# Calculate intersection rate
if len(pks_no_offset_expected) > 0:
intersection = pks_with_offset.intersection(pks_no_offset_expected)
intersection_rate = len(intersection) / len(pks_no_offset_expected)
logger.info(f"PK intersection rate: {intersection_rate:.2%}")
# The intersection rate should be at least 80%
assert intersection_rate >= 0.8, f"PK intersection rate {intersection_rate:.2%} is less than 80%"
else:
# If no expected results, the offset results should also be empty
assert len(pks_with_offset) == 0
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [1000])
@pytest.mark.parametrize("dim", [2])
@pytest.mark.parametrize("invalid_offset", [-1, -10, -100, "abc", [], {}])
def test_hybrid_search_vector_with_invalid_offset(self, nb, dim, auto_id,
is_partition_key, enable_dynamic_schema, invalid_offset):
"""
Test hybrid search with invalid offset values
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "L2"},
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "L2"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# insert data
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
else:
tmp = {
"book_id": i,
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# hybrid search with invalid offset
payload = {
"collectionName": name,
"search": [{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_1",
"limit": 10,
"outputFields": ["*"]
},
{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_2",
"limit": 10,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": 10,
"offset": invalid_offset,
"outputFields": ["user_id", "word_count", "book_describe"]
}
rsp = self.vector_client.vector_hybrid_search(payload)
assert rsp['code'] != 0
assert "offset" in rsp['message'].lower() or "invalid" in rsp['message'].lower()
@pytest.mark.parametrize("nb", [5000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("limit", [100])
@pytest.mark.parametrize("large_offset", [10000, 16385])
def test_hybrid_search_vector_with_large_offset(self, nb, dim, limit, large_offset):
"""
Test hybrid search with large offset values
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"enableDynamicField": True,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "IP"},
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "IP"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# insert data
data = []
for i in range(nb):
tmp = {
"user_id": i % 100,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# hybrid search with large offset
payload = {
"collectionName": name,
"search": [{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_1",
"limit": limit + large_offset,
"outputFields": ["*"]
},
{
"data": [gen_vector(datatype="FloatVector", dim=dim)],
"annsField": "float_vector_2",
"limit": limit + large_offset,
"outputFields": ["*"]
}
],
"rerank": {
"strategy": "rrf",
"params": {
"k": 10,
}
},
"limit": limit,
"offset": large_offset,
"outputFields": ["user_id", "word_count", "book_describe"]
}
rsp = self.vector_client.vector_hybrid_search(payload)
# When offset + limit exceeds max allowed
if large_offset + limit > constant.MAX_SUM_OFFSET_AND_LIMIT:
assert rsp['code'] == 65535
assert "exceeds" in rsp['message'] or "invalid" in rsp['message'].lower()
# When offset is larger than the available results
elif large_offset >= nb:
# Should return empty results or handle gracefully
assert rsp['code'] == 0
assert len(rsp['data']) == 0
else:
assert rsp['code'] == 0
# Should return remaining results after offset
expected_count = min(limit, nb - large_offset)
assert len(rsp['data']) == expected_count
@pytest.mark.L0
class TestQueryVector(TestBase):
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_query_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"}},
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{gen_unique_str(length=1000)}",
"bool": random.choice([True, False]),
"json": {"key": [i]},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": gen_unique_str(length=1000),
"bool": random.choice([True, False]),
"json": {"key": i},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data to make sure the data is inserted
# 1. query for int64
payload = {
"collectionName": name,
"filter": "user_id > 0",
"limit": 50,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 50
# 2. query for varchar
payload = {
"collectionName": name,
"filter": "book_describe like \"book%\"",
"limit": 50,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 50
# 3. query for json
payload = {
"collectionName": name,
"filter": "json_contains(json['key'] , 1)",
"limit": 50,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 1
# 4. query for array
payload = {
"collectionName": name,
"filter": "array_contains(int_array, 1)",
"limit": 50,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 1
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
def test_query_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema,
pass_fp32_to_fp16_or_bf16):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
"elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"},
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"},
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="Float16Vector", dim=dim)
),
"bfloat16_vector": (
gen_vector(datatype="FloatVector", dim=dim)
if pass_fp32_to_fp16_or_bf16
else gen_vector(datatype="BFloat16Vector", dim=dim)
),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
c = Collection(name)
res = c.query(
expr="user_id > 0",
limit=50,
output_fields=["*"],
)
logger.info(f"res: {res}")
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
@pytest.mark.parametrize("expr", ["10+20 <= uid < 20+30", "uid in [1,2,3,4]",
"uid > 0", "uid >= 0", "uid > 0",
"uid > -100 and uid < 100"])
@pytest.mark.parametrize("include_output_fields", [True, False])
@pytest.mark.parametrize("partial_fields", [True, False])
def test_query_vector_with_int64_filter(self, expr, include_output_fields, partial_fields):
"""
Query a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
schema_payload, data = self.init_collection(name)
output_fields = get_common_fields_by_data(data)
if partial_fields:
output_fields = output_fields[:len(output_fields) // 2]
if "uid" not in output_fields:
output_fields.append("uid")
else:
output_fields = output_fields
# query data
payload = {
"collectionName": name,
"filter": expr,
"limit": 100,
"offset": 0,
"outputFields": output_fields
}
if not include_output_fields:
payload.pop("outputFields")
if 'vector' in output_fields:
output_fields.remove("vector")
time.sleep(5)
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
for r in res:
uid = r['uid']
assert eval(expr) is True
for field in output_fields:
assert field in r
def test_query_vector_with_count(self):
"""
Query a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, nb=3000)
# query for "count(*)"
payload = {
"collectionName": name,
"filter": " ",
"limit": 0,
"outputFields": ["count(*)"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
assert rsp['data'][0]['count(*)'] == 3000
@pytest.mark.xfail(reason="query by id is not supported")
def test_query_vector_by_id(self):
"""
Query a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
_, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True)
payload = {
"collectionName": name,
"id": insert_ids,
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""])
@pytest.mark.parametrize("include_output_fields", [True, False])
def test_query_vector_with_varchar_filter(self, filter_expr, include_output_fields):
"""
Query a vector with a complex payload
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
# search data
output_fields = get_common_fields_by_data(data)
filter_expr = filter_expr.replace("placeholder", prefix)
logger.info(f"filter_expr: {filter_expr}")
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": filter_expr,
"limit": limit,
"offset": 0,
}
if not include_output_fields:
payload.pop("outputFields")
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) <= limit
for item in res:
name = item.get("name")
logger.info(f"name: {name}")
if ">" in filter_expr:
assert name > prefix
if "like" in filter_expr:
assert name.startswith(prefix)
@pytest.mark.parametrize("sum_of_limit_offset", [16384])
def test_query_vector_with_large_sum_of_limit_offset(self, sum_of_limit_offset):
"""
Query a vector with sum of limit and offset larger than max value
"""
max_sum_of_limit_offset = 16384
name = gen_collection_name()
filter_expr = "name > \"placeholder\""
self.name = name
nb = 200
dim = 128
limit = 100
offset = sum_of_limit_offset - limit
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
# search data
output_fields = get_common_fields_by_data(data)
filter_expr = filter_expr.replace("placeholder", prefix)
logger.info(f"filter_expr: {filter_expr}")
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": filter_expr,
"limit": limit,
"offset": offset,
}
rsp = self.vector_client.vector_query(payload)
if sum_of_limit_offset > max_sum_of_limit_offset:
assert rsp['code'] == 1
return
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) <= limit
for item in res:
name = item.get("name")
logger.info(f"name: {name}")
if ">" in filter_expr:
assert name > prefix
if "like" in filter_expr:
assert name.startswith(prefix)
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
def test_query_vector_with_text_match_filter(self, tokenizer):
"""
Query a vector with a simple payload
"""
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
# create a collection
dim = 128
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields=fields, description="test collection")
collection = Collection(name=name, schema=schema
)
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
data_size = 3000
batch_size = 1000
# insert data
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.sentence().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size)
]
df = pd.DataFrame(data)
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
for i in range(0, data_size, batch_size):
tmp = data[i:i + batch_size]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
collection.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
collection.load()
time.sleep(5)
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"text_match({field}, '{token}')"
logger.info(f"expr: {expr}")
rsp = self.vector_client.vector_query({"collectionName": name, "filter": f"{expr}", "outputFields": ["*"]})
assert rsp['code'] == 0, rsp
for d in rsp['data']:
assert token in d[field]
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_query_vector_with_phrase_match_filter(self, tokenizer):
"""
Query a vector with phrase match filter
"""
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
# create a collection
dim = 128
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields=fields, description="test collection")
collection = Collection(name=name, schema=schema
)
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
data_size = 3000
batch_size = 1000
# insert data
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.sentence().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size)
]
df = pd.DataFrame(data)
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
for i in range(0, data_size, batch_size):
tmp = data[i:i + batch_size]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
# add additional data to make sure query result is not empty
target_data_len = 10
phrase_map = {}
for field in text_fields:
phrase = wf_map[field].most_common()[0][0] + " " + wf_map[field].most_common()[1][0]
phrase_map[field] = phrase
tmp = [
{
"id": i,
"word": phrase_map["word"],
"sentence": phrase_map["sentence"],
"paragraph": phrase_map["paragraph"],
"text": phrase_map["text"],
"emb": [random.random() for _ in range(dim)]
}
for i in range(data_size, data_size + target_data_len)
]
payload = {
"collectionName": name,
"data": tmp,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == len(tmp)
collection.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
collection.load()
time.sleep(5)
for field in text_fields:
phrase = wf_map[field].most_common()[0][0] + " " + wf_map[field].most_common()[1][0]
expr = f"phrase_match({field}, '{phrase}')"
logger.info(f"expr: {expr}")
rsp = self.vector_client.vector_query({"collectionName": name, "filter": f"{expr}", "outputFields": ["*"]})
assert rsp['code'] == 0, rsp
for d in rsp['data']:
d[field] = re.sub(r'[^\w\s]','', d[field])
assert phrase in d[field]
assert len(rsp['data']) >= target_data_len
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
def test_query_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
enable_dynamic_schema):
"""
Insert a vector with default and none
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 8888},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"},
"nullable": True},
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}, "nullable": True},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}, "nullable": True},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True},
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
"elementTypeParams": {"max_capacity": "1024"}, "nullable": True},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": None,
"book_describe": None,
"bool": random.choice([True, False]),
"json": None,
"int_array": None,
"varchar_array": None,
"bool_array": None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": None,
"book_describe": None,
"bool": random.choice([True, False]),
"json": None,
"int_array": None,
"varchar_array": None,
"bool_array": None,
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data to make sure the data is inserted
payload = {
"collectionName": name,
"filter": "user_id > 0",
"limit": 50,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
assert rsp['data'][0]['book_describe'] is None
assert rsp['data'][0]['word_count'] == 8888
assert rsp['data'][0]['json'] is None
assert rsp['data'][0]['varchar_array'] is None
assert len(rsp['data']) == 50
@pytest.mark.L0
class TestQueryVectorNegative(TestBase):
def test_query_with_wrong_filter_expr(self):
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
output_fields = get_common_fields_by_data(data)
uids = []
for item in data:
uids.append(item.get("uid"))
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": f"{insert_ids}",
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 1100
assert "failed to create query plan" in rsp['message']
@pytest.mark.L0
class TestGetVector(TestBase):
def test_get_vector_with_simple_payload(self):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name)
# search data
dim = 128
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
payload = {
"collectionName": name,
"data": [vector_to_search],
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
limit = int(payload.get("limit", 100))
assert len(res) == limit
ids = [item['id'] for item in res]
assert len(ids) == len(set(ids))
payload = {
"collectionName": name,
"outputFields": ["*"],
"id": ids[0],
}
rsp = self.vector_client.vector_get(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {res}")
logger.info(f"res: {len(res)}")
for item in res:
assert item['id'] == ids[0]
@pytest.mark.L0
@pytest.mark.parametrize("id_field_type", ["list", "one"])
@pytest.mark.parametrize("include_invalid_id", [True, False])
@pytest.mark.parametrize("include_output_fields", [True, False])
def test_get_vector_complex(self, id_field_type, include_output_fields, include_invalid_id):
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
output_fields = get_common_fields_by_data(data)
uids = []
for item in data:
uids.append(item.get("uid"))
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": f"uid in {uids}",
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
ids = []
for r in res:
ids.append(r['id'])
logger.info(f"ids: {len(ids)}")
id_to_get = None
if id_field_type == "list":
id_to_get = ids
if id_field_type == "one":
id_to_get = ids[0]
if include_invalid_id:
if isinstance(id_to_get, list):
id_to_get[-1] = 0
else:
id_to_get = 0
# get by id list
payload = {
"collectionName": name,
"outputFields": output_fields,
"id": id_to_get
}
rsp = self.vector_client.vector_get(payload)
assert rsp['code'] == 0
res = rsp['data']
if isinstance(id_to_get, list):
if include_invalid_id:
assert len(res) == len(id_to_get) - 1
else:
assert len(res) == len(id_to_get)
else:
if include_invalid_id:
assert len(res) == 0
else:
assert len(res) == 1
for r in rsp['data']:
if isinstance(id_to_get, list):
assert r['id'] in id_to_get
else:
assert r['id'] == id_to_get
if include_output_fields:
for field in output_fields:
assert field in r
@pytest.mark.L0
class TestDeleteVector(TestBase):
@pytest.mark.xfail(reason="delete by id is not supported")
def test_delete_vector_by_id(self):
"""
Query a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
_, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True)
payload = {
"collectionName": name,
"id": insert_ids,
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("id_field_type", ["list", "one"])
def test_delete_vector_by_pk_field_ids(self, id_field_type):
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
time.sleep(1)
id_to_delete = None
if id_field_type == "list":
id_to_delete = insert_ids
if id_field_type == "one":
id_to_delete = insert_ids[0]
if isinstance(id_to_delete, list):
payload = {
"collectionName": name,
"filter": f"id in {id_to_delete}"
}
else:
payload = {
"collectionName": name,
"filter": f"id == {id_to_delete}"
}
rsp = self.vector_client.vector_delete(payload)
assert rsp['code'] == 0
# verify data deleted by get
payload = {
"collectionName": name,
"id": id_to_delete
}
rsp = self.vector_client.vector_get(payload)
assert len(rsp['data']) == 0
@pytest.mark.parametrize("id_field_type", ["list", "one"])
def test_delete_vector_by_filter_pk_field(self, id_field_type):
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
time.sleep(1)
output_fields = get_common_fields_by_data(data)
uids = []
for item in data:
uids.append(item.get("uid"))
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": f"uid in {uids}",
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
ids = []
for r in res:
ids.append(r['id'])
logger.info(f"ids: {len(ids)}")
id_to_get = None
if id_field_type == "list":
id_to_get = ids
if id_field_type == "one":
id_to_get = ids[0]
if isinstance(id_to_get, list):
if len(id_to_get) >= 100:
id_to_get = id_to_get[-100:]
# delete by id list
if isinstance(id_to_get, list):
payload = {
"collectionName": name,
"filter": f"id in {id_to_get}",
}
else:
payload = {
"collectionName": name,
"filter": f"id == {id_to_get}",
}
rsp = self.vector_client.vector_delete(payload)
assert rsp['code'] == 0
logger.info(f"delete res: {rsp}")
# verify data deleted
if not isinstance(id_to_get, list):
id_to_get = [id_to_get]
payload = {
"collectionName": name,
"filter": f"id in {id_to_get}",
}
time.sleep(5)
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == 0
def test_delete_vector_by_custom_pk_field(self):
dim = 128
nb = 3000
insert_round = 1
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
pk_values = []
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j,
"word_count": i * nb + j,
"book_describe": f"book_{i * nb + j}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
tmp = [d["book_id"] for d in data]
pk_values.extend(tmp)
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data before delete
c = Collection(name)
res = c.query(expr="", output_fields=["count(*)"])
logger.info(f"res: {res}")
# delete data
payload = {
"collectionName": name,
"filter": f"book_id in {pk_values}",
}
rsp = self.vector_client.vector_delete(payload)
# query data after delete
res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong")
logger.info(f"res: {res}")
assert res[0]["count(*)"] == 0
def test_delete_vector_by_filter_custom_field(self):
dim = 128
nb = 3000
insert_round = 1
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
tmp = {
"book_id": i * nb + j,
"word_count": i * nb + j,
"book_describe": f"book_{i * nb + j}",
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# query data before delete
c = Collection(name)
res = c.query(expr="", output_fields=["count(*)"])
logger.info(f"res: {res}")
# delete data
payload = {
"collectionName": name,
"filter": "word_count >= 0",
}
rsp = self.vector_client.vector_delete(payload)
# query data after delete
res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong")
logger.info(f"res: {res}")
assert res[0]["count(*)"] == 0
def test_delete_vector_with_non_primary_key(self):
"""
Delete a vector with a non-primary key, expect no data were deleted
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, dim=128, nb=300)
expr = "uid > 0"
payload = {
"collectionName": name,
"filter": expr,
"limit": 3000,
"offset": 0,
"outputFields": ["id", "uid"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
id_list = [r['uid'] for r in res]
delete_expr = f"uid in {[i for i in id_list[:10]]}"
# query data before delete
payload = {
"collectionName": name,
"filter": delete_expr,
"limit": 3000,
"offset": 0,
"outputFields": ["id", "uid"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
num_before_delete = len(res)
logger.info(f"res: {len(res)}")
# delete data
payload = {
"collectionName": name,
"filter": delete_expr,
}
rsp = self.vector_client.vector_delete(payload)
# query data after delete
payload = {
"collectionName": name,
"filter": delete_expr,
"limit": 3000,
"offset": 0,
"outputFields": ["id", "uid"]
}
time.sleep(1)
rsp = self.vector_client.vector_query(payload)
assert len(rsp["data"]) == 0
@pytest.mark.L0
class TestDeleteVectorNegative(TestBase):
def test_delete_vector_with_invalid_collection_name(self):
"""
Delete a vector with an invalid collection name
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, dim=128, nb=3000)
# query data
# expr = f"id in {[i for i in range(10)]}".replace("[", "(").replace("]", ")")
expr = "id > 0"
payload = {
"collectionName": name,
"filter": expr,
"limit": 3000,
"offset": 0,
"outputFields": ["id", "uid"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
id_list = [r['id'] for r in res]
delete_expr = f"id in {[i for i in id_list[:10]]}"
# query data before delete
payload = {
"collectionName": name,
"filter": delete_expr,
"limit": 3000,
"offset": 0,
"outputFields": ["id", "uid"]
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
# delete data
payload = {
"collectionName": name + "_invalid",
"filter": delete_expr,
}
rsp = self.vector_client.vector_delete(payload)
assert rsp['code'] == 100
assert "can't find collection" in rsp['message']
@pytest.mark.L1
class TestVectorWithAuth(TestBase):
def test_upsert_vector_with_invalid_api_key(self):
"""
Insert a vector with invalid api key
"""
# create a collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 10
data = [
{
"vector": [np.float64(random.random()) for _ in range(dim)],
} for _ in range(nb)
]
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
client = self.vector_client
client.api_key = "invalid_api_key"
rsp = client.vector_insert(payload)
assert rsp['code'] == 1800
def test_insert_vector_with_invalid_api_key(self):
"""
Insert a vector with invalid api key
"""
# create a collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"dimension": dim,
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0
# insert data
nb = 10
data = [
{
"vector": [np.float64(random.random()) for _ in range(dim)],
} for _ in range(nb)
]
payload = {
"collectionName": name,
"data": data,
}
body_size = sys.getsizeof(json.dumps(payload))
logger.info(f"body size: {body_size / 1024 / 1024} MB")
client = self.vector_client
client.api_key = "invalid_api_key"
rsp = client.vector_insert(payload)
assert rsp['code'] == 1800
def test_delete_vector_with_invalid_api_key(self):
"""
Delete a vector with an invalid api key
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
output_fields = get_common_fields_by_data(data)
uids = []
for item in data:
uids.append(item.get("uid"))
payload = {
"collectionName": name,
"outputFields": output_fields,
"filter": f"uid in {uids}",
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
ids = []
for r in res:
ids.append(r['id'])
logger.info(f"ids: {len(ids)}")
id_to_get = ids
# delete by id list
payload = {
"collectionName": name,
"filter": f"uid in {uids}"
}
client = self.vector_client
client.api_key = "invalid_api_key"
rsp = client.vector_delete(payload)
assert rsp['code'] == 1800
@pytest.mark.L0
class TestSearchByPK(TestBase):
"""
Test cases for search by primary key functionality (PR #47260)
Note: search_by_pk uses the vectors of specified IDs as query vectors to perform ANN search.
Each ID's vector returns `limit` results, so total results = len(ids) * limit (capped by collection size).
The first result for each query should be itself with distance ~0 (for L2/COSINE metrics).
"""
@pytest.mark.xfail(reason="https://github.com/milvus-io/milvus/issues/47495 - JSON float64 precision loss for large int64 IDs")
def test_search_by_pk_with_int64_ids_as_numbers(self):
"""
Search by primary key with int64 ids provided as numbers.
Verifies that vectors are retrieved and used as query vectors for ANN search.
Note: When using autoId=True, Milvus generates int64 IDs that may exceed
JavaScript's MAX_SAFE_INTEGER (2^53-1). Due to JSON float64 precision loss,
large int64 IDs lose precision when decoded as float64, causing "duplicate IDs" error.
This test is expected to fail until the issue is fixed.
Related issue: https://github.com/milvus-io/milvus/issues/47495
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
limit = 10
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use numeric IDs directly - this will trigger precision loss for large int64 IDs
# autoId generates IDs > 2^53 which lose precision when parsed as JSON float64
test_ids = [int(i) for i in insert_ids[:3]]
payload = {
"collectionName": name,
"ids": test_ids,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
# Each query ID returns up to `limit` results
assert len(rsp['data']) == len(test_ids) * limit
# The queried IDs should appear in results (as they match themselves with distance ~0)
returned_ids = [item['id'] for item in rsp['data']]
for rid in test_ids:
assert rid in returned_ids
def test_search_by_pk_with_int64_ids_as_strings(self):
"""
Search by primary key with int64 ids provided as strings.
Verifies string IDs are correctly parsed and used for search.
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
limit = 10
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Convert insert_ids to strings
test_ids = [str(i) for i in insert_ids[:3]]
payload = {
"collectionName": name,
"ids": test_ids,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_ids) * limit
returned_ids = [item['id'] for item in rsp['data']]
for rid in test_ids:
assert int(rid) in returned_ids
def test_search_by_pk_with_varchar_ids(self):
"""
Search by primary key with varchar ids.
"""
name = gen_collection_name()
self.name = name
dim = 128
limit = 10
# Create varchar primary key collection
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "pk", "dataType": "VarChar", "isPrimary": True,
"elementTypeParams": {"max_length": "128"}},
{"fieldName": "uid", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "vector", "dataType": "FloatVector",
"elementTypeParams": {"dim": str(dim)}},
]
},
"indexParams": [
{"fieldName": "vector", "indexName": "vector", "metricType": "L2"}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
self.wait_collection_load_completed(name)
# Insert data
nb = 100
data = []
insert_pks = []
for i in range(nb):
pk = f"pk_{i:06d}"
insert_pks.append(pk)
data.append({
"pk": pk,
"uid": i,
"vector": gen_vector(datatype="FloatVector", dim=dim),
})
insert_payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(insert_payload)
assert rsp['code'] == 0
# Search by pk
test_pks = insert_pks[:3]
search_payload = {
"collectionName": name,
"ids": test_pks,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_pks) * limit
returned_pks = [item['pk'] for item in rsp['data']]
for pk in test_pks:
assert pk in returned_pks
def test_search_by_pk_with_filter(self):
"""
Search by primary key with additional filter.
Filter applies to the search results.
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use string IDs to avoid float64 precision loss
test_ids = [str(i) for i in insert_ids[:5]]
# Apply filter that matches only some results
payload = {
"collectionName": name,
"ids": test_ids,
"filter": "uid >= 5",
"outputFields": ["*"],
"limit": 100
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
# All returned items should have uid >= 5
for item in rsp['data']:
assert item['uid'] >= 5
def test_search_by_pk_with_output_fields(self):
"""
Search by primary key with specific output fields.
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
limit = 10
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use string IDs to avoid float64 precision loss
test_ids = [str(i) for i in insert_ids[:3]]
payload = {
"collectionName": name,
"ids": test_ids,
"outputFields": ["uid", "id"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_ids) * limit
for item in rsp['data']:
assert 'uid' in item
assert 'id' in item
def test_search_by_pk_with_limit(self):
"""
Search by primary key with limit parameter.
Total results = len(ids) * limit.
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use string IDs to avoid float64 precision loss
test_ids = [str(i) for i in insert_ids[:5]]
limit = 10
payload = {
"collectionName": name,
"ids": test_ids,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
# Result count should be len(test_ids) * limit
assert len(rsp['data']) == len(test_ids) * limit
def test_search_by_pk_with_nonexistent_ids(self):
"""
Search by primary key with non-existent IDs returns error.
When IDs don't exist, the API returns error code 1100.
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
limit = 10
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use only non-existent IDs
nonexistent_ids = [999999999, 888888888]
payload = {
"collectionName": name,
"ids": nonexistent_ids,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
# Non-existent IDs should return error
assert rsp['code'] == 1100
def test_search_by_pk_result_verification(self):
"""
Verify that search by pk returns results where queried entities appear first (distance ~0).
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
limit = 5
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use string IDs to avoid float64 precision loss
test_ids = [str(i) for i in insert_ids[:3]]
payload = {
"collectionName": name,
"ids": test_ids,
"outputFields": ["*"],
"limit": limit
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_ids) * limit
# Group results by query (each limit results belong to one query)
# The first result in each group should have distance ~0 (matching itself)
for i in range(len(test_ids)):
first_result = rsp['data'][i * limit]
# First result should be the query entity itself with very small distance
assert first_result['distance'] < 0.001 or first_result['id'] == int(test_ids[i])
def test_search_by_pk_with_bm25_function_schema(self):
"""
Search by primary key with BM25 function schema (full text search enabled).
Uses sparse_vector field generated by BM25 function for search.
Note: Uses string IDs to avoid float64 precision loss for large autoId values.
"""
name = gen_collection_name()
self.name = name
limit = 10
# Create collection with BM25 function
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"enableDynamicField": True,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
self.wait_collection_load_completed(name)
# Insert data
nb = 100
data = []
for i in range(nb):
data.append({
"user_id": i % 10,
"word_count": i,
"book_describe": f"book_{i}",
"document_content": fake_en.text().lower(),
})
insert_payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(insert_payload)
assert rsp['code'] == 0
# Use string IDs to avoid float64 precision loss for large autoId values
test_ids = [str(i) for i in rsp['data']['insertIds'][:3]]
# Search by pk - must specify annsField for BM25 sparse vector
search_payload = {
"collectionName": name,
"ids": test_ids,
"annsField": "sparse_vector",
"outputFields": ["book_id", "user_id"],
"limit": limit
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_ids) * limit
returned_ids = [item['book_id'] for item in rsp['data']]
for rid in test_ids:
assert int(rid) in returned_ids
def test_search_by_pk_with_sparse_vector_schema(self):
"""
Search by primary key with sparse vector schema.
Must specify annsField when multiple vector fields exist.
"""
name = gen_collection_name()
self.name = name
dim = 128
limit = 10
# Create collection with sparse vector
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "pk", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "uid", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "dense_vector", "dataType": "FloatVector",
"elementTypeParams": {"dim": str(dim)}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
]
},
"indexParams": [
{"fieldName": "dense_vector", "indexName": "dense_vector", "metricType": "L2"},
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "IP",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
self.wait_collection_load_completed(name)
# Insert data
nb = 100
data = []
insert_pks = []
for i in range(nb):
insert_pks.append(i)
data.append({
"pk": i,
"uid": i % 10,
"dense_vector": gen_vector(datatype="FloatVector", dim=dim),
"sparse_vector": gen_vector(datatype="SparseFloatVector", dim=1000),
})
insert_payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(insert_payload)
assert rsp['code'] == 0
# Search by pk - specify annsField for dense_vector
test_pks = insert_pks[:3]
search_payload = {
"collectionName": name,
"ids": test_pks,
"annsField": "dense_vector",
"outputFields": ["pk", "uid"],
"limit": limit
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_pks) * limit
returned_pks = [item['pk'] for item in rsp['data']]
for pk in test_pks:
assert pk in returned_pks
def test_search_by_pk_with_multiple_vector_fields(self):
"""
Search by primary key with multiple vector fields schema.
Must specify annsField when multiple vector fields exist.
"""
name = gen_collection_name()
self.name = name
dim = 128
limit = 10
# Create collection with multiple vector fields
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "pk", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "uid", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "float_vector", "dataType": "FloatVector",
"elementTypeParams": {"dim": str(dim)}},
{"fieldName": "float16_vector", "dataType": "Float16Vector",
"elementTypeParams": {"dim": str(dim)}},
{"fieldName": "binary_vector", "dataType": "BinaryVector",
"elementTypeParams": {"dim": str(dim)}},
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"},
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"},
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
self.wait_collection_load_completed(name)
# Insert data
nb = 100
data = []
insert_pks = []
for i in range(nb):
insert_pks.append(i)
data.append({
"pk": i,
"uid": i % 10,
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
"float16_vector": gen_vector(datatype="FloatVector", dim=dim),
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
})
insert_payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(insert_payload)
assert rsp['code'] == 0
# Search by pk - specify annsField for float_vector
test_pks = insert_pks[:3]
search_payload = {
"collectionName": name,
"ids": test_pks,
"annsField": "float_vector",
"outputFields": ["pk", "uid"],
"limit": limit
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
assert len(rsp['data']) == len(test_pks) * limit
returned_pks = [item['pk'] for item in rsp['data']]
for pk in test_pks:
assert pk in returned_pks
@pytest.mark.L0
class TestSearchByPKNegative(TestBase):
"""Negative test cases for search by primary key functionality"""
def test_search_by_pk_with_both_ids_and_data(self):
"""
Search with both ids and data should fail - they are mutually exclusive
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
# Use string IDs to avoid float64 precision loss
test_ids = [str(i) for i in insert_ids[:3]]
vector = preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist()
payload = {
"collectionName": name,
"ids": test_ids,
"data": [vector],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1100
assert "mutually exclusive" in rsp['message'].lower() or "exclusive" in rsp['message'].lower()
def test_search_by_pk_with_empty_ids_array(self):
"""
Search with empty ids array should fail
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
self.init_collection(name, dim=dim, nb=nb)
payload = {
"collectionName": name,
"ids": [],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1802
def test_search_by_pk_with_fractional_float_ids(self):
"""
Search with fractional float ids should fail for int64 pk
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
self.init_collection(name, dim=dim, nb=nb)
payload = {
"collectionName": name,
"ids": [1.5, 2.7, 3.9],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1100
def test_search_by_pk_with_invalid_id_type(self):
"""
Search with invalid id type (boolean) should fail
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
self.init_collection(name, dim=dim, nb=nb)
payload = {
"collectionName": name,
"ids": [True, False],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1100
def test_search_by_pk_with_unparseable_string_ids(self):
"""
Search with string ids that cannot be parsed to int64 should fail
"""
name = gen_collection_name()
self.name = name
dim = 128
nb = 100
self.init_collection(name, dim=dim, nb=nb)
payload = {
"collectionName": name,
"ids": ["abc", "def", "ghi"],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1100
def test_search_by_pk_varchar_with_empty_string(self):
"""
Search varchar pk with empty string should fail
"""
name = gen_collection_name()
self.name = name
dim = 128
# Create varchar primary key collection
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "pk", "dataType": "VarChar", "isPrimary": True,
"elementTypeParams": {"max_length": "128"}},
{"fieldName": "uid", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "vector", "dataType": "FloatVector",
"elementTypeParams": {"dim": str(dim)}},
]
},
"indexParams": [
{"fieldName": "vector", "indexName": "vector", "metricType": "L2"}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
self.wait_collection_load_completed(name)
# Insert some data
data = [{
"pk": "valid_pk",
"uid": 1,
"vector": gen_vector(datatype="FloatVector", dim=dim),
}]
insert_payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(insert_payload)
assert rsp['code'] == 0
# Search with empty string
search_payload = {
"collectionName": name,
"ids": [""],
"outputFields": ["*"],
"limit": 10
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 1100