mirror of https://github.com/milvus-io/milvus.git
3893 lines
161 KiB
Python
3893 lines
161 KiB
Python
import random
|
|
from sklearn import preprocessing
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sys
|
|
import json
|
|
import time
|
|
from utils import constant
|
|
from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution
|
|
from utils.util_log import test_log as logger
|
|
import pytest
|
|
from base.testbase import TestBase
|
|
from utils.utils import (gen_unique_str, get_data_by_payload, get_common_fields_by_data, gen_vector, analyze_documents)
|
|
from pymilvus import (
|
|
FieldSchema, CollectionSchema, DataType,
|
|
Collection, utility
|
|
)
|
|
from faker import Faker
|
|
Faker.seed(19530)
|
|
fake_en = Faker("en_US")
|
|
fake_zh = Faker("zh_CN")
|
|
|
|
patch_faker_text(fake_en, en_vocabularies_distribution)
|
|
patch_faker_text(fake_zh, zh_vocabularies_distribution)
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestInsertVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [3])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_insert_entities_with_simple_payload(self, nb, dim, insert_round):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
collection_payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
"metricType": "L2"
|
|
}
|
|
rsp = self.collection_client.collection_create(collection_payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = get_data_by_payload(collection_payload, nb)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_insert_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
|
|
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
|
|
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
|
|
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
|
|
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
|
|
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"bool": random.choice([True, False]),
|
|
"json": {"key": i},
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"bool": random.choice([True, False]),
|
|
"json": {"key": i},
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
|
|
def test_insert_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema,
|
|
pass_fp32_to_fp16_or_bf16):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float16_vector", "dataType": "Float16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"},
|
|
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"},
|
|
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"},
|
|
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
|
|
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
res = c.query(
|
|
expr="user_id > 0",
|
|
limit=1,
|
|
output_fields=["*"],
|
|
)
|
|
logger.info(f"res: {res}")
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
|
|
def test_insert_entities_with_all_vector_datatype_0(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema,
|
|
pass_fp32_to_fp16_or_bf16):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "book_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float16_vector", "dataType": "Float16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "book_vector", "indexName": "book_vector", "metricType": "L2",
|
|
"params": {"index_type": "FLAT"}},
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2",
|
|
"params": {"index_type": "IVF_FLAT", "nlist": 128}},
|
|
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2",
|
|
"params": {"index_type": "IVF_SQ8", "nlist": "128"}},
|
|
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2",
|
|
"params": {"index_type": "IVF_PQ", "nlist": 128, "m": 16, "nbits": 8}},
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"book_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"book_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
res = c.query(
|
|
expr="user_id > 0",
|
|
limit=1,
|
|
output_fields=["*"],
|
|
)
|
|
logger.info(f"res: {res}")
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
|
|
def test_insert_entities_with_all_vector_datatype_1(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema,
|
|
pass_fp32_to_fp16_or_bf16):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float16_vector", "dataType": "Float16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2",
|
|
"params": {"index_type": "HNSW", "M": 32, "efConstruction": 360}},
|
|
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2",
|
|
"params": {"index_type": "SCANN", "nlist": "128"}},
|
|
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2",
|
|
"params": {"index_type": "DISKANN"}},
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
res = c.query(
|
|
expr="user_id > 0",
|
|
limit=1,
|
|
output_fields=["*"],
|
|
)
|
|
logger.info(f"res: {res}")
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_insert_entities_with_all_vector_datatype_2(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "binary_vector_0", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "binary_vector_1", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "sparse_float_vector_0", "dataType": "SparseFloatVector"},
|
|
{"fieldName": "sparse_float_vector_1", "dataType": "SparseFloatVector"},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "binary_vector_0", "indexName": "binary_vector_0_index", "metricType": "HAMMING",
|
|
"params": {"index_type": "BIN_FLAT"}},
|
|
{"fieldName": "binary_vector_1", "indexName": "binary_vector_1_index", "metricType": "HAMMING",
|
|
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}},
|
|
{"fieldName": "sparse_float_vector_0", "indexName": "sparse_float_vector_0_index", "metricType": "IP",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}},
|
|
{"fieldName": "sparse_float_vector_1", "indexName": "sparse_float_vector_1_index", "metricType": "IP",
|
|
"params": {"index_type": "SPARSE_WAND", "drop_ratio_build": "0.2"}}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim),
|
|
"binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim),
|
|
"sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
|
|
"sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim),
|
|
"binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim),
|
|
"sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
|
|
"sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
res = c.query(
|
|
expr="user_id > 0",
|
|
limit=1,
|
|
output_fields=["*"],
|
|
)
|
|
logger.info(f"res: {res}")
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_insert_entities_with_all_json_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
|
|
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
|
|
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
|
|
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
|
|
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
|
|
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
json_value = [
|
|
1,
|
|
1.0,
|
|
"1",
|
|
[1, 2, 3],
|
|
["1", "2", "3"],
|
|
[1, 2, "3"],
|
|
{"key": "value"},
|
|
]
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"bool": random.choice([True, False]),
|
|
"json": json_value[i%len(json_value)],
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"bool": random.choice([True, False]),
|
|
"json": json_value[i%len(json_value)],
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True, False])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_insert_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
|
|
enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with defaultValue and none
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}, "defaultValue": 10},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
|
|
"defaultValue": "default", "nullable": True},
|
|
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True},
|
|
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
|
|
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for k in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": None,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"json": None,
|
|
"varchar_array": None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": None,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"json": None,
|
|
"varchar_array": None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 5})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 5
|
|
assert rsp['data'][0]['book_describe'] == 'default'
|
|
assert rsp['data'][0]['word_count'] is None
|
|
assert rsp['data'][0]['json'] is None
|
|
|
|
@pytest.mark.L0
|
|
class TestInsertVectorNegative(TestBase):
|
|
|
|
def test_insert_vector_with_invalid_collection_name(self):
|
|
"""
|
|
Insert a vector with an invalid collection name
|
|
"""
|
|
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 100
|
|
data = get_data_by_payload(payload, nb)
|
|
payload = {
|
|
"collectionName": "invalid_collection_name",
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 100
|
|
assert "can't find collection" in rsp['message']
|
|
|
|
def test_insert_vector_with_invalid_database_name(self):
|
|
"""
|
|
Insert a vector with an invalid database name
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 10
|
|
data = get_data_by_payload(payload, nb)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
success = False
|
|
rsp = self.vector_client.vector_insert(payload, db_name="invalid_database")
|
|
assert rsp['code'] == 800
|
|
|
|
def test_insert_vector_with_mismatch_dim(self):
|
|
"""
|
|
Insert a vector with mismatch dim
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 32
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 1
|
|
data = [
|
|
{"id": i,
|
|
"vector": [np.float64(random.random()) for _ in range(dim + 1)],
|
|
} for i in range(nb)
|
|
]
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 1804
|
|
assert "fail to deal the insert data" in rsp['message']
|
|
|
|
def test_insert_entities_with_none_no_nullable_field(self):
|
|
"""
|
|
Insert a vector with none no nullable field
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": True,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}},
|
|
]
|
|
}
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
data = []
|
|
for i in range(10):
|
|
tmp = {
|
|
"word_count": i if i % 2 else None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(),
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 1804
|
|
assert "fail to deal the insert data" in rsp['message']
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestUpsertVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [2])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
|
|
def test_upsert_vector_default(self, nb, dim, insert_round, id_type):
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
|
|
"user_id": i * nb + j,
|
|
"word_count": i * nb + j,
|
|
"book_describe": f"book_{i * nb + j}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
c.flush()
|
|
|
|
# upsert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
|
|
"user_id": i * nb + j + 1,
|
|
"word_count": i * nb + j + 2,
|
|
"book_describe": f"book_{i * nb + j + 3}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
# query data to make sure the data is updated
|
|
if id_type == "Int64":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
|
|
if id_type == "VarChar":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
|
|
for data in rsp['data']:
|
|
assert data['user_id'] == int(data['book_id']) + 1
|
|
assert data['word_count'] == int(data['book_id']) + 2
|
|
assert data['book_describe'] == f"book_{int(data['book_id']) + 3}"
|
|
res = utility.get_query_segment_info(name)
|
|
logger.info(f"res: {res}")
|
|
|
|
@pytest.mark.parametrize("insert_round", [2])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
|
|
@pytest.mark.xfail(reason="currently not support auto_id for upsert")
|
|
def test_upsert_vector_pk_auto_id(self, nb, dim, insert_round, id_type):
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": True,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
ids = []
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
|
|
"user_id": i * nb + j,
|
|
"word_count": i * nb + j,
|
|
"book_describe": f"book_{i * nb + j}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
ids.extend(rsp['data']['insertIds'])
|
|
c = Collection(name)
|
|
c.flush()
|
|
|
|
# upsert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": ids[i * nb + j],
|
|
"user_id": i * nb + j + 1,
|
|
"word_count": i * nb + j + 2,
|
|
"book_describe": f"book_{i * nb + j + 3}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
# query data to make sure the data is updated
|
|
if id_type == "Int64":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
|
|
if id_type == "VarChar":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
|
|
for data in rsp['data']:
|
|
assert data['user_id'] == int(data['book_id']) + 1
|
|
assert data['word_count'] == int(data['book_id']) + 2
|
|
assert data['book_describe'] == f"book_{int(data['book_id']) + 3}"
|
|
res = utility.get_query_segment_info(name)
|
|
logger.info(f"res: {res}")
|
|
|
|
@pytest.mark.parametrize("insert_round", [2])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("id_type", ["Int64", "VarChar"])
|
|
def test_upsert_vector_with_default_none(self, nb, dim, insert_round, id_type):
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 123},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
|
|
"nullable": True},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
|
|
"user_id": i * nb + j,
|
|
"word_count": i * nb + j,
|
|
"book_describe": f"book_{i * nb + j}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
c.flush()
|
|
|
|
# upsert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}",
|
|
"user_id": i * nb + j + 1,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
# query data to make sure the data is updated
|
|
if id_type == "Int64":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"})
|
|
if id_type == "VarChar":
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"})
|
|
for data in rsp['data']:
|
|
assert data['user_id'] == int(data['book_id']) + 1
|
|
assert data['word_count'] == 123
|
|
assert data['book_describe'] is None
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestUpsertVectorNegative(TestBase):
|
|
|
|
def test_upsert_vector_with_invalid_collection_name(self):
|
|
"""
|
|
upsert a vector with an invalid collection name
|
|
"""
|
|
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 100
|
|
data = get_data_by_payload(payload, nb)
|
|
payload = {
|
|
"collectionName": "invalid_collection_name",
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
assert rsp['code'] == 100
|
|
assert "can't find collection" in rsp['message']
|
|
|
|
def test_upsert_entities_with_none_no_nullable_field(self):
|
|
"""
|
|
Insert a vector with none no nullable field
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": True,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}},
|
|
]
|
|
}
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
data = []
|
|
for i in range(10):
|
|
tmp = {
|
|
"word_count": i if i % 2 else None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(),
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
assert rsp['code'] == 1804
|
|
assert "fail to deal the insert data" in rsp['message']
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestSearchVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [16])
|
|
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
|
|
def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema,
|
|
pass_fp32_to_fp16_or_bf16):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float16_vector", "dataType": "Float16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "COSINE"},
|
|
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "COSINE"},
|
|
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "COSINE"},
|
|
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
|
|
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i%10,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i%10,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim)],
|
|
"annsField": "float_vector",
|
|
"filter": "word_count > 100",
|
|
"groupingField": "user_id",
|
|
"outputFields": ["*"],
|
|
"limit": 100
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
# assert no dup user_id
|
|
user_ids = [r["user_id"]for r in rsp['data']]
|
|
assert len(user_ids) == len(set(user_ids))
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("nq", [1, 2])
|
|
@pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"])
|
|
def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema, nq, metric_type):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type},
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)],
|
|
"filter": "word_count > 100",
|
|
"groupingField": "user_id",
|
|
"outputFields": ["*"],
|
|
"limit": 100,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 100 * nq
|
|
|
|
|
|
@pytest.mark.parametrize("insert_round", [1, 10])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
|
@pytest.mark.parametrize("sparse_format", ['dok', 'coo'])
|
|
def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema, groupingField, sparse_format):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "sparse_float_vector", "dataType": "SparseFloatVector"},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "sparse_float_vector", "indexName": "sparse_float_vector", "metricType": "IP",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
idx = i * nb + j
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format=sparse_format),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": idx,
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format=sparse_format),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok")],
|
|
"filter": "word_count > 100",
|
|
"outputFields": ["*"],
|
|
"searchParams": {
|
|
"metricType": "IP",
|
|
"params": {
|
|
"drop_ratio_search": "0.2",
|
|
}
|
|
},
|
|
"limit": 500,
|
|
}
|
|
if groupingField:
|
|
payload["groupingField"] = groupingField
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
|
@pytest.mark.parametrize("tokenizer", ['standard'])
|
|
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "document_content", "dataType": "VarChar",
|
|
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
|
|
"analyzer_params": {
|
|
"tokenizer": tokenizer,
|
|
},
|
|
"enable_match": True}},
|
|
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document_content"],
|
|
"outputFieldNames": ["sparse_vector"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
|
|
"indexParams": [
|
|
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
if tokenizer == 'standard':
|
|
fake = fake_en
|
|
elif tokenizer == 'jieba':
|
|
fake = fake_zh
|
|
else:
|
|
raise Exception("Invalid tokenizer")
|
|
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
idx = i * nb + j
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"document_content": fake.text().lower(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": idx,
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"document_content": fake.text().lower(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
assert rsp['code'] == 0
|
|
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [fake.text().lower() for _ in range(1)],
|
|
"filter": "word_count > 100",
|
|
"outputFields": ["*"],
|
|
"searchParams": {
|
|
"params": {
|
|
"drop_ratio_search": "0.2",
|
|
}
|
|
},
|
|
"limit": 500,
|
|
}
|
|
if groupingField:
|
|
payload["groupingField"] = groupingField
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) > 0
|
|
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True, False])
|
|
@pytest.mark.parametrize("is_partition_key", [True, False])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
|
@pytest.mark.parametrize("tokenizer", ['jieba'])
|
|
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
|
|
def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "document_content", "dataType": "VarChar",
|
|
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
|
|
"analyzer_params": {
|
|
"tokenizer": tokenizer,
|
|
},
|
|
"enable_match": True}},
|
|
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document_content"],
|
|
"outputFieldNames": ["sparse_vector"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
|
|
"indexParams": [
|
|
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
if tokenizer == 'standard':
|
|
fake = fake_en
|
|
elif tokenizer == 'jieba':
|
|
fake = fake_zh
|
|
else:
|
|
raise Exception("Invalid tokenizer")
|
|
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
idx = i * nb + j
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"document_content": fake.text().lower(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": idx,
|
|
"user_id": idx%100,
|
|
"word_count": j,
|
|
"book_describe": f"book_{idx}",
|
|
"document_content": fake.text().lower(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
assert rsp['code'] == 0
|
|
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [fake.text().lower() for _ in range(2)],
|
|
"filter": "word_count > 100",
|
|
"outputFields": ["*"],
|
|
"searchParams": {
|
|
"params": {
|
|
"drop_ratio_search": "0.2",
|
|
}
|
|
},
|
|
"limit": 500,
|
|
}
|
|
if groupingField:
|
|
payload["groupingField"] = groupingField
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) > 0
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("insert_round", [2])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("metric_type", ['HAMMING'])
|
|
def test_search_vector_with_binary_vector_datatype(self, metric_type, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": metric_type,
|
|
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# flush data
|
|
c = Collection(name)
|
|
c.flush()
|
|
time.sleep(5)
|
|
# wait for index
|
|
rsp = self.index_client.index_describe(collection_name=name, index_name="binary_vector")
|
|
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [gen_vector(datatype="BinaryVector", dim=dim)],
|
|
"filter": "word_count > 100",
|
|
"outputFields": ["*"],
|
|
"limit": 100,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 100
|
|
|
|
@pytest.mark.parametrize("metric_type", ["IP", "L2", "COSINE"])
|
|
def test_search_vector_with_simple_payload(self, metric_type):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, metric_type=metric_type)
|
|
|
|
# search data
|
|
dim = 128
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
limit = int(payload.get("limit", 100))
|
|
assert len(res) == limit
|
|
ids = [item['id'] for item in res]
|
|
assert len(ids) == len(set(ids))
|
|
distance = [item['distance'] for item in res]
|
|
if metric_type == "L2":
|
|
assert distance == sorted(distance)
|
|
if metric_type == "IP" or metric_type == "COSINE":
|
|
assert distance == sorted(distance, reverse=True)
|
|
|
|
@pytest.mark.parametrize("sum_limit_offset", [16384, 16385])
|
|
@pytest.mark.xfail(reason="")
|
|
def test_search_vector_with_exceed_sum_limit_offset(self, sum_limit_offset):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
max_search_sum_limit_offset = constant.MAX_SUM_OFFSET_AND_LIMIT
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = sum_limit_offset + 2000
|
|
metric_type = "IP"
|
|
limit = 100
|
|
self.init_collection(name, metric_type=metric_type, nb=nb, batch_size=2000)
|
|
|
|
# search data
|
|
dim = 128
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
payload = {
|
|
"collectionName": name,
|
|
"vector": vector_to_search,
|
|
"limit": limit,
|
|
"offset": sum_limit_offset - limit,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
if sum_limit_offset > max_search_sum_limit_offset:
|
|
assert rsp['code'] == 65535
|
|
return
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
limit = int(payload.get("limit", 100))
|
|
assert len(res) == limit
|
|
ids = [item['id'] for item in res]
|
|
assert len(ids) == len(set(ids))
|
|
distance = [item['distance'] for item in res]
|
|
if metric_type == "L2":
|
|
assert distance == sorted(distance)
|
|
if metric_type == "IP":
|
|
assert distance == sorted(distance, reverse=True)
|
|
|
|
@pytest.mark.parametrize("offset", [0, 100])
|
|
@pytest.mark.parametrize("limit", [100])
|
|
@pytest.mark.parametrize("metric_type", ["L2", "IP", "COSINE"])
|
|
def test_search_vector_with_complex_payload(self, limit, offset, metric_type):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = limit + offset + 3000
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": "uid >= 0",
|
|
"limit": limit,
|
|
"offset": offset,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
if offset + limit > constant.MAX_SUM_OFFSET_AND_LIMIT:
|
|
assert rsp['code'] == 90126
|
|
return
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) == limit
|
|
for item in res:
|
|
assert item.get("uid") >= 0
|
|
for field in output_fields:
|
|
assert field in item
|
|
|
|
@pytest.mark.parametrize("filter_expr", ["uid >= 0", "uid >= 0 and uid < 100", "uid in [1,2,3]"])
|
|
def test_search_vector_with_complex_int_filter(self, filter_expr):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": filter_expr,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) <= limit
|
|
for item in res:
|
|
uid = item.get("uid")
|
|
eval(filter_expr)
|
|
|
|
@pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""])
|
|
def test_search_vector_with_complex_varchar_filter(self, filter_expr):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
names = []
|
|
for item in data:
|
|
names.append(item.get("name"))
|
|
names.sort()
|
|
logger.info(f"names: {names}")
|
|
mid = len(names) // 2
|
|
prefix = names[mid][0:2]
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
filter_expr = filter_expr.replace("placeholder", prefix)
|
|
logger.info(f"filter_expr: {filter_expr}")
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": filter_expr,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) <= limit
|
|
for item in res:
|
|
name = item.get("name")
|
|
logger.info(f"name: {name}")
|
|
if ">" in filter_expr:
|
|
assert name > prefix
|
|
if "like" in filter_expr:
|
|
assert name.startswith(prefix)
|
|
|
|
@pytest.mark.parametrize("filter_expr", ["uid < 100 and name > \"placeholder\"",
|
|
"uid < 100 and name like \"placeholder%\""
|
|
])
|
|
def test_search_vector_with_complex_int64_varchar_and_filter(self, filter_expr):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
names = []
|
|
for item in data:
|
|
names.append(item.get("name"))
|
|
names.sort()
|
|
logger.info(f"names: {names}")
|
|
mid = len(names) // 2
|
|
prefix = names[mid][0:2]
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
filter_expr = filter_expr.replace("placeholder", prefix)
|
|
logger.info(f"filter_expr: {filter_expr}")
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": filter_expr,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) <= limit
|
|
for item in res:
|
|
uid = item.get("uid")
|
|
name = item.get("name")
|
|
logger.info(f"name: {name}")
|
|
uid_expr = filter_expr.split("and")[0]
|
|
assert eval(uid_expr) is True
|
|
varchar_expr = filter_expr.split("and")[1]
|
|
if ">" in varchar_expr:
|
|
assert name > prefix
|
|
if "like" in varchar_expr:
|
|
assert name.startswith(prefix)
|
|
|
|
@pytest.mark.parametrize("consistency_level", ["Strong", "Bounded", "Eventually", "Session"])
|
|
def test_search_vector_with_consistency_level(self, consistency_level):
|
|
"""
|
|
Search a vector with different consistency level
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
names = []
|
|
for item in data:
|
|
names.append(item.get("name"))
|
|
names.sort()
|
|
logger.info(f"names: {names}")
|
|
mid = len(names) // 2
|
|
prefix = names[mid][0:2]
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
"consistencyLevel": consistency_level
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) == limit
|
|
|
|
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
|
|
def test_search_vector_with_range_search(self, metric_type):
|
|
"""
|
|
Search a vector with range search with different metric type
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 3000
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
training_data = [item[vector_field] for item in data]
|
|
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
|
|
r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.5*limit))] # recall is not 100% so add 50% to make sure the range is more than limit
|
|
if metric_type == "L2":
|
|
r1, r2 = r2, r1
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
logger.info(f"r1: {r1}, r2: {r2}")
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
"searchParams": {
|
|
"params": {
|
|
"radius": r1,
|
|
"range_filter": r2,
|
|
}
|
|
}
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) >= limit*0.8
|
|
# add buffer to the distance of comparison
|
|
if metric_type == "L2":
|
|
r1 = r1 + 10**-6
|
|
r2 = r2 - 10**-6
|
|
else:
|
|
r1 = r1 - 10**-6
|
|
r2 = r2 + 10**-6
|
|
for item in res:
|
|
distance = item.get("distance")
|
|
if metric_type == "L2":
|
|
assert r1 > distance > r2
|
|
else:
|
|
assert r1 < distance < r2
|
|
|
|
@pytest.mark.parametrize("ignore_growing", [True, False])
|
|
def test_search_vector_with_ignore_growing(self, ignore_growing):
|
|
"""
|
|
Search a vector with range search with different metric type
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
metric_type = "COSINE"
|
|
nb = 1000
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
training_data = [item[vector_field] for item in data]
|
|
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
|
|
r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct
|
|
if metric_type == "L2":
|
|
r1, r2 = r2, r1
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
"searchParams": {
|
|
"ignoreGrowing": ignore_growing
|
|
|
|
}
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
if ignore_growing is True:
|
|
assert len(res) == 0
|
|
else:
|
|
assert len(res) == limit
|
|
|
|
|
|
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
|
|
def test_search_vector_with_text_match_filter(self, tokenizer):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
fake = fake_en
|
|
language = "en"
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
language = "zh"
|
|
# create a collection
|
|
dim = 128
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
name = gen_collection_name()
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
is_partition_key=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
collection = Collection(name=name, schema=schema
|
|
)
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
data_size = 3000
|
|
batch_size = 1000
|
|
# insert data
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.sentence().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)]
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
text_fields = ["word", "sentence", "paragraph", "text"]
|
|
wf_map = {}
|
|
for field in text_fields:
|
|
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
|
|
for i in range(0, data_size, batch_size):
|
|
tmp = data[i:i + batch_size]
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": tmp,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == len(tmp)
|
|
collection.create_index(
|
|
"emb",
|
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
|
)
|
|
collection.load()
|
|
time.sleep(5)
|
|
vector_to_search = [[random.random() for _ in range(dim)]]
|
|
for field in text_fields:
|
|
token = wf_map[field].most_common()[0][0]
|
|
expr = f"text_match({field}, '{token}')"
|
|
logger.info(f"expr: {expr}")
|
|
rsp = self.vector_client.vector_search({"collectionName": name, "data":vector_to_search, "filter": f"{expr}", "outputFields": ["*"]})
|
|
assert rsp['code'] == 0, rsp
|
|
for d in rsp['data']:
|
|
assert token in d[field]
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("nq", [1, 2])
|
|
@pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"])
|
|
def test_search_vector_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
|
|
enable_dynamic_schema, nq, metric_type):
|
|
"""
|
|
Insert a vector with default and none
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}, "defaultValue": 8888},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"},
|
|
"nullable": True, "defaultValue": "8888"},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type},
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i % 100 if i % 2 else None,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i % 100 if i % 2 else None,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# search data
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)],
|
|
"filter": "book_id >= 0",
|
|
# "groupingField": "user_id",
|
|
"outputFields": ["*"],
|
|
"limit": 100,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data'][0]['book_describe'] == "8888"
|
|
assert rsp['data'][0]['word_count'] is None
|
|
assert len(rsp['data']) == 100 * nq
|
|
|
|
@pytest.mark.L0
|
|
class TestSearchVectorNegative(TestBase):
|
|
|
|
@pytest.mark.parametrize("metric_type", ["L2"])
|
|
def test_search_vector_without_required_data_param(self, metric_type):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, metric_type=metric_type)
|
|
|
|
# search data
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 1802
|
|
|
|
@pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
|
|
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
|
|
def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, metric_type="COSINE")
|
|
|
|
# search data
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
|
|
"searchParams": {
|
|
"metricType": invalid_metric_type
|
|
}
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] != 0
|
|
|
|
|
|
@pytest.mark.parametrize("limit", [0, 16385])
|
|
def test_search_vector_with_invalid_limit(self, limit):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": "uid >= 0",
|
|
"limit": limit,
|
|
"offset": 0,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 65535
|
|
|
|
@pytest.mark.parametrize("offset", [-1, 100_001])
|
|
def test_search_vector_with_invalid_offset(self, offset):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
dim = 128
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": "uid >= 0",
|
|
"limit": 100,
|
|
"offset": offset,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 65535
|
|
|
|
def test_search_vector_with_invalid_collection_name(self):
|
|
"""
|
|
Search a vector with invalid collection name
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim)
|
|
vector_field = schema_payload.get("vectorField")
|
|
# search data
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
|
|
payload = {
|
|
"collectionName": "invalid_collection_name",
|
|
"data": [vector_to_search],
|
|
"outputFields": output_fields,
|
|
"filter": "uid >= 0",
|
|
"limit": 100,
|
|
"offset": 0,
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 100
|
|
assert "can't find collection" in rsp['message']
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestAdvancedSearchVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [2])
|
|
def test_advanced_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"},
|
|
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"},
|
|
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
|
|
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# advanced search data
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"search": [{
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim)],
|
|
"annsField": "float_vector_1",
|
|
"limit": 10,
|
|
"outputFields": ["*"]
|
|
},
|
|
{
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim)],
|
|
"annsField": "float_vector_2",
|
|
"limit": 10,
|
|
"outputFields": ["*"]
|
|
}
|
|
|
|
],
|
|
"rerank": {
|
|
"strategy": "rrf",
|
|
"params": {
|
|
"k": 10,
|
|
}
|
|
},
|
|
"limit": 10,
|
|
"outputFields": ["user_id", "word_count", "book_describe"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_advanced_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 10
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestHybridSearchVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [2])
|
|
def test_hybrid_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"},
|
|
{"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"},
|
|
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i%100,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector_1": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float_vector_2": gen_vector(datatype="FloatVector", dim=dim),
|
|
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# advanced search data
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"search": [{
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim)],
|
|
"annsField": "float_vector_1",
|
|
"limit": 10,
|
|
"outputFields": ["*"]
|
|
},
|
|
{
|
|
"data": [gen_vector(datatype="FloatVector", dim=dim)],
|
|
"annsField": "float_vector_2",
|
|
"limit": 10,
|
|
"outputFields": ["*"]
|
|
}
|
|
|
|
],
|
|
"rerank": {
|
|
"strategy": "rrf",
|
|
"params": {
|
|
"k": 10,
|
|
}
|
|
},
|
|
"limit": 10,
|
|
"outputFields": ["user_id", "word_count", "book_describe"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_hybrid_search(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 10
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestQueryVector(TestBase):
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_query_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"}},
|
|
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
|
|
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
|
|
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
|
|
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
|
|
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
|
|
"elementTypeParams": {"max_capacity": "1024"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
|
|
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{gen_unique_str(length=1000)}",
|
|
"bool": random.choice([True, False]),
|
|
"json": {"key": [i]},
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": gen_unique_str(length=1000),
|
|
"bool": random.choice([True, False]),
|
|
"json": {"key": i},
|
|
"int_array": [i],
|
|
"varchar_array": [f"varchar_{i}"],
|
|
"bool_array": [random.choice([True, False])],
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
"image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
|
0].tolist(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data to make sure the data is inserted
|
|
# 1. query for int64
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "user_id > 0",
|
|
"limit": 50,
|
|
"outputFields": ["*"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
# 2. query for varchar
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "book_describe like \"book%\"",
|
|
"limit": 50,
|
|
"outputFields": ["*"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
# 3. query for json
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "json_contains(json['key'] , 1)",
|
|
"limit": 50,
|
|
"outputFields": ["*"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert len(rsp['data']) == 1
|
|
|
|
# 4. query for array
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "array_contains(int_array, 1)",
|
|
"limit": 50,
|
|
"outputFields": ["*"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert len(rsp['data']) == 1
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
@pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False])
|
|
def test_query_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
|
|
is_partition_key, enable_dynamic_schema,
|
|
pass_fp32_to_fp16_or_bf16):
|
|
"""
|
|
Insert a vector with a simple payload
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "float16_vector", "dataType": "Float16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector",
|
|
"elementTypeParams": {"dim": f"{dim}"}},
|
|
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"},
|
|
{"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"},
|
|
{"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"},
|
|
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
|
|
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": i,
|
|
"book_describe": f"book_{i}",
|
|
"float_vector": gen_vector(datatype="FloatVector", dim=dim),
|
|
"float16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="Float16Vector", dim=dim)
|
|
),
|
|
"bfloat16_vector": (
|
|
gen_vector(datatype="FloatVector", dim=dim)
|
|
if pass_fp32_to_fp16_or_bf16
|
|
else gen_vector(datatype="BFloat16Vector", dim=dim)
|
|
),
|
|
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
c = Collection(name)
|
|
res = c.query(
|
|
expr="user_id > 0",
|
|
limit=50,
|
|
output_fields=["*"],
|
|
)
|
|
logger.info(f"res: {res}")
|
|
# query data to make sure the data is inserted
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 50
|
|
|
|
@pytest.mark.parametrize("expr", ["10+20 <= uid < 20+30", "uid in [1,2,3,4]",
|
|
"uid > 0", "uid >= 0", "uid > 0",
|
|
"uid > -100 and uid < 100"])
|
|
@pytest.mark.parametrize("include_output_fields", [True, False])
|
|
@pytest.mark.parametrize("partial_fields", [True, False])
|
|
def test_query_vector_with_int64_filter(self, expr, include_output_fields, partial_fields):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
schema_payload, data = self.init_collection(name)
|
|
output_fields = get_common_fields_by_data(data)
|
|
if partial_fields:
|
|
output_fields = output_fields[:len(output_fields) // 2]
|
|
if "uid" not in output_fields:
|
|
output_fields.append("uid")
|
|
else:
|
|
output_fields = output_fields
|
|
|
|
# query data
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": expr,
|
|
"limit": 100,
|
|
"offset": 0,
|
|
"outputFields": output_fields
|
|
}
|
|
if not include_output_fields:
|
|
payload.pop("outputFields")
|
|
if 'vector' in output_fields:
|
|
output_fields.remove("vector")
|
|
time.sleep(5)
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
for r in res:
|
|
uid = r['uid']
|
|
assert eval(expr) is True
|
|
for field in output_fields:
|
|
assert field in r
|
|
|
|
def test_query_vector_with_count(self):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, nb=3000)
|
|
# query for "count(*)"
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": " ",
|
|
"limit": 0,
|
|
"outputFields": ["count(*)"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data'][0]['count(*)'] == 3000
|
|
|
|
@pytest.mark.xfail(reason="query by id is not supported")
|
|
def test_query_vector_by_id(self):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
_, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True)
|
|
payload = {
|
|
"collectionName": name,
|
|
"id": insert_ids,
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
@pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""])
|
|
@pytest.mark.parametrize("include_output_fields", [True, False])
|
|
def test_query_vector_with_varchar_filter(self, filter_expr, include_output_fields):
|
|
"""
|
|
Query a vector with a complex payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
names = []
|
|
for item in data:
|
|
names.append(item.get("name"))
|
|
names.sort()
|
|
logger.info(f"names: {names}")
|
|
mid = len(names) // 2
|
|
prefix = names[mid][0:2]
|
|
# search data
|
|
output_fields = get_common_fields_by_data(data)
|
|
filter_expr = filter_expr.replace("placeholder", prefix)
|
|
logger.info(f"filter_expr: {filter_expr}")
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": filter_expr,
|
|
"limit": limit,
|
|
"offset": 0,
|
|
}
|
|
if not include_output_fields:
|
|
payload.pop("outputFields")
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) <= limit
|
|
for item in res:
|
|
name = item.get("name")
|
|
logger.info(f"name: {name}")
|
|
if ">" in filter_expr:
|
|
assert name > prefix
|
|
if "like" in filter_expr:
|
|
assert name.startswith(prefix)
|
|
|
|
@pytest.mark.parametrize("sum_of_limit_offset", [16384])
|
|
def test_query_vector_with_large_sum_of_limit_offset(self, sum_of_limit_offset):
|
|
"""
|
|
Query a vector with sum of limit and offset larger than max value
|
|
"""
|
|
max_sum_of_limit_offset = 16384
|
|
name = gen_collection_name()
|
|
filter_expr = "name > \"placeholder\""
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
limit = 100
|
|
offset = sum_of_limit_offset - limit
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
names = []
|
|
for item in data:
|
|
names.append(item.get("name"))
|
|
names.sort()
|
|
logger.info(f"names: {names}")
|
|
mid = len(names) // 2
|
|
prefix = names[mid][0:2]
|
|
# search data
|
|
output_fields = get_common_fields_by_data(data)
|
|
filter_expr = filter_expr.replace("placeholder", prefix)
|
|
logger.info(f"filter_expr: {filter_expr}")
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": filter_expr,
|
|
"limit": limit,
|
|
"offset": offset,
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
if sum_of_limit_offset > max_sum_of_limit_offset:
|
|
assert rsp['code'] == 1
|
|
return
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
assert len(res) <= limit
|
|
for item in res:
|
|
name = item.get("name")
|
|
logger.info(f"name: {name}")
|
|
if ">" in filter_expr:
|
|
assert name > prefix
|
|
if "like" in filter_expr:
|
|
assert name.startswith(prefix)
|
|
|
|
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
|
|
def test_query_vector_with_text_match_filter(self, tokenizer):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
fake = fake_en
|
|
language = "en"
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
language = "zh"
|
|
# create a collection
|
|
dim = 128
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
name = gen_collection_name()
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
is_partition_key=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
collection = Collection(name=name, schema=schema
|
|
)
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
data_size = 3000
|
|
batch_size = 1000
|
|
# insert data
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.sentence().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)]
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
text_fields = ["word", "sentence", "paragraph", "text"]
|
|
wf_map = {}
|
|
for field in text_fields:
|
|
wf_map[field] = analyze_documents(df[field].tolist(), language=language)
|
|
for i in range(0, data_size, batch_size):
|
|
tmp = data[i:i + batch_size]
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": tmp,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == len(tmp)
|
|
collection.create_index(
|
|
"emb",
|
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
|
)
|
|
collection.load()
|
|
time.sleep(5)
|
|
for field in text_fields:
|
|
token = wf_map[field].most_common()[0][0]
|
|
expr = f"text_match({field}, '{token}')"
|
|
logger.info(f"expr: {expr}")
|
|
rsp = self.vector_client.vector_query({"collectionName": name, "filter": f"{expr}", "outputFields": ["*"]})
|
|
assert rsp['code'] == 0, rsp
|
|
for d in rsp['data']:
|
|
assert token in d[field]
|
|
|
|
@pytest.mark.parametrize("insert_round", [1])
|
|
@pytest.mark.parametrize("auto_id", [True])
|
|
@pytest.mark.parametrize("is_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
|
@pytest.mark.parametrize("nb", [3000])
|
|
@pytest.mark.parametrize("dim", [128])
|
|
def test_query_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key,
|
|
enable_dynamic_schema):
|
|
"""
|
|
Insert a vector with default and none
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": auto_id,
|
|
"enableDynamicField": enable_dynamic_schema,
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
|
"elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 8888},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"},
|
|
"nullable": True},
|
|
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}, "nullable": True},
|
|
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True},
|
|
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
|
|
"elementTypeParams": {"max_capacity": "1024"}, "nullable": True},
|
|
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
|
|
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True},
|
|
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
|
|
"elementTypeParams": {"max_capacity": "1024"}, "nullable": True},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
|
|
]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for i in range(nb):
|
|
if auto_id:
|
|
tmp = {
|
|
"user_id": i,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"bool": random.choice([True, False]),
|
|
"json": None,
|
|
"int_array": None,
|
|
"varchar_array": None,
|
|
"bool_array": None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(),
|
|
}
|
|
else:
|
|
tmp = {
|
|
"book_id": i,
|
|
"user_id": i,
|
|
"word_count": None,
|
|
"book_describe": None,
|
|
"bool": random.choice([True, False]),
|
|
"json": None,
|
|
"int_array": None,
|
|
"varchar_array": None,
|
|
"bool_array": None,
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(),
|
|
}
|
|
if enable_dynamic_schema:
|
|
tmp.update({f"dynamic_field_{i}": i})
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data to make sure the data is inserted
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "user_id > 0",
|
|
"limit": 50,
|
|
"outputFields": ["*"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data'][0]['book_describe'] is None
|
|
assert rsp['data'][0]['word_count'] == 8888
|
|
assert rsp['data'][0]['json'] is None
|
|
assert rsp['data'][0]['varchar_array'] is None
|
|
assert len(rsp['data']) == 50
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestQueryVectorNegative(TestBase):
|
|
|
|
def test_query_with_wrong_filter_expr(self):
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
|
|
output_fields = get_common_fields_by_data(data)
|
|
uids = []
|
|
for item in data:
|
|
uids.append(item.get("uid"))
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": f"{insert_ids}",
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 1100
|
|
assert "failed to create query plan" in rsp['message']
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestGetVector(TestBase):
|
|
|
|
def test_get_vector_with_simple_payload(self):
|
|
"""
|
|
Search a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name)
|
|
|
|
# search data
|
|
dim = 128
|
|
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": [vector_to_search],
|
|
}
|
|
rsp = self.vector_client.vector_search(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
limit = int(payload.get("limit", 100))
|
|
assert len(res) == limit
|
|
ids = [item['id'] for item in res]
|
|
assert len(ids) == len(set(ids))
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": ["*"],
|
|
"id": ids[0],
|
|
}
|
|
rsp = self.vector_client.vector_get(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {res}")
|
|
logger.info(f"res: {len(res)}")
|
|
for item in res:
|
|
assert item['id'] == ids[0]
|
|
|
|
@pytest.mark.L0
|
|
@pytest.mark.parametrize("id_field_type", ["list", "one"])
|
|
@pytest.mark.parametrize("include_invalid_id", [True, False])
|
|
@pytest.mark.parametrize("include_output_fields", [True, False])
|
|
def test_get_vector_complex(self, id_field_type, include_output_fields, include_invalid_id):
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
output_fields = get_common_fields_by_data(data)
|
|
uids = []
|
|
for item in data:
|
|
uids.append(item.get("uid"))
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": f"uid in {uids}",
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
ids = []
|
|
for r in res:
|
|
ids.append(r['id'])
|
|
logger.info(f"ids: {len(ids)}")
|
|
id_to_get = None
|
|
if id_field_type == "list":
|
|
id_to_get = ids
|
|
if id_field_type == "one":
|
|
id_to_get = ids[0]
|
|
if include_invalid_id:
|
|
if isinstance(id_to_get, list):
|
|
id_to_get[-1] = 0
|
|
else:
|
|
id_to_get = 0
|
|
# get by id list
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"id": id_to_get
|
|
}
|
|
rsp = self.vector_client.vector_get(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
if isinstance(id_to_get, list):
|
|
if include_invalid_id:
|
|
assert len(res) == len(id_to_get) - 1
|
|
else:
|
|
assert len(res) == len(id_to_get)
|
|
else:
|
|
if include_invalid_id:
|
|
assert len(res) == 0
|
|
else:
|
|
assert len(res) == 1
|
|
for r in rsp['data']:
|
|
if isinstance(id_to_get, list):
|
|
assert r['id'] in id_to_get
|
|
else:
|
|
assert r['id'] == id_to_get
|
|
if include_output_fields:
|
|
for field in output_fields:
|
|
assert field in r
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestDeleteVector(TestBase):
|
|
|
|
@pytest.mark.xfail(reason="delete by id is not supported")
|
|
def test_delete_vector_by_id(self):
|
|
"""
|
|
Query a vector with a simple payload
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
_, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True)
|
|
payload = {
|
|
"collectionName": name,
|
|
"id": insert_ids,
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
@pytest.mark.parametrize("id_field_type", ["list", "one"])
|
|
def test_delete_vector_by_pk_field_ids(self, id_field_type):
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True)
|
|
time.sleep(1)
|
|
id_to_delete = None
|
|
if id_field_type == "list":
|
|
id_to_delete = insert_ids
|
|
if id_field_type == "one":
|
|
id_to_delete = insert_ids[0]
|
|
if isinstance(id_to_delete, list):
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"id in {id_to_delete}"
|
|
}
|
|
else:
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"id == {id_to_delete}"
|
|
}
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
assert rsp['code'] == 0
|
|
# verify data deleted by get
|
|
payload = {
|
|
"collectionName": name,
|
|
"id": id_to_delete
|
|
}
|
|
rsp = self.vector_client.vector_get(payload)
|
|
assert len(rsp['data']) == 0
|
|
|
|
@pytest.mark.parametrize("id_field_type", ["list", "one"])
|
|
def test_delete_vector_by_filter_pk_field(self, id_field_type):
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
time.sleep(1)
|
|
output_fields = get_common_fields_by_data(data)
|
|
uids = []
|
|
for item in data:
|
|
uids.append(item.get("uid"))
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": f"uid in {uids}",
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
ids = []
|
|
for r in res:
|
|
ids.append(r['id'])
|
|
logger.info(f"ids: {len(ids)}")
|
|
id_to_get = None
|
|
if id_field_type == "list":
|
|
id_to_get = ids
|
|
if id_field_type == "one":
|
|
id_to_get = ids[0]
|
|
if isinstance(id_to_get, list):
|
|
if len(id_to_get) >= 100:
|
|
id_to_get = id_to_get[-100:]
|
|
# delete by id list
|
|
if isinstance(id_to_get, list):
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"id in {id_to_get}",
|
|
}
|
|
else:
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"id == {id_to_get}",
|
|
}
|
|
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
assert rsp['code'] == 0
|
|
logger.info(f"delete res: {rsp}")
|
|
|
|
# verify data deleted
|
|
if not isinstance(id_to_get, list):
|
|
id_to_get = [id_to_get]
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"id in {id_to_get}",
|
|
}
|
|
time.sleep(5)
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
assert len(rsp['data']) == 0
|
|
|
|
def test_delete_vector_by_custom_pk_field(self):
|
|
dim = 128
|
|
nb = 3000
|
|
insert_round = 1
|
|
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
pk_values = []
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j,
|
|
"word_count": i * nb + j,
|
|
"book_describe": f"book_{i * nb + j}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
tmp = [d["book_id"] for d in data]
|
|
pk_values.extend(tmp)
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data before delete
|
|
c = Collection(name)
|
|
res = c.query(expr="", output_fields=["count(*)"])
|
|
logger.info(f"res: {res}")
|
|
|
|
# delete data
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"book_id in {pk_values}",
|
|
}
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
|
|
# query data after delete
|
|
res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong")
|
|
logger.info(f"res: {res}")
|
|
assert res[0]["count(*)"] == 0
|
|
|
|
def test_delete_vector_by_filter_custom_field(self):
|
|
dim = 128
|
|
nb = 3000
|
|
insert_round = 1
|
|
|
|
name = gen_collection_name()
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"fields": [
|
|
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
|
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
|
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
|
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
|
]
|
|
},
|
|
"indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}]
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
logger.info(f"rsp: {rsp}")
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
for i in range(insert_round):
|
|
data = []
|
|
for j in range(nb):
|
|
tmp = {
|
|
"book_id": i * nb + j,
|
|
"word_count": i * nb + j,
|
|
"book_describe": f"book_{i * nb + j}",
|
|
"text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
|
|
}
|
|
data.append(tmp)
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
assert rsp['data']['insertCount'] == nb
|
|
# query data before delete
|
|
c = Collection(name)
|
|
res = c.query(expr="", output_fields=["count(*)"])
|
|
logger.info(f"res: {res}")
|
|
|
|
# delete data
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": "word_count >= 0",
|
|
}
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
|
|
# query data after delete
|
|
res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong")
|
|
logger.info(f"res: {res}")
|
|
assert res[0]["count(*)"] == 0
|
|
|
|
|
|
def test_delete_vector_with_non_primary_key(self):
|
|
"""
|
|
Delete a vector with a non-primary key, expect no data were deleted
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, dim=128, nb=300)
|
|
expr = "uid > 0"
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": expr,
|
|
"limit": 3000,
|
|
"offset": 0,
|
|
"outputFields": ["id", "uid"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
id_list = [r['uid'] for r in res]
|
|
delete_expr = f"uid in {[i for i in id_list[:10]]}"
|
|
# query data before delete
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": delete_expr,
|
|
"limit": 3000,
|
|
"offset": 0,
|
|
"outputFields": ["id", "uid"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
num_before_delete = len(res)
|
|
logger.info(f"res: {len(res)}")
|
|
# delete data
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": delete_expr,
|
|
}
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
# query data after delete
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": delete_expr,
|
|
"limit": 3000,
|
|
"offset": 0,
|
|
"outputFields": ["id", "uid"]
|
|
}
|
|
time.sleep(1)
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert len(rsp["data"]) == 0
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestDeleteVectorNegative(TestBase):
|
|
|
|
def test_delete_vector_with_invalid_collection_name(self):
|
|
"""
|
|
Delete a vector with an invalid collection name
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
self.init_collection(name, dim=128, nb=3000)
|
|
|
|
# query data
|
|
# expr = f"id in {[i for i in range(10)]}".replace("[", "(").replace("]", ")")
|
|
expr = "id > 0"
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": expr,
|
|
"limit": 3000,
|
|
"offset": 0,
|
|
"outputFields": ["id", "uid"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
id_list = [r['id'] for r in res]
|
|
delete_expr = f"id in {[i for i in id_list[:10]]}"
|
|
# query data before delete
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": delete_expr,
|
|
"limit": 3000,
|
|
"offset": 0,
|
|
"outputFields": ["id", "uid"]
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
# delete data
|
|
payload = {
|
|
"collectionName": name + "_invalid",
|
|
"filter": delete_expr,
|
|
}
|
|
rsp = self.vector_client.vector_delete(payload)
|
|
assert rsp['code'] == 100
|
|
assert "can't find collection" in rsp['message']
|
|
|
|
@pytest.mark.L1
|
|
class TestVectorWithAuth(TestBase):
|
|
def test_upsert_vector_with_invalid_api_key(self):
|
|
"""
|
|
Insert a vector with invalid api key
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 10
|
|
data = [
|
|
{
|
|
"vector": [np.float64(random.random()) for _ in range(dim)],
|
|
} for _ in range(nb)
|
|
]
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
client = self.vector_client
|
|
client.api_key = "invalid_api_key"
|
|
rsp = client.vector_insert(payload)
|
|
assert rsp['code'] == 1800
|
|
def test_insert_vector_with_invalid_api_key(self):
|
|
"""
|
|
Insert a vector with invalid api key
|
|
"""
|
|
# create a collection
|
|
name = gen_collection_name()
|
|
dim = 128
|
|
payload = {
|
|
"collectionName": name,
|
|
"dimension": dim,
|
|
}
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0
|
|
# insert data
|
|
nb = 10
|
|
data = [
|
|
{
|
|
"vector": [np.float64(random.random()) for _ in range(dim)],
|
|
} for _ in range(nb)
|
|
]
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data,
|
|
}
|
|
body_size = sys.getsizeof(json.dumps(payload))
|
|
logger.info(f"body size: {body_size / 1024 / 1024} MB")
|
|
client = self.vector_client
|
|
client.api_key = "invalid_api_key"
|
|
rsp = client.vector_insert(payload)
|
|
assert rsp['code'] == 1800
|
|
def test_delete_vector_with_invalid_api_key(self):
|
|
"""
|
|
Delete a vector with an invalid api key
|
|
"""
|
|
name = gen_collection_name()
|
|
self.name = name
|
|
nb = 200
|
|
dim = 128
|
|
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
|
|
output_fields = get_common_fields_by_data(data)
|
|
uids = []
|
|
for item in data:
|
|
uids.append(item.get("uid"))
|
|
payload = {
|
|
"collectionName": name,
|
|
"outputFields": output_fields,
|
|
"filter": f"uid in {uids}",
|
|
}
|
|
rsp = self.vector_client.vector_query(payload)
|
|
assert rsp['code'] == 0
|
|
res = rsp['data']
|
|
logger.info(f"res: {len(res)}")
|
|
ids = []
|
|
for r in res:
|
|
ids.append(r['id'])
|
|
logger.info(f"ids: {len(ids)}")
|
|
id_to_get = ids
|
|
# delete by id list
|
|
payload = {
|
|
"collectionName": name,
|
|
"filter": f"uid in {uids}"
|
|
}
|
|
client = self.vector_client
|
|
client.api_key = "invalid_api_key"
|
|
rsp = client.vector_delete(payload)
|
|
assert rsp['code'] == 1800
|