import random from sklearn import preprocessing import numpy as np import pandas as pd import sys import json import time from utils import constant from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution from utils.util_log import test_log as logger import pytest from base.testbase import TestBase from utils.utils import (gen_unique_str, get_data_by_payload, get_common_fields_by_data, gen_vector, analyze_documents) from pymilvus import ( FieldSchema, CollectionSchema, DataType, Collection, utility ) from faker import Faker Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") patch_faker_text(fake_en, en_vocabularies_distribution) patch_faker_text(fake_zh, zh_vocabularies_distribution) @pytest.mark.L0 class TestInsertVector(TestBase): @pytest.mark.parametrize("insert_round", [3]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_insert_entities_with_simple_payload(self, nb, dim, insert_round): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() collection_payload = { "collectionName": name, "dimension": dim, "metricType": "L2" } rsp = self.collection_client.collection_create(collection_payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = get_data_by_payload(collection_payload, nb) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True, False]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_insert_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}}, {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar", "elementTypeParams": {"max_capacity": "1024", "max_length": "256"}}, {"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"}, {"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "bool": random.choice([True, False]), "json": {"key": i}, "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "bool": random.choice([True, False]), "json": {"key": i}, "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False]) def test_insert_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, pass_fp32_to_fp16_or_bf16): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float16_vector", "dataType": "Float16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"}, {"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"}, {"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"}, {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING", "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim) } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) res = c.query( expr="user_id > 0", limit=1, output_fields=["*"], ) logger.info(f"res: {res}") # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False]) def test_insert_entities_with_all_vector_datatype_0(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, pass_fp32_to_fp16_or_bf16): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "book_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float16_vector", "dataType": "Float16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "book_vector", "indexName": "book_vector", "metricType": "L2", "params": {"index_type": "FLAT"}}, {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2", "params": {"index_type": "IVF_FLAT", "nlist": 128}}, {"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2", "params": {"index_type": "IVF_SQ8", "nlist": "128"}}, {"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2", "params": {"index_type": "IVF_PQ", "nlist": 128, "m": 16, "nbits": 8}}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "book_vector": gen_vector(datatype="FloatVector", dim=dim), "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "book_vector": gen_vector(datatype="FloatVector", dim=dim), "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) res = c.query( expr="user_id > 0", limit=1, output_fields=["*"], ) logger.info(f"res: {res}") # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False]) def test_insert_entities_with_all_vector_datatype_1(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, pass_fp32_to_fp16_or_bf16): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float16_vector", "dataType": "Float16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2", "params": {"index_type": "HNSW", "M": 32, "efConstruction": 360}}, {"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2", "params": {"index_type": "SCANN", "nlist": "128"}}, {"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2", "params": {"index_type": "DISKANN"}}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) res = c.query( expr="user_id > 0", limit=1, output_fields=["*"], ) logger.info(f"res: {res}") # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_insert_entities_with_all_vector_datatype_2(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "binary_vector_0", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "binary_vector_1", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "sparse_float_vector_0", "dataType": "SparseFloatVector"}, {"fieldName": "sparse_float_vector_1", "dataType": "SparseFloatVector"}, ] }, "indexParams": [ {"fieldName": "binary_vector_0", "indexName": "binary_vector_0_index", "metricType": "HAMMING", "params": {"index_type": "BIN_FLAT"}}, {"fieldName": "binary_vector_1", "indexName": "binary_vector_1_index", "metricType": "HAMMING", "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}, {"fieldName": "sparse_float_vector_0", "indexName": "sparse_float_vector_0_index", "metricType": "IP", "params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}}, {"fieldName": "sparse_float_vector_1", "indexName": "sparse_float_vector_1_index", "metricType": "IP", "params": {"index_type": "SPARSE_WAND", "drop_ratio_build": "0.2"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim), "binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim), "sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"), "sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "binary_vector_0": gen_vector(datatype="BinaryVector", dim=dim), "binary_vector_1": gen_vector(datatype="BinaryVector", dim=dim), "sparse_float_vector_0": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"), "sparse_float_vector_1": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok"), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) res = c.query( expr="user_id > 0", limit=1, output_fields=["*"], ) logger.info(f"res: {res}") # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True, False]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_insert_entities_with_all_json_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}}, {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar", "elementTypeParams": {"max_capacity": "1024", "max_length": "256"}}, {"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"}, {"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 json_value = [ 1, 1.0, "1", [1, 2, 3], ["1", "2", "3"], [1, 2, "3"], {"key": "value"}, ] # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "bool": random.choice([True, False]), "json": json_value[i%len(json_value)], "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "bool": random.choice([True, False]), "json": json_value[i%len(json_value)], "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True, False]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_insert_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with defaultValue and none """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}, "defaultValue": 10}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}, "defaultValue": "default", "nullable": True}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True}, {"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar", "elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for k in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": None, "word_count": None, "book_describe": None, "json": None, "varchar_array": None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(), } else: tmp = { "book_id": i, "user_id": None, "word_count": None, "book_describe": None, "json": None, "varchar_array": None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 5}) assert rsp['code'] == 0 assert len(rsp['data']) == 5 assert rsp['data'][0]['book_describe'] == 'default' assert rsp['data'][0]['word_count'] is None assert rsp['data'][0]['json'] is None @pytest.mark.L0 class TestInsertVectorNegative(TestBase): def test_insert_vector_with_invalid_collection_name(self): """ Insert a vector with an invalid collection name """ # create a collection name = gen_collection_name() dim = 128 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 100 data = get_data_by_payload(payload, nb) payload = { "collectionName": "invalid_collection_name", "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 100 assert "can't find collection" in rsp['message'] def test_insert_vector_with_invalid_database_name(self): """ Insert a vector with an invalid database name """ # create a collection name = gen_collection_name() dim = 128 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 10 data = get_data_by_payload(payload, nb) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") success = False rsp = self.vector_client.vector_insert(payload, db_name="invalid_database") assert rsp['code'] == 800 def test_insert_vector_with_mismatch_dim(self): """ Insert a vector with mismatch dim """ # create a collection name = gen_collection_name() dim = 32 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 1 data = [ {"id": i, "vector": [np.float64(random.random()) for _ in range(dim + 1)], } for i in range(nb) ] payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 1804 assert "fail to deal the insert data" in rsp['message'] def test_insert_entities_with_none_no_nullable_field(self): """ Insert a vector with none no nullable field """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": True, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}}, ] } } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data data = [] for i in range(10): tmp = { "word_count": i if i % 2 else None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(), } data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 1804 assert "fail to deal the insert data" in rsp['message'] @pytest.mark.L0 class TestUpsertVector(TestBase): @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("id_type", ["Int64", "VarChar"]) def test_upsert_vector_default(self, nb, dim, insert_round, id_type): # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "fields": [ {"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}", "user_id": i * nb + j, "word_count": i * nb + j, "book_describe": f"book_{i * nb + j}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) c.flush() # upsert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}", "user_id": i * nb + j + 1, "word_count": i * nb + j + 2, "book_describe": f"book_{i * nb + j + 3}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_upsert(payload) # query data to make sure the data is updated if id_type == "Int64": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"}) if id_type == "VarChar": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"}) for data in rsp['data']: assert data['user_id'] == int(data['book_id']) + 1 assert data['word_count'] == int(data['book_id']) + 2 assert data['book_describe'] == f"book_{int(data['book_id']) + 3}" res = utility.get_query_segment_info(name) logger.info(f"res: {res}") @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("id_type", ["Int64", "VarChar"]) @pytest.mark.xfail(reason="currently not support auto_id for upsert") def test_upsert_vector_pk_auto_id(self, nb, dim, insert_round, id_type): # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": True, "fields": [ {"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 ids = [] # insert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}", "user_id": i * nb + j, "word_count": i * nb + j, "book_describe": f"book_{i * nb + j}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb ids.extend(rsp['data']['insertIds']) c = Collection(name) c.flush() # upsert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": ids[i * nb + j], "user_id": i * nb + j + 1, "word_count": i * nb + j + 2, "book_describe": f"book_{i * nb + j + 3}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_upsert(payload) # query data to make sure the data is updated if id_type == "Int64": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"}) if id_type == "VarChar": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"}) for data in rsp['data']: assert data['user_id'] == int(data['book_id']) + 1 assert data['word_count'] == int(data['book_id']) + 2 assert data['book_describe'] == f"book_{int(data['book_id']) + 3}" res = utility.get_query_segment_info(name) logger.info(f"res: {res}") @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("id_type", ["Int64", "VarChar"]) def test_upsert_vector_with_default_none(self, nb, dim, insert_round, id_type): # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "fields": [ {"fieldName": "book_id", "dataType": f"{id_type}", "isPrimary": True, "elementTypeParams": {"max_length": "256"}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 123}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}, "nullable": True}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}", "user_id": i * nb + j, "word_count": i * nb + j, "book_describe": f"book_{i * nb + j}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) c.flush() # upsert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j if id_type == "Int64" else f"{i * nb + j}", "user_id": i * nb + j + 1, "word_count": None, "book_describe": None, "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_upsert(payload) # query data to make sure the data is updated if id_type == "Int64": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > 0"}) if id_type == "VarChar": rsp = self.vector_client.vector_query({"collectionName": name, "filter": "book_id > '0'"}) for data in rsp['data']: assert data['user_id'] == int(data['book_id']) + 1 assert data['word_count'] == 123 assert data['book_describe'] is None @pytest.mark.L0 class TestUpsertVectorNegative(TestBase): def test_upsert_vector_with_invalid_collection_name(self): """ upsert a vector with an invalid collection name """ # create a collection name = gen_collection_name() dim = 128 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 100 data = get_data_by_payload(payload, nb) payload = { "collectionName": "invalid_collection_name", "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_upsert(payload) assert rsp['code'] == 100 assert "can't find collection" in rsp['message'] def test_upsert_entities_with_none_no_nullable_field(self): """ Insert a vector with none no nullable field """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": True, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{128}"}}, ] } } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data data = [] for i in range(10): tmp = { "word_count": i if i % 2 else None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(128)])])[0].tolist(), } data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_upsert(payload) assert rsp['code'] == 1804 assert "fail to deal the insert data" in rsp['message'] @pytest.mark.L0 class TestSearchVector(TestBase): @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [16]) @pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False]) def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, pass_fp32_to_fp16_or_bf16): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float16_vector", "dataType": "Float16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "COSINE"}, {"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "COSINE"}, {"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "COSINE"}, {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING", "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i%10, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim) } else: tmp = { "book_id": i, "user_id": i%10, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim) } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # search data payload = { "collectionName": name, "data": [gen_vector(datatype="FloatVector", dim=dim)], "annsField": "float_vector", "filter": "word_count > 100", "groupingField": "user_id", "outputFields": ["*"], "limit": 100 } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 # assert no dup user_id user_ids = [r["user_id"]for r in rsp['data']] assert len(user_ids) == len(set(user_ids)) @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("nq", [1, 2]) @pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"]) def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, nq, metric_type): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # search data payload = { "collectionName": name, "data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)], "filter": "word_count > 100", "groupingField": "user_id", "outputFields": ["*"], "limit": 100, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 100 * nq @pytest.mark.parametrize("insert_round", [1, 10]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("groupingField", ['user_id', None]) @pytest.mark.parametrize("sparse_format", ['dok', 'coo']) def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, groupingField, sparse_format): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "sparse_float_vector", "dataType": "SparseFloatVector"}, ] }, "indexParams": [ {"fieldName": "sparse_float_vector", "indexName": "sparse_float_vector", "metricType": "IP", "params": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for j in range(nb): idx = i * nb + j if auto_id: tmp = { "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format=sparse_format), } else: tmp = { "book_id": idx, "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format=sparse_format), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # search data payload = { "collectionName": name, "data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="dok")], "filter": "word_count > 100", "outputFields": ["*"], "searchParams": { "metricType": "IP", "params": { "drop_ratio_search": "0.2", } }, "limit": 500, } if groupingField: payload["groupingField"] = groupingField rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("groupingField", ['user_id', None]) @pytest.mark.parametrize("tokenizer", ['standard']) def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, groupingField, tokenizer): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "document_content", "dataType": "VarChar", "elementTypeParams": {"max_length": "1000", "enable_analyzer": True, "analyzer_params": { "tokenizer": tokenizer, }, "enable_match": True}}, {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}, ], "functions": [ { "name": "bm25_fn", "type": "BM25", "inputFieldNames": ["document_content"], "outputFieldNames": ["sparse_vector"], "params": {} } ] }, "indexParams": [ {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25", "params": {"index_type": "SPARSE_INVERTED_INDEX"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 if tokenizer == 'standard': fake = fake_en elif tokenizer == 'jieba': fake = fake_zh else: raise Exception("Invalid tokenizer") # insert data for i in range(insert_round): data = [] for j in range(nb): idx = i * nb + j if auto_id: tmp = { "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "document_content": fake.text().lower(), } else: tmp = { "book_id": idx, "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "document_content": fake.text().lower(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb assert rsp['code'] == 0 # search data payload = { "collectionName": name, "data": [fake.text().lower() for _ in range(1)], "filter": "word_count > 100", "outputFields": ["*"], "searchParams": { "params": { "drop_ratio_search": "0.2", } }, "limit": 500, } if groupingField: payload["groupingField"] = groupingField rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) > 0 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("is_partition_key", [True, False]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("groupingField", ['user_id', None]) @pytest.mark.parametrize("tokenizer", ['jieba']) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751") def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, groupingField, tokenizer): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "document_content", "dataType": "VarChar", "elementTypeParams": {"max_length": "1000", "enable_analyzer": True, "analyzer_params": { "tokenizer": tokenizer, }, "enable_match": True}}, {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}, ], "functions": [ { "name": "bm25_fn", "type": "BM25", "inputFieldNames": ["document_content"], "outputFieldNames": ["sparse_vector"], "params": {} } ] }, "indexParams": [ {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25", "params": {"index_type": "SPARSE_INVERTED_INDEX"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 if tokenizer == 'standard': fake = fake_en elif tokenizer == 'jieba': fake = fake_zh else: raise Exception("Invalid tokenizer") # insert data for i in range(insert_round): data = [] for j in range(nb): idx = i * nb + j if auto_id: tmp = { "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "document_content": fake.text().lower(), } else: tmp = { "book_id": idx, "user_id": idx%100, "word_count": j, "book_describe": f"book_{idx}", "document_content": fake.text().lower(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb assert rsp['code'] == 0 # search data payload = { "collectionName": name, "data": [fake.text().lower() for _ in range(2)], "filter": "word_count > 100", "outputFields": ["*"], "searchParams": { "params": { "drop_ratio_search": "0.2", } }, "limit": 500, } if groupingField: payload["groupingField"] = groupingField rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) > 0 @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("metric_type", ['HAMMING']) def test_search_vector_with_binary_vector_datatype(self, metric_type, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [ {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": metric_type, "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "binary_vector": gen_vector(datatype="BinaryVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "binary_vector": gen_vector(datatype="BinaryVector", dim=dim), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # flush data c = Collection(name) c.flush() time.sleep(5) # wait for index rsp = self.index_client.index_describe(collection_name=name, index_name="binary_vector") # search data payload = { "collectionName": name, "data": [gen_vector(datatype="BinaryVector", dim=dim)], "filter": "word_count > 100", "outputFields": ["*"], "limit": 100, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 100 @pytest.mark.parametrize("metric_type", ["IP", "L2", "COSINE"]) def test_search_vector_with_simple_payload(self, metric_type): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name self.init_collection(name, metric_type=metric_type) # search data dim = 128 vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() payload = { "collectionName": name, "data": [vector_to_search], } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") limit = int(payload.get("limit", 100)) assert len(res) == limit ids = [item['id'] for item in res] assert len(ids) == len(set(ids)) distance = [item['distance'] for item in res] if metric_type == "L2": assert distance == sorted(distance) if metric_type == "IP" or metric_type == "COSINE": assert distance == sorted(distance, reverse=True) @pytest.mark.parametrize("sum_limit_offset", [16384, 16385]) @pytest.mark.xfail(reason="") def test_search_vector_with_exceed_sum_limit_offset(self, sum_limit_offset): """ Search a vector with a simple payload """ max_search_sum_limit_offset = constant.MAX_SUM_OFFSET_AND_LIMIT name = gen_collection_name() self.name = name nb = sum_limit_offset + 2000 metric_type = "IP" limit = 100 self.init_collection(name, metric_type=metric_type, nb=nb, batch_size=2000) # search data dim = 128 vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() payload = { "collectionName": name, "vector": vector_to_search, "limit": limit, "offset": sum_limit_offset - limit, } rsp = self.vector_client.vector_search(payload) if sum_limit_offset > max_search_sum_limit_offset: assert rsp['code'] == 65535 return assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") limit = int(payload.get("limit", 100)) assert len(res) == limit ids = [item['id'] for item in res] assert len(ids) == len(set(ids)) distance = [item['distance'] for item in res] if metric_type == "L2": assert distance == sorted(distance) if metric_type == "IP": assert distance == sorted(distance, reverse=True) @pytest.mark.parametrize("offset", [0, 100]) @pytest.mark.parametrize("limit", [100]) @pytest.mark.parametrize("metric_type", ["L2", "IP", "COSINE"]) def test_search_vector_with_complex_payload(self, limit, offset, metric_type): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name nb = limit + offset + 3000 dim = 128 schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": "uid >= 0", "limit": limit, "offset": offset, } rsp = self.vector_client.vector_search(payload) if offset + limit > constant.MAX_SUM_OFFSET_AND_LIMIT: assert rsp['code'] == 90126 return assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) == limit for item in res: assert item.get("uid") >= 0 for field in output_fields: assert field in item @pytest.mark.parametrize("filter_expr", ["uid >= 0", "uid >= 0 and uid < 100", "uid in [1,2,3]"]) def test_search_vector_with_complex_int_filter(self, filter_expr): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name nb = 200 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": filter_expr, "limit": limit, "offset": 0, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) <= limit for item in res: uid = item.get("uid") eval(filter_expr) @pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""]) def test_search_vector_with_complex_varchar_filter(self, filter_expr): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name nb = 200 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) names = [] for item in data: names.append(item.get("name")) names.sort() logger.info(f"names: {names}") mid = len(names) // 2 prefix = names[mid][0:2] vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) filter_expr = filter_expr.replace("placeholder", prefix) logger.info(f"filter_expr: {filter_expr}") payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": filter_expr, "limit": limit, "offset": 0, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) <= limit for item in res: name = item.get("name") logger.info(f"name: {name}") if ">" in filter_expr: assert name > prefix if "like" in filter_expr: assert name.startswith(prefix) @pytest.mark.parametrize("filter_expr", ["uid < 100 and name > \"placeholder\"", "uid < 100 and name like \"placeholder%\"" ]) def test_search_vector_with_complex_int64_varchar_and_filter(self, filter_expr): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name nb = 200 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) names = [] for item in data: names.append(item.get("name")) names.sort() logger.info(f"names: {names}") mid = len(names) // 2 prefix = names[mid][0:2] vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) filter_expr = filter_expr.replace("placeholder", prefix) logger.info(f"filter_expr: {filter_expr}") payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": filter_expr, "limit": limit, "offset": 0, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) <= limit for item in res: uid = item.get("uid") name = item.get("name") logger.info(f"name: {name}") uid_expr = filter_expr.split("and")[0] assert eval(uid_expr) is True varchar_expr = filter_expr.split("and")[1] if ">" in varchar_expr: assert name > prefix if "like" in varchar_expr: assert name.startswith(prefix) @pytest.mark.parametrize("consistency_level", ["Strong", "Bounded", "Eventually", "Session"]) def test_search_vector_with_consistency_level(self, consistency_level): """ Search a vector with different consistency level """ name = gen_collection_name() self.name = name nb = 200 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) names = [] for item in data: names.append(item.get("name")) names.sort() logger.info(f"names: {names}") mid = len(names) // 2 prefix = names[mid][0:2] vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "limit": limit, "offset": 0, "consistencyLevel": consistency_level } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) == limit @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"]) def test_search_vector_with_range_search(self, metric_type): """ Search a vector with range search with different metric type """ name = gen_collection_name() self.name = name nb = 3000 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() training_data = [item[vector_field] for item in data] distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type) r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.5*limit))] # recall is not 100% so add 50% to make sure the range is more than limit if metric_type == "L2": r1, r2 = r2, r1 output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) logger.info(f"r1: {r1}, r2: {r2}") payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "limit": limit, "offset": 0, "searchParams": { "params": { "radius": r1, "range_filter": r2, } } } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) >= limit*0.8 # add buffer to the distance of comparison if metric_type == "L2": r1 = r1 + 10**-6 r2 = r2 - 10**-6 else: r1 = r1 - 10**-6 r2 = r2 + 10**-6 for item in res: distance = item.get("distance") if metric_type == "L2": assert r1 > distance > r2 else: assert r1 < distance < r2 @pytest.mark.parametrize("ignore_growing", [True, False]) def test_search_vector_with_ignore_growing(self, ignore_growing): """ Search a vector with range search with different metric type """ name = gen_collection_name() self.name = name metric_type = "COSINE" nb = 1000 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() training_data = [item[vector_field] for item in data] distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type) r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct if metric_type == "L2": r1, r2 = r2, r1 output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "limit": limit, "offset": 0, "searchParams": { "ignoreGrowing": ignore_growing } } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") if ignore_growing is True: assert len(res) == 0 else: assert len(res) == limit @pytest.mark.parametrize("tokenizer", ["jieba", "standard"]) def test_search_vector_with_text_match_filter(self, tokenizer): """ Query a vector with a simple payload """ fake = fake_en language = "en" if tokenizer == "jieba": fake = fake_zh language = "zh" # create a collection dim = 128 analyzer_params = { "tokenizer": tokenizer, } name = gen_collection_name() fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), FieldSchema( name="word", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, is_partition_key=True, analyzer_params=analyzer_params, ), FieldSchema( name="sentence", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( name="paragraph", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( name="text", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields=fields, description="test collection") collection = Collection(name=name, schema=schema ) rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 data_size = 3000 batch_size = 1000 # insert data data = [ { "id": i, "word": fake.word().lower(), "sentence": fake.sentence().lower(), "paragraph": fake.sentence().lower(), "text": fake.text().lower(), "emb": [random.random() for _ in range(dim)] } for i in range(data_size) ] df = pd.DataFrame(data) text_fields = ["word", "sentence", "paragraph", "text"] wf_map = {} for field in text_fields: wf_map[field] = analyze_documents(df[field].tolist(), language=language) for i in range(0, data_size, batch_size): tmp = data[i:i + batch_size] payload = { "collectionName": name, "data": tmp, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == len(tmp) collection.create_index( "emb", {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, ) collection.load() time.sleep(5) vector_to_search = [[random.random() for _ in range(dim)]] for field in text_fields: token = wf_map[field].most_common()[0][0] expr = f"text_match({field}, '{token}')" logger.info(f"expr: {expr}") rsp = self.vector_client.vector_search({"collectionName": name, "data":vector_to_search, "filter": f"{expr}", "outputFields": ["*"]}) assert rsp['code'] == 0, rsp for d in rsp['data']: assert token in d[field] @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("nq", [1, 2]) @pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"]) def test_search_vector_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, nq, metric_type): """ Insert a vector with default and none """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}, "defaultValue": 8888}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "nullable": True}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}, "nullable": True, "defaultValue": "8888"}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i % 100 if i % 2 else None, "word_count": None, "book_describe": None, "float_vector": gen_vector(datatype="FloatVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i % 100 if i % 2 else None, "word_count": None, "book_describe": None, "float_vector": gen_vector(datatype="FloatVector", dim=dim), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # search data payload = { "collectionName": name, "data": [gen_vector(datatype="FloatVector", dim=dim) for _ in range(nq)], "filter": "book_id >= 0", # "groupingField": "user_id", "outputFields": ["*"], "limit": 100, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 assert rsp['data'][0]['book_describe'] == "8888" assert rsp['data'][0]['word_count'] is None assert len(rsp['data']) == 100 * nq @pytest.mark.L0 class TestSearchVectorNegative(TestBase): @pytest.mark.parametrize("metric_type", ["L2"]) def test_search_vector_without_required_data_param(self, metric_type): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name self.init_collection(name, metric_type=metric_type) # search data dim = 128 payload = { "collectionName": name, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 1802 @pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"]) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138") def test_search_vector_with_invalid_metric_type(self, invalid_metric_type): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name self.init_collection(name, metric_type="COSINE") # search data dim = 128 payload = { "collectionName": name, "data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()], "searchParams": { "metricType": invalid_metric_type } } rsp = self.vector_client.vector_search(payload) assert rsp['code'] != 0 @pytest.mark.parametrize("limit", [0, 16385]) def test_search_vector_with_invalid_limit(self, limit): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name dim = 128 schema_payload, data = self.init_collection(name, dim=dim) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": "uid >= 0", "limit": limit, "offset": 0, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 65535 @pytest.mark.parametrize("offset", [-1, 100_001]) def test_search_vector_with_invalid_offset(self, offset): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name dim = 128 schema_payload, data = self.init_collection(name, dim=dim) vector_field = schema_payload.get("vectorField") # search data dim = 128 vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": name, "data": [vector_to_search], "outputFields": output_fields, "filter": "uid >= 0", "limit": 100, "offset": offset, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 65535 def test_search_vector_with_invalid_collection_name(self): """ Search a vector with invalid collection name """ name = gen_collection_name() self.name = name dim = 128 schema_payload, data = self.init_collection(name, dim=dim) vector_field = schema_payload.get("vectorField") # search data vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) payload = { "collectionName": "invalid_collection_name", "data": [vector_to_search], "outputFields": output_fields, "filter": "uid >= 0", "limit": 100, "offset": 0, } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 100 assert "can't find collection" in rsp['message'] @pytest.mark.L0 class TestAdvancedSearchVector(TestBase): @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [2]) def test_advanced_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"}, {"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector_1": gen_vector(datatype="FloatVector", dim=dim), "float_vector_2": gen_vector(datatype="FloatVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector_1": gen_vector(datatype="FloatVector", dim=dim), "float_vector_2": gen_vector(datatype="FloatVector", dim=dim), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # advanced search data payload = { "collectionName": name, "search": [{ "data": [gen_vector(datatype="FloatVector", dim=dim)], "annsField": "float_vector_1", "limit": 10, "outputFields": ["*"] }, { "data": [gen_vector(datatype="FloatVector", dim=dim)], "annsField": "float_vector_2", "limit": 10, "outputFields": ["*"] } ], "rerank": { "strategy": "rrf", "params": { "k": 10, } }, "limit": 10, "outputFields": ["user_id", "word_count", "book_describe"] } rsp = self.vector_client.vector_advanced_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 10 @pytest.mark.L0 class TestHybridSearchVector(TestBase): @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [2]) def test_hybrid_search_vector_with_multi_float32_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector_1", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float_vector_2", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector_1", "indexName": "float_vector_1", "metricType": "COSINE"}, {"fieldName": "float_vector_2", "indexName": "float_vector_2", "metricType": "COSINE"}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector_1": gen_vector(datatype="FloatVector", dim=dim), "float_vector_2": gen_vector(datatype="FloatVector", dim=dim), } else: tmp = { "book_id": i, "user_id": i%100, "word_count": i, "book_describe": f"book_{i}", "float_vector_1": gen_vector(datatype="FloatVector", dim=dim), "float_vector_2": gen_vector(datatype="FloatVector", dim=dim), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # advanced search data payload = { "collectionName": name, "search": [{ "data": [gen_vector(datatype="FloatVector", dim=dim)], "annsField": "float_vector_1", "limit": 10, "outputFields": ["*"] }, { "data": [gen_vector(datatype="FloatVector", dim=dim)], "annsField": "float_vector_2", "limit": 10, "outputFields": ["*"] } ], "rerank": { "strategy": "rrf", "params": { "k": 10, } }, "limit": 10, "outputFields": ["user_id", "word_count", "book_describe"] } rsp = self.vector_client.vector_hybrid_search(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 10 @pytest.mark.L0 class TestQueryVector(TestBase): @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_query_entities_with_all_scalar_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"}}, {"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}}, {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar", "elementTypeParams": {"max_capacity": "1024", "max_length": "256"}}, {"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool", "elementTypeParams": {"max_capacity": "1024"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"}, {"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{gen_unique_str(length=1000)}", "bool": random.choice([True, False]), "json": {"key": [i]}, "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": gen_unique_str(length=1000), "bool": random.choice([True, False]), "json": {"key": i}, "int_array": [i], "varchar_array": [f"varchar_{i}"], "bool_array": [random.choice([True, False])], "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), "image_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ 0].tolist(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data to make sure the data is inserted # 1. query for int64 payload = { "collectionName": name, "filter": "user_id > 0", "limit": 50, "outputFields": ["*"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 50 # 2. query for varchar payload = { "collectionName": name, "filter": "book_describe like \"book%\"", "limit": 50, "outputFields": ["*"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 50 # 3. query for json payload = { "collectionName": name, "filter": "json_contains(json['key'] , 1)", "limit": 50, "outputFields": ["*"] } rsp = self.vector_client.vector_query(payload) assert len(rsp['data']) == 1 # 4. query for array payload = { "collectionName": name, "filter": "array_contains(int_array, 1)", "limit": 50, "outputFields": ["*"] } rsp = self.vector_client.vector_query(payload) assert len(rsp['data']) == 1 @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) @pytest.mark.parametrize("pass_fp32_to_fp16_or_bf16", [True, False]) def test_query_entities_with_all_vector_datatype(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema, pass_fp32_to_fp16_or_bf16): """ Insert a vector with a simple payload """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "float_vector", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "float16_vector", "dataType": "Float16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "bfloat16_vector", "dataType": "BFloat16Vector", "elementTypeParams": {"dim": f"{dim}"}}, {"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "L2"}, {"fieldName": "float16_vector", "indexName": "float16_vector", "metricType": "L2"}, {"fieldName": "bfloat16_vector", "indexName": "bfloat16_vector", "metricType": "L2"}, {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING", "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}} ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim) } else: tmp = { "book_id": i, "user_id": i, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), "float16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="Float16Vector", dim=dim) ), "bfloat16_vector": ( gen_vector(datatype="FloatVector", dim=dim) if pass_fp32_to_fp16_or_bf16 else gen_vector(datatype="BFloat16Vector", dim=dim) ), "binary_vector": gen_vector(datatype="BinaryVector", dim=dim) } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb c = Collection(name) res = c.query( expr="user_id > 0", limit=50, output_fields=["*"], ) logger.info(f"res: {res}") # query data to make sure the data is inserted rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50}) assert rsp['code'] == 0 assert len(rsp['data']) == 50 @pytest.mark.parametrize("expr", ["10+20 <= uid < 20+30", "uid in [1,2,3,4]", "uid > 0", "uid >= 0", "uid > 0", "uid > -100 and uid < 100"]) @pytest.mark.parametrize("include_output_fields", [True, False]) @pytest.mark.parametrize("partial_fields", [True, False]) def test_query_vector_with_int64_filter(self, expr, include_output_fields, partial_fields): """ Query a vector with a simple payload """ name = gen_collection_name() self.name = name schema_payload, data = self.init_collection(name) output_fields = get_common_fields_by_data(data) if partial_fields: output_fields = output_fields[:len(output_fields) // 2] if "uid" not in output_fields: output_fields.append("uid") else: output_fields = output_fields # query data payload = { "collectionName": name, "filter": expr, "limit": 100, "offset": 0, "outputFields": output_fields } if not include_output_fields: payload.pop("outputFields") if 'vector' in output_fields: output_fields.remove("vector") time.sleep(5) rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") for r in res: uid = r['uid'] assert eval(expr) is True for field in output_fields: assert field in r def test_query_vector_with_count(self): """ Query a vector with a simple payload """ name = gen_collection_name() self.name = name self.init_collection(name, nb=3000) # query for "count(*)" payload = { "collectionName": name, "filter": " ", "limit": 0, "outputFields": ["count(*)"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 assert rsp['data'][0]['count(*)'] == 3000 @pytest.mark.xfail(reason="query by id is not supported") def test_query_vector_by_id(self): """ Query a vector with a simple payload """ name = gen_collection_name() self.name = name _, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True) payload = { "collectionName": name, "id": insert_ids, } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 @pytest.mark.parametrize("filter_expr", ["name > \"placeholder\"", "name like \"placeholder%\""]) @pytest.mark.parametrize("include_output_fields", [True, False]) def test_query_vector_with_varchar_filter(self, filter_expr, include_output_fields): """ Query a vector with a complex payload """ name = gen_collection_name() self.name = name nb = 200 dim = 128 limit = 100 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) names = [] for item in data: names.append(item.get("name")) names.sort() logger.info(f"names: {names}") mid = len(names) // 2 prefix = names[mid][0:2] # search data output_fields = get_common_fields_by_data(data) filter_expr = filter_expr.replace("placeholder", prefix) logger.info(f"filter_expr: {filter_expr}") payload = { "collectionName": name, "outputFields": output_fields, "filter": filter_expr, "limit": limit, "offset": 0, } if not include_output_fields: payload.pop("outputFields") rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) <= limit for item in res: name = item.get("name") logger.info(f"name: {name}") if ">" in filter_expr: assert name > prefix if "like" in filter_expr: assert name.startswith(prefix) @pytest.mark.parametrize("sum_of_limit_offset", [16384]) def test_query_vector_with_large_sum_of_limit_offset(self, sum_of_limit_offset): """ Query a vector with sum of limit and offset larger than max value """ max_sum_of_limit_offset = 16384 name = gen_collection_name() filter_expr = "name > \"placeholder\"" self.name = name nb = 200 dim = 128 limit = 100 offset = sum_of_limit_offset - limit schema_payload, data = self.init_collection(name, dim=dim, nb=nb) names = [] for item in data: names.append(item.get("name")) names.sort() logger.info(f"names: {names}") mid = len(names) // 2 prefix = names[mid][0:2] # search data output_fields = get_common_fields_by_data(data) filter_expr = filter_expr.replace("placeholder", prefix) logger.info(f"filter_expr: {filter_expr}") payload = { "collectionName": name, "outputFields": output_fields, "filter": filter_expr, "limit": limit, "offset": offset, } rsp = self.vector_client.vector_query(payload) if sum_of_limit_offset > max_sum_of_limit_offset: assert rsp['code'] == 1 return assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") assert len(res) <= limit for item in res: name = item.get("name") logger.info(f"name: {name}") if ">" in filter_expr: assert name > prefix if "like" in filter_expr: assert name.startswith(prefix) @pytest.mark.parametrize("tokenizer", ["jieba", "standard"]) def test_query_vector_with_text_match_filter(self, tokenizer): """ Query a vector with a simple payload """ fake = fake_en language = "en" if tokenizer == "jieba": fake = fake_zh language = "zh" # create a collection dim = 128 analyzer_params = { "tokenizer": tokenizer, } name = gen_collection_name() fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), FieldSchema( name="word", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, is_partition_key=True, analyzer_params=analyzer_params, ), FieldSchema( name="sentence", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( name="paragraph", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( name="text", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields=fields, description="test collection") collection = Collection(name=name, schema=schema ) rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 data_size = 3000 batch_size = 1000 # insert data data = [ { "id": i, "word": fake.word().lower(), "sentence": fake.sentence().lower(), "paragraph": fake.sentence().lower(), "text": fake.text().lower(), "emb": [random.random() for _ in range(dim)] } for i in range(data_size) ] df = pd.DataFrame(data) text_fields = ["word", "sentence", "paragraph", "text"] wf_map = {} for field in text_fields: wf_map[field] = analyze_documents(df[field].tolist(), language=language) for i in range(0, data_size, batch_size): tmp = data[i:i + batch_size] payload = { "collectionName": name, "data": tmp, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == len(tmp) collection.create_index( "emb", {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, ) collection.load() time.sleep(5) for field in text_fields: token = wf_map[field].most_common()[0][0] expr = f"text_match({field}, '{token}')" logger.info(f"expr: {expr}") rsp = self.vector_client.vector_query({"collectionName": name, "filter": f"{expr}", "outputFields": ["*"]}) assert rsp['code'] == 0, rsp for d in rsp['data']: assert token in d[field] @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) def test_query_entities_with_default_none(self, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with default and none """ # create a collection name = gen_collection_name() payload = { "collectionName": name, "schema": { "autoId": auto_id, "enableDynamicField": enable_dynamic_schema, "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}, "defaultValue": 8888}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "25536"}, "nullable": True}, {"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}, "nullable": True}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}, "nullable": True}, {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64", "elementTypeParams": {"max_capacity": "1024"}, "nullable": True}, {"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar", "elementTypeParams": {"max_capacity": "1024", "max_length": "256"}, "nullable": True}, {"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool", "elementTypeParams": {"max_capacity": "1024"}, "nullable": True}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, ] }, "indexParams": [ {"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"}, ] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for i in range(nb): if auto_id: tmp = { "user_id": i, "word_count": None, "book_describe": None, "bool": random.choice([True, False]), "json": None, "int_array": None, "varchar_array": None, "bool_array": None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(), } else: tmp = { "book_id": i, "user_id": i, "word_count": None, "book_describe": None, "bool": random.choice([True, False]), "json": None, "int_array": None, "varchar_array": None, "bool_array": None, "text_emb": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[0].tolist(), } if enable_dynamic_schema: tmp.update({f"dynamic_field_{i}": i}) data.append(tmp) payload = { "collectionName": name, "data": data, } rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data to make sure the data is inserted payload = { "collectionName": name, "filter": "user_id > 0", "limit": 50, "outputFields": ["*"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 assert rsp['data'][0]['book_describe'] is None assert rsp['data'][0]['word_count'] == 8888 assert rsp['data'][0]['json'] is None assert rsp['data'][0]['varchar_array'] is None assert len(rsp['data']) == 50 @pytest.mark.L0 class TestQueryVectorNegative(TestBase): def test_query_with_wrong_filter_expr(self): name = gen_collection_name() self.name = name nb = 200 dim = 128 schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True) output_fields = get_common_fields_by_data(data) uids = [] for item in data: uids.append(item.get("uid")) payload = { "collectionName": name, "outputFields": output_fields, "filter": f"{insert_ids}", } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 1100 assert "failed to create query plan" in rsp['message'] @pytest.mark.L0 class TestGetVector(TestBase): def test_get_vector_with_simple_payload(self): """ Search a vector with a simple payload """ name = gen_collection_name() self.name = name self.init_collection(name) # search data dim = 128 vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() payload = { "collectionName": name, "data": [vector_to_search], } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") limit = int(payload.get("limit", 100)) assert len(res) == limit ids = [item['id'] for item in res] assert len(ids) == len(set(ids)) payload = { "collectionName": name, "outputFields": ["*"], "id": ids[0], } rsp = self.vector_client.vector_get(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {res}") logger.info(f"res: {len(res)}") for item in res: assert item['id'] == ids[0] @pytest.mark.L0 @pytest.mark.parametrize("id_field_type", ["list", "one"]) @pytest.mark.parametrize("include_invalid_id", [True, False]) @pytest.mark.parametrize("include_output_fields", [True, False]) def test_get_vector_complex(self, id_field_type, include_output_fields, include_invalid_id): name = gen_collection_name() self.name = name nb = 200 dim = 128 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) output_fields = get_common_fields_by_data(data) uids = [] for item in data: uids.append(item.get("uid")) payload = { "collectionName": name, "outputFields": output_fields, "filter": f"uid in {uids}", } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") ids = [] for r in res: ids.append(r['id']) logger.info(f"ids: {len(ids)}") id_to_get = None if id_field_type == "list": id_to_get = ids if id_field_type == "one": id_to_get = ids[0] if include_invalid_id: if isinstance(id_to_get, list): id_to_get[-1] = 0 else: id_to_get = 0 # get by id list payload = { "collectionName": name, "outputFields": output_fields, "id": id_to_get } rsp = self.vector_client.vector_get(payload) assert rsp['code'] == 0 res = rsp['data'] if isinstance(id_to_get, list): if include_invalid_id: assert len(res) == len(id_to_get) - 1 else: assert len(res) == len(id_to_get) else: if include_invalid_id: assert len(res) == 0 else: assert len(res) == 1 for r in rsp['data']: if isinstance(id_to_get, list): assert r['id'] in id_to_get else: assert r['id'] == id_to_get if include_output_fields: for field in output_fields: assert field in r @pytest.mark.L0 class TestDeleteVector(TestBase): @pytest.mark.xfail(reason="delete by id is not supported") def test_delete_vector_by_id(self): """ Query a vector with a simple payload """ name = gen_collection_name() self.name = name _, _, insert_ids = self.init_collection(name, nb=3000, return_insert_id=True) payload = { "collectionName": name, "id": insert_ids, } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 @pytest.mark.parametrize("id_field_type", ["list", "one"]) def test_delete_vector_by_pk_field_ids(self, id_field_type): name = gen_collection_name() self.name = name nb = 200 dim = 128 schema_payload, data, insert_ids = self.init_collection(name, dim=dim, nb=nb, return_insert_id=True) time.sleep(1) id_to_delete = None if id_field_type == "list": id_to_delete = insert_ids if id_field_type == "one": id_to_delete = insert_ids[0] if isinstance(id_to_delete, list): payload = { "collectionName": name, "filter": f"id in {id_to_delete}" } else: payload = { "collectionName": name, "filter": f"id == {id_to_delete}" } rsp = self.vector_client.vector_delete(payload) assert rsp['code'] == 0 # verify data deleted by get payload = { "collectionName": name, "id": id_to_delete } rsp = self.vector_client.vector_get(payload) assert len(rsp['data']) == 0 @pytest.mark.parametrize("id_field_type", ["list", "one"]) def test_delete_vector_by_filter_pk_field(self, id_field_type): name = gen_collection_name() self.name = name nb = 200 dim = 128 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) time.sleep(1) output_fields = get_common_fields_by_data(data) uids = [] for item in data: uids.append(item.get("uid")) payload = { "collectionName": name, "outputFields": output_fields, "filter": f"uid in {uids}", } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") ids = [] for r in res: ids.append(r['id']) logger.info(f"ids: {len(ids)}") id_to_get = None if id_field_type == "list": id_to_get = ids if id_field_type == "one": id_to_get = ids[0] if isinstance(id_to_get, list): if len(id_to_get) >= 100: id_to_get = id_to_get[-100:] # delete by id list if isinstance(id_to_get, list): payload = { "collectionName": name, "filter": f"id in {id_to_get}", } else: payload = { "collectionName": name, "filter": f"id == {id_to_get}", } rsp = self.vector_client.vector_delete(payload) assert rsp['code'] == 0 logger.info(f"delete res: {rsp}") # verify data deleted if not isinstance(id_to_get, list): id_to_get = [id_to_get] payload = { "collectionName": name, "filter": f"id in {id_to_get}", } time.sleep(5) rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 assert len(rsp['data']) == 0 def test_delete_vector_by_custom_pk_field(self): dim = 128 nb = 3000 insert_round = 1 name = gen_collection_name() payload = { "collectionName": name, "schema": { "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 pk_values = [] # insert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j, "word_count": i * nb + j, "book_describe": f"book_{i * nb + j}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } tmp = [d["book_id"] for d in data] pk_values.extend(tmp) body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data before delete c = Collection(name) res = c.query(expr="", output_fields=["count(*)"]) logger.info(f"res: {res}") # delete data payload = { "collectionName": name, "filter": f"book_id in {pk_values}", } rsp = self.vector_client.vector_delete(payload) # query data after delete res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong") logger.info(f"res: {res}") assert res[0]["count(*)"] == 0 def test_delete_vector_by_filter_custom_field(self): dim = 128 nb = 3000 insert_round = 1 name = gen_collection_name() payload = { "collectionName": name, "schema": { "fields": [ {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, {"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} ] }, "indexParams": [{"fieldName": "text_emb", "indexName": "text_emb_index", "metricType": "L2"}] } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) logger.info(f"rsp: {rsp}") assert rsp['code'] == 0 # insert data for i in range(insert_round): data = [] for j in range(nb): tmp = { "book_id": i * nb + j, "word_count": i * nb + j, "book_describe": f"book_{i * nb + j}", "text_emb": preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() } data.append(tmp) payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") rsp = self.vector_client.vector_insert(payload) assert rsp['code'] == 0 assert rsp['data']['insertCount'] == nb # query data before delete c = Collection(name) res = c.query(expr="", output_fields=["count(*)"]) logger.info(f"res: {res}") # delete data payload = { "collectionName": name, "filter": "word_count >= 0", } rsp = self.vector_client.vector_delete(payload) # query data after delete res = c.query(expr="", output_fields=["count(*)"], consistency_level="Strong") logger.info(f"res: {res}") assert res[0]["count(*)"] == 0 def test_delete_vector_with_non_primary_key(self): """ Delete a vector with a non-primary key, expect no data were deleted """ name = gen_collection_name() self.name = name self.init_collection(name, dim=128, nb=300) expr = "uid > 0" payload = { "collectionName": name, "filter": expr, "limit": 3000, "offset": 0, "outputFields": ["id", "uid"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") id_list = [r['uid'] for r in res] delete_expr = f"uid in {[i for i in id_list[:10]]}" # query data before delete payload = { "collectionName": name, "filter": delete_expr, "limit": 3000, "offset": 0, "outputFields": ["id", "uid"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] num_before_delete = len(res) logger.info(f"res: {len(res)}") # delete data payload = { "collectionName": name, "filter": delete_expr, } rsp = self.vector_client.vector_delete(payload) # query data after delete payload = { "collectionName": name, "filter": delete_expr, "limit": 3000, "offset": 0, "outputFields": ["id", "uid"] } time.sleep(1) rsp = self.vector_client.vector_query(payload) assert len(rsp["data"]) == 0 @pytest.mark.L0 class TestDeleteVectorNegative(TestBase): def test_delete_vector_with_invalid_collection_name(self): """ Delete a vector with an invalid collection name """ name = gen_collection_name() self.name = name self.init_collection(name, dim=128, nb=3000) # query data # expr = f"id in {[i for i in range(10)]}".replace("[", "(").replace("]", ")") expr = "id > 0" payload = { "collectionName": name, "filter": expr, "limit": 3000, "offset": 0, "outputFields": ["id", "uid"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") id_list = [r['id'] for r in res] delete_expr = f"id in {[i for i in id_list[:10]]}" # query data before delete payload = { "collectionName": name, "filter": delete_expr, "limit": 3000, "offset": 0, "outputFields": ["id", "uid"] } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") # delete data payload = { "collectionName": name + "_invalid", "filter": delete_expr, } rsp = self.vector_client.vector_delete(payload) assert rsp['code'] == 100 assert "can't find collection" in rsp['message'] @pytest.mark.L1 class TestVectorWithAuth(TestBase): def test_upsert_vector_with_invalid_api_key(self): """ Insert a vector with invalid api key """ # create a collection name = gen_collection_name() dim = 128 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 10 data = [ { "vector": [np.float64(random.random()) for _ in range(dim)], } for _ in range(nb) ] payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") client = self.vector_client client.api_key = "invalid_api_key" rsp = client.vector_insert(payload) assert rsp['code'] == 1800 def test_insert_vector_with_invalid_api_key(self): """ Insert a vector with invalid api key """ # create a collection name = gen_collection_name() dim = 128 payload = { "collectionName": name, "dimension": dim, } rsp = self.collection_client.collection_create(payload) assert rsp['code'] == 0 rsp = self.collection_client.collection_describe(name) assert rsp['code'] == 0 # insert data nb = 10 data = [ { "vector": [np.float64(random.random()) for _ in range(dim)], } for _ in range(nb) ] payload = { "collectionName": name, "data": data, } body_size = sys.getsizeof(json.dumps(payload)) logger.info(f"body size: {body_size / 1024 / 1024} MB") client = self.vector_client client.api_key = "invalid_api_key" rsp = client.vector_insert(payload) assert rsp['code'] == 1800 def test_delete_vector_with_invalid_api_key(self): """ Delete a vector with an invalid api key """ name = gen_collection_name() self.name = name nb = 200 dim = 128 schema_payload, data = self.init_collection(name, dim=dim, nb=nb) output_fields = get_common_fields_by_data(data) uids = [] for item in data: uids.append(item.get("uid")) payload = { "collectionName": name, "outputFields": output_fields, "filter": f"uid in {uids}", } rsp = self.vector_client.vector_query(payload) assert rsp['code'] == 0 res = rsp['data'] logger.info(f"res: {len(res)}") ids = [] for r in res: ids.append(r['id']) logger.info(f"ids: {len(ids)}") id_to_get = ids # delete by id list payload = { "collectionName": name, "filter": f"uid in {uids}" } client = self.vector_client client.api_key = "invalid_api_key" rsp = client.vector_delete(payload) assert rsp['code'] == 1800