milvus/tests/restful_client_v2/testcases/test_index_operation.py

423 lines
18 KiB
Python

import random
from sklearn import preprocessing
import numpy as np
import time
from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \
zh_vocabularies_distribution
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
from utils.utils import gen_vector
from pymilvus import (
Collection
)
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
patch_faker_text(fake_en, en_vocabularies_distribution)
patch_faker_text(fake_zh, zh_vocabularies_distribution)
index_param_map = {
"FLAT": {},
"IVF_SQ8": {"nlist": 128},
"HNSW": {"M": 16, "efConstruction": 200},
"BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
"AUTOINDEX": {}
}
@pytest.mark.L0
class TestCreateIndex(TestBase):
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
def test_index_default(self, dim, metric_type, index_type):
"""
target: test create collection
method: create a collection with a simple schema
expected: create collection success
"""
name = gen_collection_name()
client = self.collection_client
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
}
}
logger.info(f"create collection {name} with payload: {payload}")
rsp = client.collection_create(payload)
c = Collection(name)
c.flush()
# list index, expect empty
rsp = self.index_client.index_list(name)
# create index
payload = {
"collectionName": name,
"indexParams": [
{"fieldName": "book_intro", "indexName": "book_intro_vector",
"metricType": f"{metric_type}",
"indexType": f"{index_type}",
"params": index_param_map[index_type]
}
]
}
rsp = self.index_client.index_create(payload)
assert rsp['code'] == 0
time.sleep(10)
# list index, expect not empty
rsp = self.index_client.index_list(collection_name=name)
# describe index
rsp = self.index_client.index_describe(collection_name=name, index_name="book_intro_vector")
assert rsp['code'] == 0
assert len(rsp['data']) == len(payload['indexParams'])
expected_index = sorted(payload['indexParams'], key=lambda x: x['fieldName'])
actual_index = sorted(rsp['data'], key=lambda x: x['fieldName'])
for i in range(len(expected_index)):
assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
assert expected_index[i]['metricType'] == actual_index[i]['metricType']
assert expected_index[i]["indexType"] == actual_index[i]['indexType']
# check index by pymilvus
index_info = [index.to_dict() for index in c.indexes]
logger.info(f"index_info: {index_info}")
for index in index_info:
index_param = index["index_param"]
if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
assert index_param["metric_type"] == "BM25"
assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
else:
assert index_param["metric_type"] == metric_type
assert index_param["index_type"] == index_type
assert index_param.get("params", {}) == index_param_map[index_type]
# drop index
for i in range(len(actual_index)):
payload = {
"collectionName": name,
"indexName": actual_index[i]['indexName']
}
rsp = self.index_client.index_drop(payload)
assert rsp['code'] == 0
# list index, expect empty
rsp = self.index_client.index_list(collection_name=name)
assert rsp['data'] == []
@pytest.mark.parametrize("index_type", ["INVERTED"])
@pytest.mark.parametrize("dim", [128])
def test_index_for_scalar_field(self, dim, index_type):
"""
target: test create collection
method: create a collection with a simple schema
expected: create collection success
"""
name = gen_collection_name()
client = self.collection_client
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
}
}
logger.info(f"create collection {name} with payload: {payload}")
rsp = client.collection_create(payload)
# insert data
for i in range(1):
data = []
for j in range(3000):
tmp = {
"book_id": j,
"word_count": j,
"book_describe": f"book_{j}",
"book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
c = Collection(name)
c.flush()
# list index, expect empty
rsp = self.index_client.index_list(name)
# create index
payload = {
"collectionName": name,
"indexParams": [{"fieldName": "word_count", "indexName": "word_count_vector", "indexType": "INVERTED",
"params": {"index_type": "INVERTED"}}]
}
rsp = self.index_client.index_create(payload)
assert rsp['code'] == 0
time.sleep(10)
# list index, expect not empty
rsp = self.index_client.index_list(collection_name=name)
# describe index
rsp = self.index_client.index_describe(collection_name=name, index_name="word_count_vector")
assert rsp['code'] == 0
assert len(rsp['data']) == len(payload['indexParams'])
expected_index = sorted(payload['indexParams'], key=lambda x: x['fieldName'])
actual_index = sorted(rsp['data'], key=lambda x: x['fieldName'])
for i in range(len(expected_index)):
assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
assert expected_index[i]['indexType'] == actual_index[i]['indexType']
@pytest.mark.parametrize("index_type", ["BIN_FLAT", "BIN_IVF_FLAT"])
@pytest.mark.parametrize("metric_type", ["JACCARD", "HAMMING"])
@pytest.mark.parametrize("dim", [128])
def test_index_for_binary_vector_field(self, dim, metric_type, index_type):
"""
target: test create collection
method: create a collection with a simple schema
expected: create collection success
"""
name = gen_collection_name()
client = self.collection_client
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}
]
}
}
logger.info(f"create collection {name} with payload: {payload}")
rsp = client.collection_create(payload)
# insert data
for i in range(1):
data = []
for j in range(3000):
tmp = {
"book_id": j,
"word_count": j,
"book_describe": f"book_{j}",
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
c = Collection(name)
c.flush()
# list index, expect empty
rsp = self.index_client.index_list(name)
# create index
index_name = "binary_vector_index"
payload = {
"collectionName": name,
"indexParams": [{"fieldName": "binary_vector", "indexName": index_name, "metricType": metric_type, "indexType": index_type,
"params": {"index_type": index_type}}]
}
if index_type == "BIN_IVF_FLAT":
payload["indexParams"][0]["params"]["nlist"] = "16384"
rsp = self.index_client.index_create(payload)
assert rsp['code'] == 0
time.sleep(10)
# list index, expect not empty
rsp = self.index_client.index_list(collection_name=name)
# describe index
rsp = self.index_client.index_describe(collection_name=name, index_name=index_name)
assert rsp['code'] == 0
assert len(rsp['data']) == len(payload['indexParams'])
expected_index = sorted(payload['indexParams'], key=lambda x: x['fieldName'])
actual_index = sorted(rsp['data'], key=lambda x: x['fieldName'])
for i in range(len(expected_index)):
assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
assert expected_index[i]['indexType'] == actual_index[i]['indexType']
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("tokenizer", ['standard', 'jieba'])
@pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
@pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
@pytest.mark.parametrize("bm25_b", [0.7, 0.5])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key,
enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# create index
payload = {
"collectionName": name,
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector",
"metricType": "BM25",
"indexType": index_type,
"params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b}
}
]
}
rsp = self.index_client.index_create(payload)
c = Collection(name)
index_info = [index.to_dict() for index in c.indexes]
logger.info(f"index_info: {index_info}")
for info in index_info:
assert info['index_param']['metric_type'] == 'BM25'
assert info['index_param']["params"]['bm25_k1'] == bm25_k1
assert info['index_param']["params"]['bm25_b'] == bm25_b
assert info['index_param']['index_type'] == index_type
@pytest.mark.L1
class TestCreateIndexNegative(TestBase):
@pytest.mark.parametrize("index_type", ["BIN_FLAT", "BIN_IVF_FLAT"])
@pytest.mark.parametrize("metric_type", ["L2", "IP", "COSINE"])
@pytest.mark.parametrize("dim", [128])
def test_index_for_binary_vector_field_with_mismatch_metric_type(self, dim, metric_type, index_type):
"""
"""
name = gen_collection_name()
client = self.collection_client
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "binary_vector", "dataType": "BinaryVector", "elementTypeParams": {"dim": f"{dim}"}}
]
}
}
logger.info(f"create collection {name} with payload: {payload}")
rsp = client.collection_create(payload)
# insert data
for i in range(1):
data = []
for j in range(3000):
tmp = {
"book_id": j,
"word_count": j,
"book_describe": f"book_{j}",
"binary_vector": gen_vector(datatype="BinaryVector", dim=dim)
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
c = Collection(name)
c.flush()
# list index, expect empty
rsp = self.index_client.index_list(name)
# create index
index_name = "binary_vector_index"
payload = {
"collectionName": name,
"indexParams": [{"fieldName": "binary_vector", "indexName": index_name, "metricType": metric_type,
"params": {"index_type": index_type}}]
}
if index_type == "BIN_IVF_FLAT":
payload["indexParams"][0]["params"]["nlist"] = "16384"
rsp = self.index_client.index_create(payload)
assert rsp['code'] == 1100
assert "not supported" in rsp['message']