mirror of https://github.com/milvus-io/milvus.git
test: add partition key isolation test case (#39403)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>pull/39503/head
parent
eab102e976
commit
40e6fcd868
|
@ -0,0 +1,357 @@
|
|||
from base.client_base import TestcaseBase
|
||||
from common import common_func as cf
|
||||
from common.common_type import CaseLabel
|
||||
from utils.util_log import test_log as log
|
||||
import time
|
||||
import pytest
|
||||
import random
|
||||
from pymilvus import (
|
||||
list_collections,
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection, utility
|
||||
)
|
||||
import pandas as pd
|
||||
import faker
|
||||
fake = faker.Faker()
|
||||
|
||||
|
||||
prefix = "par_key_isolation_"
|
||||
|
||||
|
||||
class TestPartitionKeyIsolation(TestcaseBase):
|
||||
""" Test case of partition key isolation"""
|
||||
@pytest.mark.tags(CaseLabel.L3)
|
||||
def test_par_key_isolation_with_valid_expr(self):
|
||||
# create
|
||||
self._connect()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
partition_key = "scalar_6"
|
||||
enable_isolation = "true"
|
||||
if collection_name in list_collections():
|
||||
log.info(f"collection {collection_name} exists, drop it")
|
||||
Collection(name=collection_name).drop()
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_3")),
|
||||
FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_6")),
|
||||
FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_9")),
|
||||
FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_12")),
|
||||
FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_5_linear")),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
|
||||
num_partitions=1)
|
||||
collection = Collection(name=collection_name, schema=schema, num_partitions=1)
|
||||
|
||||
collection.set_properties({"partitionkey.isolation": enable_isolation})
|
||||
log.info(f"collection {collection_name} created: {collection.describe()}")
|
||||
index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
|
||||
log.info(f"collection {collection_name} created")
|
||||
batch_size = 1000
|
||||
data_size = 10000
|
||||
epoch = data_size // batch_size
|
||||
remainder = data_size % batch_size
|
||||
all_data = []
|
||||
for i in range(epoch + 1):
|
||||
if i == epoch:
|
||||
if remainder == 0:
|
||||
break
|
||||
batch_size = remainder
|
||||
start_idx = i * batch_size
|
||||
end_idx = (i + 1) * batch_size
|
||||
t0 = time.time()
|
||||
data = {
|
||||
"id": [i for i in range(start_idx, end_idx)],
|
||||
"scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
|
||||
"scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
|
||||
"scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
|
||||
"scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
|
||||
"scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
|
||||
"emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
all_data.append(df)
|
||||
log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
|
||||
collection.insert(df)
|
||||
all_df = pd.concat(all_data)
|
||||
collection.compact()
|
||||
collection.wait_for_compaction_completed()
|
||||
t0 = time.time()
|
||||
collection.create_index("emb", index_params=index_params)
|
||||
index_list = utility.list_indexes(collection_name=collection_name)
|
||||
for index_name in index_list:
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
while progress["pending_index_rows"] > 0:
|
||||
time.sleep(30)
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
tt = time.time() - t0
|
||||
log.info(f"create index cost time {tt}")
|
||||
collection.compact()
|
||||
collection.wait_for_compaction_completed()
|
||||
t0 = time.time()
|
||||
collection.load()
|
||||
log.info(f"load collection cost time {time.time() - t0}")
|
||||
num = collection.num_entities
|
||||
log.info(f"collection {collection_name} loaded, num_entities: {num}")
|
||||
|
||||
valid_expressions = [
|
||||
"scalar_6 == '1' and scalar_12 == '1'",
|
||||
"scalar_6 == '1' and scalar_12 > '1'",
|
||||
"scalar_6 == '3' and (scalar_12 == '1' or scalar_3 != '1')",
|
||||
"scalar_6 == '2' and ('4' < scalar_12 < '6' or scalar_3 == '1')",
|
||||
"scalar_6 == '5' and scalar_12 in ['1', '3', '5']",
|
||||
"scalar_6 == '1'"
|
||||
]
|
||||
for expr in valid_expressions:
|
||||
res = collection.search(
|
||||
data=[[random.random() for _ in range(768)]],
|
||||
anns_field="emb",
|
||||
expr=expr,
|
||||
param={"metric_type": "L2", "params": {"nprobe": 16}},
|
||||
limit=10000,
|
||||
output_fields=["scalar_3", "scalar_6", "scalar_12"]
|
||||
)
|
||||
log.info(f"search res {res}")
|
||||
true_res = all_df.query(expr)
|
||||
log.info(f"true res {true_res}")
|
||||
assert len(res[0]) == len(true_res)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L3)
|
||||
def test_par_key_isolation_with_unsupported_expr(self):
|
||||
# create
|
||||
self._connect()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
partition_key = "scalar_6"
|
||||
enable_isolation = "true"
|
||||
if collection_name in list_collections():
|
||||
log.info(f"collection {collection_name} exists, drop it")
|
||||
Collection(name=collection_name).drop()
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_3")),
|
||||
FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_6")),
|
||||
FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_9")),
|
||||
FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_12")),
|
||||
FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_5_linear")),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
|
||||
num_partitions=1)
|
||||
collection = Collection(name=collection_name, schema=schema, num_partitions=1)
|
||||
|
||||
collection.set_properties({"partitionkey.isolation": enable_isolation})
|
||||
log.info(f"collection {collection_name} created: {collection.describe()}")
|
||||
index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
|
||||
log.info(f"collection {collection_name} created")
|
||||
batch_size = 1000
|
||||
data_size = 10000
|
||||
epoch = data_size // batch_size
|
||||
remainder = data_size % batch_size
|
||||
for i in range(epoch + 1):
|
||||
if i == epoch:
|
||||
if remainder == 0:
|
||||
break
|
||||
batch_size = remainder
|
||||
start_idx = i * batch_size
|
||||
end_idx = (i + 1) * batch_size
|
||||
t0 = time.time()
|
||||
data = {
|
||||
"id": [i for i in range(start_idx, end_idx)],
|
||||
"scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
|
||||
"scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
|
||||
"scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
|
||||
"scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
|
||||
"scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
|
||||
"emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
|
||||
collection.insert(df)
|
||||
collection.compact()
|
||||
collection.wait_for_compaction_completed()
|
||||
t0 = time.time()
|
||||
collection.create_index("emb", index_params=index_params)
|
||||
index_list = utility.list_indexes(collection_name=collection_name)
|
||||
for index_name in index_list:
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
while progress["pending_index_rows"] > 0:
|
||||
time.sleep(30)
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
tt = time.time() - t0
|
||||
log.info(f"create index cost time {tt}")
|
||||
collection.compact()
|
||||
collection.wait_for_compaction_completed()
|
||||
t0 = time.time()
|
||||
collection.load()
|
||||
log.info(f"load collection cost time {time.time() - t0}")
|
||||
num = collection.num_entities
|
||||
log.info(f"collection {collection_name} loaded, num_entities: {num}")
|
||||
|
||||
invalid_expressions = [
|
||||
"scalar_6 in ['1', '2']",
|
||||
"scalar_6 not in ['1', '2']",
|
||||
"scalar_6 == '1' or scalar_3 == '1'",
|
||||
"scalar_6 != '1'",
|
||||
"scalar_6 > '1'",
|
||||
"'1' < scalar_6 < '3'",
|
||||
"scalar_3 == '1'" # scalar_3 is not partition key
|
||||
]
|
||||
false_result = []
|
||||
for expr in invalid_expressions:
|
||||
try:
|
||||
res = collection.search(
|
||||
data=[[random.random() for _ in range(768)]],
|
||||
anns_field="emb",
|
||||
expr=expr,
|
||||
param={"metric_type": "L2", "params": {"nprobe": 16}},
|
||||
limit=10,
|
||||
output_fields=["scalar_6"]
|
||||
)
|
||||
log.info(f"search with {expr} get res {res}")
|
||||
false_result.append(expr)
|
||||
except Exception as e:
|
||||
log.info(f"search with unsupported expr {expr} get {e}")
|
||||
if len(false_result) > 0:
|
||||
log.info(f"search with unsupported expr {false_result}, but not raise error\n")
|
||||
assert False
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L3)
|
||||
def test_par_key_isolation_without_partition_key(self):
|
||||
# create
|
||||
self._connect()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
partition_key = "None"
|
||||
enable_isolation = "true"
|
||||
if collection_name in list_collections():
|
||||
log.info(f"collection {collection_name} exists, drop it")
|
||||
Collection(name=collection_name).drop()
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_3")),
|
||||
FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_6")),
|
||||
FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_9")),
|
||||
FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_12")),
|
||||
FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_5_linear")),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
|
||||
num_partitions=1)
|
||||
collection = Collection(name=collection_name, schema=schema)
|
||||
try:
|
||||
collection.set_properties({"partitionkey.isolation": enable_isolation})
|
||||
assert False
|
||||
except Exception as e:
|
||||
log.info(f"set_properties failed without partition key {e}")
|
||||
assert "partition key isolation mode is enabled but no partition key field is set" in str(e)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L3)
|
||||
def test_set_par_key_isolation_after_vector_indexed(self):
|
||||
# create
|
||||
self._connect()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
partition_key = "scalar_6"
|
||||
enable_isolation = "false"
|
||||
if collection_name in list_collections():
|
||||
log.info(f"collection {collection_name} exists, drop it")
|
||||
Collection(name=collection_name).drop()
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_3")),
|
||||
FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_6")),
|
||||
FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_9")),
|
||||
FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_12")),
|
||||
FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
|
||||
is_partition_key=bool(partition_key == "scalar_5_linear")),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
|
||||
num_partitions=1)
|
||||
collection = Collection(name=collection_name, schema=schema, num_partitions=1)
|
||||
|
||||
collection.set_properties({"partitionkey.isolation": enable_isolation})
|
||||
log.info(f"collection {collection_name} created: {collection.describe()}")
|
||||
index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
|
||||
log.info(f"collection {collection_name} created")
|
||||
batch_size = 1000
|
||||
data_size = 10000
|
||||
epoch = data_size // batch_size
|
||||
remainder = data_size % batch_size
|
||||
for i in range(epoch + 1):
|
||||
if i == epoch:
|
||||
if remainder == 0:
|
||||
break
|
||||
batch_size = remainder
|
||||
start_idx = i * batch_size
|
||||
end_idx = (i + 1) * batch_size
|
||||
t0 = time.time()
|
||||
data = {
|
||||
"id": [i for i in range(start_idx, end_idx)],
|
||||
"scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
|
||||
"scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
|
||||
"scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
|
||||
"scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
|
||||
"scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
|
||||
"emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
|
||||
collection.insert(df)
|
||||
collection.compact()
|
||||
collection.wait_for_compaction_completed()
|
||||
t0 = time.time()
|
||||
collection.create_index("emb", index_params=index_params)
|
||||
index_list = utility.list_indexes(collection_name=collection_name)
|
||||
for index_name in index_list:
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
while progress["pending_index_rows"] > 0:
|
||||
time.sleep(30)
|
||||
progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
log.info(f"collection {collection_name} index {index_name} progress: {progress}")
|
||||
tt = time.time() - t0
|
||||
log.info(f"create index cost time {tt}")
|
||||
result = True
|
||||
try:
|
||||
collection.set_properties({"partitionkey.isolation": "true"})
|
||||
|
||||
except Exception as e:
|
||||
result = False
|
||||
log.info(f"set_properties after vector indexed {e}")
|
||||
assert result is False
|
||||
collection.drop_index()
|
||||
collection.set_properties({"partitionkey.isolation": "true"})
|
||||
collection.create_index("emb", index_params=index_params)
|
||||
collection.load()
|
||||
res = collection.search(
|
||||
data=[[random.random() for _ in range(768)]],
|
||||
anns_field="emb",
|
||||
expr="scalar_6 == '1' and scalar_3 == '1'",
|
||||
param={"metric_type": "L2", "params": {"nprobe": 16}},
|
||||
limit=10,
|
||||
output_fields=["scalar_6", "scalar_3"]
|
||||
)
|
||||
log.info(f"search res {res}")
|
||||
assert len(res[0]) > 0
|
Loading…
Reference in New Issue