test: add partition key isolation test case (#39403)

/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2025-01-22 09:57:06 +08:00 · 2025-01-22 09:57:06 +08:00 · 40e6fcd868
parent eab102e976
commit 40e6fcd868
1 changed files with 357 additions and 0 deletions
--- a/tests/python_client/testcases/test_partition_key_isolation.py
+++ b/tests/python_client/testcases/test_partition_key_isolation.py
@ -0,0 +1,357 @@
 from base.client_base import TestcaseBase
 from common import common_func as cf
 from common.common_type import CaseLabel
 from utils.util_log import test_log as log
 import time
 import pytest
 import random
 from pymilvus import (
    list_collections,
    FieldSchema, CollectionSchema, DataType,
    Collection, utility
 )
 import pandas as pd
 import faker
 fake = faker.Faker()
 prefix = "par_key_isolation_"
 class TestPartitionKeyIsolation(TestcaseBase):
    """ Test case of partition key isolation"""
    @pytest.mark.tags(CaseLabel.L3)
    def test_par_key_isolation_with_valid_expr(self):
        # create
        self._connect()
        collection_name = cf.gen_unique_str(prefix)
        partition_key = "scalar_6"
        enable_isolation = "true"
        if collection_name in list_collections():
            log.info(f"collection {collection_name} exists, drop it")
            Collection(name=collection_name).drop()
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_3")),
            FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_6")),
            FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_9")),
            FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_12")),
            FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_5_linear")),
            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
        ]
        schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
                                  num_partitions=1)
        collection = Collection(name=collection_name, schema=schema, num_partitions=1)
        collection.set_properties({"partitionkey.isolation": enable_isolation})
        log.info(f"collection {collection_name} created: {collection.describe()}")
        index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
        log.info(f"collection {collection_name} created")
        batch_size = 1000
        data_size = 10000
        epoch = data_size // batch_size
        remainder = data_size % batch_size
        all_data = []
        for i in range(epoch + 1):
            if i == epoch:
                if remainder == 0:
                    break
                batch_size = remainder
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            t0 = time.time()
            data = {
                "id": [i for i in range(start_idx, end_idx)],
                "scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
                "scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
                "scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
                "scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
                "scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
                "emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
            }
            df = pd.DataFrame(data)
            all_data.append(df)
            log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
            collection.insert(df)
        all_df = pd.concat(all_data)
        collection.compact()
        collection.wait_for_compaction_completed()
        t0 = time.time()
        collection.create_index("emb", index_params=index_params)
        index_list = utility.list_indexes(collection_name=collection_name)
        for index_name in index_list:
            progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
            while progress["pending_index_rows"] > 0:
                time.sleep(30)
                progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
                log.info(f"collection {collection_name} index {index_name} progress: {progress}")
            log.info(f"collection {collection_name} index {index_name} progress: {progress}")
        tt = time.time() - t0
        log.info(f"create index cost time {tt}")
        collection.compact()
        collection.wait_for_compaction_completed()
        t0 = time.time()
        collection.load()
        log.info(f"load collection cost time {time.time() - t0}")
        num = collection.num_entities
        log.info(f"collection {collection_name} loaded, num_entities: {num}")
        valid_expressions = [
            "scalar_6 == '1' and scalar_12 == '1'",
            "scalar_6 == '1' and scalar_12 > '1'",
            "scalar_6 == '3' and (scalar_12 == '1' or scalar_3 != '1')",
            "scalar_6 == '2' and ('4' < scalar_12 < '6' or scalar_3 == '1')",
            "scalar_6 == '5' and scalar_12 in ['1', '3', '5']",
            "scalar_6 == '1'"
        ]
        for expr in valid_expressions:
            res = collection.search(
                data=[[random.random() for _ in range(768)]],
                anns_field="emb",
                expr=expr,
                param={"metric_type": "L2", "params": {"nprobe": 16}},
                limit=10000,
                output_fields=["scalar_3", "scalar_6", "scalar_12"]
            )
            log.info(f"search res {res}")
            true_res = all_df.query(expr)
            log.info(f"true res {true_res}")
            assert len(res[0]) == len(true_res)
    @pytest.mark.tags(CaseLabel.L3)
    def test_par_key_isolation_with_unsupported_expr(self):
        # create
        self._connect()
        collection_name = cf.gen_unique_str(prefix)
        partition_key = "scalar_6"
        enable_isolation = "true"
        if collection_name in list_collections():
            log.info(f"collection {collection_name} exists, drop it")
            Collection(name=collection_name).drop()
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_3")),
            FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_6")),
            FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_9")),
            FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_12")),
            FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_5_linear")),
            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
        ]
        schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
                                  num_partitions=1)
        collection = Collection(name=collection_name, schema=schema, num_partitions=1)
        collection.set_properties({"partitionkey.isolation": enable_isolation})
        log.info(f"collection {collection_name} created: {collection.describe()}")
        index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
        log.info(f"collection {collection_name} created")
        batch_size = 1000
        data_size = 10000
        epoch = data_size // batch_size
        remainder = data_size % batch_size
        for i in range(epoch + 1):
            if i == epoch:
                if remainder == 0:
                    break
                batch_size = remainder
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            t0 = time.time()
            data = {
                "id": [i for i in range(start_idx, end_idx)],
                "scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
                "scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
                "scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
                "scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
                "scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
                "emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
            }
            df = pd.DataFrame(data)
            log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
            collection.insert(df)
        collection.compact()
        collection.wait_for_compaction_completed()
        t0 = time.time()
        collection.create_index("emb", index_params=index_params)
        index_list = utility.list_indexes(collection_name=collection_name)
        for index_name in index_list:
            progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
            while progress["pending_index_rows"] > 0:
                time.sleep(30)
                progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
                log.info(f"collection {collection_name} index {index_name} progress: {progress}")
            log.info(f"collection {collection_name} index {index_name} progress: {progress}")
        tt = time.time() - t0
        log.info(f"create index cost time {tt}")
        collection.compact()
        collection.wait_for_compaction_completed()
        t0 = time.time()
        collection.load()
        log.info(f"load collection cost time {time.time() - t0}")
        num = collection.num_entities
        log.info(f"collection {collection_name} loaded, num_entities: {num}")
        invalid_expressions = [
            "scalar_6 in ['1', '2']",
            "scalar_6 not in ['1', '2']",
            "scalar_6 == '1' or scalar_3 == '1'",
            "scalar_6 != '1'",
            "scalar_6 > '1'",
            "'1' < scalar_6 < '3'",
            "scalar_3 == '1'"  # scalar_3 is not partition key
        ]
        false_result = []
        for expr in invalid_expressions:
            try:
                res = collection.search(
                    data=[[random.random() for _ in range(768)]],
                    anns_field="emb",
                    expr=expr,
                    param={"metric_type": "L2", "params": {"nprobe": 16}},
                    limit=10,
                    output_fields=["scalar_6"]
                )
                log.info(f"search with {expr} get res {res}")
                false_result.append(expr)
            except Exception as e:
                log.info(f"search with unsupported expr {expr} get {e}")
        if len(false_result) > 0:
            log.info(f"search with unsupported expr {false_result}, but not raise error\n")
            assert False
    @pytest.mark.tags(CaseLabel.L3)
    def test_par_key_isolation_without_partition_key(self):
        # create
        self._connect()
        collection_name = cf.gen_unique_str(prefix)
        partition_key = "None"
        enable_isolation = "true"
        if collection_name in list_collections():
            log.info(f"collection {collection_name} exists, drop it")
            Collection(name=collection_name).drop()
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_3")),
            FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_6")),
            FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_9")),
            FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_12")),
            FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_5_linear")),
            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
        ]
        schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
                                  num_partitions=1)
        collection = Collection(name=collection_name, schema=schema)
        try:
            collection.set_properties({"partitionkey.isolation": enable_isolation})
            assert False
        except Exception as e:
            log.info(f"set_properties failed without partition key {e}")
            assert "partition key isolation mode is enabled but no partition key field is set" in str(e)
    @pytest.mark.tags(CaseLabel.L3)
    def test_set_par_key_isolation_after_vector_indexed(self):
        # create
        self._connect()
        collection_name = cf.gen_unique_str(prefix)
        partition_key = "scalar_6"
        enable_isolation = "false"
        if collection_name in list_collections():
            log.info(f"collection {collection_name} exists, drop it")
            Collection(name=collection_name).drop()
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="scalar_3", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_3")),
            FieldSchema(name="scalar_6", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_6")),
            FieldSchema(name="scalar_9", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_9")),
            FieldSchema(name="scalar_12", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_12")),
            FieldSchema(name="scalar_5_linear", dtype=DataType.VARCHAR, max_length=1000,
                        is_partition_key=bool(partition_key == "scalar_5_linear")),
            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=768)
        ]
        schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=True,
                                  num_partitions=1)
        collection = Collection(name=collection_name, schema=schema, num_partitions=1)
        collection.set_properties({"partitionkey.isolation": enable_isolation})
        log.info(f"collection {collection_name} created: {collection.describe()}")
        index_params = {"metric_type": "L2", "index_type": "HNSW", "params": {"M": 30, "efConstruction": 360}}
        log.info(f"collection {collection_name} created")
        batch_size = 1000
        data_size = 10000
        epoch = data_size // batch_size
        remainder = data_size % batch_size
        for i in range(epoch + 1):
            if i == epoch:
                if remainder == 0:
                    break
                batch_size = remainder
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            t0 = time.time()
            data = {
                "id": [i for i in range(start_idx, end_idx)],
                "scalar_3": [str(i % 3) for i in range(start_idx, end_idx)],
                "scalar_6": [str(i % 6) for i in range(start_idx, end_idx)],
                "scalar_9": [str(i % 9) for i in range(start_idx, end_idx)],
                "scalar_12": [str(i % 12) for i in range(start_idx, end_idx)],
                "scalar_5_linear": [str(i % 5) for i in range(start_idx, end_idx)],
                "emb": [[random.random() for _ in range(768)] for _ in range(batch_size)]
            }
            df = pd.DataFrame(data)
            log.info(f"generate test data {batch_size} cost time {time.time() - t0}")
            collection.insert(df)
        collection.compact()
        collection.wait_for_compaction_completed()
        t0 = time.time()
        collection.create_index("emb", index_params=index_params)
        index_list = utility.list_indexes(collection_name=collection_name)
        for index_name in index_list:
            progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
            while progress["pending_index_rows"] > 0:
                time.sleep(30)
                progress = utility.index_building_progress(collection_name=collection_name, index_name=index_name)
                log.info(f"collection {collection_name} index {index_name} progress: {progress}")
            log.info(f"collection {collection_name} index {index_name} progress: {progress}")
        tt = time.time() - t0
        log.info(f"create index cost time {tt}")
        result = True
        try:
            collection.set_properties({"partitionkey.isolation": "true"})
        except Exception as e:
            result = False
            log.info(f"set_properties after vector indexed {e}")
        assert result is False
        collection.drop_index()
        collection.set_properties({"partitionkey.isolation": "true"})
        collection.create_index("emb", index_params=index_params)
        collection.load()
        res = collection.search(
            data=[[random.random() for _ in range(768)]],
            anns_field="emb",
            expr="scalar_6 == '1' and scalar_3 == '1'",
            param={"metric_type": "L2", "params": {"nprobe": 16}},
            limit=10,
            output_fields=["scalar_6", "scalar_3"]
        )
        log.info(f"search res {res}")
        assert len(res[0]) > 0