test: add custom analyzer testcases (#37781)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2024-11-27 16:08:36 +08:00 · 2024-11-27 16:08:36 +08:00 · 8188e1472d
parent 302650ae0e
commit 8188e1472d
3 changed files with 882 additions and 159 deletions
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col):
            id_list.append(row["id"])
    return id_list
 def get_top_english_tokens(counter, n=10):
    english_pattern = re.compile(r'^[a-zA-Z]+$')
    english_tokens = {
        word: freq
        for word, freq in counter.items()
        if english_pattern.match(str(word))
    }
    english_counter = Counter(english_tokens)
    return english_counter.most_common(n)
 def analyze_documents(texts, language="en"):
    tokenizer = custom_tokenizer(language)
--- a/tests/python_client/requirements.txt
+++ b/tests/python_client/requirements.txt
@ -66,6 +66,7 @@ ml-dtypes==0.2.0
 # for full text search
 bm25s==0.2.0
 jieba==0.42.1
 Unidecode==1.3.8
 # for perf test
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py