test: add custom analyzer testcases (#37781)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
pull/38049/head
zhuwenxing 2024-11-27 16:08:36 +08:00 committed by GitHub
parent 302650ae0e
commit 8188e1472d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 882 additions and 159 deletions

View File

@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col):
id_list.append(row["id"]) id_list.append(row["id"])
return id_list return id_list
def get_top_english_tokens(counter, n=10):
english_pattern = re.compile(r'^[a-zA-Z]+$')
english_tokens = {
word: freq
for word, freq in counter.items()
if english_pattern.match(str(word))
}
english_counter = Counter(english_tokens)
return english_counter.most_common(n)
def analyze_documents(texts, language="en"): def analyze_documents(texts, language="en"):
tokenizer = custom_tokenizer(language) tokenizer = custom_tokenizer(language)

View File

@ -66,6 +66,7 @@ ml-dtypes==0.2.0
# for full text search # for full text search
bm25s==0.2.0 bm25s==0.2.0
jieba==0.42.1 jieba==0.42.1
Unidecode==1.3.8
# for perf test # for perf test

File diff suppressed because it is too large Load Diff