mirror of https://github.com/milvus-io/milvus.git
test: add custom analyzer testcases (#37781)
Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>pull/38049/head
parent
302650ae0e
commit
8188e1472d
|
@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col):
|
||||||
id_list.append(row["id"])
|
id_list.append(row["id"])
|
||||||
return id_list
|
return id_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_english_tokens(counter, n=10):
|
||||||
|
english_pattern = re.compile(r'^[a-zA-Z]+$')
|
||||||
|
|
||||||
|
english_tokens = {
|
||||||
|
word: freq
|
||||||
|
for word, freq in counter.items()
|
||||||
|
if english_pattern.match(str(word))
|
||||||
|
}
|
||||||
|
english_counter = Counter(english_tokens)
|
||||||
|
return english_counter.most_common(n)
|
||||||
|
|
||||||
def analyze_documents(texts, language="en"):
|
def analyze_documents(texts, language="en"):
|
||||||
|
|
||||||
tokenizer = custom_tokenizer(language)
|
tokenizer = custom_tokenizer(language)
|
||||||
|
|
|
@ -66,6 +66,7 @@ ml-dtypes==0.2.0
|
||||||
# for full text search
|
# for full text search
|
||||||
bm25s==0.2.0
|
bm25s==0.2.0
|
||||||
jieba==0.42.1
|
jieba==0.42.1
|
||||||
|
Unidecode==1.3.8
|
||||||
|
|
||||||
|
|
||||||
# for perf test
|
# for perf test
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue