mirror of https://github.com/milvus-io/milvus.git
test: add custom analyzer testcases (#37781)
Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>pull/38049/head
parent
302650ae0e
commit
8188e1472d
|
@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col):
|
|||
id_list.append(row["id"])
|
||||
return id_list
|
||||
|
||||
|
||||
def get_top_english_tokens(counter, n=10):
|
||||
english_pattern = re.compile(r'^[a-zA-Z]+$')
|
||||
|
||||
english_tokens = {
|
||||
word: freq
|
||||
for word, freq in counter.items()
|
||||
if english_pattern.match(str(word))
|
||||
}
|
||||
english_counter = Counter(english_tokens)
|
||||
return english_counter.most_common(n)
|
||||
|
||||
def analyze_documents(texts, language="en"):
|
||||
|
||||
tokenizer = custom_tokenizer(language)
|
||||
|
|
|
@ -66,6 +66,7 @@ ml-dtypes==0.2.0
|
|||
# for full text search
|
||||
bm25s==0.2.0
|
||||
jieba==0.42.1
|
||||
Unidecode==1.3.8
|
||||
|
||||
|
||||
# for perf test
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue