package ctokenizer /* #cgo pkg-config: milvus_core #include // free #include "segcore/tokenizer_c.h" #include "segcore/token_stream_c.h" */ import "C" import ( "encoding/json" "fmt" "path" "unsafe" "github.com/milvus-io/milvus/internal/util/tokenizerapi" "github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/paramtable" ) func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) { param, err := CheckAndFillParams(param) if err != nil { return nil, err } paramPtr := C.CString(param) defer C.free(unsafe.Pointer(paramPtr)) var ptr C.CTokenizer status := C.create_tokenizer(paramPtr, &ptr) if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil { return nil, err } return NewCTokenizer(ptr), nil } func CheckAndFillParams(params string) (string, error) { if len(params) == 0 { return "", nil } var paramMaps map[string]any flag := false err := json.Unmarshal([]byte(params), ¶mMaps) if err != nil { return "", merr.WrapErrAsInputError(fmt.Errorf("unmarshal analyzer params failed with json error: %s", err.Error())) } tokenizer, ok := paramMaps["tokenizer"] if !ok { // skip check if no tokenizer params return params, nil } switch value := tokenizer.(type) { case string: // return if use build-in tokenizer return params, nil case map[string]any: flag, err = CheckAndFillTokenizerParams(value) if err != nil { return "", err } default: return "", merr.WrapErrAsInputError(fmt.Errorf("analyzer params set tokenizer with unknown type")) } // remarshal json params if params map was changed. if flag { bytes, err := json.Marshal(paramMaps) if err != nil { return "", merr.WrapErrAsInputError(fmt.Errorf("marshal analyzer params failed with json error: %s", err.Error())) } return string(bytes), nil } return params, nil } // fill some milvus params to tokenizer params func CheckAndFillTokenizerParams(params map[string]any) (bool, error) { v, ok := params["type"] if !ok { return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer must set type")) } tokenizerType, ok := v.(string) if !ok { return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer type must be string")) } switch tokenizerType { case "lindera": cfg := paramtable.Get() if _, ok := params["dict_build_dir"]; ok { return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer dict_build_dir was system params, should not be set")) } // build lindera to LocalResourcePath/lindera/dict_kind params["dict_build_dir"] = path.Join(cfg.FunctionCfg.LocalResourcePath.GetValue(), "lindera") v, ok := params["dict_kind"] if !ok { return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer must set dict_kind")) } dictKind, ok := v.(string) if !ok { return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer dict kind must be string")) } dictUrlsMap := cfg.FunctionCfg.LinderaDownloadUrls.GetValue() if _, ok := params["download_urls"]; ok { return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer download_urls was system params, should not be set")) } if value, ok := dictUrlsMap["."+dictKind]; ok { // use download urls set in milvus yaml params["download_urls"] = paramtable.ParseAsStings(value) } return true, nil default: return false, nil } } func ValidateTokenizer(param string) error { param, err := CheckAndFillParams(param) if err != nil { return err } paramPtr := C.CString(param) defer C.free(unsafe.Pointer(paramPtr)) status := C.validate_tokenizer(paramPtr) if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil { return err } return nil }