diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index b67dda0955..9b0c70903c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -82,6 +82,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" + [[package]] name = "arc-swap" version = "1.7.1" @@ -102,15 +108,21 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.85" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "atty" version = "0.2.14" @@ -149,6 +161,15 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -157,9 +178,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.7.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "bitpacking" @@ -192,15 +213,27 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" + [[package]] name = "cbindgen" version = "0.26.0" @@ -208,8 +241,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da6bc11b07529f16944307272d5bd9b22530bc7d05751717c9d416586cedab49" dependencies = [ "clap", - "heck", - "indexmap", + "heck 0.4.1", + "indexmap 1.9.3", "log", "proc-macro2", "quote", @@ -222,9 +255,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.9" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" +checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" dependencies = [ "jobserver", "libc", @@ -261,7 +294,7 @@ dependencies = [ "atty", "bitflags 1.3.2", "clap_lex", - "indexmap", + "indexmap 1.9.3", "strsim 0.10.0", "termcolor", "textwrap", @@ -291,6 +324,22 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "crc32fast" version = "1.4.2" @@ -336,9 +385,30 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] [[package]] name = "darling" @@ -361,7 +431,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -372,7 +442,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -385,6 +455,48 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.98", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "downcast-rs" version = "1.2.1" @@ -397,6 +509,88 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "env_filter" version = "0.1.3" @@ -469,6 +663,28 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "filetime" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" +dependencies = [ + "cfg-if", + "libc", + "libredox", + "windows-sys 0.59.0", +] + +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -481,6 +697,30 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "fs4" version = "0.8.4" @@ -547,7 +787,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -597,7 +837,19 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets", ] [[package]] @@ -606,6 +858,31 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "h2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.7.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -635,6 +912,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -650,12 +933,124 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" +[[package]] +name = "http" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" + [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "hyper" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + [[package]] name = "hyperloglogplus" version = "0.4.1" @@ -665,12 +1060,151 @@ dependencies = [ "serde", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "ident_case" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -681,6 +1215,22 @@ dependencies = [ "hashbrown 0.12.3", ] +[[package]] +name = "indexmap" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +dependencies = [ + "equivalent", + "hashbrown 0.15.2", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -726,6 +1276,25 @@ dependencies = [ "libc", ] +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -750,6 +1319,137 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.8.0", + "libc", + "redox_syscall", +] + +[[package]] +name = "lindera" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-cc-cedict", + "lindera-dictionary", + "lindera-ipadic", + "lindera-ipadic-neologd", + "lindera-ko-dic", + "lindera-unidic", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum", + "strum_macros", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-cc-cedict" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-dictionary" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "once_cell", + "reqwest", + "serde", + "tar", + "thiserror", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ipadic-neologd" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-unidic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -757,10 +1457,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] -name = "log" -version = "0.4.22" +name = "litemap" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + +[[package]] +name = "log" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" [[package]] name = "lru" @@ -801,6 +1507,12 @@ dependencies = [ "libc", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -816,12 +1528,40 @@ dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + [[package]] name = "murmurhash32" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nom" version = "7.1.3" @@ -859,15 +1599,59 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.2" +version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "oneshot" -version = "0.1.8" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e296cf87e61c9cfc1a61c3c63a0f7f286ed4554e0e22be84e8a38e1d264a2a29" +checksum = "79d72a7c0f743d2ebb0a2ad1d219db75fdc799092ed3a884c9144c42a31225bd" + +[[package]] +name = "openssl" +version = "0.10.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" +dependencies = [ + "bitflags 2.8.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] [[package]] name = "os_str_bytes" @@ -889,6 +1673,12 @@ version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "phf" version = "0.11.3" @@ -967,7 +1757,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1015,7 +1805,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -1048,6 +1838,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "regex" version = "1.11.1" @@ -1077,6 +1876,65 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "reqwest" +version = "0.12.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tower", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "windows-registry", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -1095,23 +1953,62 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.7.0", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys", "windows-sys 0.59.0", ] +[[package]] +name = "rustls" +version = "0.23.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" + +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.19" @@ -1120,9 +2017,18 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" + +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] [[package]] name = "scopeguard" @@ -1130,6 +2036,29 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.8.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "serde" version = "1.0.217" @@ -1147,14 +2076,14 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "serde_json" -version = "1.0.135" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -1162,6 +2091,31 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.7.1", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1198,6 +2152,22 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "socket2" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -1216,6 +2186,34 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.98", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "1.0.109" @@ -1229,15 +2227,56 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.8.0", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tantivy" version = "0.23.0" @@ -1304,6 +2343,7 @@ dependencies = [ "jieba-rs", "lazy_static", "libc", + "lindera", "log", "regex", "scopeguard", @@ -1399,14 +2439,25 @@ dependencies = [ ] [[package]] -name = "tempfile" -version = "3.15.0" +name = "tar" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "tempfile" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -1444,7 +2495,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1478,6 +2529,31 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.43.0" @@ -1485,7 +2561,57 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", + "bytes", + "libc", + "mio", "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", ] [[package]] @@ -1498,10 +2624,112 @@ dependencies = [ ] [[package]] -name = "unicode-ident" -version = "1.0.14" +name = "tower" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + +[[package]] +name = "unicode-ident" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" + +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" [[package]] name = "utf8-ranges" @@ -1509,6 +2737,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -1517,20 +2751,125 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b913a3b5fe84142e269d63cc62b64319ccaf89b748fc31fe025177f767a756c4" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom", + "getrandom 0.3.1", "serde", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.98", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1562,6 +2901,36 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -1644,6 +3013,68 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "xattr" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e105d177a3871454f754b33bb0ee637ecaaac997446375fd3e5d43a2ed00c909" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -1662,7 +3093,56 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 15583270e0..08265d2fa0 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -5,8 +5,17 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["lindera-ipadic", "lindera-ko-dic", "lindera-cc-cedict"] +lindera-ipadic = ["lindera/ipadic"] +lindera-ipadic-neologd = ["lindera/ipadic-neologd"] +lindera-unidic = ["lindera/unidic"] +lindera-ko-dic = ["lindera/ko-dic"] +lindera-cc-cedict = ["lindera/cc-cedict"] + [dependencies] tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug. +lindera = "0.38.1" futures = "0.3.21" libc = "0.2" scopeguard = "1.2" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 16d378132b..9172570f62 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -342,11 +342,11 @@ bool tantivy_token_stream_advance(void *token_stream); const char *tantivy_token_stream_get_token(void *token_stream); -RustResult tantivy_create_tokenizer(const char *analyzer_params); +RustResult tantivy_create_analyzer(const char *analyzer_params); -void *tantivy_clone_tokenizer(void *ptr); +void *tantivy_clone_analyzer(void *ptr); -void tantivy_free_tokenizer(void *tokenizer); +void tantivy_free_analyzer(void *tokenizer); bool tantivy_index_exist(const char *path); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs similarity index 72% rename from internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs rename to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs index 06c440f39b..1b99023117 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs @@ -1,80 +1,19 @@ -use log::warn; -use serde_json as json; use std::collections::HashMap; -use tantivy::tokenizer::StopWordFilter; use tantivy::tokenizer::*; +use serde_json as json; use crate::error::Result; use crate::error::TantivyBindingError; -use crate::jieba_tokenizer::JiebaTokenizer; -use crate::stop_words; -use crate::tokenizer_filter::*; -use crate::util::*; +use crate::analyzer::{ + build_in_analyzer::*, + tokenizers::get_builder_with_tokenizer, + filter::*, + util::* +}; -// default build-in analyzer -pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { - let builder = standard_builder().filter(LowerCaser); - if stop_words.len() > 0 { - return builder.filter(StopWordFilter::remove(stop_words)).build(); - } - - builder.build() -} - -fn chinese_analyzer(stop_words: Vec) -> TextAnalyzer { - let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); - if stop_words.len() > 0 { - return builder.filter(StopWordFilter::remove(stop_words)).build(); - } - - builder.build() -} - -fn english_analyzer(stop_words: Vec) -> TextAnalyzer { - let builder = standard_builder() - .filter(LowerCaser) - .filter(Stemmer::new(Language::English)) - .filter(StopWordFilter::remove( - stop_words::ENGLISH.iter().map(|&word| word.to_owned()), - )); - - if stop_words.len() > 0 { - return builder.filter(StopWordFilter::remove(stop_words)).build(); - } - - builder.build() -} - -fn standard_builder() -> TextAnalyzerBuilder { - TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() -} - -fn whitespace_builder() -> TextAnalyzerBuilder { - TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() -} - -fn jieba_builder() -> TextAnalyzerBuilder { - TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() -} - -fn get_builder_by_name(name: &String) -> Result { - match name.as_str() { - "standard" => Ok(standard_builder()), - "whitespace" => Ok(whitespace_builder()), - "jieba" => Ok(jieba_builder()), - other => { - warn!("unsupported tokenizer: {}", other); - Err(TantivyBindingError::InternalError(format!( - "unsupported tokenizer: {}", - other - ))) - } - } -} struct AnalyzerBuilder<'a> { - // builder: TextAnalyzerBuilder filters: HashMap, params: &'a json::Map, } @@ -87,20 +26,21 @@ impl AnalyzerBuilder<'_> { } } - fn get_tokenizer_name(&self) -> Result{ + fn get_tokenizer_params(&self) -> Result<&json::Value>{ let tokenizer=self.params.get("tokenizer"); if tokenizer.is_none(){ return Err(TantivyBindingError::InternalError(format!( "tokenizer name or type must be set" ))); } - if !tokenizer.unwrap().is_string() { - return Err(TantivyBindingError::InternalError(format!( - "tokenizer name should be string" - ))); + let value = tokenizer.unwrap(); + if value.is_object() || value.is_string() { + return Ok(tokenizer.unwrap()) } - Ok(tokenizer.unwrap().as_str().unwrap().to_string()) + Err(TantivyBindingError::InternalError(format!( + "tokenizer name should be string or dict" + ))) } fn add_custom_filter( @@ -196,7 +136,7 @@ impl AnalyzerBuilder<'_> { let str_list = get_string_list(value, "filter stop_words")?; Ok(get_stop_words_list(str_list)) } - None => Ok(vec![]), + _ => Ok(vec![]), } } @@ -227,8 +167,8 @@ impl AnalyzerBuilder<'_> { }; //build custom analyzer - let tokenizer_name = self.get_tokenizer_name()?; - let mut builder = get_builder_by_name(&tokenizer_name)?; + let tokenizer_params = self.get_tokenizer_params()?; + let mut builder = get_builder_with_tokenizer(&tokenizer_params)?; // build with option builder = self.build_option(builder)?; @@ -236,7 +176,7 @@ impl AnalyzerBuilder<'_> { } } -pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result { +pub(crate) fn create_analyzer_with_filter(params: &String) -> Result { match json::from_str::(¶ms) { Ok(value) => { if value.is_null() { @@ -280,16 +220,16 @@ pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result Result { +pub(crate) fn create_analyzer(params: &str) -> Result { if params.len() == 0 { return Ok(standard_analyzer(vec![])); } - create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params)) + create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params)) } #[cfg(test)] mod tests { - use crate::tokenizer::create_tokenizer; + use crate::analyzer::analyzer::create_analyzer; #[test] fn test_standard_analyzer() { @@ -298,7 +238,7 @@ mod tests { "stop_words": ["_english_"] }"#; - let tokenizer = create_tokenizer(¶ms.to_string()); + let tokenizer = create_analyzer(¶ms.to_string()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); } @@ -308,7 +248,7 @@ mod tests { "type": "chinese" }"#; - let tokenizer = create_tokenizer(¶ms.to_string()); + let tokenizer = create_analyzer(¶ms.to_string()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); let mut bining = tokenizer.unwrap(); let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); @@ -321,4 +261,28 @@ mod tests { print!("test tokens :{:?}\n", results) } + + #[test] + fn test_lindera_analyzer() { + let params = r#"{ + "tokenizer": { + "type": "lindera", + "dict_kind": "ipadic" + } + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + + let mut bining = tokenizer.unwrap(); + let mut stream = bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + print!("test tokens :{:?}\n", results) + } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs new file mode 100644 index 0000000000..691ce9b6e4 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs @@ -0,0 +1,40 @@ +use tantivy::tokenizer::*; + +use crate::analyzer::tokenizers::*; +use crate::analyzer::filter::*; +use crate::analyzer::stop_words; + +// default build-in analyzer +pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder().filter(LowerCaser); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn chinese_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn english_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder() + .filter(LowerCaser) + .filter(Stemmer::new(Language::English)) + .filter(StopWordFilter::remove( + stop_words::ENGLISH.iter().map(|&word| word.to_owned()), + )); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs similarity index 98% rename from internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs rename to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs index 6b67b662a8..fd4c6d7f57 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs @@ -2,9 +2,8 @@ use regex; use serde_json as json; use tantivy::tokenizer::*; -use crate::error::Result; -use crate::error::TantivyBindingError; -use crate::util::*; +use crate::error::{Result,TantivyBindingError}; +use crate::analyzer::util::*; pub(crate) enum SystemFilter { Invalid, @@ -79,7 +78,7 @@ fn get_decompounder_filter(params: &json::Map) -> Result str_list.push(word.to_string()), - None => { + _ => { return Err(TantivyBindingError::InternalError( "decompounder word list item should be string".to_string(), )) @@ -114,12 +113,10 @@ fn get_stemmer_filter(params: &json::Map) -> Result Result; } impl LanguageParser for &str { - type Error = TantivyBindingError; fn into_language(self) -> Result { match self.to_lowercase().as_str() { "arabig" => Ok(Language::Arabic), diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs new file mode 100644 index 0000000000..4a7ba62849 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs @@ -0,0 +1,9 @@ +mod analyzer; +mod stop_words; +mod tokenizers; +mod build_in_analyzer; +mod filter; +mod util; + +pub(crate) use self::analyzer::create_analyzer; +pub(crate) use self::build_in_analyzer::standard_analyzer; \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/stop_words.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/stop_words.rs similarity index 100% rename from internal/core/thirdparty/tantivy/tantivy-binding/src/stop_words.rs rename to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/stop_words.rs diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs similarity index 100% rename from internal/core/thirdparty/tantivy/tantivy-binding/src/jieba_tokenizer.rs rename to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs new file mode 100644 index 0000000000..548d439793 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -0,0 +1,157 @@ + +use core::result::Result::Err; +use log::warn; + +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::token::Token as LToken; +use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder}; +use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; +use tantivy::tokenizer::{Token, Tokenizer, TokenStream}; + +use serde_json as json; +use crate::error::{Result,TantivyBindingError}; + +pub struct LinderaTokenStream<'a> { + pub tokens: Vec>, + pub token: &'a mut Token, +} + +impl<'a> TokenStream for LinderaTokenStream<'a> { + fn advance(&mut self) -> bool { + if self.tokens.is_empty() { + return false; + } + let token = self.tokens.remove(0); + self.token.text = token.text.to_string(); + self.token.offset_from = token.byte_start; + self.token.offset_to = token.byte_end; + self.token.position = token.position; + self.token.position_length = token.position_length; + + true + } + + fn token(&self) -> &Token { + self.token + } + + fn token_mut(&mut self) -> &mut Token { + self.token + } +} + +#[derive(Clone)] +pub struct LinderaTokenizer { + tokenizer: LTokenizer, + token: Token, +} + +impl LinderaTokenizer { + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. + pub fn from_json(params: &json::Map) -> Result { + let kind = fetch_lindera_kind(params)?; + let dictionary = load_dictionary_from_kind(kind); + if dictionary.is_err(){ + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer with invalid dict_kind" + ))); + } + let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None); + Ok(LinderaTokenizer::from_segmenter(segmenter)) + } + + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. + pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { + LinderaTokenizer { + tokenizer: LTokenizer::new(segmenter), + token: Default::default(), + } + } +} + +impl Tokenizer for LinderaTokenizer { + type TokenStream<'a> = LinderaTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> { + self.token.reset(); + LinderaTokenStream { + tokens: self.tokenizer.tokenize(text).unwrap(), + token: &mut self.token, + } + } +} + +trait DictionaryKindParser { + fn into_dict_kind(self) -> Result; +} + +impl DictionaryKindParser for &str{ + fn into_dict_kind(self) -> Result { + match self{ + "ipadic" => Ok(DictionaryKind::IPADIC), + "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd), + "unidic" => Ok(DictionaryKind::UniDic), + "ko-dic" => Ok(DictionaryKind::KoDic), + "cc-cedict" => Ok(DictionaryKind::CcCedict), + other => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported lindera dict type: {}", + other + ))) + } + } +} + +fn fetch_lindera_kind(params:&json::Map) -> Result{ + match params.get("dict_kind"){ + Some(val) => { + if !val.is_string(){ + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict kind should be string" + ))) + } + val.as_str().unwrap().into_dict_kind() + }, + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict_kind must be set" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use serde_json as json; + + use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer; + + #[test] + fn test_lindera_tokenizer(){ + let params = r#"{ + "type": "lindera", + "dict_kind": "ipadic" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } + + #[test] + #[cfg(feature = "lindera-cc-cedict")] + fn test_lindera_tokenizer_cc(){ + let params = r#"{ + "type": "lindera", + "dict_kind": "cc-cedict" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs new file mode 100644 index 0000000000..3d3c24864c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs @@ -0,0 +1,7 @@ +mod tokenizer; +mod jieba_tokenizer; +mod lindera_tokenizer; + +pub(crate) use self::tokenizer::*; +use self::jieba_tokenizer::JiebaTokenizer; +use self::lindera_tokenizer::LinderaTokenizer; \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs new file mode 100644 index 0000000000..1644fbe4fa --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs @@ -0,0 +1,73 @@ +use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; +use lindera::segmenter::Segmenter; +use tantivy::tokenizer::*; +use lindera::mode::Mode; +use serde_json as json; +use log::warn; + +use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer}; +use crate::error::{Result,TantivyBindingError}; + + +pub fn standard_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() +} + +pub fn whitespace_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() +} + +pub fn jieba_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() +} + +pub fn lindera_builder(params: Option<&json::Map>) -> Result{ + if params.is_none(){ + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer must be costum" + ))) + } + let tokenizer = LinderaTokenizer::from_json(params.unwrap())?; + Ok(TextAnalyzer::builder(tokenizer).dynamic()) +} + +pub fn get_builder_with_tokenizer(params: &json::Value) -> Result { + let name; + let params_map; + if params.is_string(){ + name = params.as_str().unwrap(); + params_map = None; + }else{ + let m = params.as_object().unwrap(); + match m.get("type"){ + Some(val) => { + if !val.is_string(){ + return Err(TantivyBindingError::InvalidArgument(format!( + "tokenizer type should be string" + ))) + } + name = val.as_str().unwrap(); + }, + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "costum tokenizer must set type" + ))) + }, + } + params_map = Some(m); + } + + match name { + "standard" => Ok(standard_builder()), + "whitespace" => Ok(whitespace_builder()), + "jieba" => Ok(jieba_builder()), + "lindera" => lindera_builder(params_map), + other => { + warn!("unsupported tokenizer: {}", other); + Err(TantivyBindingError::InvalidArgument(format!( + "unsupported tokenizer: {}", + other + ))) + } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/util.rs new file mode 100644 index 0000000000..0ee04eea70 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/util.rs @@ -0,0 +1,45 @@ +use serde_json as json; + +use crate::error::{Result,TantivyBindingError}; +use crate::analyzer::stop_words; + +pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result> { + if !value.is_array() { + return Err(TantivyBindingError::InternalError( + format!("{} should be array", label).to_string(), + )); + } + + let stop_words = value.as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words { + match element.as_str() { + Some(word) => str_list.push(word.to_string()), + _ => { + return Err(TantivyBindingError::InternalError( + format!("{} list item should be string", label).to_string(), + )) + } + } + } + Ok(str_list) +} + +pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { + let mut stop_words = Vec::new(); + for str in str_list { + if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { + match str.as_str() { + "_english_" => { + for word in stop_words::ENGLISH { + stop_words.push(word.to_string()); + } + continue; + } + _other => {} + } + } + stop_words.push(str); + } + stop_words +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index e0efce7181..eb1adc3d84 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -5,7 +5,7 @@ use tantivy::{ }; use crate::error::Result; -use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer}; +use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer}; impl IndexReaderWrapper { // split the query string into multiple tokens using index's default tokenizer, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index 72e83f9842..f5b80a3cc8 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -4,7 +4,7 @@ use libc::{c_char, c_void}; use crate::{ array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log, - tokenizer::create_tokenizer, + analyzer::create_analyzer, }; #[no_mangle] @@ -39,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer( let real = ptr as *mut IndexReaderWrapper; let tokenizer_name = cstr_to_str!(tokenizer_name); let params = cstr_to_str!(analyzer_params); - let analyzer = create_tokenizer(params); + let analyzer = create_analyzer(params); match analyzer { Ok(text_analyzer) => unsafe { (*real).register_tokenizer(String::from(tokenizer_name), text_analyzer); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 261dde5c72..3c2a6f8a75 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -5,7 +5,8 @@ use crate::array::RustResult; use crate::cstr_to_str; use crate::index_writer::IndexWriterWrapper; use crate::log::init_log; -use crate::tokenizer::create_tokenizer; +use crate::string_c::c_str_to_str; +use crate::analyzer::create_analyzer; use crate::util::create_binding; #[no_mangle] @@ -23,7 +24,7 @@ pub extern "C" fn tantivy_create_text_writer( let path_str = cstr_to_str!(path); let tokenizer_name_str = cstr_to_str!(tokenizer_name); let params = cstr_to_str!(analyzer_params); - let analyzer = create_tokenizer(params); + let analyzer = create_analyzer(params); match analyzer { Ok(text_analyzer) => { let wrapper = IndexWriterWrapper::create_text_writer( diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index 110c16d6ab..508c8a1448 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -12,14 +12,11 @@ mod index_writer; mod index_writer_c; mod index_writer_text; mod index_writer_text_c; -mod jieba_tokenizer; mod log; -mod stop_words; mod string_c; mod token_stream_c; -mod tokenizer; +mod analyzer; mod tokenizer_c; -mod tokenizer_filter; mod util; mod util_c; mod vec_collector; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index 6290591d27..14b2482169 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -5,15 +5,15 @@ use crate::{ array::RustResult, log::init_log, string_c::c_str_to_str, - tokenizer::create_tokenizer, + analyzer::create_analyzer, util::{create_binding, free_binding}, }; #[no_mangle] -pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> RustResult { +pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> RustResult { init_log(); let params = unsafe { c_str_to_str(analyzer_params).to_string() }; - let analyzer = create_tokenizer(¶ms); + let analyzer = create_analyzer(¶ms); match analyzer { Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)), Err(err) => RustResult::from_error(format!( @@ -25,13 +25,13 @@ pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> Ru } #[no_mangle] -pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void { +pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void { let analyzer = ptr as *mut TextAnalyzer; let clone = unsafe { (*analyzer).clone() }; create_binding(clone) } #[no_mangle] -pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) { +pub extern "C" fn tantivy_free_analyzer(tokenizer: *mut c_void) { free_binding::(tokenizer); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs index 8fa56898f4..e4d9c9a9ca 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs @@ -1,12 +1,7 @@ -use serde_json as json; use std::ffi::c_void; use std::ops::Bound; use tantivy::{directory::MmapDirectory, Index}; -use crate::error::Result; -use crate::error::TantivyBindingError; -use crate::stop_words; - pub fn index_exist(path: &str) -> bool { let dir = MmapDirectory::open(path).unwrap(); Index::exists(&dir).unwrap() @@ -31,45 +26,4 @@ pub fn free_binding(ptr: *mut c_void) { unsafe { drop(Box::from_raw(real)); } -} - -pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result> { - if !value.is_array() { - return Err(TantivyBindingError::InternalError( - format!("{} should be array", label).to_string(), - )); - } - - let stop_words = value.as_array().unwrap(); - let mut str_list = Vec::::new(); - for element in stop_words { - match element.as_str() { - Some(word) => str_list.push(word.to_string()), - None => { - return Err(TantivyBindingError::InternalError( - format!("{} list item should be string", label).to_string(), - )) - } - } - } - Ok(str_list) -} - -pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { - let mut stop_words = Vec::new(); - for str in str_list { - if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { - match str.as_str() { - "_english_" => { - for word in stop_words::ENGLISH { - stop_words.push(word.to_string()); - } - continue; - } - _other => {} - } - } - stop_words.push(str); - } - stop_words -} +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tokenizer.h b/internal/core/thirdparty/tantivy/tokenizer.h index c4be0ee314..2870b01e2e 100644 --- a/internal/core/thirdparty/tantivy/tokenizer.h +++ b/internal/core/thirdparty/tantivy/tokenizer.h @@ -15,7 +15,7 @@ struct Tokenizer { explicit Tokenizer(std::string&& params) { auto shared_params = std::make_shared(std::move(params)); auto res = - RustResultWrapper(tantivy_create_tokenizer(shared_params->c_str())); + RustResultWrapper(tantivy_create_analyzer(shared_params->c_str())); AssertInfo(res.result_->success, "Tokenizer creation failed: {}", res.result_->error); @@ -27,7 +27,7 @@ struct Tokenizer { ~Tokenizer() { if (ptr_ != nullptr) { - tantivy_free_tokenizer(ptr_); + tantivy_free_analyzer(ptr_); } } @@ -41,7 +41,7 @@ struct Tokenizer { std::unique_ptr Clone() { - auto newptr = tantivy_clone_tokenizer(ptr_); + auto newptr = tantivy_clone_analyzer(ptr_); return std::make_unique(newptr); }