mirror of https://github.com/milvus-io/milvus.git
enhance: optimize search performance of inverted index (#29794)
issue: #29793 Use `DocSetCollector` instead of `TopDocsCollector`, which will avoid scoring and sorting. --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>pull/29871/head
parent
5164d30287
commit
67ab5be15a
|
@ -64,3 +64,10 @@ target_link_libraries(test_tantivy
|
|||
boost_filesystem
|
||||
dl
|
||||
)
|
||||
|
||||
add_executable(bench_tantivy bench.cpp)
|
||||
target_link_libraries(bench_tantivy
|
||||
tantivy_binding
|
||||
boost_filesystem
|
||||
dl
|
||||
)
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
#include <cstdint>
|
||||
#include <cassert>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "tantivy-wrapper.h"
|
||||
#include "time_recorder.h"
|
||||
|
||||
using namespace milvus::tantivy;
|
||||
|
||||
void
|
||||
build_index(size_t n = 1000000) {
|
||||
auto path = "/tmp/inverted-index/test-binding/";
|
||||
boost::filesystem::remove_all(path);
|
||||
boost::filesystem::create_directories(path);
|
||||
|
||||
auto w =
|
||||
TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path);
|
||||
|
||||
std::vector<std::string> arr;
|
||||
arr.reserve(n);
|
||||
|
||||
std::default_random_engine er(42);
|
||||
int64_t sample = 10000;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
auto x = er() % sample;
|
||||
arr.push_back(std::to_string(x));
|
||||
}
|
||||
|
||||
w.add_data<std::string>(arr.data(), arr.size());
|
||||
|
||||
w.finish();
|
||||
assert(w.count() == n);
|
||||
}
|
||||
|
||||
void
|
||||
search(size_t repeat = 10) {
|
||||
TimeRecorder tr("bench-tantivy-search");
|
||||
|
||||
auto path = "/tmp/inverted-index/test-binding/";
|
||||
assert(tantivy_index_exist(path));
|
||||
tr.RecordSection("check if index exist");
|
||||
|
||||
auto w = TantivyIndexWrapper(path);
|
||||
auto cnt = w.count();
|
||||
tr.RecordSection("count num_entities");
|
||||
std::cout << "index already exist, open it, count: " << cnt << std::endl;
|
||||
|
||||
for (size_t i = 0; i < repeat; i++) {
|
||||
w.lower_bound_range_query<std::string>(std::to_string(45), false);
|
||||
tr.RecordSection("query");
|
||||
}
|
||||
|
||||
tr.ElapseFromBegin("done");
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char* argv[]) {
|
||||
build_index(1000000);
|
||||
search(10);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
use std::{env, path::PathBuf};
|
||||
|
||||
|
||||
fn main() {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
let package_name = env::var("CARGO_PKG_NAME").unwrap();
|
||||
|
|
|
@ -21,30 +21,6 @@ struct RustArray {
|
|||
|
||||
extern "C" {
|
||||
|
||||
void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path);
|
||||
|
||||
void tantivy_free_index_writer(void *ptr);
|
||||
|
||||
void tantivy_finish_index(void *ptr);
|
||||
|
||||
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_keyword(void *ptr, const char *s);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
void free_rust_array(RustArray array);
|
||||
|
||||
void *tantivy_load_index(const char *path);
|
||||
|
@ -97,4 +73,28 @@ RustArray tantivy_range_query_keyword(void *ptr,
|
|||
|
||||
RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix);
|
||||
|
||||
void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path);
|
||||
|
||||
void tantivy_free_index_writer(void *ptr);
|
||||
|
||||
void tantivy_finish_index(void *ptr);
|
||||
|
||||
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_keyword(void *ptr, const char *s);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
} // extern "C"
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
use libc::size_t;
|
||||
|
||||
|
||||
#[repr(C)]
|
||||
pub struct RustArray {
|
||||
array: *mut u32,
|
||||
|
@ -23,7 +22,7 @@ impl RustArray {
|
|||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_rust_array(array: RustArray) {
|
||||
let RustArray { array, len , cap} = array;
|
||||
let RustArray { array, len, cap } = array;
|
||||
unsafe {
|
||||
Vec::from_raw_parts(array, len, cap);
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
#[repr(u8)]
|
||||
pub enum TantivyDataType {
|
||||
// Text,
|
||||
Keyword,
|
||||
// U64,
|
||||
I64,
|
||||
F64,
|
||||
Bool,
|
||||
// Text,
|
||||
Keyword,
|
||||
// U64,
|
||||
I64,
|
||||
F64,
|
||||
Bool,
|
||||
}
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use tantivy::{
|
||||
collector::{Collector, SegmentCollector},
|
||||
DocId,
|
||||
};
|
||||
|
||||
pub struct HashSetCollector;
|
||||
|
||||
impl Collector for HashSetCollector {
|
||||
type Fruit = HashSet<DocId>;
|
||||
|
||||
type Child = HashSetChildCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: tantivy::SegmentOrdinal,
|
||||
_segment: &tantivy::SegmentReader,
|
||||
) -> tantivy::Result<Self::Child> {
|
||||
Ok(HashSetChildCollector {
|
||||
docs: HashSet::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_fruits: Vec<HashSet<DocId>>) -> tantivy::Result<HashSet<DocId>> {
|
||||
if segment_fruits.len() == 1 {
|
||||
Ok(segment_fruits.into_iter().next().unwrap())
|
||||
} else {
|
||||
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
|
||||
let mut result = HashSet::with_capacity(len);
|
||||
for docs in segment_fruits {
|
||||
for doc in docs {
|
||||
result.insert(doc);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HashSetChildCollector {
|
||||
docs: HashSet<DocId>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for HashSetChildCollector {
|
||||
type Fruit = HashSet<DocId>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: tantivy::Score) {
|
||||
self.docs.insert(doc);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Self::Fruit {
|
||||
self.docs
|
||||
}
|
||||
}
|
|
@ -1,22 +1,19 @@
|
|||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
|
||||
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::query::{Query, RangeQuery, TermQuery, RegexQuery};
|
||||
use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery};
|
||||
use tantivy::schema::{Field, IndexRecordOption};
|
||||
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
||||
|
||||
|
||||
use crate::util::make_bounds;
|
||||
use crate::vec_collector::VecCollector;
|
||||
|
||||
pub struct IndexReaderWrapper {
|
||||
pub field_name: String,
|
||||
pub field: Field,
|
||||
pub reader: IndexReader,
|
||||
pub cnt: u32,
|
||||
pub cnt: u32,
|
||||
}
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
|
@ -26,20 +23,18 @@ impl IndexReaderWrapper {
|
|||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let metas = index
|
||||
.searchable_segment_metas()
|
||||
.unwrap();
|
||||
let metas = index.searchable_segment_metas().unwrap();
|
||||
let mut sum: u32 = 0;
|
||||
for meta in metas {
|
||||
sum += meta.max_doc();
|
||||
}
|
||||
reader.reload().unwrap();
|
||||
reader.reload().unwrap();
|
||||
IndexReaderWrapper {
|
||||
field_name: field_name.to_string(),
|
||||
field,
|
||||
reader,
|
||||
cnt: sum,
|
||||
}
|
||||
field_name: field_name.to_string(),
|
||||
field,
|
||||
reader,
|
||||
cnt: sum,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load(path: &str) -> IndexReaderWrapper {
|
||||
|
@ -52,18 +47,15 @@ impl IndexReaderWrapper {
|
|||
}
|
||||
|
||||
pub fn count(&self) -> u32 {
|
||||
self.cnt
|
||||
self.cnt
|
||||
}
|
||||
|
||||
fn search(&self, q: &dyn Query) -> Vec<u32> {
|
||||
let searcher = self.reader.searcher();
|
||||
let cnt = self.cnt;
|
||||
let hits = searcher
|
||||
.search(q, &TopDocs::with_limit(cnt as usize))
|
||||
.unwrap();
|
||||
let mut ret = Vec::new();
|
||||
for (_, address) in hits {
|
||||
ret.push(address.doc_id);
|
||||
let hits = searcher.search(q, &VecCollector).unwrap();
|
||||
let mut ret = Vec::with_capacity(hits.len());
|
||||
for address in hits {
|
||||
ret.push(address);
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
@ -193,10 +185,7 @@ impl IndexReaderWrapper {
|
|||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn prefix_query_keyword(
|
||||
&self,
|
||||
prefix: &str,
|
||||
) -> Vec<u32> {
|
||||
pub fn prefix_query_keyword(&self, prefix: &str) -> Vec<u32> {
|
||||
let pattern = format!("{}(.|\n)*", prefix);
|
||||
let q = RegexQuery::from_pattern(&pattern, self.field).unwrap();
|
||||
self.search(&q)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
index_reader::IndexReaderWrapper, util_c::tantivy_index_exist, util::{create_binding, free_binding}, array::RustArray,
|
||||
array::RustArray,
|
||||
index_reader::IndexReaderWrapper,
|
||||
util::{create_binding, free_binding},
|
||||
util_c::tantivy_index_exist,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -196,7 +199,10 @@ pub extern "C" fn tantivy_range_query_keyword(
|
|||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_prefix_query_keyword(ptr: *mut c_void, prefix: *const c_char) -> RustArray {
|
||||
pub extern "C" fn tantivy_prefix_query_keyword(
|
||||
ptr: *mut c_void,
|
||||
prefix: *const c_char,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_str = CStr::from_ptr(prefix);
|
||||
|
|
|
@ -4,7 +4,6 @@ use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextO
|
|||
use tantivy::{doc, tokenizer, Index, IndexWriter};
|
||||
|
||||
use crate::data_type::TantivyDataType;
|
||||
use crate::index_reader::IndexReaderWrapper;
|
||||
|
||||
pub struct IndexWriterWrapper {
|
||||
pub field_name: String,
|
||||
|
|
|
@ -2,7 +2,9 @@ use core::slice;
|
|||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
data_type::TantivyDataType, index_writer::IndexWriterWrapper, util::{create_binding, free_binding},
|
||||
data_type::TantivyDataType,
|
||||
index_writer::IndexWriterWrapper,
|
||||
util::{create_binding, free_binding},
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -31,9 +33,7 @@ pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
|
|||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
Box::from_raw(real).finish()
|
||||
}
|
||||
unsafe { Box::from_raw(real).finish() }
|
||||
}
|
||||
|
||||
// -------------------------build--------------------
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
mod data_type;
|
||||
mod index_writer;
|
||||
mod index_writer_c;
|
||||
mod util;
|
||||
mod util_c;
|
||||
mod array;
|
||||
mod data_type;
|
||||
mod hashset_collector;
|
||||
mod index_reader;
|
||||
mod index_reader_c;
|
||||
mod index_writer;
|
||||
mod index_writer_c;
|
||||
mod linkedlist_collector;
|
||||
mod util;
|
||||
mod util_c;
|
||||
mod vec_collector;
|
||||
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
|
|
61
internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs
vendored
Normal file
61
internal/core/thirdparty/tantivy/tantivy-binding/src/linkedlist_collector.rs
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
use std::collections::LinkedList;
|
||||
|
||||
use tantivy::{
|
||||
collector::{Collector, SegmentCollector},
|
||||
DocId,
|
||||
};
|
||||
|
||||
pub struct LinkedListCollector;
|
||||
|
||||
impl Collector for LinkedListCollector {
|
||||
type Fruit = LinkedList<DocId>;
|
||||
|
||||
type Child = LinkedListChildCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: tantivy::SegmentOrdinal,
|
||||
_segment: &tantivy::SegmentReader,
|
||||
) -> tantivy::Result<Self::Child> {
|
||||
Ok(LinkedListChildCollector {
|
||||
docs: LinkedList::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<LinkedList<DocId>>,
|
||||
) -> tantivy::Result<LinkedList<DocId>> {
|
||||
if segment_fruits.len() == 1 {
|
||||
Ok(segment_fruits.into_iter().next().unwrap())
|
||||
} else {
|
||||
let mut result = LinkedList::new();
|
||||
for docs in segment_fruits {
|
||||
for doc in docs {
|
||||
result.push_front(doc);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LinkedListChildCollector {
|
||||
docs: LinkedList<DocId>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for LinkedListChildCollector {
|
||||
type Fruit = LinkedList<DocId>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: tantivy::Score) {
|
||||
self.docs.push_front(doc);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Self::Fruit {
|
||||
self.docs
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
use std::ffi::c_void;
|
||||
use std::ops::Bound;
|
||||
use std::ffi::{c_void};
|
||||
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
|
@ -17,9 +17,9 @@ pub fn make_bounds<T>(bound: T, inclusive: bool) -> Bound<T> {
|
|||
}
|
||||
|
||||
pub fn create_binding<T>(wrapper: T) -> *mut c_void {
|
||||
let bp = Box::new(wrapper);
|
||||
let p_heap : *mut T = Box::into_raw(bp);
|
||||
p_heap as *mut c_void
|
||||
let bp = Box::new(wrapper);
|
||||
let p_heap: *mut T = Box::into_raw(bp);
|
||||
p_heap as *mut c_void
|
||||
}
|
||||
|
||||
pub fn free_binding<T>(ptr: *mut c_void) {
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
use std::ffi::{c_char, CStr};
|
||||
|
||||
use crate::{util::index_exist};
|
||||
use crate::util::index_exist;
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_exist(path: *const c_char) -> bool {
|
||||
let path_str = unsafe { CStr::from_ptr(path) };
|
||||
index_exist(path_str.to_str().unwrap())
|
||||
let path_str = unsafe { CStr::from_ptr(path) };
|
||||
index_exist(path_str.to_str().unwrap())
|
||||
}
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
use tantivy::{
|
||||
collector::{Collector, SegmentCollector},
|
||||
DocId,
|
||||
};
|
||||
|
||||
pub struct VecCollector;
|
||||
|
||||
impl Collector for VecCollector {
|
||||
type Fruit = Vec<DocId>;
|
||||
|
||||
type Child = VecChildCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: tantivy::SegmentOrdinal,
|
||||
_segment: &tantivy::SegmentReader,
|
||||
) -> tantivy::Result<Self::Child> {
|
||||
Ok(VecChildCollector { docs: Vec::new() })
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Vec<DocId>>) -> tantivy::Result<Vec<DocId>> {
|
||||
if segment_fruits.len() == 1 {
|
||||
Ok(segment_fruits.into_iter().next().unwrap())
|
||||
} else {
|
||||
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
|
||||
let mut result = Vec::with_capacity(len);
|
||||
for docs in segment_fruits {
|
||||
for doc in docs {
|
||||
result.push(doc);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct VecChildCollector {
|
||||
docs: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for VecChildCollector {
|
||||
type Fruit = Vec<DocId>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: tantivy::Score) {
|
||||
self.docs.push(doc);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Self::Fruit {
|
||||
self.docs
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
class TimeRecorder {
|
||||
using stdclock = std::chrono::high_resolution_clock;
|
||||
|
||||
public:
|
||||
// trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5
|
||||
explicit TimeRecorder(std::string hdr, int64_t log_level = 0)
|
||||
: header_(std::move(hdr)), log_level_(log_level) {
|
||||
start_ = last_ = stdclock::now();
|
||||
}
|
||||
virtual ~TimeRecorder() = default;
|
||||
|
||||
double
|
||||
RecordSection(const std::string& msg) {
|
||||
stdclock::time_point curr = stdclock::now();
|
||||
double span =
|
||||
(std::chrono::duration<double, std::micro>(curr - last_)).count();
|
||||
last_ = curr;
|
||||
|
||||
PrintTimeRecord(msg, span);
|
||||
return span;
|
||||
}
|
||||
|
||||
double
|
||||
ElapseFromBegin(const std::string& msg) {
|
||||
stdclock::time_point curr = stdclock::now();
|
||||
double span =
|
||||
(std::chrono::duration<double, std::micro>(curr - start_)).count();
|
||||
|
||||
PrintTimeRecord(msg, span);
|
||||
return span;
|
||||
}
|
||||
|
||||
static std::string
|
||||
GetTimeSpanStr(double span) {
|
||||
std::string str_ms = std::to_string(span * 0.001) + " ms";
|
||||
return str_ms;
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
PrintTimeRecord(const std::string& msg, double span) {
|
||||
std::string str_log;
|
||||
if (!header_.empty()) {
|
||||
str_log += header_ + ": ";
|
||||
}
|
||||
str_log += msg;
|
||||
str_log += " (";
|
||||
str_log += TimeRecorder::GetTimeSpanStr(span);
|
||||
str_log += ")";
|
||||
|
||||
std::cout << str_log << std::endl;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string header_;
|
||||
stdclock::time_point start_;
|
||||
stdclock::time_point last_;
|
||||
int64_t log_level_;
|
||||
};
|
Loading…
Reference in New Issue