enhance: add tantivy collector for i64 (#39850)

issue: #39852

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
pull/39893/head
Spade A 2025-02-14 15:50:15 +08:00 committed by GitHub
parent 36e5b545b5
commit f7d9587720
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 149 additions and 22 deletions

View File

@ -148,7 +148,8 @@ struct ArithCompareOperator {
} else if constexpr (AOp == ArithOpType::Div) {
return CompareOperator<CmpOp>::compare(left / right, value);
} else if constexpr (AOp == ArithOpType::Mod) {
return CompareOperator<CmpOp>::compare(long(left) % long(right), value);
return CompareOperator<CmpOp>::compare(long(left) % long(right),
value);
} else {
// unimplemented
static_assert(always_false_v<T>, "unimplemented");

View File

@ -122,7 +122,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) == val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) == val;
res[i] =
(long(src[offset]) % long(right_operand)) == val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "
@ -143,7 +144,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) != val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) != val;
res[i] =
(long(src[offset]) % long(right_operand)) != val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "
@ -165,7 +167,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) > val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) > val;
res[i] =
(long(src[offset]) % long(right_operand)) > val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "
@ -187,7 +190,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) >= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) >= val;
res[i] =
(long(src[offset]) % long(right_operand)) >= val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "
@ -208,7 +212,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) < val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) < val;
res[i] =
(long(src[offset]) % long(right_operand)) < val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "
@ -229,7 +234,8 @@ struct ArithOpElementFunc {
res[i] = (src[offset] / right_operand) <= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] = (long(src[offset]) % long(right_operand)) <= val;
res[i] =
(long(src[offset]) % long(right_operand)) <= val;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported arith type:{} for "

View File

@ -20,10 +20,17 @@ struct RustArray {
size_t cap;
};
struct RustArrayI64 {
int64_t *array;
size_t len;
size_t cap;
};
struct Value {
enum class Tag {
None,
RustArray,
RustArrayI64,
U32,
Ptr,
};
@ -36,6 +43,10 @@ struct Value {
RustArray _0;
};
struct RustArrayI64_Body {
RustArrayI64 _0;
};
struct U32_Body {
uint32_t _0;
};
@ -48,6 +59,7 @@ struct Value {
union {
None_Body none;
RustArray_Body rust_array;
RustArrayI64_Body rust_array_i64;
U32_Body u32;
Ptr_Body ptr;
};
@ -63,6 +75,8 @@ extern "C" {
void free_rust_array(RustArray array);
void free_rust_array_i64(RustArrayI64 array);
void free_rust_result(RustResult result);
void free_rust_error(const char *error);

View File

@ -54,10 +54,55 @@ pub extern "C" fn free_rust_array(array: RustArray) {
}
}
#[repr(C)]
pub struct RustArrayI64 {
array: *mut i64,
len: size_t,
cap: size_t,
}
impl RustArrayI64 {
pub fn from_vec(vec: Vec<i64>) -> RustArrayI64 {
let len = vec.len();
let cap = vec.capacity();
let v = vec.leak();
RustArrayI64 {
array: v.as_mut_ptr(),
len,
cap,
}
}
}
impl std::default::Default for RustArrayI64 {
fn default() -> Self {
RustArrayI64 {
array: std::ptr::null_mut(),
len: 0,
cap: 0,
}
}
}
impl From<Vec<i64>> for RustArrayI64 {
fn from(vec: Vec<i64>) -> Self {
RustArrayI64::from_vec(vec)
}
}
#[no_mangle]
pub extern "C" fn free_rust_array_i64(array: RustArrayI64) {
let RustArrayI64 { array, len, cap } = array;
unsafe {
Vec::from_raw_parts(array, len, cap);
}
}
#[repr(C)]
pub enum Value {
None(()),
RustArray(RustArray),
RustArrayI64(RustArrayI64),
U32(u32),
Ptr(*mut c_void),
}
@ -74,7 +119,7 @@ macro_rules! impl_from_for_enum {
};
}
impl_from_for_enum!(Value, None => (), RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void);
impl_from_for_enum!(Value, None => (), RustArrayI64 => RustArrayI64, RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void);
#[repr(C)]
pub struct RustResult {

View File

@ -4,11 +4,19 @@ use tantivy::{
DocId, Score, SegmentOrdinal, SegmentReader,
};
pub(crate) struct DocIdCollector;
#[derive(Default)]
pub(crate) struct DocIdCollector<T> {
_phantom: std::marker::PhantomData<T>,
}
impl Collector for DocIdCollector {
pub(crate) struct DocIdChildCollector<T> {
docs: Vec<T>,
column: Column<i64>,
}
impl Collector for DocIdCollector<u32> {
type Fruit = Vec<u32>;
type Child = DocIdChildCollector;
type Child = DocIdChildCollector<u32>;
fn for_segment(
&self,
@ -40,12 +48,7 @@ impl Collector for DocIdCollector {
}
}
pub(crate) struct DocIdChildCollector {
docs: Vec<u32>,
column: Column<i64>,
}
impl SegmentCollector for DocIdChildCollector {
impl SegmentCollector for DocIdChildCollector<u32> {
type Fruit = Vec<u32>;
fn collect(&mut self, doc: DocId, _score: Score) {
@ -58,3 +61,51 @@ impl SegmentCollector for DocIdChildCollector {
self.docs
}
}
impl Collector for DocIdCollector<i64> {
type Fruit = Vec<i64>;
type Child = DocIdChildCollector<i64>;
fn for_segment(
&self,
_segment_local_id: SegmentOrdinal,
segment: &SegmentReader,
) -> tantivy::Result<Self::Child> {
Ok(DocIdChildCollector {
docs: Vec::new(),
column: segment.fast_fields().i64("doc_id").unwrap(),
})
}
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(
&self,
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> tantivy::Result<Self::Fruit> {
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
let mut result = Vec::with_capacity(len);
for docs in segment_fruits {
for doc in docs {
result.push(doc);
}
}
Ok(result)
}
}
impl SegmentCollector for DocIdChildCollector<i64> {
type Fruit = Vec<i64>;
fn collect(&mut self, doc: DocId, _score: Score) {
self.column.values_for_doc(doc).for_each(|doc_id| {
self.docs.push(doc_id);
})
}
fn harvest(self) -> Self::Fruit {
self.docs
}
}

View File

@ -73,7 +73,7 @@ impl IndexReaderWrapper {
Some(_) => {
// newer version with doc_id.
searcher
.search(q, &DocIdCollector {})
.search(q, &DocIdCollector::<u32>::default())
.map_err(TantivyBindingError::TantivyError)
}
None => {
@ -85,6 +85,16 @@ impl IndexReaderWrapper {
}
}
// Generally, we should use [`crate::search`], except for some special senarios where the doc_id could beyound
// the score of u32.
pub(crate) fn search_i64(&self, q: &dyn Query) -> Result<Vec<i64>> {
assert!(self.id_field.is_some());
let searcher = self.reader.searcher();
searcher
.search(q, &DocIdCollector::<i64>::default())
.map_err(TantivyBindingError::TantivyError)
}
pub fn term_query_i64(&self, term: i64) -> Result<Vec<u32>> {
let q = TermQuery::new(
Term::from_field_i64(self.field, term),

View File

@ -2,6 +2,7 @@ mod array;
mod data_type;
mod demo_c;
mod docid_collector;
mod error;
mod hashmap_c;
mod index_reader;
mod index_reader_c;
@ -11,18 +12,17 @@ mod index_writer;
mod index_writer_c;
mod index_writer_text;
mod index_writer_text_c;
mod jieba_tokenizer;
mod log;
mod stop_words;
mod string_c;
mod token_stream_c;
mod tokenizer;
mod tokenizer_filter;
mod tokenizer_c;
mod tokenizer_filter;
mod util;
mod error;
mod util_c;
mod vec_collector;
mod stop_words;
mod jieba_tokenizer;
pub fn add(left: usize, right: usize) -> usize {
left + right