Add sealedSegment (go&c) unittest, fix growingSegment field id check

Signed-off-by: FluorineDog <guilin.gou@zilliz.com>
pull/4973/head^2
FluorineDog 2021-01-22 17:40:58 +08:00 committed by yefu.chen
parent 067c30c422
commit bff208d78c
24 changed files with 233 additions and 230 deletions

View File

@ -319,14 +319,14 @@ Parser::ParseItemList(const Json& body) {
std::vector<ExprPtr> results; std::vector<ExprPtr> results;
if (body.is_object()) { if (body.is_object()) {
// only one item; // only one item;
auto new_entry = ParseAnyNode(body); auto new_expr = ParseAnyNode(body);
results.emplace_back(std::move(new_entry)); results.emplace_back(std::move(new_expr));
} else { } else {
// item array // item array
Assert(body.is_array()); Assert(body.is_array());
for (auto& item : body) { for (auto& item : body) {
auto new_entry = ParseAnyNode(item); auto new_expr = ParseAnyNode(item);
results.emplace_back(std::move(new_entry)); results.emplace_back(std::move(new_expr));
} }
} }
auto old_size = results.size(); auto old_size = results.size();

View File

@ -24,7 +24,7 @@ SubQueryResult
BinarySearchBruteForceFast(MetricType metric_type, BinarySearchBruteForceFast(MetricType metric_type,
int64_t dim, int64_t dim,
const uint8_t* binary_chunk, const uint8_t* binary_chunk,
int64_t chunk_size, int64_t size_per_chunk,
int64_t topk, int64_t topk,
int64_t num_queries, int64_t num_queries,
const uint8_t* query_data, const uint8_t* query_data,
@ -34,7 +34,7 @@ BinarySearchBruteForceFast(MetricType metric_type,
idx_t* result_labels = sub_result.get_labels(); idx_t* result_labels = sub_result.get_labels();
int64_t code_size = dim / 8; int64_t code_size = dim / 8;
const idx_t block_size = chunk_size; const idx_t block_size = size_per_chunk;
bool use_heap = true; bool use_heap = true;
if (metric_type == faiss::METRIC_Jaccard || metric_type == faiss::METRIC_Tanimoto) { if (metric_type == faiss::METRIC_Jaccard || metric_type == faiss::METRIC_Tanimoto) {
@ -50,7 +50,7 @@ BinarySearchBruteForceFast(MetricType metric_type,
result_labels + query_base_index * topk, D + query_base_index * topk}; result_labels + query_base_index * topk, D + query_base_index * topk};
binary_distence_knn_hc(metric_type, &res, query_data + query_base_index * code_size, binary_chunk, binary_distence_knn_hc(metric_type, &res, query_data + query_base_index * code_size, binary_chunk,
chunk_size, code_size, size_per_chunk, code_size,
/* ordered = */ true, bitset); /* ordered = */ true, bitset);
} }
if (metric_type == faiss::METRIC_Tanimoto) { if (metric_type == faiss::METRIC_Tanimoto) {
@ -67,7 +67,7 @@ BinarySearchBruteForceFast(MetricType metric_type,
} }
// only match ids will be chosed, not to use heap // only match ids will be chosed, not to use heap
binary_distence_knn_mc(metric_type, query_data + s * code_size, binary_chunk, nn, chunk_size, topk, binary_distence_knn_mc(metric_type, query_data + s * code_size, binary_chunk, nn, size_per_chunk, topk,
code_size, D + s * topk, result_labels + s * topk, bitset); code_size, D + s * topk, result_labels + s * topk, bitset);
} }
} else if (metric_type == faiss::METRIC_Hamming) { } else if (metric_type == faiss::METRIC_Hamming) {
@ -82,10 +82,10 @@ BinarySearchBruteForceFast(MetricType metric_type,
faiss::int_maxheap_array_t res = {size_t(nn), size_t(topk), result_labels + s * topk, faiss::int_maxheap_array_t res = {size_t(nn), size_t(topk), result_labels + s * topk,
int_distances.data() + s * topk}; int_distances.data() + s * topk};
hammings_knn_hc(&res, query_data + s * code_size, binary_chunk, chunk_size, code_size, hammings_knn_hc(&res, query_data + s * code_size, binary_chunk, size_per_chunk, code_size,
/* ordered = */ true, bitset); /* ordered = */ true, bitset);
} else { } else {
hammings_knn_mc(query_data + s * code_size, binary_chunk, nn, chunk_size, topk, code_size, hammings_knn_mc(query_data + s * code_size, binary_chunk, nn, size_per_chunk, topk, code_size,
int_distances.data() + s * topk, result_labels + s * topk, bitset); int_distances.data() + s * topk, result_labels + s * topk, bitset);
} }
} }
@ -101,7 +101,7 @@ BinarySearchBruteForceFast(MetricType metric_type,
SubQueryResult SubQueryResult
FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset, FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset,
const float* chunk_data, const float* chunk_data,
int64_t chunk_size, int64_t size_per_chunk,
const faiss::BitsetView& bitset) { const faiss::BitsetView& bitset) {
auto metric_type = query_dataset.metric_type; auto metric_type = query_dataset.metric_type;
auto num_queries = query_dataset.num_queries; auto num_queries = query_dataset.num_queries;
@ -111,11 +111,11 @@ FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset,
if (metric_type == MetricType::METRIC_L2) { if (metric_type == MetricType::METRIC_L2) {
faiss::float_maxheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_labels(), sub_qr.get_values()}; faiss::float_maxheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_labels(), sub_qr.get_values()};
faiss::knn_L2sqr(query_dataset.query_data, chunk_data, dim, num_queries, chunk_size, &buf, bitset); faiss::knn_L2sqr(query_dataset.query_data, chunk_data, dim, num_queries, size_per_chunk, &buf, bitset);
return sub_qr; return sub_qr;
} else { } else {
faiss::float_minheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_labels(), sub_qr.get_values()}; faiss::float_minheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_labels(), sub_qr.get_values()};
faiss::knn_inner_product(query_dataset.query_data, chunk_data, dim, num_queries, chunk_size, &buf, bitset); faiss::knn_inner_product(query_dataset.query_data, chunk_data, dim, num_queries, size_per_chunk, &buf, bitset);
return sub_qr; return sub_qr;
} }
} }
@ -123,10 +123,10 @@ FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset,
SubQueryResult SubQueryResult
BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset, BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset,
const uint8_t* binary_chunk, const uint8_t* binary_chunk,
int64_t chunk_size, int64_t size_per_chunk,
const faiss::BitsetView& bitset) { const faiss::BitsetView& bitset) {
// TODO: refactor the internal function // TODO: refactor the internal function
return BinarySearchBruteForceFast(query_dataset.metric_type, query_dataset.dim, binary_chunk, chunk_size, return BinarySearchBruteForceFast(query_dataset.metric_type, query_dataset.dim, binary_chunk, size_per_chunk,
query_dataset.topk, query_dataset.num_queries, query_dataset.query_data, bitset); query_dataset.topk, query_dataset.num_queries, query_dataset.query_data, bitset);
} }
} // namespace milvus::query } // namespace milvus::query

View File

@ -21,13 +21,13 @@ namespace milvus::query {
SubQueryResult SubQueryResult
BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset, BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset,
const uint8_t* binary_chunk, const uint8_t* binary_chunk,
int64_t chunk_size, int64_t size_per_chunk,
const faiss::BitsetView& bitset); const faiss::BitsetView& bitset);
SubQueryResult SubQueryResult
FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset, FloatSearchBruteForce(const dataset::FloatQueryDataset& query_dataset,
const float* chunk_data, const float* chunk_data,
int64_t chunk_size, int64_t size_per_chunk,
const faiss::BitsetView& bitset); const faiss::BitsetView& bitset);
} // namespace milvus::query } // namespace milvus::query

View File

@ -72,46 +72,46 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
dataset::FloatQueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data}; dataset::FloatQueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data};
auto max_indexed_id = indexing_record.get_finished_ack(); auto max_indexed_id = indexing_record.get_finished_ack();
const auto& indexing_entry = indexing_record.get_vec_entry(vecfield_offset); const auto& field_indexing = indexing_record.get_vec_field_indexing(vecfield_offset);
auto search_conf = indexing_entry.get_search_conf(topK); auto search_conf = field_indexing.get_search_conf(topK);
for (int chunk_id = 0; chunk_id < max_indexed_id; ++chunk_id) { for (int chunk_id = 0; chunk_id < max_indexed_id; ++chunk_id) {
auto chunk_size = indexing_entry.get_chunk_size(); auto size_per_chunk = field_indexing.get_size_per_chunk();
auto indexing = indexing_entry.get_indexing(chunk_id); auto indexing = field_indexing.get_chunk_indexing(chunk_id);
auto sub_view = BitsetSubView(bitset, chunk_id * chunk_size, chunk_size); auto sub_view = BitsetSubView(bitset, chunk_id * size_per_chunk, size_per_chunk);
auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, sub_view); auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, sub_view);
// convert chunk uid to segment uid // convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) { for (auto& x : sub_qr.mutable_labels()) {
if (x != -1) { if (x != -1) {
x += chunk_id * chunk_size; x += chunk_id * size_per_chunk;
} }
} }
final_qr.merge(sub_qr); final_qr.merge(sub_qr);
} }
auto vec_ptr = record.get_entity<FloatVector>(vecfield_offset); auto vec_ptr = record.get_field_data<FloatVector>(vecfield_offset);
// step 4: brute force search where small indexing is unavailable // step 4: brute force search where small indexing is unavailable
auto vec_chunk_size = vec_ptr->get_chunk_size(); auto vec_size_per_chunk = vec_ptr->get_size_per_chunk();
Assert(vec_chunk_size == indexing_entry.get_chunk_size()); Assert(vec_size_per_chunk == field_indexing.get_size_per_chunk());
auto max_chunk = upper_div(ins_barrier, vec_chunk_size); auto max_chunk = upper_div(ins_barrier, vec_size_per_chunk);
for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) { for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) {
auto& chunk = vec_ptr->get_chunk(chunk_id); auto& chunk = vec_ptr->get_chunk(chunk_id);
auto element_begin = chunk_id * vec_chunk_size; auto element_begin = chunk_id * vec_size_per_chunk;
auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_chunk_size); auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_size_per_chunk);
auto chunk_size = element_end - element_begin; auto size_per_chunk = element_end - element_begin;
auto sub_view = BitsetSubView(bitset, element_begin, chunk_size); auto sub_view = BitsetSubView(bitset, element_begin, size_per_chunk);
auto sub_qr = FloatSearchBruteForce(query_dataset, chunk.data(), chunk_size, sub_view); auto sub_qr = FloatSearchBruteForce(query_dataset, chunk.data(), size_per_chunk, sub_view);
// convert chunk uid to segment uid // convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) { for (auto& x : sub_qr.mutable_labels()) {
if (x != -1) { if (x != -1) {
x += chunk_id * vec_chunk_size; x += chunk_id * vec_size_per_chunk;
} }
} }
final_qr.merge(sub_qr); final_qr.merge(sub_qr);
@ -160,18 +160,18 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
// step 3: small indexing search // step 3: small indexing search
query::dataset::BinaryQueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data}; query::dataset::BinaryQueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data};
auto vec_ptr = record.get_entity<BinaryVector>(vecfield_offset); auto vec_ptr = record.get_field_data<BinaryVector>(vecfield_offset);
auto max_indexed_id = 0; auto max_indexed_id = 0;
// step 4: brute force search where small indexing is unavailable // step 4: brute force search where small indexing is unavailable
auto vec_chunk_size = vec_ptr->get_chunk_size(); auto vec_size_per_chunk = vec_ptr->get_size_per_chunk();
auto max_chunk = upper_div(ins_barrier, vec_chunk_size); auto max_chunk = upper_div(ins_barrier, vec_size_per_chunk);
SubQueryResult final_result(num_queries, topK, metric_type); SubQueryResult final_result(num_queries, topK, metric_type);
for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) { for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) {
auto& chunk = vec_ptr->get_chunk(chunk_id); auto& chunk = vec_ptr->get_chunk(chunk_id);
auto element_begin = chunk_id * vec_chunk_size; auto element_begin = chunk_id * vec_size_per_chunk;
auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_chunk_size); auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_size_per_chunk);
auto nsize = element_end - element_begin; auto nsize = element_end - element_begin;
auto sub_view = BitsetSubView(bitset, element_begin, nsize); auto sub_view = BitsetSubView(bitset, element_begin, nsize);
@ -180,7 +180,7 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
// convert chunk uid to segment uid // convert chunk uid to segment uid
for (auto& x : sub_result.mutable_labels()) { for (auto& x : sub_result.mutable_labels()) {
if (x != -1) { if (x != -1) {
x += chunk_id * vec_chunk_size; x += chunk_id * vec_size_per_chunk;
} }
} }
final_result.merge(sub_result); final_result.merge(sub_result);

View File

@ -62,16 +62,16 @@ SearchOnSealed(const Schema& schema,
auto dim = field.get_dim(); auto dim = field.get_dim();
Assert(record.is_ready(field_offset)); Assert(record.is_ready(field_offset));
auto indexing_entry = record.get_entry(field_offset); auto field_indexing = record.get_field_indexing(field_offset);
Assert(indexing_entry->metric_type_ == query_info.metric_type_); Assert(field_indexing->metric_type_ == query_info.metric_type_);
auto final = [&] { auto final = [&] {
auto ds = knowhere::GenDataset(num_queries, dim, query_data); auto ds = knowhere::GenDataset(num_queries, dim, query_data);
auto conf = query_info.search_params_; auto conf = query_info.search_params_;
conf[milvus::knowhere::meta::TOPK] = query_info.topK_; conf[milvus::knowhere::meta::TOPK] = query_info.topK_;
conf[milvus::knowhere::Metric::TYPE] = MetricTypeToName(indexing_entry->metric_type_); conf[milvus::knowhere::Metric::TYPE] = MetricTypeToName(field_indexing->metric_type_);
return indexing_entry->indexing_->Query(ds, conf, bitset); return field_indexing->indexing_->Query(ds, conf, bitset);
}(); }();
auto ids = final->Get<idx_t*>(knowhere::meta::IDS); auto ids = final->Get<idx_t*>(knowhere::meta::IDS);

View File

@ -120,41 +120,33 @@ template <typename T, typename IndexFunc, typename ElementFunc>
auto auto
ExecExprVisitor::ExecRangeVisitorImpl(RangeExprImpl<T>& expr, IndexFunc index_func, ElementFunc element_func) ExecExprVisitor::ExecRangeVisitorImpl(RangeExprImpl<T>& expr, IndexFunc index_func, ElementFunc element_func)
-> RetType { -> RetType {
auto data_type = expr.data_type_;
auto& schema = segment_.get_schema(); auto& schema = segment_.get_schema();
auto field_offset = expr.field_offset_; auto field_offset = expr.field_offset_;
auto& field_meta = schema[field_offset]; auto& field_meta = schema[field_offset];
// auto vec_ptr = records.get_entity<T>(field_offset); auto indexing_barrier = segment_.num_chunk_index(field_offset);
// auto& vec = *vec_ptr; auto size_per_chunk = segment_.size_per_chunk();
// const segcore::ScalarIndexingEntry<T>& entry = indexing_record.get_scalar_entry<T>(field_offset); auto num_chunk = upper_div(row_count_, size_per_chunk);
// RetType results(vec.num_chunk());
auto indexing_barrier = segment_.num_chunk_index_safe(field_offset);
auto chunk_size = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, chunk_size);
RetType results; RetType results;
using Index = knowhere::scalar::StructuredIndex<T>; using Index = knowhere::scalar::StructuredIndex<T>;
for (auto chunk_id = 0; chunk_id < indexing_barrier; ++chunk_id) { for (auto chunk_id = 0; chunk_id < indexing_barrier; ++chunk_id) {
// auto& result = results[chunk_id];
const Index& indexing = segment_.chunk_scalar_index<T>(field_offset, chunk_id); const Index& indexing = segment_.chunk_scalar_index<T>(field_offset, chunk_id);
// NOTE: knowhere is not const-ready // NOTE: knowhere is not const-ready
// This is a dirty workaround // This is a dirty workaround
auto data = index_func(const_cast<Index*>(&indexing)); auto data = index_func(const_cast<Index*>(&indexing));
Assert(data->size() == chunk_size); Assert(data->size() == size_per_chunk);
results.emplace_back(std::move(*data)); results.emplace_back(std::move(*data));
} }
for (auto chunk_id = indexing_barrier; chunk_id < num_chunk; ++chunk_id) { for (auto chunk_id = indexing_barrier; chunk_id < num_chunk; ++chunk_id) {
boost::dynamic_bitset<> result(chunk_size); boost::dynamic_bitset<> result(size_per_chunk);
// auto& result = results[chunk_id]; result.resize(size_per_chunk);
result.resize(chunk_size);
auto chunk = segment_.chunk_data<T>(field_offset, chunk_id); auto chunk = segment_.chunk_data<T>(field_offset, chunk_id);
const T* data = chunk.data(); const T* data = chunk.data();
for (int index = 0; index < chunk_size; ++index) { for (int index = 0; index < size_per_chunk; ++index) {
result[index] = element_func(data[index]); result[index] = element_func(data[index]);
} }
Assert(result.size() == chunk_size); Assert(result.size() == size_per_chunk);
results.emplace_back(std::move(result)); results.emplace_back(std::move(result));
} }
return results; return results;
@ -282,27 +274,19 @@ template <typename T>
auto auto
ExecExprVisitor::ExecTermVisitorImpl(TermExpr& expr_raw) -> RetType { ExecExprVisitor::ExecTermVisitorImpl(TermExpr& expr_raw) -> RetType {
auto& expr = static_cast<TermExprImpl<T>&>(expr_raw); auto& expr = static_cast<TermExprImpl<T>&>(expr_raw);
// auto& records = segment_.get_insert_record();
auto data_type = expr.data_type_;
auto& schema = segment_.get_schema(); auto& schema = segment_.get_schema();
auto field_offset = expr_raw.field_offset_; auto field_offset = expr_raw.field_offset_;
auto& field_meta = schema[field_offset]; auto& field_meta = schema[field_offset];
// auto vec_ptr = records.get_entity<T>(field_offset); auto size_per_chunk = segment_.size_per_chunk();
// auto& vec = *vec_ptr; auto num_chunk = upper_div(row_count_, size_per_chunk);
auto chunk_size = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, chunk_size);
RetType bitsets; RetType bitsets;
// auto N = records.ack_responder_.GetAck();
// TODO: enable index for term
for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) { for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
Span<T> chunk = segment_.chunk_data<T>(field_offset, chunk_id); Span<T> chunk = segment_.chunk_data<T>(field_offset, chunk_id);
auto size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * chunk_size : chunk_size; auto size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
boost::dynamic_bitset<> bitset(chunk_size); boost::dynamic_bitset<> bitset(size_per_chunk);
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
auto value = chunk.data()[i]; auto value = chunk.data()[i];
bool is_in = std::binary_search(expr.terms_.begin(), expr.terms_.end(), value); bool is_in = std::binary_search(expr.terms_.begin(), expr.terms_.end(), value);

View File

@ -6,7 +6,7 @@ set(SEGCORE_FILES
SegmentGrowing.cpp SegmentGrowing.cpp
SegmentGrowingImpl.cpp SegmentGrowingImpl.cpp
SegmentSealedImpl.cpp SegmentSealedImpl.cpp
IndexingEntry.cpp FieldIndexing.cpp
InsertRecord.cpp InsertRecord.cpp
Reduce.cpp Reduce.cpp
plan_c.cpp plan_c.cpp

View File

@ -72,7 +72,7 @@ class ThreadSafeVector {
class VectorBase { class VectorBase {
public: public:
explicit VectorBase(int64_t chunk_size) : chunk_size_(chunk_size) { explicit VectorBase(int64_t size_per_chunk) : size_per_chunk_(size_per_chunk) {
} }
virtual ~VectorBase() = default; virtual ~VectorBase() = default;
@ -86,12 +86,12 @@ class VectorBase {
get_span_base(int64_t chunk_id) const = 0; get_span_base(int64_t chunk_id) const = 0;
int64_t int64_t
get_chunk_size() const { get_size_per_chunk() const {
return chunk_size_; return size_per_chunk_;
} }
protected: protected:
const int64_t chunk_size_; const int64_t size_per_chunk_;
}; };
template <typename Type, bool is_scalar = false> template <typename Type, bool is_scalar = false>
@ -111,27 +111,28 @@ class ConcurrentVectorImpl : public VectorBase {
std::conditional_t<is_scalar, Type, std::conditional_t<std::is_same_v<Type, float>, FloatVector, BinaryVector>>; std::conditional_t<is_scalar, Type, std::conditional_t<std::is_same_v<Type, float>, FloatVector, BinaryVector>>;
public: public:
explicit ConcurrentVectorImpl(ssize_t dim, int64_t chunk_size) : VectorBase(chunk_size), Dim(is_scalar ? 1 : dim) { explicit ConcurrentVectorImpl(ssize_t dim, int64_t size_per_chunk)
: VectorBase(size_per_chunk), Dim(is_scalar ? 1 : dim) {
Assert(is_scalar ? dim == 1 : dim != 1); Assert(is_scalar ? dim == 1 : dim != 1);
} }
void void
grow_to_at_least(int64_t element_count) override { grow_to_at_least(int64_t element_count) override {
auto chunk_count = upper_div(element_count, chunk_size_); auto chunk_count = upper_div(element_count, size_per_chunk_);
chunks_.emplace_to_at_least(chunk_count, Dim * chunk_size_); chunks_.emplace_to_at_least(chunk_count, Dim * size_per_chunk_);
} }
Span<TraitType> Span<TraitType>
get_span(int64_t chunk_id) const { get_span(int64_t chunk_id) const {
auto& chunk = get_chunk(chunk_id); auto& chunk = get_chunk(chunk_id);
if constexpr (is_scalar) { if constexpr (is_scalar) {
return Span<TraitType>(chunk.data(), chunk_size_); return Span<TraitType>(chunk.data(), size_per_chunk_);
} else if constexpr (std::is_same_v<Type, int64_t> || std::is_same_v<Type, int>) { } else if constexpr (std::is_same_v<Type, int64_t> || std::is_same_v<Type, int>) {
// only for testing // only for testing
PanicInfo("unimplemented"); PanicInfo("unimplemented");
} else { } else {
static_assert(std::is_same_v<typename TraitType::embedded_type, Type>); static_assert(std::is_same_v<typename TraitType::embedded_type, Type>);
return Span<TraitType>(chunk.data(), chunk_size_, Dim); return Span<TraitType>(chunk.data(), size_per_chunk_, Dim);
} }
} }
@ -151,28 +152,28 @@ class ConcurrentVectorImpl : public VectorBase {
return; return;
} }
this->grow_to_at_least(element_offset + element_count); this->grow_to_at_least(element_offset + element_count);
auto chunk_id = element_offset / chunk_size_; auto chunk_id = element_offset / size_per_chunk_;
auto chunk_offset = element_offset % chunk_size_; auto chunk_offset = element_offset % size_per_chunk_;
ssize_t source_offset = 0; ssize_t source_offset = 0;
// first partition: // first partition:
if (chunk_offset + element_count <= chunk_size_) { if (chunk_offset + element_count <= size_per_chunk_) {
// only first // only first
fill_chunk(chunk_id, chunk_offset, element_count, source, source_offset); fill_chunk(chunk_id, chunk_offset, element_count, source, source_offset);
return; return;
} }
auto first_size = chunk_size_ - chunk_offset; auto first_size = size_per_chunk_ - chunk_offset;
fill_chunk(chunk_id, chunk_offset, first_size, source, source_offset); fill_chunk(chunk_id, chunk_offset, first_size, source, source_offset);
source_offset += chunk_size_ - chunk_offset; source_offset += size_per_chunk_ - chunk_offset;
element_count -= first_size; element_count -= first_size;
++chunk_id; ++chunk_id;
// the middle // the middle
while (element_count >= chunk_size_) { while (element_count >= size_per_chunk_) {
fill_chunk(chunk_id, 0, chunk_size_, source, source_offset); fill_chunk(chunk_id, 0, size_per_chunk_, source, source_offset);
source_offset += chunk_size_; source_offset += size_per_chunk_;
element_count -= chunk_size_; element_count -= size_per_chunk_;
++chunk_id; ++chunk_id;
} }
@ -190,16 +191,16 @@ class ConcurrentVectorImpl : public VectorBase {
// just for fun, don't use it directly // just for fun, don't use it directly
const Type* const Type*
get_element(ssize_t element_index) const { get_element(ssize_t element_index) const {
auto chunk_id = element_index / chunk_size_; auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % chunk_size_; auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id).data() + chunk_offset * Dim; return get_chunk(chunk_id).data() + chunk_offset * Dim;
} }
const Type& const Type&
operator[](ssize_t element_index) const { operator[](ssize_t element_index) const {
Assert(Dim == 1); Assert(Dim == 1);
auto chunk_id = element_index / chunk_size_; auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % chunk_size_; auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id)[chunk_offset]; return get_chunk(chunk_id)[chunk_offset];
} }
@ -232,24 +233,24 @@ template <typename Type>
class ConcurrentVector : public ConcurrentVectorImpl<Type, true> { class ConcurrentVector : public ConcurrentVectorImpl<Type, true> {
public: public:
static_assert(std::is_fundamental_v<Type>); static_assert(std::is_fundamental_v<Type>);
explicit ConcurrentVector(int64_t chunk_size) explicit ConcurrentVector(int64_t size_per_chunk)
: ConcurrentVectorImpl<Type, true>::ConcurrentVectorImpl(1, chunk_size) { : ConcurrentVectorImpl<Type, true>::ConcurrentVectorImpl(1, size_per_chunk) {
} }
}; };
template <> template <>
class ConcurrentVector<FloatVector> : public ConcurrentVectorImpl<float, false> { class ConcurrentVector<FloatVector> : public ConcurrentVectorImpl<float, false> {
public: public:
ConcurrentVector(int64_t dim, int64_t chunk_size) ConcurrentVector(int64_t dim, int64_t size_per_chunk)
: ConcurrentVectorImpl<float, false>::ConcurrentVectorImpl(dim, chunk_size) { : ConcurrentVectorImpl<float, false>::ConcurrentVectorImpl(dim, size_per_chunk) {
} }
}; };
template <> template <>
class ConcurrentVector<BinaryVector> : public ConcurrentVectorImpl<uint8_t, false> { class ConcurrentVector<BinaryVector> : public ConcurrentVectorImpl<uint8_t, false> {
public: public:
explicit ConcurrentVector(int64_t dim, int64_t chunk_size) explicit ConcurrentVector(int64_t dim, int64_t size_per_chunk)
: binary_dim_(dim), ConcurrentVectorImpl(dim / 8, chunk_size) { : binary_dim_(dim), ConcurrentVectorImpl(dim / 8, size_per_chunk) {
Assert(dim % 8 == 0); Assert(dim % 8 == 0);
} }

View File

@ -29,9 +29,11 @@ struct DeletedRecord {
std::shared_ptr<TmpBitmap> std::shared_ptr<TmpBitmap>
clone(int64_t capacity); clone(int64_t capacity);
}; };
static constexpr int64_t deprecated_chunk_size = 32 * 1024; static constexpr int64_t deprecated_size_per_chunk = 32 * 1024;
DeletedRecord() DeletedRecord()
: lru_(std::make_shared<TmpBitmap>()), timestamps_(deprecated_chunk_size), uids_(deprecated_chunk_size) { : lru_(std::make_shared<TmpBitmap>()),
timestamps_(deprecated_size_per_chunk),
uids_(deprecated_size_per_chunk) {
lru_->bitmap_ptr = std::make_shared<faiss::ConcurrentBitset>(0); lru_->bitmap_ptr = std::make_shared<faiss::ConcurrentBitset>(0);
} }

View File

@ -9,14 +9,14 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License // or implied. See the License for the specific language governing permissions and limitations under the License
#include "segcore/IndexingEntry.h" #include "segcore/FieldIndexing.h"
#include <thread> #include <thread>
#include <knowhere/index/vector_index/IndexIVF.h> #include <knowhere/index/vector_index/IndexIVF.h>
#include <knowhere/index/vector_index/adapter/VectorAdapter.h> #include <knowhere/index/vector_index/adapter/VectorAdapter.h>
namespace milvus::segcore { namespace milvus::segcore {
void void
VecIndexingEntry::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) { VectorFieldIndexing::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) {
assert(field_meta_.get_data_type() == DataType::VECTOR_FLOAT); assert(field_meta_.get_data_type() == DataType::VECTOR_FLOAT);
auto dim = field_meta_.get_dim(); auto dim = field_meta_.get_dim();
@ -30,7 +30,7 @@ VecIndexingEntry::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const Vector
const auto& chunk = source->get_chunk(chunk_id); const auto& chunk = source->get_chunk(chunk_id);
// build index for chunk // build index for chunk
auto indexing = std::make_unique<knowhere::IVF>(); auto indexing = std::make_unique<knowhere::IVF>();
auto dataset = knowhere::GenDataset(source->get_chunk_size(), dim, chunk.data()); auto dataset = knowhere::GenDataset(source->get_size_per_chunk(), dim, chunk.data());
indexing->Train(dataset, conf); indexing->Train(dataset, conf);
indexing->AddWithoutIds(dataset, conf); indexing->AddWithoutIds(dataset, conf);
data_[chunk_id] = std::move(indexing); data_[chunk_id] = std::move(indexing);
@ -38,7 +38,7 @@ VecIndexingEntry::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const Vector
} }
knowhere::Config knowhere::Config
VecIndexingEntry::get_build_conf() const { VectorFieldIndexing::get_build_conf() const {
return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()}, return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()},
{knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nlist, 100},
{knowhere::IndexParams::nprobe, 4}, {knowhere::IndexParams::nprobe, 4},
@ -47,7 +47,7 @@ VecIndexingEntry::get_build_conf() const {
} }
knowhere::Config knowhere::Config
VecIndexingEntry::get_search_conf(int top_K) const { VectorFieldIndexing::get_search_conf(int top_K) const {
return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()}, return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()},
{knowhere::meta::TOPK, top_K}, {knowhere::meta::TOPK, top_K},
{knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nlist, 100},
@ -71,8 +71,8 @@ IndexingRecord::UpdateResourceAck(int64_t chunk_ack, const InsertRecord& record)
lck.unlock(); lck.unlock();
// std::thread([this, old_ack, chunk_ack, &record] { // std::thread([this, old_ack, chunk_ack, &record] {
for (auto& [field_offset, entry] : entries_) { for (auto& [field_offset, entry] : field_indexings_) {
auto vec_base = record.get_base_entity(field_offset); auto vec_base = record.get_field_data_base(field_offset);
entry->BuildIndexRange(old_ack, chunk_ack, vec_base); entry->BuildIndexRange(old_ack, chunk_ack, vec_base);
} }
finished_ack_.AddSegment(old_ack, chunk_ack); finished_ack_.AddSegment(old_ack, chunk_ack);
@ -81,7 +81,7 @@ IndexingRecord::UpdateResourceAck(int64_t chunk_ack, const InsertRecord& record)
template <typename T> template <typename T>
void void
ScalarIndexingEntry<T>::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) { ScalarFieldIndexing<T>::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) {
auto source = dynamic_cast<const ConcurrentVector<T>*>(vec_base); auto source = dynamic_cast<const ConcurrentVector<T>*>(vec_base);
Assert(source); Assert(source);
auto num_chunk = source->num_chunk(); auto num_chunk = source->num_chunk();
@ -92,16 +92,16 @@ ScalarIndexingEntry<T>::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const
// build index for chunk // build index for chunk
// TODO // TODO
auto indexing = std::make_unique<knowhere::scalar::StructuredIndexSort<T>>(); auto indexing = std::make_unique<knowhere::scalar::StructuredIndexSort<T>>();
indexing->Build(vec_base->get_chunk_size(), chunk.data()); indexing->Build(vec_base->get_size_per_chunk(), chunk.data());
data_[chunk_id] = std::move(indexing); data_[chunk_id] = std::move(indexing);
} }
} }
std::unique_ptr<IndexingEntry> std::unique_ptr<FieldIndexing>
CreateIndex(const FieldMeta& field_meta, int64_t chunk_size) { CreateIndex(const FieldMeta& field_meta, int64_t size_per_chunk) {
if (field_meta.is_vector()) { if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
return std::make_unique<VecIndexingEntry>(field_meta, chunk_size); return std::make_unique<VectorFieldIndexing>(field_meta, size_per_chunk);
} else { } else {
// TODO // TODO
PanicInfo("unsupported"); PanicInfo("unsupported");
@ -109,19 +109,19 @@ CreateIndex(const FieldMeta& field_meta, int64_t chunk_size) {
} }
switch (field_meta.get_data_type()) { switch (field_meta.get_data_type()) {
case DataType::BOOL: case DataType::BOOL:
return std::make_unique<ScalarIndexingEntry<bool>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<bool>>(field_meta, size_per_chunk);
case DataType::INT8: case DataType::INT8:
return std::make_unique<ScalarIndexingEntry<int8_t>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<int8_t>>(field_meta, size_per_chunk);
case DataType::INT16: case DataType::INT16:
return std::make_unique<ScalarIndexingEntry<int16_t>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<int16_t>>(field_meta, size_per_chunk);
case DataType::INT32: case DataType::INT32:
return std::make_unique<ScalarIndexingEntry<int32_t>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<int32_t>>(field_meta, size_per_chunk);
case DataType::INT64: case DataType::INT64:
return std::make_unique<ScalarIndexingEntry<int64_t>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<int64_t>>(field_meta, size_per_chunk);
case DataType::FLOAT: case DataType::FLOAT:
return std::make_unique<ScalarIndexingEntry<float>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<float>>(field_meta, size_per_chunk);
case DataType::DOUBLE: case DataType::DOUBLE:
return std::make_unique<ScalarIndexingEntry<double>>(field_meta, chunk_size); return std::make_unique<ScalarFieldIndexing<double>>(field_meta, size_per_chunk);
default: default:
PanicInfo("unsupported"); PanicInfo("unsupported");
} }

View File

@ -24,14 +24,14 @@ namespace milvus::segcore {
// this should be concurrent // this should be concurrent
// All concurrent // All concurrent
class IndexingEntry { class FieldIndexing {
public: public:
explicit IndexingEntry(const FieldMeta& field_meta, int64_t chunk_size) explicit FieldIndexing(const FieldMeta& field_meta, int64_t size_per_chunk)
: field_meta_(field_meta), chunk_size_(chunk_size) { : field_meta_(field_meta), size_per_chunk_(size_per_chunk) {
} }
IndexingEntry(const IndexingEntry&) = delete; FieldIndexing(const FieldIndexing&) = delete;
IndexingEntry& FieldIndexing&
operator=(const IndexingEntry&) = delete; operator=(const FieldIndexing&) = delete;
// Do this in parallel // Do this in parallel
virtual void virtual void
@ -43,29 +43,29 @@ class IndexingEntry {
} }
int64_t int64_t
get_chunk_size() const { get_size_per_chunk() const {
return chunk_size_; return size_per_chunk_;
} }
virtual knowhere::Index* virtual knowhere::Index*
get_indexing(int64_t chunk_id) const = 0; get_chunk_indexing(int64_t chunk_id) const = 0;
protected: protected:
// additional info // additional info
const FieldMeta& field_meta_; const FieldMeta& field_meta_;
const int64_t chunk_size_; const int64_t size_per_chunk_;
}; };
template <typename T> template <typename T>
class ScalarIndexingEntry : public IndexingEntry { class ScalarFieldIndexing : public FieldIndexing {
public: public:
using IndexingEntry::IndexingEntry; using FieldIndexing::FieldIndexing;
void void
BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) override; BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) override;
// concurrent // concurrent
knowhere::scalar::StructuredIndex<T>* knowhere::scalar::StructuredIndex<T>*
get_indexing(int64_t chunk_id) const override { get_chunk_indexing(int64_t chunk_id) const override {
Assert(!field_meta_.is_vector()); Assert(!field_meta_.is_vector());
return data_.at(chunk_id).get(); return data_.at(chunk_id).get();
} }
@ -74,16 +74,16 @@ class ScalarIndexingEntry : public IndexingEntry {
tbb::concurrent_vector<std::unique_ptr<knowhere::scalar::StructuredIndex<T>>> data_; tbb::concurrent_vector<std::unique_ptr<knowhere::scalar::StructuredIndex<T>>> data_;
}; };
class VecIndexingEntry : public IndexingEntry { class VectorFieldIndexing : public FieldIndexing {
public: public:
using IndexingEntry::IndexingEntry; using FieldIndexing::FieldIndexing;
void void
BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) override; BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) override;
// concurrent // concurrent
knowhere::VecIndex* knowhere::VecIndex*
get_indexing(int64_t chunk_id) const override { get_chunk_indexing(int64_t chunk_id) const override {
Assert(field_meta_.is_vector()); Assert(field_meta_.is_vector());
return data_.at(chunk_id).get(); return data_.at(chunk_id).get();
} }
@ -97,12 +97,13 @@ class VecIndexingEntry : public IndexingEntry {
tbb::concurrent_vector<std::unique_ptr<knowhere::VecIndex>> data_; tbb::concurrent_vector<std::unique_ptr<knowhere::VecIndex>> data_;
}; };
std::unique_ptr<IndexingEntry> std::unique_ptr<FieldIndexing>
CreateIndex(const FieldMeta& field_meta, int64_t chunk_size); CreateIndex(const FieldMeta& field_meta, int64_t size_per_chunk);
class IndexingRecord { class IndexingRecord {
public: public:
explicit IndexingRecord(const Schema& schema, int64_t chunk_size) : schema_(schema), chunk_size_(chunk_size) { explicit IndexingRecord(const Schema& schema, int64_t size_per_chunk)
: schema_(schema), size_per_chunk_(size_per_chunk) {
Initialize(); Initialize();
} }
@ -111,7 +112,7 @@ class IndexingRecord {
int offset = 0; int offset = 0;
for (auto& field : schema_) { for (auto& field : schema_) {
if (field.get_data_type() != DataType::VECTOR_BINARY) { if (field.get_data_type() != DataType::VECTOR_BINARY) {
entries_.try_emplace(FieldOffset(offset), CreateIndex(field, chunk_size_)); field_indexings_.try_emplace(FieldOffset(offset), CreateIndex(field, size_per_chunk_));
} }
++offset; ++offset;
} }
@ -128,24 +129,24 @@ class IndexingRecord {
return finished_ack_.GetAck(); return finished_ack_.GetAck();
} }
const IndexingEntry& const FieldIndexing&
get_entry(FieldOffset field_offset) const { get_field_indexing(FieldOffset field_offset) const {
assert(entries_.count(field_offset)); assert(field_indexings_.count(field_offset));
return *entries_.at(field_offset); return *field_indexings_.at(field_offset);
} }
const VecIndexingEntry& const VectorFieldIndexing&
get_vec_entry(FieldOffset field_offset) const { get_vec_field_indexing(FieldOffset field_offset) const {
auto& entry = get_entry(field_offset); auto& field_indexing = get_field_indexing(field_offset);
auto ptr = dynamic_cast<const VecIndexingEntry*>(&entry); auto ptr = dynamic_cast<const VectorFieldIndexing*>(&field_indexing);
AssertInfo(ptr, "invalid indexing"); AssertInfo(ptr, "invalid indexing");
return *ptr; return *ptr;
} }
template <typename T> template <typename T>
auto auto
get_scalar_entry(FieldOffset field_offset) const -> const ScalarIndexingEntry<T>& { get_scalar_field_indexing(FieldOffset field_offset) const -> const ScalarFieldIndexing<T>& {
auto& entry = get_entry(field_offset); auto& entry = get_field_indexing(field_offset);
auto ptr = dynamic_cast<const ScalarIndexingEntry<T>*>(&entry); auto ptr = dynamic_cast<const ScalarFieldIndexing<T>*>(&entry);
AssertInfo(ptr, "invalid indexing"); AssertInfo(ptr, "invalid indexing");
return *ptr; return *ptr;
} }
@ -159,11 +160,11 @@ class IndexingRecord {
// std::atomic<int64_t> finished_ack_ = 0; // std::atomic<int64_t> finished_ack_ = 0;
AckResponder finished_ack_; AckResponder finished_ack_;
std::mutex mutex_; std::mutex mutex_;
int64_t chunk_size_; int64_t size_per_chunk_;
private: private:
// field_offset => indexing // field_offset => indexing
std::map<FieldOffset, std::unique_ptr<IndexingEntry>> entries_; std::map<FieldOffset, std::unique_ptr<FieldIndexing>> field_indexings_;
}; };
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -13,14 +13,14 @@
namespace milvus::segcore { namespace milvus::segcore {
InsertRecord::InsertRecord(const Schema& schema, int64_t chunk_size) : uids_(1), timestamps_(1) { InsertRecord::InsertRecord(const Schema& schema, int64_t size_per_chunk) : uids_(1), timestamps_(1) {
for (auto& field : schema) { for (auto& field : schema) {
if (field.is_vector()) { if (field.is_vector()) {
if (field.get_data_type() == DataType::VECTOR_FLOAT) { if (field.get_data_type() == DataType::VECTOR_FLOAT) {
this->insert_entity<FloatVector>(field.get_dim(), chunk_size); this->append_field_data<FloatVector>(field.get_dim(), size_per_chunk);
continue; continue;
} else if (field.get_data_type() == DataType::VECTOR_BINARY) { } else if (field.get_data_type() == DataType::VECTOR_BINARY) {
this->insert_entity<BinaryVector>(field.get_dim(), chunk_size); this->append_field_data<BinaryVector>(field.get_dim(), size_per_chunk);
continue; continue;
} else { } else {
PanicInfo("unsupported"); PanicInfo("unsupported");
@ -28,34 +28,34 @@ InsertRecord::InsertRecord(const Schema& schema, int64_t chunk_size) : uids_(1),
} }
switch (field.get_data_type()) { switch (field.get_data_type()) {
case DataType::BOOL: { case DataType::BOOL: {
this->insert_entity<bool>(chunk_size); this->append_field_data<bool>(size_per_chunk);
break; break;
} }
case DataType::INT8: { case DataType::INT8: {
this->insert_entity<int8_t>(chunk_size); this->append_field_data<int8_t>(size_per_chunk);
break; break;
} }
case DataType::INT16: { case DataType::INT16: {
this->insert_entity<int16_t>(chunk_size); this->append_field_data<int16_t>(size_per_chunk);
break; break;
} }
case DataType::INT32: { case DataType::INT32: {
this->insert_entity<int32_t>(chunk_size); this->append_field_data<int32_t>(size_per_chunk);
break; break;
} }
case DataType::INT64: { case DataType::INT64: {
this->insert_entity<int64_t>(chunk_size); this->append_field_data<int64_t>(size_per_chunk);
break; break;
} }
case DataType::FLOAT: { case DataType::FLOAT: {
this->insert_entity<float>(chunk_size); this->append_field_data<float>(size_per_chunk);
break; break;
} }
case DataType::DOUBLE: { case DataType::DOUBLE: {
this->insert_entity<double>(chunk_size); this->append_field_data<double>(size_per_chunk);
break; break;
} }
default: { default: {

View File

@ -24,47 +24,53 @@ struct InsertRecord {
ConcurrentVector<Timestamp> timestamps_; ConcurrentVector<Timestamp> timestamps_;
ConcurrentVector<idx_t> uids_; ConcurrentVector<idx_t> uids_;
explicit InsertRecord(const Schema& schema, int64_t chunk_size); explicit InsertRecord(const Schema& schema, int64_t size_per_chunk);
// get field data without knowing the type
// return VectorBase type
auto auto
get_base_entity(FieldOffset field_offset) const { get_field_data_base(FieldOffset field_offset) const {
auto ptr = entity_vec_[field_offset.get()].get(); auto ptr = field_datas_[field_offset.get()].get();
return ptr; return ptr;
} }
// get field data in given type, const version
template <typename Type> template <typename Type>
auto auto
get_entity(FieldOffset field_offset) const { get_field_data(FieldOffset field_offset) const {
auto base_ptr = get_base_entity(field_offset); auto base_ptr = get_field_data_base(field_offset);
auto ptr = dynamic_cast<const ConcurrentVector<Type>*>(base_ptr); auto ptr = dynamic_cast<const ConcurrentVector<Type>*>(base_ptr);
Assert(ptr); Assert(ptr);
return ptr; return ptr;
} }
// get field data in given type, nonconst version
template <typename Type> template <typename Type>
auto auto
get_entity(FieldOffset field_offset) { get_field_data(FieldOffset field_offset) {
auto base_ptr = get_base_entity(field_offset); auto base_ptr = get_field_data_base(field_offset);
auto ptr = dynamic_cast<ConcurrentVector<Type>*>(base_ptr); auto ptr = dynamic_cast<ConcurrentVector<Type>*>(base_ptr);
Assert(ptr); Assert(ptr);
return ptr; return ptr;
} }
// append a column of scalar type
template <typename Type> template <typename Type>
void void
insert_entity(int64_t chunk_size) { append_field_data(int64_t size_per_chunk) {
static_assert(std::is_fundamental_v<Type>); static_assert(std::is_fundamental_v<Type>);
entity_vec_.emplace_back(std::make_unique<ConcurrentVector<Type>>(chunk_size)); field_datas_.emplace_back(std::make_unique<ConcurrentVector<Type>>(size_per_chunk));
} }
// append a column of vector type
template <typename VectorType> template <typename VectorType>
void void
insert_entity(int64_t dim, int64_t chunk_size) { append_field_data(int64_t dim, int64_t size_per_chunk) {
static_assert(std::is_base_of_v<VectorTrait, VectorType>); static_assert(std::is_base_of_v<VectorTrait, VectorType>);
entity_vec_.emplace_back(std::make_unique<ConcurrentVector<VectorType>>(dim, chunk_size)); field_datas_.emplace_back(std::make_unique<ConcurrentVector<VectorType>>(dim, size_per_chunk));
} }
private: private:
std::vector<std::unique_ptr<VectorBase>> entity_vec_; std::vector<std::unique_ptr<VectorBase>> field_datas_;
}; };
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -31,30 +31,30 @@ using SealedIndexingEntryPtr = std::unique_ptr<SealedIndexingEntry>;
struct SealedIndexingRecord { struct SealedIndexingRecord {
void void
add_entry(FieldOffset field_offset, MetricType metric_type, knowhere::VecIndexPtr indexing) { append_field_indexing(FieldOffset field_offset, MetricType metric_type, knowhere::VecIndexPtr indexing) {
auto ptr = std::make_unique<SealedIndexingEntry>(); auto ptr = std::make_unique<SealedIndexingEntry>();
ptr->indexing_ = indexing; ptr->indexing_ = indexing;
ptr->metric_type_ = metric_type; ptr->metric_type_ = metric_type;
std::unique_lock lck(mutex_); std::unique_lock lck(mutex_);
entries_[field_offset] = std::move(ptr); field_indexings_[field_offset] = std::move(ptr);
} }
const SealedIndexingEntry* const SealedIndexingEntry*
get_entry(FieldOffset field_offset) const { get_field_indexing(FieldOffset field_offset) const {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
AssertInfo(entries_.count(field_offset), "field_offset not found"); AssertInfo(field_indexings_.count(field_offset), "field_offset not found");
return entries_.at(field_offset).get(); return field_indexings_.at(field_offset).get();
} }
bool bool
is_ready(FieldOffset field_offset) const { is_ready(FieldOffset field_offset) const {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
return entries_.count(field_offset); return field_indexings_.count(field_offset);
} }
private: private:
// field_offset -> SealedIndexingEntry // field_offset -> SealedIndexingEntry
std::map<FieldOffset, SealedIndexingEntryPtr> entries_; std::map<FieldOffset, SealedIndexingEntryPtr> field_indexings_;
mutable std::shared_mutex mutex_; mutable std::shared_mutex mutex_;
}; };
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -20,8 +20,8 @@ TestABI() {
} }
std::unique_ptr<SegmentGrowing> std::unique_ptr<SegmentGrowing>
CreateGrowingSegment(SchemaPtr schema, int64_t chunk_size) { CreateGrowingSegment(SchemaPtr schema, int64_t size_per_chunk) {
auto segment = std::make_unique<SegmentGrowingImpl>(schema, chunk_size); auto segment = std::make_unique<SegmentGrowingImpl>(schema, size_per_chunk);
return segment; return segment;
} }

View File

@ -80,7 +80,7 @@ class SegmentGrowing : public SegmentInternalInterface {
using SegmentGrowingPtr = std::unique_ptr<SegmentGrowing>; using SegmentGrowingPtr = std::unique_ptr<SegmentGrowing>;
SegmentGrowingPtr SegmentGrowingPtr
CreateGrowingSegment(SchemaPtr schema, int64_t chunk_size = 32 * 1024); CreateGrowingSegment(SchemaPtr schema, int64_t size_per_chunk = 32 * 1024);
} // namespace segcore } // namespace segcore
} // namespace milvus } // namespace milvus

View File

@ -170,7 +170,7 @@ SegmentGrowingImpl::Insert(int64_t reserved_begin,
record_.uids_.set_data(reserved_begin, uids.data(), size); record_.uids_.set_data(reserved_begin, uids.data(), size);
for (int fid = 0; fid < schema_->size(); ++fid) { for (int fid = 0; fid < schema_->size(); ++fid) {
auto field_offset = FieldOffset(fid); auto field_offset = FieldOffset(fid);
record_.get_base_entity(field_offset)->set_data_raw(reserved_begin, entities[fid].data(), size); record_.get_field_data_base(field_offset)->set_data_raw(reserved_begin, entities[fid].data(), size);
} }
for (int i = 0; i < uids.size(); ++i) { for (int i = 0; i < uids.size(); ++i) {
@ -180,7 +180,7 @@ SegmentGrowingImpl::Insert(int64_t reserved_begin,
} }
record_.ack_responder_.AddSegment(reserved_begin, reserved_begin + size); record_.ack_responder_.AddSegment(reserved_begin, reserved_begin + size);
indexing_record_.UpdateResourceAck(record_.ack_responder_.GetAck() / chunk_size_, record_); indexing_record_.UpdateResourceAck(record_.ack_responder_.GetAck() / size_per_chunk_, record_);
return Status::OK(); return Status::OK();
} }
@ -231,9 +231,9 @@ SegmentGrowingImpl::Close() {
int64_t int64_t
SegmentGrowingImpl::GetMemoryUsageInBytes() const { SegmentGrowingImpl::GetMemoryUsageInBytes() const {
int64_t total_bytes = 0; int64_t total_bytes = 0;
int64_t ins_n = upper_align(record_.reserved, chunk_size_); int64_t ins_n = upper_align(record_.reserved, size_per_chunk_);
total_bytes += ins_n * (schema_->get_total_sizeof() + 16 + 1); total_bytes += ins_n * (schema_->get_total_sizeof() + 16 + 1);
int64_t del_n = upper_align(deleted_record_.reserved, chunk_size_); int64_t del_n = upper_align(deleted_record_.reserved, size_per_chunk_);
total_bytes += del_n * (16 * 2); total_bytes += del_n * (16 * 2);
return total_bytes; return total_bytes;
} }
@ -245,20 +245,20 @@ SegmentGrowingImpl::LoadIndexing(const LoadIndexInfo& info) {
Assert(info.index_params.count("metric_type")); Assert(info.index_params.count("metric_type"));
auto metric_type_str = info.index_params.at("metric_type"); auto metric_type_str = info.index_params.at("metric_type");
sealed_indexing_record_.add_entry(field_offset, GetMetricType(metric_type_str), info.index); sealed_indexing_record_.append_field_indexing(field_offset, GetMetricType(metric_type_str), info.index);
return Status::OK(); return Status::OK();
} }
SpanBase SpanBase
SegmentGrowingImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const { SegmentGrowingImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const {
auto vec = get_insert_record().get_base_entity(field_offset); auto vec = get_insert_record().get_field_data_base(field_offset);
return vec->get_span_base(chunk_id); return vec->get_span_base(chunk_id);
} }
int64_t int64_t
SegmentGrowingImpl::num_chunk_data() const { SegmentGrowingImpl::num_chunk() const {
auto size = get_insert_record().ack_responder_.GetAck(); auto size = get_insert_record().ack_responder_.GetAck();
return upper_div(size, chunk_size_); return upper_div(size, size_per_chunk_);
} }
void void
SegmentGrowingImpl::vector_search(int64_t vec_count, SegmentGrowingImpl::vector_search(int64_t vec_count,

View File

@ -27,7 +27,7 @@
#include "utils/Status.h" #include "utils/Status.h"
#include "segcore/DeletedRecord.h" #include "segcore/DeletedRecord.h"
#include "utils/EasyAssert.h" #include "utils/EasyAssert.h"
#include "IndexingEntry.h" #include "FieldIndexing.h"
#include "InsertRecord.h" #include "InsertRecord.h"
#include <utility> #include <utility>
#include <memory> #include <memory>
@ -89,18 +89,18 @@ class SegmentGrowingImpl : public SegmentGrowing {
// return count of index that has index, i.e., [0, num_chunk_index) have built index // return count of index that has index, i.e., [0, num_chunk_index) have built index
int64_t int64_t
num_chunk_index_safe(FieldOffset field_offset) const final { num_chunk_index(FieldOffset field_offset) const final {
return indexing_record_.get_finished_ack(); return indexing_record_.get_finished_ack();
} }
const knowhere::Index* const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const final { chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const final {
return indexing_record_.get_entry(field_offset).get_indexing(chunk_id); return indexing_record_.get_field_indexing(field_offset).get_chunk_indexing(chunk_id);
} }
int64_t int64_t
size_per_chunk() const final { size_per_chunk() const final {
return chunk_size_; return size_per_chunk_;
} }
public: public:
@ -152,27 +152,27 @@ class SegmentGrowingImpl : public SegmentGrowing {
void void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const override { bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const override {
// TODO: support more types // TODO: support more types
auto vec_ptr = record_.get_base_entity(field_offset); auto vec_ptr = record_.get_field_data_base(field_offset);
auto data_type = schema_->operator[](field_offset).get_data_type(); auto data_type = schema_->operator[](field_offset).get_data_type();
Assert(data_type == DataType::INT64); Assert(data_type == DataType::INT64);
bulk_subscript_impl<int64_t>(*vec_ptr, seg_offsets, count, output); bulk_subscript_impl<int64_t>(*vec_ptr, seg_offsets, count, output);
} }
int64_t int64_t
num_chunk_data() const override; num_chunk() const override;
Status Status
LoadIndexing(const LoadIndexInfo& info) override; LoadIndexing(const LoadIndexInfo& info) override;
public: public:
friend std::unique_ptr<SegmentGrowing> friend std::unique_ptr<SegmentGrowing>
CreateGrowingSegment(SchemaPtr schema, int64_t chunk_size); CreateGrowingSegment(SchemaPtr schema, int64_t size_per_chunk);
explicit SegmentGrowingImpl(SchemaPtr schema, int64_t chunk_size) explicit SegmentGrowingImpl(SchemaPtr schema, int64_t size_per_chunk)
: chunk_size_(chunk_size), : size_per_chunk_(size_per_chunk),
schema_(std::move(schema)), schema_(std::move(schema)),
record_(*schema_, chunk_size), record_(*schema_, size_per_chunk),
indexing_record_(*schema_, chunk_size) { indexing_record_(*schema_, size_per_chunk) {
} }
void void
@ -192,7 +192,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const override; chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const override;
private: private:
int64_t chunk_size_; int64_t size_per_chunk_;
SchemaPtr schema_; SchemaPtr schema_;
std::atomic<SegmentState> state_ = SegmentState::Open; std::atomic<SegmentState> state_ = SegmentState::Open;

View File

@ -14,15 +14,18 @@
#include "common/Schema.h" #include "common/Schema.h"
#include "query/Plan.h" #include "query/Plan.h"
#include "common/Span.h" #include "common/Span.h"
#include "IndexingEntry.h" #include "FieldIndexing.h"
#include <knowhere/index/vector_index/VecIndex.h> #include <knowhere/index/vector_index/VecIndex.h>
#include "common/SystemProperty.h" #include "common/SystemProperty.h"
#include "query/PlanNode.h" #include "query/PlanNode.h"
namespace milvus::segcore { namespace milvus::segcore {
// common interface of SegmentSealed and SegmentGrowing
// used by C API
class SegmentInterface { class SegmentInterface {
public: public:
// fill results according to target_entries in plan
void void
FillTargetEntry(const query::Plan* plan, QueryResult& results) const; FillTargetEntry(const query::Plan* plan, QueryResult& results) const;
@ -44,14 +47,17 @@ class SegmentInterface {
virtual ~SegmentInterface() = default; virtual ~SegmentInterface() = default;
protected: protected:
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to system_type
virtual void virtual void
bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0; bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to field_offset
virtual void virtual void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const = 0; bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
}; };
// internal API for DSL calculation // internal API for DSL calculation
// only for implementation
class SegmentInternalInterface : public SegmentInterface { class SegmentInternalInterface : public SegmentInterface {
public: public:
template <typename T> template <typename T>
@ -80,21 +86,24 @@ class SegmentInternalInterface : public SegmentInterface {
const BitsetView& bitset, const BitsetView& bitset,
QueryResult& output) const = 0; QueryResult& output) const = 0;
// count of chunk that has index available
virtual int64_t virtual int64_t
num_chunk_index_safe(FieldOffset field_offset) const = 0; num_chunk_index(FieldOffset field_offset) const = 0;
// count of chunks
virtual int64_t virtual int64_t
num_chunk_data() const = 0; num_chunk() const = 0;
// return chunk_size for each chunk, renaming against confusion // element size in each chunk
virtual int64_t virtual int64_t
size_per_chunk() const = 0; size_per_chunk() const = 0;
protected: protected:
// blob and row_count // internal API: return chunk_data in span
virtual SpanBase virtual SpanBase
chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const = 0; chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
// internal API: return chunk_index in span, support scalar index only
virtual const knowhere::Index* virtual const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const = 0; chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
}; };

View File

@ -27,6 +27,6 @@ class SegmentSealed : public SegmentInternalInterface {
using SegmentSealedPtr = std::unique_ptr<SegmentSealed>; using SegmentSealedPtr = std::unique_ptr<SegmentSealed>;
SegmentSealedPtr SegmentSealedPtr
CreateSealedSegment(SchemaPtr schema, int64_t chunk_size = 32 * 1024); CreateSealedSegment(SchemaPtr schema, int64_t size_per_chunk = 32 * 1024);
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -29,7 +29,7 @@ SegmentSealedImpl::LoadIndex(const LoadIndexInfo& info) {
row_count_opt_ = row_count; row_count_opt_ = row_count;
} }
Assert(!vec_indexings_.is_ready(field_offset)); Assert(!vec_indexings_.is_ready(field_offset));
vec_indexings_.add_entry(field_offset, GetMetricType(metric_type_str), info.index); vec_indexings_.append_field_indexing(field_offset, GetMetricType(metric_type_str), info.index);
++ready_count_; ++ready_count_;
} }
@ -77,13 +77,13 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
} }
int64_t int64_t
SegmentSealedImpl::num_chunk_index_safe(FieldOffset field_offset) const { SegmentSealedImpl::num_chunk_index(FieldOffset field_offset) const {
// TODO: support scalar index // TODO: support scalar index
return 0; return 0;
} }
int64_t int64_t
SegmentSealedImpl::num_chunk_data() const { SegmentSealedImpl::num_chunk() const {
return 1; return 1;
} }
@ -142,7 +142,7 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
} }
SegmentSealedPtr SegmentSealedPtr
CreateSealedSegment(SchemaPtr schema, int64_t chunk_size) { CreateSealedSegment(SchemaPtr schema, int64_t size_per_chunk) {
return std::make_unique<SegmentSealedImpl>(schema); return std::make_unique<SegmentSealedImpl>(schema);
} }

View File

@ -37,12 +37,12 @@ class SegmentSealedImpl : public SegmentSealed {
public: public:
int64_t int64_t
num_chunk_index_safe(FieldOffset field_offset) const override; num_chunk_index(FieldOffset field_offset) const override;
int64_t int64_t
num_chunk_data() const override; num_chunk() const override;
// return chunk_size for each chunk, renaming against confusion // return size_per_chunk for each chunk, renaming against confusion
int64_t int64_t
size_per_chunk() const override; size_per_chunk() const override;

View File

@ -277,7 +277,7 @@ TEST(Sealed, LoadFieldData) {
vec_info.index_params["metric_type"] = milvus::knowhere::Metric::L2; vec_info.index_params["metric_type"] = milvus::knowhere::Metric::L2;
segment->LoadIndex(vec_info); segment->LoadIndex(vec_info);
} }
ASSERT_EQ(segment->num_chunk_data(), 1); ASSERT_EQ(segment->num_chunk(), 1);
auto chunk_span1 = segment->chunk_data<int64_t>(FieldOffset(1), 0); auto chunk_span1 = segment->chunk_data<int64_t>(FieldOffset(1), 0);
auto chunk_span2 = segment->chunk_data<double>(FieldOffset(2), 0); auto chunk_span2 = segment->chunk_data<double>(FieldOffset(2), 0);
auto ref1 = dataset.get_col<int64_t>(1); auto ref1 = dataset.get_col<int64_t>(1);

View File

@ -19,38 +19,38 @@ TEST(Span, Naive) {
using namespace milvus::query; using namespace milvus::query;
using namespace milvus::segcore; using namespace milvus::segcore;
int64_t N = 1000 * 1000; int64_t N = 1000 * 1000;
constexpr int64_t chunk_size = 32 * 1024; constexpr int64_t size_per_chunk = 32 * 1024;
auto schema = std::make_shared<Schema>(); auto schema = std::make_shared<Schema>();
schema->AddDebugField("binaryvec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard); schema->AddDebugField("binaryvec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::FLOAT); schema->AddDebugField("age", DataType::FLOAT);
schema->AddDebugField("floatvec", DataType::VECTOR_FLOAT, 32, MetricType::METRIC_L2); schema->AddDebugField("floatvec", DataType::VECTOR_FLOAT, 32, MetricType::METRIC_L2);
auto dataset = DataGen(schema, N); auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema, chunk_size); auto segment = CreateGrowingSegment(schema, size_per_chunk);
segment->PreInsert(N); segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto vec_ptr = dataset.get_col<uint8_t>(0); auto vec_ptr = dataset.get_col<uint8_t>(0);
auto age_ptr = dataset.get_col<float>(1); auto age_ptr = dataset.get_col<float>(1);
auto float_ptr = dataset.get_col<float>(2); auto float_ptr = dataset.get_col<float>(2);
SegmentInternalInterface& interface = *segment; SegmentInternalInterface& interface = *segment;
auto num_chunk = interface.num_chunk_data(); auto num_chunk = interface.num_chunk();
ASSERT_EQ(num_chunk, upper_div(N, chunk_size)); ASSERT_EQ(num_chunk, upper_div(N, size_per_chunk));
auto row_count = interface.get_row_count(); auto row_count = interface.get_row_count();
ASSERT_EQ(N, row_count); ASSERT_EQ(N, row_count);
for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) { for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto vec_span = interface.chunk_data<BinaryVector>(FieldOffset(0), chunk_id); auto vec_span = interface.chunk_data<BinaryVector>(FieldOffset(0), chunk_id);
auto age_span = interface.chunk_data<float>(FieldOffset(1), chunk_id); auto age_span = interface.chunk_data<float>(FieldOffset(1), chunk_id);
auto float_span = interface.chunk_data<FloatVector>(FieldOffset(2), chunk_id); auto float_span = interface.chunk_data<FloatVector>(FieldOffset(2), chunk_id);
auto begin = chunk_id * chunk_size; auto begin = chunk_id * size_per_chunk;
auto end = std::min((chunk_id + 1) * chunk_size, N); auto end = std::min((chunk_id + 1) * size_per_chunk, N);
auto chunk_size = end - begin; auto size_per_chunk = end - begin;
for (int i = 0; i < chunk_size * 512 / 8; ++i) { for (int i = 0; i < size_per_chunk * 512 / 8; ++i) {
ASSERT_EQ(vec_span.data()[i], vec_ptr[i + begin * 512 / 8]); ASSERT_EQ(vec_span.data()[i], vec_ptr[i + begin * 512 / 8]);
} }
for (int i = 0; i < chunk_size; ++i) { for (int i = 0; i < size_per_chunk; ++i) {
ASSERT_EQ(age_span.data()[i], age_ptr[i + begin]); ASSERT_EQ(age_span.data()[i], age_ptr[i + begin]);
} }
for (int i = 0; i < chunk_size; ++i) { for (int i = 0; i < size_per_chunk; ++i) {
ASSERT_EQ(float_span.data()[i], float_ptr[i + begin * 32]); ASSERT_EQ(float_span.data()[i], float_ptr[i + begin * 32]);
} }
} }