enhance: redefine variable column block size (#35040)

#35013

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
pull/34672/head
zhagnlu 2024-07-30 19:23:50 +08:00 committed by GitHub
parent 273e9203f4
commit dd0c26cf58
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 38 additions and 19 deletions

View File

@ -53,7 +53,10 @@ namespace milvus {
*/
constexpr size_t STRING_PADDING = 1;
constexpr size_t ARRAY_PADDING = 1;
constexpr size_t BLOCK_SIZE = 8192;
constexpr size_t DEFAULT_PK_VRCOL_BLOCK_SIZE = 1;
constexpr size_t DEFAULT_MEM_VRCOL_BLOCK_SIZE = 32;
constexpr size_t DEFAULT_MMAP_VRCOL_BLOCK_SIZE = 256;
class ColumnBase {
public:
@ -643,13 +646,16 @@ class VariableColumn : public ColumnBase {
std::conditional_t<std::is_same_v<T, std::string>, std::string_view, T>;
// memory mode ctor
VariableColumn(size_t cap, const FieldMeta& field_meta)
: ColumnBase(cap, field_meta) {
VariableColumn(size_t cap, const FieldMeta& field_meta, size_t block_size)
: ColumnBase(cap, field_meta), block_size_(block_size) {
}
// mmap mode ctor
VariableColumn(const File& file, size_t size, const FieldMeta& field_meta)
: ColumnBase(file, size, field_meta) {
VariableColumn(const File& file,
size_t size,
const FieldMeta& field_meta,
size_t block_size)
: ColumnBase(file, size, field_meta), block_size_(block_size) {
}
// mmap with mmap manager
VariableColumn(size_t reserve,
@ -657,8 +663,10 @@ class VariableColumn : public ColumnBase {
const DataType& data_type,
storage::MmapChunkManagerPtr mcm,
storage::MmapChunkDescriptorPtr descriptor,
bool nullable)
: ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) {
bool nullable,
size_t block_size)
: ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable),
block_size_(block_size) {
}
VariableColumn(VariableColumn&& column) noexcept
@ -708,8 +716,8 @@ class VariableColumn : public ColumnBase {
PanicInfo(ErrorCode::OutOfRange, "index out of range");
}
char* pos = data_ + indices_[start_offset / BLOCK_SIZE];
for (size_t j = 0; j < start_offset % BLOCK_SIZE; j++) {
char* pos = data_ + indices_[start_offset / block_size_];
for (size_t j = 0; j < start_offset % block_size_; j++) {
uint32_t size;
size = *reinterpret_cast<uint32_t*>(pos);
pos += sizeof(uint32_t) + size;
@ -723,8 +731,8 @@ class VariableColumn : public ColumnBase {
if (i < 0 || i > num_rows_) {
PanicInfo(ErrorCode::OutOfRange, "index out of range");
}
size_t batch_id = i / BLOCK_SIZE;
size_t offset = i % BLOCK_SIZE;
size_t batch_id = i / block_size_;
size_t offset = i % block_size_;
// located in batch start location
char* pos = data_ + indices_[batch_id];
@ -801,11 +809,11 @@ class VariableColumn : public ColumnBase {
void
shrink_indice() {
std::vector<uint64_t> tmp_indices;
tmp_indices.reserve((indices_.size() + BLOCK_SIZE - 1) / BLOCK_SIZE);
tmp_indices.reserve((indices_.size() + block_size_ - 1) / block_size_);
for (size_t i = 0; i < indices_.size();) {
tmp_indices.push_back(indices_[i]);
i += BLOCK_SIZE;
i += block_size_;
}
indices_.swap(tmp_indices);
@ -814,8 +822,8 @@ class VariableColumn : public ColumnBase {
private:
// loading states
std::queue<FieldDataPtr> load_buf_{};
// raw data index, record indices located 0, interval, 2 * interval, 3 * interval
// ... just like page index, interval set to 8192 that matches search engine's batch size
// raw data index, record indices located 0, block_size_, 2 * block_size_, 3 * block_size_
size_t block_size_;
std::vector<uint64_t> indices_{};
};

View File

@ -399,6 +399,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
// Don't allow raw data and index exist at the same time
// AssertInfo(!get_bit(index_ready_bitset_, field_id),
// "field data can't be loaded when indexing exists");
auto get_block_size = [&]() -> size_t {
return schema_->get_primary_field_id() == field_id
? DEFAULT_PK_VRCOL_BLOCK_SIZE
: DEFAULT_MEM_VRCOL_BLOCK_SIZE;
};
std::shared_ptr<ColumnBase> column{};
if (IsVariableDataType(data_type)) {
@ -408,7 +413,7 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
case milvus::DataType::VARCHAR: {
auto var_column =
std::make_shared<VariableColumn<std::string>>(
num_rows, field_meta);
num_rows, field_meta, get_block_size());
FieldDataPtr field_data;
while (data.channel->pop(field_data)) {
var_column->Append(std::move(field_data));
@ -423,7 +428,7 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
case milvus::DataType::JSON: {
auto var_column =
std::make_shared<VariableColumn<milvus::Json>>(
num_rows, field_meta);
num_rows, field_meta, get_block_size());
FieldDataPtr field_data;
while (data.channel->pop(field_data)) {
var_column->Append(std::move(field_data));
@ -572,7 +577,10 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
auto var_column = std::make_shared<VariableColumn<std::string>>(
file, total_written, field_meta);
file,
total_written,
field_meta,
DEFAULT_MMAP_VRCOL_BLOCK_SIZE);
var_column->Seal(std::move(indices));
column = std::move(var_column);
break;
@ -580,7 +588,10 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
case milvus::DataType::JSON: {
auto var_column =
std::make_shared<VariableColumn<milvus::Json>>(
file, total_written, field_meta);
file,
total_written,
field_meta,
DEFAULT_MMAP_VRCOL_BLOCK_SIZE);
var_column->Seal(std::move(indices));
column = std::move(var_column);
break;