milvus/internal/datanode/writebuffer/insert_buffer.go

172 lines
4.1 KiB
Go

package writebuffer
import (
"math"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/mq/msgstream"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
const (
noLimit int64 = -1
)
type BufferBase struct {
rows int64
rowLimit int64
size int64
sizeLimit int64
TimestampFrom typeutil.Timestamp
TimestampTo typeutil.Timestamp
startPos *msgpb.MsgPosition
endPos *msgpb.MsgPosition
}
func (b *BufferBase) UpdateStatistics(entryNum, size int64, tr TimeRange, startPos, endPos *msgpb.MsgPosition) {
b.rows += entryNum
b.size += size
if tr.timestampMin < b.TimestampFrom {
b.TimestampFrom = tr.timestampMin
}
if tr.timestampMax > b.TimestampTo {
b.TimestampTo = tr.timestampMax
}
if b.startPos == nil || startPos.Timestamp < b.startPos.Timestamp {
b.startPos = startPos
}
if b.endPos == nil || endPos.Timestamp > b.endPos.Timestamp {
b.endPos = endPos
}
}
func (b *BufferBase) IsFull() bool {
return (b.rowLimit != noLimit && b.rows >= b.rowLimit) ||
(b.sizeLimit != noLimit && b.size >= b.sizeLimit)
}
func (b *BufferBase) IsEmpty() bool {
return b.rows == 0
}
func (b *BufferBase) MinTimestamp() typeutil.Timestamp {
if b.startPos == nil {
return math.MaxUint64
}
return b.startPos.GetTimestamp()
}
type InsertBuffer struct {
BufferBase
collSchema *schemapb.CollectionSchema
buffer *storage.InsertData
}
func NewInsertBuffer(sch *schemapb.CollectionSchema) (*InsertBuffer, error) {
size, err := typeutil.EstimateSizePerRecord(sch)
if err != nil {
log.Warn("failed to estimate size per record", zap.Error(err))
return nil, err
}
if size == 0 {
return nil, errors.New("Invalid schema")
}
buffer, err := storage.NewInsertData(sch)
if err != nil {
return nil, err
}
limit := paramtable.Get().DataNodeCfg.FlushInsertBufferSize.GetAsInt64() / int64(size)
if paramtable.Get().DataNodeCfg.FlushInsertBufferSize.GetAsInt64()%int64(size) != 0 {
limit++
}
return &InsertBuffer{
BufferBase: BufferBase{
rowLimit: limit,
sizeLimit: noLimit,
TimestampFrom: math.MaxUint64,
TimestampTo: 0,
},
collSchema: sch,
buffer: buffer,
}, nil
}
func (ib *InsertBuffer) Renew() *storage.InsertData {
if ib.IsEmpty() {
return nil
}
result := ib.buffer
// no error since validated in constructor
ib.buffer, _ = storage.NewInsertData(ib.collSchema)
ib.BufferBase.rows = 0
ib.BufferBase.TimestampFrom = math.MaxUint64
ib.BufferBase.TimestampTo = 0
return result
}
func (ib *InsertBuffer) Buffer(msgs []*msgstream.InsertMsg, startPos, endPos *msgpb.MsgPosition) ([]storage.FieldData, error) {
pkData := make([]storage.FieldData, 0, len(msgs))
for _, msg := range msgs {
tmpBuffer, err := storage.InsertMsgToInsertData(msg, ib.collSchema)
if err != nil {
log.Warn("failed to transfer insert msg to insert data", zap.Error(err))
return nil, err
}
pkFieldData, err := storage.GetPkFromInsertData(ib.collSchema, tmpBuffer)
if err != nil {
return nil, err
}
if pkFieldData.RowNum() != tmpBuffer.GetRowNum() {
return nil, merr.WrapErrServiceInternal("pk column row num not match")
}
pkData = append(pkData, pkFieldData)
storage.MergeInsertData(ib.buffer, tmpBuffer)
tsData, err := storage.GetTimestampFromInsertData(tmpBuffer)
if err != nil {
log.Warn("no timestamp field found in insert msg", zap.Error(err))
return nil, err
}
// update buffer size
ib.UpdateStatistics(int64(tmpBuffer.GetRowNum()), 0, ib.getTimestampRange(tsData), startPos, endPos)
}
return pkData, nil
}
func (ib *InsertBuffer) getTimestampRange(tsData *storage.Int64FieldData) TimeRange {
tr := TimeRange{
timestampMin: math.MaxUint64,
timestampMax: 0,
}
for _, data := range tsData.Data {
if uint64(data) < tr.timestampMin {
tr.timestampMin = typeutil.Timestamp(data)
}
if uint64(data) > tr.timestampMax {
tr.timestampMax = typeutil.Timestamp(data)
}
}
return tr
}