Flush statistics for all int64 fields (#5318)

Resolves: #5262

Signed-off-by: Xiangyu Wang <xiangyu.wang@zilliz.com>
pull/5327/head^2
Xiangyu Wang 2021-05-20 18:38:45 +08:00 committed by GitHub
parent 913712fbea
commit 23c4de0eb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 288 additions and 19 deletions

View File

@ -39,6 +39,10 @@ func NewBinlogMeta(kv kv.TxnKV, idAllocator allocatorInterface) (*binlogMeta, er
return mt, nil
}
func (bm *binlogMeta) allocID() (key UniqueID, err error) {
return bm.idAllocator.allocID()
}
// genKey gives a valid key string for lists of UniqueIDs:
// if alloc is true, the returned keys will have a generated-unique ID at the end.
// if alloc is false, the returned keys will only consist of provided ids.

View File

@ -596,7 +596,7 @@ func flushSegment(collMeta *etcdpb.CollectionMeta, segID, partitionID, collID Un
return
}
binLogs, err := inCodec.Serialize(partitionID, segID, data.(*InsertData))
binLogs, statsBinlogs, err := inCodec.Serialize(partitionID, segID, data.(*InsertData))
if err != nil {
log.Error("Flush failed ... cannot generate binlog ..", zap.Error(err))
clearFn(false)
@ -607,6 +607,9 @@ func flushSegment(collMeta *etcdpb.CollectionMeta, segID, partitionID, collID Un
field2Path := make(map[UniqueID]string, len(binLogs))
kvs := make(map[string]string, len(binLogs))
paths := make([]string, 0, len(binLogs))
field2Logidx := make(map[UniqueID]UniqueID, len(binLogs))
// write insert binlog
for _, blob := range binLogs {
fieldID, err := strconv.ParseInt(blob.GetKey(), 10, 64)
if err != nil {
@ -615,17 +618,39 @@ func flushSegment(collMeta *etcdpb.CollectionMeta, segID, partitionID, collID Un
return
}
k, err := idAllocator.genKey(true, collID, partitionID, segID, fieldID)
logidx, err := idAllocator.allocID()
if err != nil {
log.Error("Flush failed ... cannot alloc ID ..", zap.Error(err))
clearFn(false)
return
}
// no error raise if alloc=false
k, _ := idAllocator.genKey(false, collID, partitionID, segID, fieldID, logidx)
key := path.Join(Params.InsertBinlogRootPath, k)
paths = append(paths, key)
kvs[key] = string(blob.Value[:])
field2Path[fieldID] = key
field2Logidx[fieldID] = logidx
}
// write stats binlog
for _, blob := range statsBinlogs {
fieldID, err := strconv.ParseInt(blob.GetKey(), 10, 64)
if err != nil {
log.Error("Flush failed ... cannot parse string to fieldID ..", zap.Error(err))
clearFn(false)
return
}
logidx := field2Logidx[fieldID]
// no error raise if alloc=false
k, _ := idAllocator.genKey(false, collID, partitionID, segID, fieldID, logidx)
key := path.Join(Params.StatsBinlogRootPath, k)
kvs[key] = string(blob.Value[:])
}
err = kv.MultiSave(kvs)

View File

@ -13,16 +13,24 @@ package datanode
import (
"context"
"fmt"
"math"
"path"
"sync"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
memkv "github.com/milvus-io/milvus/internal/kv/mem"
"github.com/milvus-io/milvus/internal/msgstream"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/proto/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/flowgraph"
"github.com/stretchr/testify/assert"
)
func TestFlowGraphInsertBufferNode_Operate(t *testing.T) {
@ -107,3 +115,108 @@ func genInsertMsg() insertMsg {
return *iMsg
}
func TestFlushSegmentTxn(t *testing.T) {
idAllocMock := NewAllocatorFactory(1)
mockMinIO := memkv.NewMemoryKV()
segmentID, _ := idAllocMock.allocID()
partitionID, _ := idAllocMock.allocID()
collectionID, _ := idAllocMock.allocID()
fmt.Printf("generate segmentID, partitionID, collectionID: %v, %v, %v\n",
segmentID, partitionID, collectionID)
collMeta := genCollectionMeta(collectionID, "test_flush_segment_txn")
flushMap := sync.Map{}
flushMeta := newBinlogMeta()
finishCh := make(chan map[UniqueID]string)
insertData := &InsertData{
Data: make(map[storage.FieldID]storage.FieldData),
}
insertData.Data[0] = &storage.Int64FieldData{
NumRows: 10,
Data: []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
}
insertData.Data[1] = &storage.Int64FieldData{
NumRows: 10,
Data: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
}
insertData.Data[107] = &storage.FloatFieldData{
NumRows: 10,
Data: make([]float32, 10),
}
flushMap.Store(segmentID, insertData)
go func(wait <-chan map[UniqueID]string) {
field2Path := <-wait
assert.NotNil(t, field2Path)
}(finishCh)
flushSegment(collMeta,
segmentID,
partitionID,
collectionID,
&flushMap,
mockMinIO,
finishCh,
idAllocMock)
k, _ := flushMeta.genKey(false, collectionID, partitionID, segmentID, 0)
key := path.Join(Params.StatsBinlogRootPath, k)
_, values, _ := mockMinIO.LoadWithPrefix(key)
assert.Equal(t, len(values), 1)
assert.Equal(t, values[0], `{"max":9,"min":0}`)
}
func genCollectionMeta(collectionID UniqueID, collectionName string) *etcdpb.CollectionMeta {
sch := schemapb.CollectionSchema{
Name: collectionName,
Description: "test collection by meta factory",
AutoID: true,
Fields: []*schemapb.FieldSchema{
{
FieldID: 0,
Name: "RowID",
Description: "RowID field",
DataType: schemapb.DataType_Int64,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "f0_tk1",
Value: "f0_tv1",
},
},
},
{
FieldID: 1,
Name: "Timestamp",
Description: "Timestamp field",
DataType: schemapb.DataType_Int64,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "f1_tk1",
Value: "f1_tv1",
},
},
},
{
FieldID: 107,
Name: "float32_field",
Description: "field 107",
DataType: schemapb.DataType_Float,
TypeParams: []*commonpb.KeyValuePair{},
IndexParams: []*commonpb.KeyValuePair{},
},
},
}
collection := etcdpb.CollectionMeta{
ID: collectionID,
Schema: &sch,
CreateTime: Timestamp(1),
SegmentIDs: make([]UniqueID, 0),
PartitionIDs: []UniqueID{0},
}
return &collection
}

View File

@ -33,6 +33,7 @@ type ParamTable struct {
FlushInsertBufferSize int32
InsertBinlogRootPath string
DdlBinlogRootPath string
StatsBinlogRootPath string
Log log.Config
// === DataNode External Components Configs ===
@ -89,6 +90,7 @@ func (p *ParamTable) Init() {
p.initFlushInsertBufferSize()
p.initInsertBinlogRootPath()
p.initDdlBinlogRootPath()
p.initStatsBinlogRootPath()
p.initLogCfg()
// === DataNode External Components Configs ===
@ -166,6 +168,14 @@ func (p *ParamTable) initDdlBinlogRootPath() {
p.DdlBinlogRootPath = path.Join(rootPath, "data_definition_log")
}
func (p *ParamTable) initStatsBinlogRootPath() {
rootPath, err := p.Load("etcd.rootPath")
if err != nil {
panic(err)
}
p.StatsBinlogRootPath = path.Join(rootPath, "stats_log")
}
// ---- Pulsar ----
func (p *ParamTable) initPulsarAddress() {
url, err := p.Load("_PulsarAddress")

View File

@ -810,7 +810,7 @@ func generateInsertBinLog(collectionID UniqueID, partitionID UniqueID, segmentID
DataType: schemapb.DataType_Int64,
})
inCodec := storage.NewInsertCodec(collMeta)
binLogs, err := inCodec.Serialize(partitionID, segmentID, insertData)
binLogs, _, err := inCodec.Serialize(partitionID, segmentID, insertData)
if err != nil {
return nil, nil, err

View File

@ -138,12 +138,13 @@ func NewInsertCodec(schema *etcdpb.CollectionMeta) *InsertCodec {
return &InsertCodec{Schema: schema}
}
func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID UniqueID, data *InsertData) ([]*Blob, error) {
func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID UniqueID, data *InsertData) ([]*Blob, []*Blob, error) {
var blobs []*Blob
var statsBlobs []*Blob
var writer *InsertBinlogWriter
timeFieldData, ok := data.Data[ms.TimeStampField]
if !ok {
return nil, errors.New("data doesn't contains timestamp field")
return nil, nil, errors.New("data doesn't contains timestamp field")
}
ts := timeFieldData.(*Int64FieldData).Data
startTs := ts[0]
@ -157,11 +158,14 @@ func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID Unique
for _, field := range insertCodec.Schema.Schema.Fields {
singleData := data.Data[field.FieldID]
// encode fields
writer = NewInsertBinlogWriter(field.DataType, insertCodec.Schema.ID, partitionID, segmentID, field.FieldID)
eventWriter, err := writer.NextInsertEventWriter()
if err != nil {
return nil, err
return nil, nil, err
}
eventWriter.SetStartTimestamp(typeutil.Timestamp(startTs))
eventWriter.SetEndTimestamp(typeutil.Timestamp(endTs))
switch field.DataType {
@ -183,7 +187,7 @@ func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID Unique
for _, singleString := range singleData.(*StringFieldData).Data {
err = eventWriter.AddOneStringToPayload(singleString)
if err != nil {
return nil, err
return nil, nil, err
}
}
case schemapb.DataType_BinaryVector:
@ -191,22 +195,22 @@ func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID Unique
case schemapb.DataType_FloatVector:
err = eventWriter.AddFloatVectorToPayload(singleData.(*FloatVectorFieldData).Data, singleData.(*FloatVectorFieldData).Dim)
default:
return nil, fmt.Errorf("undefined data type %d", field.DataType)
return nil, nil, fmt.Errorf("undefined data type %d", field.DataType)
}
if err != nil {
return nil, err
return nil, nil, err
}
writer.SetStartTimeStamp(typeutil.Timestamp(startTs))
writer.SetEndTimeStamp(typeutil.Timestamp(endTs))
err = writer.Close()
if err != nil {
return nil, err
return nil, nil, err
}
buffer, err := writer.GetBuffer()
if err != nil {
return nil, err
return nil, nil, err
}
blobKey := fmt.Sprintf("%d", field.FieldID)
blobs = append(blobs, &Blob{
@ -214,9 +218,23 @@ func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID Unique
Value: buffer,
})
// stats fields
statsWriter := &StatsWriter{}
switch field.DataType {
case schemapb.DataType_Int64:
err = statsWriter.StatsInt64(singleData.(*Int64FieldData).Data)
}
if err != nil {
return nil, nil, err
}
statsBuffer := statsWriter.GetBuffer()
statsBlobs = append(statsBlobs, &Blob{
Key: blobKey,
Value: statsBuffer,
})
}
return blobs, nil
return blobs, statsBlobs, nil
}
func (insertCodec *InsertCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *InsertData, err error) {
if len(blobs) == 0 {

View File

@ -228,13 +228,13 @@ func TestInsertCodec(t *testing.T) {
},
},
}
firstBlobs, err := insertCodec.Serialize(1, 1, insertDataFirst)
firstBlobs, _, err := insertCodec.Serialize(1, 1, insertDataFirst)
assert.Nil(t, err)
for _, blob := range firstBlobs {
blob.Key = fmt.Sprintf("1/insert_log/2/3/4/5/%d", 100)
assert.Equal(t, blob.GetKey(), blob.Key)
}
secondBlobs, err := insertCodec.Serialize(1, 1, insertDataSecond)
secondBlobs, _, err := insertCodec.Serialize(1, 1, insertDataSecond)
assert.Nil(t, err)
for _, blob := range secondBlobs {
blob.Key = fmt.Sprintf("1/insert_log/2/3/4/5/%d", 99)
@ -353,7 +353,7 @@ func TestIndexCodec(t *testing.T) {
func TestTsError(t *testing.T) {
insertData := &InsertData{}
insertCodec := NewInsertCodec(nil)
blobs, err := insertCodec.Serialize(1, 1, insertData)
blobs, _, err := insertCodec.Serialize(1, 1, insertData)
assert.Nil(t, blobs)
assert.NotNil(t, err)
}
@ -410,7 +410,7 @@ func TestSchemaError(t *testing.T) {
},
}
insertCodec := NewInsertCodec(schema)
blobs, err := insertCodec.Serialize(1, 1, insertData)
blobs, _, err := insertCodec.Serialize(1, 1, insertData)
assert.Nil(t, blobs)
assert.NotNil(t, err)
}

View File

@ -281,7 +281,7 @@ func TestPrintBinlogFiles(t *testing.T) {
},
},
}
firstBlobs, err := insertCodec.Serialize(1, 1, insertDataFirst)
firstBlobs, _, err := insertCodec.Serialize(1, 1, insertDataFirst)
assert.Nil(t, err)
var binlogFiles []string
for index, blob := range firstBlobs {
@ -296,7 +296,7 @@ func TestPrintBinlogFiles(t *testing.T) {
err = fd.Close()
assert.Nil(t, err)
}
secondBlobs, err := insertCodec.Serialize(1, 1, insertDataSecond)
secondBlobs, _, err := insertCodec.Serialize(1, 1, insertDataSecond)
assert.Nil(t, err)
for index, blob := range secondBlobs {
blob.Key = fmt.Sprintf("1/insert_log/2/3/4/5/%d", 99)

62
internal/storage/stats.go Normal file
View File

@ -0,0 +1,62 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package storage
import (
"encoding/json"
)
type Int64Stats struct {
Max int64 `json:"max"`
Min int64 `json:"min"`
}
type StatsWriter struct {
buffer []byte
}
func (sw *StatsWriter) GetBuffer() []byte {
return sw.buffer
}
func (sw *StatsWriter) StatsInt64(msgs []int64) error {
if len(msgs) < 1 {
// return error: msgs must has one element at least
return nil
}
stats := &Int64Stats{
Max: msgs[len(msgs)-1],
Min: msgs[0],
}
b, err := json.Marshal(stats)
if err != nil {
return err
}
sw.buffer = b
return nil
}
type StatsReader struct {
buffer []byte
}
func (sr *StatsReader) SetBuffer(buffer []byte) {
sr.buffer = buffer
}
func (sr *StatsReader) GetInt64Stats() Int64Stats {
stats := Int64Stats{}
json.Unmarshal(sr.buffer, &stats)
return stats
}

View File

@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package storage
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestStatsInt64(t *testing.T) {
data := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9}
sw := &StatsWriter{}
err := sw.StatsInt64(data)
assert.NoError(t, err)
b := sw.GetBuffer()
assert.Equal(t, string(b), `{"max":9,"min":1}`)
sr := &StatsReader{}
sr.SetBuffer(b)
stats := sr.GetInt64Stats()
expectedStats := Int64Stats{
Max: 9,
Min: 1,
}
assert.Equal(t, stats, expectedStats)
}