Use proto marshal instead of MarshalTextString in querycoord (#6958)

* use proto marshal instead of marshalToText

Signed-off-by: xige-16 <xi.ge@zilliz.com>

* log error

Signed-off-by: xige-16 <xi.ge@zilliz.com>

* don't retry after init meta/cluster/scheduler failed

Signed-off-by: xige-16 <xi.ge@zilliz.com>

* fix return err

Signed-off-by: xige-16 <xi.ge@zilliz.com>

* log inconsistent task info

Signed-off-by: xige-16 <xi.ge@zilliz.com>
pull/6985/head
xige-16 2021-08-03 22:03:25 +08:00 committed by GitHub
parent 4123deef9e
commit c8a1f780c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 108 additions and 54 deletions

View File

@ -106,7 +106,7 @@ func (qc *QueryCoord) ShowCollections(ctx context.Context, req *querypb.ShowColl
for _, id := range req.CollectionIDs {
if _, ok := ID2collectionInfo[id]; !ok {
status.ErrorCode = commonpb.ErrorCode_UnexpectedError
err := errors.New("collection not exist or has not been loaded to memory")
err := errors.New("collection has not been loaded to memory or load failed")
status.Reason = err.Error()
return &querypb.ShowCollectionsResponse{
Status: status,
@ -255,7 +255,7 @@ func (qc *QueryCoord) ShowPartitions(ctx context.Context, req *querypb.ShowParti
for _, id := range req.PartitionIDs {
if _, ok := ID2PartitionState[id]; !ok {
status.ErrorCode = commonpb.ErrorCode_UnexpectedError
err := errors.New("partition not exist or has not been loaded to memory")
err := errors.New("partition has not been loaded to memory or load failed")
status.Reason = err.Error()
return &querypb.ShowPartitionsResponse{
Status: status,

View File

@ -85,18 +85,7 @@ func (qc *QueryCoord) Init() error {
}
etcdKV := etcdkv.NewEtcdKV(etcdClient, Params.MetaRootPath)
qc.kvClient = etcdKV
metaKV, err := newMeta(etcdKV)
if err != nil {
return err
}
qc.meta = metaKV
qc.cluster, err = newQueryNodeCluster(metaKV, etcdKV)
if err != nil {
return err
}
qc.scheduler, err = NewTaskScheduler(qc.loopCtx, metaKV, qc.cluster, etcdKV, qc.rootCoordClient, qc.dataCoordClient)
return err
return nil
}
log.Debug("query coordinator try to connect etcd")
err := retry.Do(qc.loopCtx, connectEtcdFn, retry.Attempts(300))
@ -105,6 +94,24 @@ func (qc *QueryCoord) Init() error {
return err
}
log.Debug("query coordinator try to connect etcd success")
qc.meta, err = newMeta(qc.kvClient)
if err != nil {
log.Error("query coordinator init meta failed", zap.Error(err))
return err
}
qc.cluster, err = newQueryNodeCluster(qc.meta, qc.kvClient)
if err != nil {
log.Error("query coordinator init cluster failed", zap.Error(err))
return err
}
qc.scheduler, err = NewTaskScheduler(qc.loopCtx, qc.meta, qc.cluster, qc.kvClient, qc.rootCoordClient, qc.dataCoordClient)
if err != nil {
log.Error("query coordinator init task scheduler failed", zap.Error(err))
return err
}
return nil
}

View File

@ -66,7 +66,7 @@ type task interface {
AddChildTask(t task)
IsValid() bool
Reschedule() ([]task, error)
Marshal() string
Marshal() ([]byte, error)
State() taskState
SetState(state taskState)
}
@ -142,8 +142,8 @@ func (lct *LoadCollectionTask) MsgBase() *commonpb.MsgBase {
return lct.Base
}
func (lct *LoadCollectionTask) Marshal() string {
return proto.MarshalTextString(lct.LoadCollectionRequest)
func (lct *LoadCollectionTask) Marshal() ([]byte, error) {
return proto.Marshal(lct.LoadCollectionRequest)
}
func (lct *LoadCollectionTask) Type() commonpb.MsgType {
@ -370,8 +370,8 @@ func (rct *ReleaseCollectionTask) MsgBase() *commonpb.MsgBase {
return rct.Base
}
func (rct *ReleaseCollectionTask) Marshal() string {
return proto.MarshalTextString(rct.ReleaseCollectionRequest)
func (rct *ReleaseCollectionTask) Marshal() ([]byte, error) {
return proto.Marshal(rct.ReleaseCollectionRequest)
}
func (rct *ReleaseCollectionTask) Type() commonpb.MsgType {
@ -485,8 +485,8 @@ func (lpt *LoadPartitionTask) MsgBase() *commonpb.MsgBase {
return lpt.Base
}
func (lpt *LoadPartitionTask) Marshal() string {
return proto.MarshalTextString(lpt.LoadPartitionsRequest)
func (lpt *LoadPartitionTask) Marshal() ([]byte, error) {
return proto.Marshal(lpt.LoadPartitionsRequest)
}
func (lpt *LoadPartitionTask) Type() commonpb.MsgType {
@ -683,8 +683,8 @@ func (rpt *ReleasePartitionTask) MsgBase() *commonpb.MsgBase {
return rpt.Base
}
func (rpt *ReleasePartitionTask) Marshal() string {
return proto.MarshalTextString(rpt.ReleasePartitionsRequest)
func (rpt *ReleasePartitionTask) Marshal() ([]byte, error) {
return proto.Marshal(rpt.ReleasePartitionsRequest)
}
func (rpt *ReleasePartitionTask) Type() commonpb.MsgType {
@ -777,8 +777,8 @@ func (lst *LoadSegmentTask) MsgBase() *commonpb.MsgBase {
return lst.Base
}
func (lst *LoadSegmentTask) Marshal() string {
return proto.MarshalTextString(lst.LoadSegmentsRequest)
func (lst *LoadSegmentTask) Marshal() ([]byte, error) {
return proto.Marshal(lst.LoadSegmentsRequest)
}
func (lst *LoadSegmentTask) IsValid() bool {
@ -914,8 +914,8 @@ func (rst *ReleaseSegmentTask) MsgBase() *commonpb.MsgBase {
return rst.Base
}
func (rst *ReleaseSegmentTask) Marshal() string {
return proto.MarshalTextString(rst.ReleaseSegmentsRequest)
func (rst *ReleaseSegmentTask) Marshal() ([]byte, error) {
return proto.Marshal(rst.ReleaseSegmentsRequest)
}
func (rst *ReleaseSegmentTask) IsValid() bool {
@ -984,8 +984,8 @@ func (wdt *WatchDmChannelTask) MsgBase() *commonpb.MsgBase {
return wdt.Base
}
func (wdt *WatchDmChannelTask) Marshal() string {
return proto.MarshalTextString(wdt.WatchDmChannelsRequest)
func (wdt *WatchDmChannelTask) Marshal() ([]byte, error) {
return proto.Marshal(wdt.WatchDmChannelsRequest)
}
func (wdt *WatchDmChannelTask) IsValid() bool {
@ -1125,8 +1125,8 @@ func (wqt *WatchQueryChannelTask) MsgBase() *commonpb.MsgBase {
return wqt.Base
}
func (wqt *WatchQueryChannelTask) Marshal() string {
return proto.MarshalTextString(wqt.AddQueryChannelRequest)
func (wqt *WatchQueryChannelTask) Marshal() ([]byte, error) {
return proto.Marshal(wqt.AddQueryChannelRequest)
}
func (wqt *WatchQueryChannelTask) IsValid() bool {
@ -1207,8 +1207,8 @@ func (lbt *LoadBalanceTask) MsgBase() *commonpb.MsgBase {
return lbt.Base
}
func (lbt *LoadBalanceTask) Marshal() string {
return proto.MarshalTextString(lbt.LoadBalanceRequest)
func (lbt *LoadBalanceTask) Marshal() ([]byte, error) {
return proto.Marshal(lbt.LoadBalanceRequest)
}
func (lbt *LoadBalanceTask) Type() commonpb.MsgType {

View File

@ -148,7 +148,7 @@ func NewTaskScheduler(ctx context.Context, meta Meta, cluster *queryNodeCluster,
dataCoord: dataCoord,
}
s.triggerTaskQueue = NewTaskQueue()
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", tsoutil.NewTSOKVBase(Params.EtcdEndpoints, Params.KvRootPath, "query coordinator task id"))
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", tsoutil.NewTSOKVBase(Params.EtcdEndpoints, Params.KvRootPath, "queryCoordTaskID"))
if err := idAllocator.Initialize(); err != nil {
log.Debug("query coordinator idAllocator initialize failed", zap.Error(err))
return nil, err
@ -217,7 +217,8 @@ func (scheduler *TaskScheduler) reloadFromKV() error {
state := taskState(value)
taskInfos[taskID] = state
if _, ok := triggerTasks[taskID]; !ok {
return errors.New("taskStateInfo and triggerTaskInfo are inconsistent")
log.Error("reloadFromKV: taskStateInfo and triggerTaskInfo are inconsistent")
continue
}
triggerTasks[taskID].SetState(state)
}
@ -243,7 +244,7 @@ func (scheduler *TaskScheduler) reloadFromKV() error {
func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
header := commonpb.MsgHeader{}
err := proto.UnmarshalText(t, &header)
err := proto.Unmarshal([]byte(t), &header)
if err != nil {
return nil, fmt.Errorf("Failed to unmarshal message header, err %s ", err.Error())
}
@ -251,7 +252,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
switch header.Base.MsgType {
case commonpb.MsgType_LoadCollection:
loadReq := querypb.LoadCollectionRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -270,7 +271,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = loadCollectionTask
case commonpb.MsgType_LoadPartitions:
loadReq := querypb.LoadPartitionsRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -288,7 +289,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = loadPartitionTask
case commonpb.MsgType_ReleaseCollection:
loadReq := querypb.ReleaseCollectionRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -306,7 +307,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = releaseCollectionTask
case commonpb.MsgType_ReleasePartitions:
loadReq := querypb.ReleasePartitionsRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -322,7 +323,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = releasePartitionTask
case commonpb.MsgType_LoadSegments:
loadReq := querypb.LoadSegmentsRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -339,7 +340,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = loadSegmentTask
case commonpb.MsgType_ReleaseSegments:
loadReq := querypb.ReleaseSegmentsRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -355,7 +356,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = releaseSegmentTask
case commonpb.MsgType_WatchDmChannels:
loadReq := querypb.WatchDmChannelsRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -372,7 +373,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = watchDmChannelTask
case commonpb.MsgType_WatchQueryChannels:
loadReq := querypb.AddQueryChannelRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -388,7 +389,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
newTask = watchQueryChannelTask
case commonpb.MsgType_LoadBalanceSegments:
loadReq := querypb.LoadBalanceRequest{}
err = proto.UnmarshalText(t, &loadReq)
err = proto.Unmarshal([]byte(t), &loadReq)
if err != nil {
log.Error(err.Error())
}
@ -423,12 +424,16 @@ func (scheduler *TaskScheduler) Enqueue(tasks []task) {
t.SetID(id)
kvs := make(map[string]string)
taskKey := fmt.Sprintf("%s/%d", triggerTaskPrefix, t.ID())
kvs[taskKey] = t.Marshal()
blobs, err := t.Marshal()
if err != nil {
log.Error("error when save marshal task", zap.Int64("taskID", t.ID()), zap.String("error", err.Error()))
}
kvs[taskKey] = string(blobs)
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, t.ID())
kvs[stateKey] = strconv.Itoa(int(taskUndo))
err = scheduler.client.MultiSave(kvs)
if err != nil {
log.Error("error when save trigger task to etcd", zap.Int64("taskID", t.ID()))
log.Error("error when save trigger task to etcd", zap.Int64("taskID", t.ID()), zap.String("error", err.Error()))
}
log.Debug("EnQueue a triggerTask and save to etcd", zap.Int64("taskID", t.ID()))
t.SetState(taskUndo)
@ -450,7 +455,7 @@ func (scheduler *TaskScheduler) processTask(t task) error {
key := fmt.Sprintf("%s/%d", taskInfoPrefix, t.ID())
err := scheduler.client.Save(key, strconv.Itoa(int(taskDoing)))
if err != nil {
log.Debug("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
log.Error("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
trace.LogError(span, err)
return err
}
@ -477,12 +482,18 @@ func (scheduler *TaskScheduler) processTask(t task) error {
childTask.SetID(id)
kvs := make(map[string]string)
taskKey := fmt.Sprintf("%s/%d", activeTaskPrefix, childTask.ID())
kvs[taskKey] = childTask.Marshal()
blobs, err := childTask.Marshal()
if err != nil {
log.Error("processTask: marshal task err", zap.String("reason", err.Error()))
trace.LogError(span, err)
return err
}
kvs[taskKey] = string(blobs)
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, childTask.ID())
kvs[stateKey] = strconv.Itoa(int(taskUndo))
err = scheduler.client.MultiSave(kvs)
if err != nil {
log.Debug("processTask: save active task info err", zap.String("reason", err.Error()))
log.Error("processTask: save active task info err", zap.String("reason", err.Error()))
trace.LogError(span, err)
return err
}
@ -491,7 +502,7 @@ func (scheduler *TaskScheduler) processTask(t task) error {
err = scheduler.client.Save(key, strconv.Itoa(int(taskDone)))
if err != nil {
log.Debug("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
log.Error("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
trace.LogError(span, err)
return err
}
@ -586,7 +597,13 @@ func (scheduler *TaskScheduler) waitActivateTaskDone(wg *sync.WaitGroup, t task)
}
rt.SetID(id)
taskKey := fmt.Sprintf("%s/%d", activeTaskPrefix, rt.ID())
saves[taskKey] = rt.Marshal()
blobs, err := rt.Marshal()
if err != nil {
log.Error("waitActivateTaskDone: error when marshal active task")
continue
//TODO::xige-16 deal error when marshal task failed
}
saves[taskKey] = string(blobs)
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, rt.ID())
saves[stateKey] = strconv.Itoa(int(taskUndo))
reSchedID = append(reSchedID, rt.ID())
@ -595,6 +612,7 @@ func (scheduler *TaskScheduler) waitActivateTaskDone(wg *sync.WaitGroup, t task)
err = scheduler.client.MultiSaveAndRemove(saves, removes)
if err != nil {
log.Error("waitActivateTaskDone: error when save and remove task from etcd")
//TODO::xige-16 deal error when save meta failed
}
log.Debug("waitActivateTaskDone: delete failed active task and save reScheduled task to etcd", zap.Int64("failed taskID", t.ID()), zap.Int64s("reScheduled taskIDs", reSchedID))

View File

@ -5,6 +5,9 @@ import (
"testing"
"time"
"go.etcd.io/etcd/clientv3"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/querypb"
@ -23,8 +26,8 @@ func (tt *testTask) MsgBase() *commonpb.MsgBase {
return tt.baseMsg
}
func (tt *testTask) Marshal() string {
return ""
func (tt *testTask) Marshal() ([]byte, error) {
return []byte{}, nil
}
func (tt *testTask) Type() commonpb.MsgType {
@ -155,3 +158,29 @@ func TestWatchQueryChannel_ClearEtcdInfoAfterAssignedNodeDown(t *testing.T) {
assert.Equal(t, len(newActiveTaskIDKeys), len(activeTaskIDKeys))
queryCoord.Stop()
}
func TestUnMarshalTask_LoadCollection(t *testing.T) {
etcdClient, err := clientv3.New(clientv3.Config{Endpoints: Params.EtcdEndpoints})
assert.Nil(t, err)
kv := etcdkv.NewEtcdKV(etcdClient, Params.MetaRootPath)
loadTask := &LoadCollectionTask{
LoadCollectionRequest: &querypb.LoadCollectionRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_LoadCollection,
},
},
}
blobs, err := loadTask.Marshal()
assert.Nil(t, err)
err = kv.Save("testMarshalLoadCollection", string(blobs))
assert.Nil(t, err)
defer kv.RemoveWithPrefix("testMarshalLoadCollection")
value, err := kv.Load("testMarshalLoadCollection")
assert.Nil(t, err)
taskScheduler := &TaskScheduler{}
task, err := taskScheduler.unmarshalTask(value)
assert.Nil(t, err)
assert.Equal(t, task.Type(), commonpb.MsgType_LoadCollection)
}