mirror of https://github.com/milvus-io/milvus.git
Use proto marshal instead of MarshalTextString in querycoord (#6958)
* use proto marshal instead of marshalToText Signed-off-by: xige-16 <xi.ge@zilliz.com> * log error Signed-off-by: xige-16 <xi.ge@zilliz.com> * don't retry after init meta/cluster/scheduler failed Signed-off-by: xige-16 <xi.ge@zilliz.com> * fix return err Signed-off-by: xige-16 <xi.ge@zilliz.com> * log inconsistent task info Signed-off-by: xige-16 <xi.ge@zilliz.com>pull/6985/head
parent
4123deef9e
commit
c8a1f780c1
|
@ -106,7 +106,7 @@ func (qc *QueryCoord) ShowCollections(ctx context.Context, req *querypb.ShowColl
|
|||
for _, id := range req.CollectionIDs {
|
||||
if _, ok := ID2collectionInfo[id]; !ok {
|
||||
status.ErrorCode = commonpb.ErrorCode_UnexpectedError
|
||||
err := errors.New("collection not exist or has not been loaded to memory")
|
||||
err := errors.New("collection has not been loaded to memory or load failed")
|
||||
status.Reason = err.Error()
|
||||
return &querypb.ShowCollectionsResponse{
|
||||
Status: status,
|
||||
|
@ -255,7 +255,7 @@ func (qc *QueryCoord) ShowPartitions(ctx context.Context, req *querypb.ShowParti
|
|||
for _, id := range req.PartitionIDs {
|
||||
if _, ok := ID2PartitionState[id]; !ok {
|
||||
status.ErrorCode = commonpb.ErrorCode_UnexpectedError
|
||||
err := errors.New("partition not exist or has not been loaded to memory")
|
||||
err := errors.New("partition has not been loaded to memory or load failed")
|
||||
status.Reason = err.Error()
|
||||
return &querypb.ShowPartitionsResponse{
|
||||
Status: status,
|
||||
|
|
|
@ -85,18 +85,7 @@ func (qc *QueryCoord) Init() error {
|
|||
}
|
||||
etcdKV := etcdkv.NewEtcdKV(etcdClient, Params.MetaRootPath)
|
||||
qc.kvClient = etcdKV
|
||||
metaKV, err := newMeta(etcdKV)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
qc.meta = metaKV
|
||||
qc.cluster, err = newQueryNodeCluster(metaKV, etcdKV)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
qc.scheduler, err = NewTaskScheduler(qc.loopCtx, metaKV, qc.cluster, etcdKV, qc.rootCoordClient, qc.dataCoordClient)
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
log.Debug("query coordinator try to connect etcd")
|
||||
err := retry.Do(qc.loopCtx, connectEtcdFn, retry.Attempts(300))
|
||||
|
@ -105,6 +94,24 @@ func (qc *QueryCoord) Init() error {
|
|||
return err
|
||||
}
|
||||
log.Debug("query coordinator try to connect etcd success")
|
||||
qc.meta, err = newMeta(qc.kvClient)
|
||||
if err != nil {
|
||||
log.Error("query coordinator init meta failed", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
|
||||
qc.cluster, err = newQueryNodeCluster(qc.meta, qc.kvClient)
|
||||
if err != nil {
|
||||
log.Error("query coordinator init cluster failed", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
|
||||
qc.scheduler, err = NewTaskScheduler(qc.loopCtx, qc.meta, qc.cluster, qc.kvClient, qc.rootCoordClient, qc.dataCoordClient)
|
||||
if err != nil {
|
||||
log.Error("query coordinator init task scheduler failed", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ type task interface {
|
|||
AddChildTask(t task)
|
||||
IsValid() bool
|
||||
Reschedule() ([]task, error)
|
||||
Marshal() string
|
||||
Marshal() ([]byte, error)
|
||||
State() taskState
|
||||
SetState(state taskState)
|
||||
}
|
||||
|
@ -142,8 +142,8 @@ func (lct *LoadCollectionTask) MsgBase() *commonpb.MsgBase {
|
|||
return lct.Base
|
||||
}
|
||||
|
||||
func (lct *LoadCollectionTask) Marshal() string {
|
||||
return proto.MarshalTextString(lct.LoadCollectionRequest)
|
||||
func (lct *LoadCollectionTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(lct.LoadCollectionRequest)
|
||||
}
|
||||
|
||||
func (lct *LoadCollectionTask) Type() commonpb.MsgType {
|
||||
|
@ -370,8 +370,8 @@ func (rct *ReleaseCollectionTask) MsgBase() *commonpb.MsgBase {
|
|||
return rct.Base
|
||||
}
|
||||
|
||||
func (rct *ReleaseCollectionTask) Marshal() string {
|
||||
return proto.MarshalTextString(rct.ReleaseCollectionRequest)
|
||||
func (rct *ReleaseCollectionTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(rct.ReleaseCollectionRequest)
|
||||
}
|
||||
|
||||
func (rct *ReleaseCollectionTask) Type() commonpb.MsgType {
|
||||
|
@ -485,8 +485,8 @@ func (lpt *LoadPartitionTask) MsgBase() *commonpb.MsgBase {
|
|||
return lpt.Base
|
||||
}
|
||||
|
||||
func (lpt *LoadPartitionTask) Marshal() string {
|
||||
return proto.MarshalTextString(lpt.LoadPartitionsRequest)
|
||||
func (lpt *LoadPartitionTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(lpt.LoadPartitionsRequest)
|
||||
}
|
||||
|
||||
func (lpt *LoadPartitionTask) Type() commonpb.MsgType {
|
||||
|
@ -683,8 +683,8 @@ func (rpt *ReleasePartitionTask) MsgBase() *commonpb.MsgBase {
|
|||
return rpt.Base
|
||||
}
|
||||
|
||||
func (rpt *ReleasePartitionTask) Marshal() string {
|
||||
return proto.MarshalTextString(rpt.ReleasePartitionsRequest)
|
||||
func (rpt *ReleasePartitionTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(rpt.ReleasePartitionsRequest)
|
||||
}
|
||||
|
||||
func (rpt *ReleasePartitionTask) Type() commonpb.MsgType {
|
||||
|
@ -777,8 +777,8 @@ func (lst *LoadSegmentTask) MsgBase() *commonpb.MsgBase {
|
|||
return lst.Base
|
||||
}
|
||||
|
||||
func (lst *LoadSegmentTask) Marshal() string {
|
||||
return proto.MarshalTextString(lst.LoadSegmentsRequest)
|
||||
func (lst *LoadSegmentTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(lst.LoadSegmentsRequest)
|
||||
}
|
||||
|
||||
func (lst *LoadSegmentTask) IsValid() bool {
|
||||
|
@ -914,8 +914,8 @@ func (rst *ReleaseSegmentTask) MsgBase() *commonpb.MsgBase {
|
|||
return rst.Base
|
||||
}
|
||||
|
||||
func (rst *ReleaseSegmentTask) Marshal() string {
|
||||
return proto.MarshalTextString(rst.ReleaseSegmentsRequest)
|
||||
func (rst *ReleaseSegmentTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(rst.ReleaseSegmentsRequest)
|
||||
}
|
||||
|
||||
func (rst *ReleaseSegmentTask) IsValid() bool {
|
||||
|
@ -984,8 +984,8 @@ func (wdt *WatchDmChannelTask) MsgBase() *commonpb.MsgBase {
|
|||
return wdt.Base
|
||||
}
|
||||
|
||||
func (wdt *WatchDmChannelTask) Marshal() string {
|
||||
return proto.MarshalTextString(wdt.WatchDmChannelsRequest)
|
||||
func (wdt *WatchDmChannelTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(wdt.WatchDmChannelsRequest)
|
||||
}
|
||||
|
||||
func (wdt *WatchDmChannelTask) IsValid() bool {
|
||||
|
@ -1125,8 +1125,8 @@ func (wqt *WatchQueryChannelTask) MsgBase() *commonpb.MsgBase {
|
|||
return wqt.Base
|
||||
}
|
||||
|
||||
func (wqt *WatchQueryChannelTask) Marshal() string {
|
||||
return proto.MarshalTextString(wqt.AddQueryChannelRequest)
|
||||
func (wqt *WatchQueryChannelTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(wqt.AddQueryChannelRequest)
|
||||
}
|
||||
|
||||
func (wqt *WatchQueryChannelTask) IsValid() bool {
|
||||
|
@ -1207,8 +1207,8 @@ func (lbt *LoadBalanceTask) MsgBase() *commonpb.MsgBase {
|
|||
return lbt.Base
|
||||
}
|
||||
|
||||
func (lbt *LoadBalanceTask) Marshal() string {
|
||||
return proto.MarshalTextString(lbt.LoadBalanceRequest)
|
||||
func (lbt *LoadBalanceTask) Marshal() ([]byte, error) {
|
||||
return proto.Marshal(lbt.LoadBalanceRequest)
|
||||
}
|
||||
|
||||
func (lbt *LoadBalanceTask) Type() commonpb.MsgType {
|
||||
|
|
|
@ -148,7 +148,7 @@ func NewTaskScheduler(ctx context.Context, meta Meta, cluster *queryNodeCluster,
|
|||
dataCoord: dataCoord,
|
||||
}
|
||||
s.triggerTaskQueue = NewTaskQueue()
|
||||
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", tsoutil.NewTSOKVBase(Params.EtcdEndpoints, Params.KvRootPath, "query coordinator task id"))
|
||||
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", tsoutil.NewTSOKVBase(Params.EtcdEndpoints, Params.KvRootPath, "queryCoordTaskID"))
|
||||
if err := idAllocator.Initialize(); err != nil {
|
||||
log.Debug("query coordinator idAllocator initialize failed", zap.Error(err))
|
||||
return nil, err
|
||||
|
@ -217,7 +217,8 @@ func (scheduler *TaskScheduler) reloadFromKV() error {
|
|||
state := taskState(value)
|
||||
taskInfos[taskID] = state
|
||||
if _, ok := triggerTasks[taskID]; !ok {
|
||||
return errors.New("taskStateInfo and triggerTaskInfo are inconsistent")
|
||||
log.Error("reloadFromKV: taskStateInfo and triggerTaskInfo are inconsistent")
|
||||
continue
|
||||
}
|
||||
triggerTasks[taskID].SetState(state)
|
||||
}
|
||||
|
@ -243,7 +244,7 @@ func (scheduler *TaskScheduler) reloadFromKV() error {
|
|||
|
||||
func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
||||
header := commonpb.MsgHeader{}
|
||||
err := proto.UnmarshalText(t, &header)
|
||||
err := proto.Unmarshal([]byte(t), &header)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Failed to unmarshal message header, err %s ", err.Error())
|
||||
}
|
||||
|
@ -251,7 +252,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
switch header.Base.MsgType {
|
||||
case commonpb.MsgType_LoadCollection:
|
||||
loadReq := querypb.LoadCollectionRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -270,7 +271,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = loadCollectionTask
|
||||
case commonpb.MsgType_LoadPartitions:
|
||||
loadReq := querypb.LoadPartitionsRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -288,7 +289,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = loadPartitionTask
|
||||
case commonpb.MsgType_ReleaseCollection:
|
||||
loadReq := querypb.ReleaseCollectionRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -306,7 +307,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = releaseCollectionTask
|
||||
case commonpb.MsgType_ReleasePartitions:
|
||||
loadReq := querypb.ReleasePartitionsRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -322,7 +323,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = releasePartitionTask
|
||||
case commonpb.MsgType_LoadSegments:
|
||||
loadReq := querypb.LoadSegmentsRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -339,7 +340,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = loadSegmentTask
|
||||
case commonpb.MsgType_ReleaseSegments:
|
||||
loadReq := querypb.ReleaseSegmentsRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -355,7 +356,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = releaseSegmentTask
|
||||
case commonpb.MsgType_WatchDmChannels:
|
||||
loadReq := querypb.WatchDmChannelsRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -372,7 +373,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = watchDmChannelTask
|
||||
case commonpb.MsgType_WatchQueryChannels:
|
||||
loadReq := querypb.AddQueryChannelRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -388,7 +389,7 @@ func (scheduler *TaskScheduler) unmarshalTask(t string) (task, error) {
|
|||
newTask = watchQueryChannelTask
|
||||
case commonpb.MsgType_LoadBalanceSegments:
|
||||
loadReq := querypb.LoadBalanceRequest{}
|
||||
err = proto.UnmarshalText(t, &loadReq)
|
||||
err = proto.Unmarshal([]byte(t), &loadReq)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
}
|
||||
|
@ -423,12 +424,16 @@ func (scheduler *TaskScheduler) Enqueue(tasks []task) {
|
|||
t.SetID(id)
|
||||
kvs := make(map[string]string)
|
||||
taskKey := fmt.Sprintf("%s/%d", triggerTaskPrefix, t.ID())
|
||||
kvs[taskKey] = t.Marshal()
|
||||
blobs, err := t.Marshal()
|
||||
if err != nil {
|
||||
log.Error("error when save marshal task", zap.Int64("taskID", t.ID()), zap.String("error", err.Error()))
|
||||
}
|
||||
kvs[taskKey] = string(blobs)
|
||||
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, t.ID())
|
||||
kvs[stateKey] = strconv.Itoa(int(taskUndo))
|
||||
err = scheduler.client.MultiSave(kvs)
|
||||
if err != nil {
|
||||
log.Error("error when save trigger task to etcd", zap.Int64("taskID", t.ID()))
|
||||
log.Error("error when save trigger task to etcd", zap.Int64("taskID", t.ID()), zap.String("error", err.Error()))
|
||||
}
|
||||
log.Debug("EnQueue a triggerTask and save to etcd", zap.Int64("taskID", t.ID()))
|
||||
t.SetState(taskUndo)
|
||||
|
@ -450,7 +455,7 @@ func (scheduler *TaskScheduler) processTask(t task) error {
|
|||
key := fmt.Sprintf("%s/%d", taskInfoPrefix, t.ID())
|
||||
err := scheduler.client.Save(key, strconv.Itoa(int(taskDoing)))
|
||||
if err != nil {
|
||||
log.Debug("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
|
||||
log.Error("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
|
||||
trace.LogError(span, err)
|
||||
return err
|
||||
}
|
||||
|
@ -477,12 +482,18 @@ func (scheduler *TaskScheduler) processTask(t task) error {
|
|||
childTask.SetID(id)
|
||||
kvs := make(map[string]string)
|
||||
taskKey := fmt.Sprintf("%s/%d", activeTaskPrefix, childTask.ID())
|
||||
kvs[taskKey] = childTask.Marshal()
|
||||
blobs, err := childTask.Marshal()
|
||||
if err != nil {
|
||||
log.Error("processTask: marshal task err", zap.String("reason", err.Error()))
|
||||
trace.LogError(span, err)
|
||||
return err
|
||||
}
|
||||
kvs[taskKey] = string(blobs)
|
||||
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, childTask.ID())
|
||||
kvs[stateKey] = strconv.Itoa(int(taskUndo))
|
||||
err = scheduler.client.MultiSave(kvs)
|
||||
if err != nil {
|
||||
log.Debug("processTask: save active task info err", zap.String("reason", err.Error()))
|
||||
log.Error("processTask: save active task info err", zap.String("reason", err.Error()))
|
||||
trace.LogError(span, err)
|
||||
return err
|
||||
}
|
||||
|
@ -491,7 +502,7 @@ func (scheduler *TaskScheduler) processTask(t task) error {
|
|||
|
||||
err = scheduler.client.Save(key, strconv.Itoa(int(taskDone)))
|
||||
if err != nil {
|
||||
log.Debug("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
|
||||
log.Error("processTask: update task state err", zap.String("reason", err.Error()), zap.Int64("taskID", t.ID()))
|
||||
trace.LogError(span, err)
|
||||
return err
|
||||
}
|
||||
|
@ -586,7 +597,13 @@ func (scheduler *TaskScheduler) waitActivateTaskDone(wg *sync.WaitGroup, t task)
|
|||
}
|
||||
rt.SetID(id)
|
||||
taskKey := fmt.Sprintf("%s/%d", activeTaskPrefix, rt.ID())
|
||||
saves[taskKey] = rt.Marshal()
|
||||
blobs, err := rt.Marshal()
|
||||
if err != nil {
|
||||
log.Error("waitActivateTaskDone: error when marshal active task")
|
||||
continue
|
||||
//TODO::xige-16 deal error when marshal task failed
|
||||
}
|
||||
saves[taskKey] = string(blobs)
|
||||
stateKey := fmt.Sprintf("%s/%d", taskInfoPrefix, rt.ID())
|
||||
saves[stateKey] = strconv.Itoa(int(taskUndo))
|
||||
reSchedID = append(reSchedID, rt.ID())
|
||||
|
@ -595,6 +612,7 @@ func (scheduler *TaskScheduler) waitActivateTaskDone(wg *sync.WaitGroup, t task)
|
|||
err = scheduler.client.MultiSaveAndRemove(saves, removes)
|
||||
if err != nil {
|
||||
log.Error("waitActivateTaskDone: error when save and remove task from etcd")
|
||||
//TODO::xige-16 deal error when save meta failed
|
||||
}
|
||||
log.Debug("waitActivateTaskDone: delete failed active task and save reScheduled task to etcd", zap.Int64("failed taskID", t.ID()), zap.Int64s("reScheduled taskIDs", reSchedID))
|
||||
|
||||
|
|
|
@ -5,6 +5,9 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"go.etcd.io/etcd/clientv3"
|
||||
|
||||
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
||||
"github.com/milvus-io/milvus/internal/log"
|
||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||
"github.com/milvus-io/milvus/internal/proto/querypb"
|
||||
|
@ -23,8 +26,8 @@ func (tt *testTask) MsgBase() *commonpb.MsgBase {
|
|||
return tt.baseMsg
|
||||
}
|
||||
|
||||
func (tt *testTask) Marshal() string {
|
||||
return ""
|
||||
func (tt *testTask) Marshal() ([]byte, error) {
|
||||
return []byte{}, nil
|
||||
}
|
||||
|
||||
func (tt *testTask) Type() commonpb.MsgType {
|
||||
|
@ -155,3 +158,29 @@ func TestWatchQueryChannel_ClearEtcdInfoAfterAssignedNodeDown(t *testing.T) {
|
|||
assert.Equal(t, len(newActiveTaskIDKeys), len(activeTaskIDKeys))
|
||||
queryCoord.Stop()
|
||||
}
|
||||
|
||||
func TestUnMarshalTask_LoadCollection(t *testing.T) {
|
||||
etcdClient, err := clientv3.New(clientv3.Config{Endpoints: Params.EtcdEndpoints})
|
||||
assert.Nil(t, err)
|
||||
kv := etcdkv.NewEtcdKV(etcdClient, Params.MetaRootPath)
|
||||
|
||||
loadTask := &LoadCollectionTask{
|
||||
LoadCollectionRequest: &querypb.LoadCollectionRequest{
|
||||
Base: &commonpb.MsgBase{
|
||||
MsgType: commonpb.MsgType_LoadCollection,
|
||||
},
|
||||
},
|
||||
}
|
||||
blobs, err := loadTask.Marshal()
|
||||
assert.Nil(t, err)
|
||||
err = kv.Save("testMarshalLoadCollection", string(blobs))
|
||||
assert.Nil(t, err)
|
||||
defer kv.RemoveWithPrefix("testMarshalLoadCollection")
|
||||
value, err := kv.Load("testMarshalLoadCollection")
|
||||
assert.Nil(t, err)
|
||||
|
||||
taskScheduler := &TaskScheduler{}
|
||||
task, err := taskScheduler.unmarshalTask(value)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, task.Type(), commonpb.MsgType_LoadCollection)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue