mirror of https://github.com/milvus-io/milvus.git
Fix Watch Channel canceled due to revision compacted (#9788)
Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>pull/9804/head
parent
6c88774624
commit
c90a97f95a
|
@ -26,7 +26,9 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
|
||||||
clientv3 "go.etcd.io/etcd/client/v3"
|
clientv3 "go.etcd.io/etcd/client/v3"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/golang/protobuf/proto"
|
"github.com/golang/protobuf/proto"
|
||||||
|
@ -200,14 +202,21 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) {
|
||||||
// REF MEP#7 watch path should be [prefix]/channel/{node_id}/{channel_name}
|
// REF MEP#7 watch path should be [prefix]/channel/{node_id}/{channel_name}
|
||||||
watchPrefix := fmt.Sprintf("channel/%d", node.NodeID)
|
watchPrefix := fmt.Sprintf("channel/%d", node.NodeID)
|
||||||
evtChan := node.kvClient.WatchWithPrefix(watchPrefix)
|
evtChan := node.kvClient.WatchWithPrefix(watchPrefix)
|
||||||
|
// after watch, first check all exists nodes first
|
||||||
|
node.checkWatchedList()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
log.Debug("watch etcd loop quit")
|
log.Debug("watch etcd loop quit")
|
||||||
return
|
return
|
||||||
case event := <-evtChan:
|
case event := <-evtChan:
|
||||||
if event.Canceled { // failed to watch
|
if event.Canceled { // event canceled
|
||||||
log.Warn("Watch channel failed", zap.Error(event.Err()))
|
log.Warn("watch channel canceled", zap.Error(event.Err()))
|
||||||
|
// https://github.com/etcd-io/etcd/issues/8980
|
||||||
|
if event.Err() == v3rpc.ErrCompacted {
|
||||||
|
go node.StartWatchChannels(ctx)
|
||||||
|
return
|
||||||
|
}
|
||||||
// if watch loop return due to event canceled, the datanode is not functional anymore
|
// if watch loop return due to event canceled, the datanode is not functional anymore
|
||||||
// stop the datanode and wait for restart
|
// stop the datanode and wait for restart
|
||||||
err := node.Stop()
|
err := node.Stop()
|
||||||
|
@ -223,40 +232,27 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// checkWatchedList list all nodes under [prefix]/channel/{node_id} and make sure all nodeds are watched
|
||||||
|
// serves the corner case for etcd connection lost and missing some events
|
||||||
|
func (node *DataNode) checkWatchedList() error {
|
||||||
|
// REF MEP#7 watch path should be [prefix]/channel/{node_id}/{channel_name}
|
||||||
|
prefix := fmt.Sprintf("channel/%d", node.NodeID)
|
||||||
|
|
||||||
|
keys, values, err := node.kvClient.LoadWithPrefix(prefix)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for i, val := range values {
|
||||||
|
node.handleWatchInfo(keys[i], []byte(val))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// handleChannelEvt handles event from kv watch event
|
// handleChannelEvt handles event from kv watch event
|
||||||
func (node *DataNode) handleChannelEvt(evt *clientv3.Event) {
|
func (node *DataNode) handleChannelEvt(evt *clientv3.Event) {
|
||||||
switch evt.Type {
|
switch evt.Type {
|
||||||
case clientv3.EventTypePut: // datacoord shall put channels needs to be watched here
|
case clientv3.EventTypePut: // datacoord shall put channels needs to be watched here
|
||||||
watchInfo := datapb.ChannelWatchInfo{}
|
node.handleWatchInfo(string(evt.Kv.Key), evt.Kv.Value)
|
||||||
err := proto.Unmarshal(evt.Kv.Value, &watchInfo)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("fail to parse ChannelWatchInfo", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if watchInfo.State == datapb.ChannelWatchState_Complete {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if watchInfo.Vchan == nil {
|
|
||||||
log.Warn("found ChannelWatchInfo with nil VChannelInfo", zap.String("key", string(evt.Kv.Key)))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
err = node.NewDataSyncService(watchInfo.Vchan)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("fail to create DataSyncService", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
watchInfo.State = datapb.ChannelWatchState_Complete
|
|
||||||
v, err := proto.Marshal(&watchInfo)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("fail to Marshal watchInfo", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
err = node.kvClient.Save(fmt.Sprintf("channel/%d/%s", node.NodeID, watchInfo.Vchan.ChannelName), string(v))
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("fail to change WatchState to complete", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
|
|
||||||
node.ReleaseDataSyncService(string(evt.Kv.Key))
|
|
||||||
// TODO GOOSE: maybe retry logic and exit logic
|
|
||||||
}
|
|
||||||
case clientv3.EventTypeDelete:
|
case clientv3.EventTypeDelete:
|
||||||
// guaranteed there is no "/" in channel name
|
// guaranteed there is no "/" in channel name
|
||||||
parts := strings.Split(string(evt.Kv.Key), "/")
|
parts := strings.Split(string(evt.Kv.Key), "/")
|
||||||
|
@ -264,6 +260,38 @@ func (node *DataNode) handleChannelEvt(evt *clientv3.Event) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (node *DataNode) handleWatchInfo(key string, data []byte) {
|
||||||
|
watchInfo := datapb.ChannelWatchInfo{}
|
||||||
|
err := proto.Unmarshal(data, &watchInfo)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("fail to parse ChannelWatchInfo", zap.String("key", key), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if watchInfo.State == datapb.ChannelWatchState_Complete {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if watchInfo.Vchan == nil {
|
||||||
|
log.Warn("found ChannelWatchInfo with nil VChannelInfo", zap.String("key", key))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = node.NewDataSyncService(watchInfo.Vchan)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("fail to create DataSyncService", zap.String("key", key), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
watchInfo.State = datapb.ChannelWatchState_Complete
|
||||||
|
v, err := proto.Marshal(&watchInfo)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("fail to Marshal watchInfo", zap.String("key", key), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = node.kvClient.Save(fmt.Sprintf("channel/%d/%s", node.NodeID, watchInfo.Vchan.ChannelName), string(v))
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("fail to change WatchState to complete", zap.String("key", key), zap.Error(err))
|
||||||
|
node.ReleaseDataSyncService(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// NewDataSyncService adds a new dataSyncService for new dmlVchannel and starts dataSyncService.
|
// NewDataSyncService adds a new dataSyncService for new dmlVchannel and starts dataSyncService.
|
||||||
func (node *DataNode) NewDataSyncService(vchan *datapb.VchannelInfo) error {
|
func (node *DataNode) NewDataSyncService(vchan *datapb.VchannelInfo) error {
|
||||||
node.chanMut.Lock()
|
node.chanMut.Lock()
|
||||||
|
|
|
@ -530,8 +530,13 @@ func TestWatchChannel(t *testing.T) {
|
||||||
|
|
||||||
kv, err := etcdkv.NewEtcdKV(Params.EtcdEndpoints, Params.MetaRootPath)
|
kv, err := etcdkv.NewEtcdKV(Params.EtcdEndpoints, Params.MetaRootPath)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
oldInvalidCh := "datanode-etcd-test-channel-invalid"
|
||||||
|
path := fmt.Sprintf("channel/%d/%s", node.NodeID, oldInvalidCh)
|
||||||
|
err = kv.Save(path, string([]byte{23}))
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
ch := fmt.Sprintf("datanode-etcd-test-channel_%d", rand.Int31())
|
ch := fmt.Sprintf("datanode-etcd-test-channel_%d", rand.Int31())
|
||||||
path := fmt.Sprintf("channel/%d/%s", node.NodeID, ch)
|
path = fmt.Sprintf("channel/%d/%s", node.NodeID, ch)
|
||||||
c := make(chan struct{})
|
c := make(chan struct{})
|
||||||
go func() {
|
go func() {
|
||||||
ec := kv.WatchWithPrefix(fmt.Sprintf("channel/%d", node.NodeID))
|
ec := kv.WatchWithPrefix(fmt.Sprintf("channel/%d", node.NodeID))
|
||||||
|
@ -590,4 +595,43 @@ func TestWatchChannel(t *testing.T) {
|
||||||
assert.Nil(t, err)
|
assert.Nil(t, err)
|
||||||
assert.Equal(t, s.ErrorCode, commonpb.ErrorCode_UnexpectedError)
|
assert.Equal(t, s.ErrorCode, commonpb.ErrorCode_UnexpectedError)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("handle watch info failed", func(t *testing.T) {
|
||||||
|
node.handleWatchInfo("test1", []byte{23})
|
||||||
|
|
||||||
|
node.chanMut.RLock()
|
||||||
|
_, has := node.vchan2SyncService["test1"]
|
||||||
|
assert.False(t, has)
|
||||||
|
node.chanMut.RUnlock()
|
||||||
|
|
||||||
|
info := datapb.ChannelWatchInfo{
|
||||||
|
Vchan: nil,
|
||||||
|
State: datapb.ChannelWatchState_Uncomplete,
|
||||||
|
}
|
||||||
|
bs, err := proto.Marshal(&info)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
node.handleWatchInfo("test2", bs)
|
||||||
|
|
||||||
|
node.chanMut.RLock()
|
||||||
|
_, has = node.vchan2SyncService["test2"]
|
||||||
|
assert.False(t, has)
|
||||||
|
node.chanMut.RUnlock()
|
||||||
|
|
||||||
|
info = datapb.ChannelWatchInfo{
|
||||||
|
Vchan: &datapb.VchannelInfo{},
|
||||||
|
State: datapb.ChannelWatchState_Uncomplete,
|
||||||
|
}
|
||||||
|
bs, err = proto.Marshal(&info)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
node.msFactory = &FailMessageStreamFactory{
|
||||||
|
node.msFactory,
|
||||||
|
}
|
||||||
|
node.handleWatchInfo("test3", bs)
|
||||||
|
node.chanMut.RLock()
|
||||||
|
_, has = node.vchan2SyncService["test3"]
|
||||||
|
assert.False(t, has)
|
||||||
|
node.chanMut.RUnlock()
|
||||||
|
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue