Fix Watch Channel canceled due to revision compacted (#9788)

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/9804/head
congqixia 2021-10-13 17:02:33 +08:00 committed by GitHub
parent 6c88774624
commit c90a97f95a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 105 additions and 33 deletions

View File

@ -26,7 +26,9 @@ import (
"sync/atomic"
"time"
v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
clientv3 "go.etcd.io/etcd/client/v3"
"go.uber.org/zap"
"github.com/golang/protobuf/proto"
@ -200,14 +202,21 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) {
// REF MEP#7 watch path should be [prefix]/channel/{node_id}/{channel_name}
watchPrefix := fmt.Sprintf("channel/%d", node.NodeID)
evtChan := node.kvClient.WatchWithPrefix(watchPrefix)
// after watch, first check all exists nodes first
node.checkWatchedList()
for {
select {
case <-ctx.Done():
log.Debug("watch etcd loop quit")
return
case event := <-evtChan:
if event.Canceled { // failed to watch
log.Warn("Watch channel failed", zap.Error(event.Err()))
if event.Canceled { // event canceled
log.Warn("watch channel canceled", zap.Error(event.Err()))
// https://github.com/etcd-io/etcd/issues/8980
if event.Err() == v3rpc.ErrCompacted {
go node.StartWatchChannels(ctx)
return
}
// if watch loop return due to event canceled, the datanode is not functional anymore
// stop the datanode and wait for restart
err := node.Stop()
@ -223,44 +232,63 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) {
}
}
// checkWatchedList list all nodes under [prefix]/channel/{node_id} and make sure all nodeds are watched
// serves the corner case for etcd connection lost and missing some events
func (node *DataNode) checkWatchedList() error {
// REF MEP#7 watch path should be [prefix]/channel/{node_id}/{channel_name}
prefix := fmt.Sprintf("channel/%d", node.NodeID)
keys, values, err := node.kvClient.LoadWithPrefix(prefix)
if err != nil {
return err
}
for i, val := range values {
node.handleWatchInfo(keys[i], []byte(val))
}
return nil
}
// handleChannelEvt handles event from kv watch event
func (node *DataNode) handleChannelEvt(evt *clientv3.Event) {
switch evt.Type {
case clientv3.EventTypePut: // datacoord shall put channels needs to be watched here
node.handleWatchInfo(string(evt.Kv.Key), evt.Kv.Value)
case clientv3.EventTypeDelete:
// guaranteed there is no "/" in channel name
parts := strings.Split(string(evt.Kv.Key), "/")
node.ReleaseDataSyncService(parts[len(parts)-1])
}
}
func (node *DataNode) handleWatchInfo(key string, data []byte) {
watchInfo := datapb.ChannelWatchInfo{}
err := proto.Unmarshal(evt.Kv.Value, &watchInfo)
err := proto.Unmarshal(data, &watchInfo)
if err != nil {
log.Warn("fail to parse ChannelWatchInfo", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
log.Warn("fail to parse ChannelWatchInfo", zap.String("key", key), zap.Error(err))
return
}
if watchInfo.State == datapb.ChannelWatchState_Complete {
return
}
if watchInfo.Vchan == nil {
log.Warn("found ChannelWatchInfo with nil VChannelInfo", zap.String("key", string(evt.Kv.Key)))
log.Warn("found ChannelWatchInfo with nil VChannelInfo", zap.String("key", key))
return
}
err = node.NewDataSyncService(watchInfo.Vchan)
if err != nil {
log.Warn("fail to create DataSyncService", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
log.Warn("fail to create DataSyncService", zap.String("key", key), zap.Error(err))
return
}
watchInfo.State = datapb.ChannelWatchState_Complete
v, err := proto.Marshal(&watchInfo)
if err != nil {
log.Warn("fail to Marshal watchInfo", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
log.Warn("fail to Marshal watchInfo", zap.String("key", key), zap.Error(err))
return
}
err = node.kvClient.Save(fmt.Sprintf("channel/%d/%s", node.NodeID, watchInfo.Vchan.ChannelName), string(v))
if err != nil {
log.Warn("fail to change WatchState to complete", zap.String("key", string(evt.Kv.Key)), zap.Error(err))
node.ReleaseDataSyncService(string(evt.Kv.Key))
// TODO GOOSE: maybe retry logic and exit logic
}
case clientv3.EventTypeDelete:
// guaranteed there is no "/" in channel name
parts := strings.Split(string(evt.Kv.Key), "/")
node.ReleaseDataSyncService(parts[len(parts)-1])
log.Warn("fail to change WatchState to complete", zap.String("key", key), zap.Error(err))
node.ReleaseDataSyncService(key)
}
}

View File

@ -530,8 +530,13 @@ func TestWatchChannel(t *testing.T) {
kv, err := etcdkv.NewEtcdKV(Params.EtcdEndpoints, Params.MetaRootPath)
require.NoError(t, err)
oldInvalidCh := "datanode-etcd-test-channel-invalid"
path := fmt.Sprintf("channel/%d/%s", node.NodeID, oldInvalidCh)
err = kv.Save(path, string([]byte{23}))
assert.NoError(t, err)
ch := fmt.Sprintf("datanode-etcd-test-channel_%d", rand.Int31())
path := fmt.Sprintf("channel/%d/%s", node.NodeID, ch)
path = fmt.Sprintf("channel/%d/%s", node.NodeID, ch)
c := make(chan struct{})
go func() {
ec := kv.WatchWithPrefix(fmt.Sprintf("channel/%d", node.NodeID))
@ -590,4 +595,43 @@ func TestWatchChannel(t *testing.T) {
assert.Nil(t, err)
assert.Equal(t, s.ErrorCode, commonpb.ErrorCode_UnexpectedError)
})
t.Run("handle watch info failed", func(t *testing.T) {
node.handleWatchInfo("test1", []byte{23})
node.chanMut.RLock()
_, has := node.vchan2SyncService["test1"]
assert.False(t, has)
node.chanMut.RUnlock()
info := datapb.ChannelWatchInfo{
Vchan: nil,
State: datapb.ChannelWatchState_Uncomplete,
}
bs, err := proto.Marshal(&info)
assert.NoError(t, err)
node.handleWatchInfo("test2", bs)
node.chanMut.RLock()
_, has = node.vchan2SyncService["test2"]
assert.False(t, has)
node.chanMut.RUnlock()
info = datapb.ChannelWatchInfo{
Vchan: &datapb.VchannelInfo{},
State: datapb.ChannelWatchState_Uncomplete,
}
bs, err = proto.Marshal(&info)
assert.NoError(t, err)
node.msFactory = &FailMessageStreamFactory{
node.msFactory,
}
node.handleWatchInfo("test3", bs)
node.chanMut.RLock()
_, has = node.vchan2SyncService["test3"]
assert.False(t, has)
node.chanMut.RUnlock()
})
}