Handle flowgraph releasing properly (#16169)

See also: #15846

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
pull/16552/head
XuanYang-cn 2022-04-20 16:45:41 +08:00 committed by GitHub
parent 69252f812d
commit 93777e0c9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 55 deletions

View File

@ -313,7 +313,7 @@ func (node *DataNode) handleWatchInfo(e *event, key string, data []byte) {
}
if isEndWatchState(watchInfo.State) {
log.Warn("DataNode received a PUT event with an end State", zap.String("state", watchInfo.State.String()))
log.Debug("DataNode received a PUT event with an end State", zap.String("state", watchInfo.State.String()))
return
}
@ -328,14 +328,13 @@ func (node *DataNode) handleWatchInfo(e *event, key string, data []byte) {
actualManager, loaded := node.eventManagerMap.LoadOrStore(e.vChanName, newChannelEventManager(
node.handlePutEvent, node.handleDeleteEvent, retryWatchInterval,
))
if !loaded {
actualManager.(*channelEventManager).Run()
}
actualManager.(*channelEventManager).handleEvent(*e)
// Whenever a delete event comes, this eventManger will be removed from map
// Whenever a delete event comes, this eventManager will be removed from map
if e.eventType == deleteEventType {
if m, loaded := node.eventManagerMap.LoadAndDelete(e.vChanName); loaded {
m.(*channelEventManager).Close()
@ -371,29 +370,15 @@ func (node *DataNode) handlePutEvent(watchInfo *datapb.ChannelWatchInfo, version
if err := node.flowgraphManager.addAndStart(node, watchInfo.GetVchan()); err != nil {
return fmt.Errorf("fail to add and start flowgraph for vChanName: %s, err: %v", vChanName, err)
}
log.Info("handle put event: new data sync service success", zap.String("vChanName", vChanName))
defer func() {
if err != nil {
node.releaseFlowgraph(vChanName)
}
}()
log.Debug("handle put event: new data sync service success", zap.String("vChanName", vChanName))
watchInfo.State = datapb.ChannelWatchState_WatchSuccess
case datapb.ChannelWatchState_ToRelease:
success := true
func() {
defer func() {
if x := recover(); x != nil {
log.Error("release flowgraph panic", zap.Any("recovered", x))
success = false
}
}()
node.releaseFlowgraph(vChanName)
}()
if !success {
watchInfo.State = datapb.ChannelWatchState_ReleaseFailure
} else {
if node.tryToReleaseFlowgraph(vChanName) {
watchInfo.State = datapb.ChannelWatchState_ReleaseSuccess
} else {
watchInfo.State = datapb.ChannelWatchState_ReleaseFailure
}
}
@ -403,8 +388,8 @@ func (node *DataNode) handlePutEvent(watchInfo *datapb.ChannelWatchInfo, version
}
k := path.Join(Params.DataNodeCfg.ChannelWatchSubPath, fmt.Sprintf("%d", node.NodeID), vChanName)
log.Info("handle put event: try to save result state", zap.String("key", k), zap.String("state", watchInfo.State.String()))
log.Debug("handle put event: try to save result state", zap.String("key", k), zap.String("state", watchInfo.State.String()))
err = node.watchKv.CompareVersionAndSwap(k, version, string(v))
if err != nil {
return fmt.Errorf("fail to update watch state to etcd, vChanName: %s, state: %s, err: %w", vChanName, watchInfo.State.String(), err)
@ -412,22 +397,33 @@ func (node *DataNode) handlePutEvent(watchInfo *datapb.ChannelWatchInfo, version
return nil
}
func (node *DataNode) handleDeleteEvent(vChanName string) {
node.releaseFlowgraph(vChanName)
func (node *DataNode) handleDeleteEvent(vChanName string) bool {
return node.tryToReleaseFlowgraph(vChanName)
}
func (node *DataNode) releaseFlowgraph(vChanName string) {
// tryToReleaseFlowgraph tries to release a flowgraph, returns false if failed
func (node *DataNode) tryToReleaseFlowgraph(vChanName string) bool {
success := true
defer func() {
if x := recover(); x != nil {
log.Error("release flowgraph panic", zap.String("vChanName", vChanName), zap.Any("recovered", x))
success = false
}
}()
node.flowgraphManager.release(vChanName)
log.Info("try to release flowgraph success", zap.String("vChanName", vChanName))
return success
}
// BackGroundGC runs in background to release datanode resources
// GOOSE TODO: remove background GC, using ToRelease for drop-collection after #15846
func (node *DataNode) BackGroundGC(vChannelCh <-chan string) {
log.Info("DataNode Background GC Start")
for {
select {
case vchanName := <-vChannelCh:
log.Info("GC flowgraph", zap.String("vChanName", vchanName))
node.releaseFlowgraph(vchanName)
node.tryToReleaseFlowgraph(vchanName)
case <-node.ctx.Done():
log.Warn("DataNode context done, exiting background GC")
return

View File

@ -599,9 +599,10 @@ func TestWatchChannel(t *testing.T) {
chPut <- struct{}{}
return r
},
func(vChan string) {
func(vChan string) bool {
node.handleDeleteEvent(vChan)
chDel <- struct{}{}
return true
}, time.Millisecond*100,
)
node.eventManagerMap.Store(ch, m)
@ -636,9 +637,10 @@ func TestWatchChannel(t *testing.T) {
chPut <- struct{}{}
return r
},
func(vChan string) {
func(vChan string) bool {
node.handleDeleteEvent(vChan)
chDel <- struct{}{}
return true
}, time.Millisecond*100,
)
node.eventManagerMap.Store(ch, m)

View File

@ -40,7 +40,7 @@ type channelEventManager struct {
eventChan chan event
closeChan chan struct{}
handlePutEvent func(watchInfo *datapb.ChannelWatchInfo, version int64) error // node.handlePutEvent
handleDeleteEvent func(vChanName string) // node.handleDeleteEvent
handleDeleteEvent func(vChanName string) bool // node.handleDeleteEvent
retryInterval time.Duration
}
@ -50,7 +50,7 @@ const (
)
func newChannelEventManager(handlePut func(*datapb.ChannelWatchInfo, int64) error,
handleDel func(string), retryInterval time.Duration) *channelEventManager {
handleDel func(string) bool, retryInterval time.Duration) *channelEventManager {
return &channelEventManager{
eventChan: make(chan event, 10),
closeChan: make(chan struct{}),

View File

@ -34,7 +34,7 @@ func TestChannelEventManager(t *testing.T) {
ran = true
ch <- struct{}{}
return nil
}, func(name string) {}, time.Millisecond*10)
}, func(name string) bool { return true }, time.Millisecond*10)
em.Run()
em.handleEvent(event{
@ -56,7 +56,7 @@ func TestChannelEventManager(t *testing.T) {
ran = true
ch <- struct{}{}
return nil
}, func(name string) {}, time.Millisecond*10)
}, func(name string) bool { return true }, time.Millisecond*10)
em.Run()
em.handleEvent(event{
@ -89,7 +89,7 @@ func TestChannelEventManager(t *testing.T) {
}
return errors.New("mocked error")
}, func(name string) {}, time.Millisecond*10)
}, func(name string) bool { return true }, time.Millisecond*10)
em.Run()
em.handleEvent(event{
@ -107,7 +107,7 @@ func TestChannelEventManager(t *testing.T) {
t.Run("retry until timeout", func(t *testing.T) {
em := newChannelEventManager(func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
}, func(name string) {}, time.Millisecond*100)
}, func(name string) bool { return true }, time.Millisecond*100)
ch := make(chan struct{}, 1)
@ -136,7 +136,7 @@ func TestChannelEventManager(t *testing.T) {
ch := make(chan struct{}, 1)
em := newChannelEventManager(func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
}, func(name string) {}, time.Millisecond*10)
}, func(name string) bool { return true }, time.Millisecond*10)
go func() {
ddl := time.Now().Add(time.Minute)
@ -168,12 +168,17 @@ func TestChannelEventManager(t *testing.T) {
t.Run("cancel by delete event", func(t *testing.T) {
ch := make(chan struct{}, 1)
ran := false
em := newChannelEventManager(func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
}, func(name string) {
ran = true
ch <- struct{}{}
}, time.Millisecond*10)
em := newChannelEventManager(
func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
},
func(name string) bool {
ran = true
ch <- struct{}{}
return true
},
time.Millisecond*10,
)
em.Run()
em.handleEvent(event{
eventType: putEventType,
@ -198,16 +203,19 @@ func TestChannelEventManager(t *testing.T) {
t.Run("overwrite put event", func(t *testing.T) {
ch := make(chan struct{}, 1)
ran := false
em := newChannelEventManager(func(info *datapb.ChannelWatchInfo, version int64) error {
if version > 0 {
ran = true
ch <- struct{}{}
return nil
}
return errors.New("mocked error")
}, func(name string) {
t.FailNow()
}, time.Millisecond*10)
em := newChannelEventManager(
func(info *datapb.ChannelWatchInfo, version int64) error {
if version > 0 {
ran = true
ch <- struct{}{}
return nil
}
return errors.New("mocked error")
},
func(name string) bool {
return false
},
time.Millisecond*10)
em.Run()
em.handleEvent(event{
eventType: putEventType,
@ -241,9 +249,15 @@ func TestChannelEventManager(t *testing.T) {
}
for _, es := range endStates {
em := newChannelEventManager(func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
}, func(name string) { t.FailNow() }, time.Millisecond*100)
em := newChannelEventManager(
func(info *datapb.ChannelWatchInfo, version int64) error {
return errors.New("mocked error")
},
func(name string) bool {
return false
},
time.Millisecond*100,
)
ch := make(chan struct{}, 1)
ddl := time.Now().Add(time.Minute)