fix: ChannelManager concurret Release and Watch bug (#38590)

See also: #38589

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
pull/38602/head
XuanYang-cn 2024-12-19 22:50:47 +08:00 committed by GitHub
parent 8fcb33c21d
commit c0b855dc75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 115 additions and 52 deletions

View File

@ -191,7 +191,15 @@ func NewStateChannelByWatchInfo(nodeID int64, info *datapb.ChannelWatchInfo) *St
return c
}
func (c *StateChannel) TransitionOnSuccess() {
func (c *StateChannel) TransitionOnSuccess(opID int64) {
if opID != c.Info.GetOpID() {
log.Warn("Try to transit on success but opID not match, stay original state ",
zap.Any("currentState", c.currentState),
zap.String("channel", c.Name),
zap.Int64("target opID", opID),
zap.Int64("channel opID", c.Info.GetOpID()))
return
}
switch c.currentState {
case Standby:
c.setState(ToWatch)
@ -208,7 +216,15 @@ func (c *StateChannel) TransitionOnSuccess() {
}
}
func (c *StateChannel) TransitionOnFailure() {
func (c *StateChannel) TransitionOnFailure(opID int64) {
if opID != c.Info.GetOpID() {
log.Warn("Try to transit on failure but opID not match, stay original state",
zap.Any("currentState", c.currentState),
zap.String("channel", c.Name),
zap.Int64("target opID", opID),
zap.Int64("channel opID", c.Info.GetOpID()))
return
}
switch c.currentState {
case Watching:
c.setState(Standby)

View File

@ -546,8 +546,8 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
nodeID := nodeAssign.NodeID
var (
succeededChannels = make([]RWChannel, 0, channelCount)
failedChannels = make([]RWChannel, 0, channelCount)
succeededChannels = 0
failedChannels = 0
futures = make([]*conc.Future[any], 0, channelCount)
)
@ -564,31 +564,42 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
future := getOrCreateIOPool().Submit(func() (any, error) {
err := m.Notify(ctx, nodeID, tmpWatchInfo)
return innerCh, err
return poolResult{
ch: innerCh,
opID: tmpWatchInfo.GetOpID(),
}, err
})
futures = append(futures, future)
}
for _, f := range futures {
ch, err := f.Await()
got, err := f.Await()
res := got.(poolResult)
if err != nil {
failedChannels = append(failedChannels, ch.(RWChannel))
log.Ctx(ctx).Warn("Failed to notify channel operations to datanode",
zap.Int64("assignment", nodeAssign.NodeID),
zap.Int("operation count", channelCount),
zap.String("channel name", res.ch.GetName()),
zap.Error(err),
)
failedChannels++
} else {
succeededChannels = append(succeededChannels, ch.(RWChannel))
succeededChannels++
advanced = true
}
m.mu.Lock()
m.store.UpdateState(err == nil, nodeID, res.ch, res.opID)
m.mu.Unlock()
}
log.Ctx(ctx).Info("Finish to notify channel operations to datanode",
zap.Int64("assignment", nodeAssign.NodeID),
zap.Int("operation count", channelCount),
zap.Int("success count", len(succeededChannels)),
zap.Int("failure count", len(failedChannels)),
zap.Int("success count", succeededChannels),
zap.Int("failure count", failedChannels),
)
m.mu.Lock()
m.store.UpdateState(false, failedChannels...)
m.store.UpdateState(true, succeededChannels...)
m.mu.Unlock()
}
return advanced
@ -597,6 +608,7 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
type poolResult struct {
successful bool
ch RWChannel
opID int64
}
func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*NodeChannelInfo) bool {
@ -617,13 +629,15 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
for _, ch := range nodeAssign.Channels {
innerCh := ch
tmpWatchInfo := typeutil.Clone(innerCh.GetWatchInfo())
future := getOrCreateIOPool().Submit(func() (any, error) {
successful, got := m.Check(ctx, nodeID, innerCh.GetWatchInfo())
successful, got := m.Check(ctx, nodeID, tmpWatchInfo)
if got {
return poolResult{
successful: successful,
ch: innerCh,
opID: tmpWatchInfo.GetOpID(),
}, nil
}
return nil, errors.New("Got results with no progress")
@ -636,7 +650,7 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
if err == nil {
m.mu.Lock()
result := got.(poolResult)
m.store.UpdateState(result.successful, result.ch)
m.store.UpdateState(result.successful, nodeID, result.ch, result.opID)
m.mu.Unlock()
advanced = true
@ -656,6 +670,7 @@ func (m *ChannelManagerImpl) Notify(ctx context.Context, nodeID int64, info *dat
zap.String("channel", info.GetVchan().GetChannelName()),
zap.Int64("assignment", nodeID),
zap.String("operation", info.GetState().String()),
zap.Int64("opID", info.GetOpID()),
)
log.Info("Notify channel operation")
err := m.subCluster.NotifyChannelOperation(ctx, nodeID, &datapb.ChannelOperationsRequest{Infos: []*datapb.ChannelWatchInfo{info}})
@ -706,6 +721,7 @@ func (m *ChannelManagerImpl) Check(ctx context.Context, nodeID int64, info *data
if resp.GetState() == datapb.ChannelWatchState_ReleaseFailure {
return false, true
}
}
return false, false
}

View File

@ -450,6 +450,50 @@ func (s *ChannelManagerSuite) TestAdvanceChannelState() {
s.checkAssignment(m, 1, "ch1", Watching)
s.checkAssignment(m, 1, "ch2", Watching)
})
s.Run("advance watching channels released during check", func() {
idx := int64(19530)
mockAlloc := globalIDAllocator.NewMockGlobalIDAllocator(s.T())
mockAlloc.EXPECT().AllocOne().
RunAndReturn(func() (int64, error) {
idx++
return idx, nil
})
chNodes := map[string]int64{
"ch1": 1,
"ch2": 1,
}
s.prepareMeta(chNodes, datapb.ChannelWatchState_ToWatch)
s.mockCluster.EXPECT().NotifyChannelOperation(mock.Anything, mock.Anything, mock.Anything).Return(nil).Twice()
m, err := NewChannelManager(s.mockKv, s.mockHandler, s.mockCluster, mockAlloc)
s.Require().NoError(err)
s.checkAssignment(m, 1, "ch1", ToWatch)
s.checkAssignment(m, 1, "ch2", ToWatch)
m.AdvanceChannelState(ctx)
s.checkAssignment(m, 1, "ch1", Watching)
s.checkAssignment(m, 1, "ch2", Watching)
// Release belfore check return
s.mockCluster.EXPECT().CheckChannelOperationProgress(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, info *datapb.ChannelWatchInfo) (*datapb.ChannelOperationProgressResponse, error) {
if info.GetVchan().GetChannelName() == "ch1" {
m.Release(1, "ch1")
s.checkAssignment(m, 1, "ch1", ToRelease)
rwChannel, found := m.GetChannel(nodeID, "ch1")
s.True(found)
metaInfo := rwChannel.GetWatchInfo()
s.Require().EqualValues(metaInfo.GetOpID(), 19531)
log.Info("Trying to check this info", zap.Any("meta info", rwChannel.GetWatchInfo()))
}
log.Info("Trying to check this info", zap.Any("rpc info", info))
return &datapb.ChannelOperationProgressResponse{State: datapb.ChannelWatchState_WatchSuccess, Progress: 100}, nil
}).Twice()
m.AdvanceChannelState(ctx)
s.checkAssignment(m, 1, "ch1", ToRelease)
s.checkAssignment(m, 1, "ch2", Watched)
})
s.Run("advance watching channels check ErrNodeNotFound", func() {
chNodes := map[string]int64{
"ch1": 1,

View File

@ -72,7 +72,7 @@ type RWChannelStore interface {
Update(op *ChannelOpSet) error
// UpdateState is used by StateChannelStore only
UpdateState(isSuccessful bool, channels ...RWChannel)
UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64)
// SegLegacyChannelByNode is used by StateChannelStore only
SetLegacyChannelByNode(nodeIDs ...int64)
@ -375,18 +375,17 @@ func (c *StateChannelStore) AddNode(nodeID int64) {
}
}
func (c *StateChannelStore) UpdateState(isSuccessful bool, channels ...RWChannel) {
lo.ForEach(channels, func(ch RWChannel, _ int) {
for _, cInfo := range c.channelsInfo {
if stateChannel, ok := cInfo.Channels[ch.GetName()]; ok {
if isSuccessful {
stateChannel.(*StateChannel).TransitionOnSuccess()
} else {
stateChannel.(*StateChannel).TransitionOnFailure()
}
func (c *StateChannelStore) UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64) {
channelName := channel.GetName()
if cInfo, ok := c.channelsInfo[nodeID]; ok {
if stateChannel, ok := cInfo.Channels[channelName]; ok {
if isSuccessful {
stateChannel.(*StateChannel).TransitionOnSuccess(opID)
} else {
stateChannel.(*StateChannel).TransitionOnFailure(opID)
}
}
})
}
}
func (c *StateChannelStore) SetLegacyChannelByNode(nodeIDs ...int64) {

View File

@ -436,14 +436,14 @@ func (s *StateChannelStoreSuite) TestUpdateState() {
ch := "ch-1"
channel := NewStateChannel(getChannel(ch, 1))
channel.setState(test.inChannelState)
store.channelsInfo[1] = &NodeChannelInfo{
store.channelsInfo[bufferID] = &NodeChannelInfo{
NodeID: bufferID,
Channels: map[string]RWChannel{
ch: channel,
},
}
store.UpdateState(test.inSuccess, channel)
store.UpdateState(test.inSuccess, bufferID, channel, 0)
s.Equal(test.outChannelState, channel.currentState)
})
}

View File

@ -566,16 +566,9 @@ func (_c *MockRWChannelStore_Update_Call) RunAndReturn(run func(*ChannelOpSet) e
return _c
}
// UpdateState provides a mock function with given fields: isSuccessful, channels
func (_m *MockRWChannelStore) UpdateState(isSuccessful bool, channels ...RWChannel) {
_va := make([]interface{}, len(channels))
for _i := range channels {
_va[_i] = channels[_i]
}
var _ca []interface{}
_ca = append(_ca, isSuccessful)
_ca = append(_ca, _va...)
_m.Called(_ca...)
// UpdateState provides a mock function with given fields: isSuccessful, nodeID, channel, opID
func (_m *MockRWChannelStore) UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64) {
_m.Called(isSuccessful, nodeID, channel, opID)
}
// MockRWChannelStore_UpdateState_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'UpdateState'
@ -585,21 +578,16 @@ type MockRWChannelStore_UpdateState_Call struct {
// UpdateState is a helper method to define mock.On call
// - isSuccessful bool
// - channels ...RWChannel
func (_e *MockRWChannelStore_Expecter) UpdateState(isSuccessful interface{}, channels ...interface{}) *MockRWChannelStore_UpdateState_Call {
return &MockRWChannelStore_UpdateState_Call{Call: _e.mock.On("UpdateState",
append([]interface{}{isSuccessful}, channels...)...)}
// - nodeID int64
// - channel RWChannel
// - opID int64
func (_e *MockRWChannelStore_Expecter) UpdateState(isSuccessful interface{}, nodeID interface{}, channel interface{}, opID interface{}) *MockRWChannelStore_UpdateState_Call {
return &MockRWChannelStore_UpdateState_Call{Call: _e.mock.On("UpdateState", isSuccessful, nodeID, channel, opID)}
}
func (_c *MockRWChannelStore_UpdateState_Call) Run(run func(isSuccessful bool, channels ...RWChannel)) *MockRWChannelStore_UpdateState_Call {
func (_c *MockRWChannelStore_UpdateState_Call) Run(run func(isSuccessful bool, nodeID int64, channel RWChannel, opID int64)) *MockRWChannelStore_UpdateState_Call {
_c.Call.Run(func(args mock.Arguments) {
variadicArgs := make([]RWChannel, len(args)-1)
for i, a := range args[1:] {
if a != nil {
variadicArgs[i] = a.(RWChannel)
}
}
run(args[0].(bool), variadicArgs...)
run(args[0].(bool), args[1].(int64), args[2].(RWChannel), args[3].(int64))
})
return _c
}
@ -609,7 +597,7 @@ func (_c *MockRWChannelStore_UpdateState_Call) Return() *MockRWChannelStore_Upda
return _c
}
func (_c *MockRWChannelStore_UpdateState_Call) RunAndReturn(run func(bool, ...RWChannel)) *MockRWChannelStore_UpdateState_Call {
func (_c *MockRWChannelStore_UpdateState_Call) RunAndReturn(run func(bool, int64, RWChannel, int64)) *MockRWChannelStore_UpdateState_Call {
_c.Call.Return(run)
return _c
}