mirror of https://github.com/milvus-io/milvus.git
fix: ChannelManager concurret Release and Watch bug (#38590)
See also: #38589 Signed-off-by: yangxuan <xuan.yang@zilliz.com>pull/38602/head
parent
8fcb33c21d
commit
c0b855dc75
|
@ -191,7 +191,15 @@ func NewStateChannelByWatchInfo(nodeID int64, info *datapb.ChannelWatchInfo) *St
|
|||
return c
|
||||
}
|
||||
|
||||
func (c *StateChannel) TransitionOnSuccess() {
|
||||
func (c *StateChannel) TransitionOnSuccess(opID int64) {
|
||||
if opID != c.Info.GetOpID() {
|
||||
log.Warn("Try to transit on success but opID not match, stay original state ",
|
||||
zap.Any("currentState", c.currentState),
|
||||
zap.String("channel", c.Name),
|
||||
zap.Int64("target opID", opID),
|
||||
zap.Int64("channel opID", c.Info.GetOpID()))
|
||||
return
|
||||
}
|
||||
switch c.currentState {
|
||||
case Standby:
|
||||
c.setState(ToWatch)
|
||||
|
@ -208,7 +216,15 @@ func (c *StateChannel) TransitionOnSuccess() {
|
|||
}
|
||||
}
|
||||
|
||||
func (c *StateChannel) TransitionOnFailure() {
|
||||
func (c *StateChannel) TransitionOnFailure(opID int64) {
|
||||
if opID != c.Info.GetOpID() {
|
||||
log.Warn("Try to transit on failure but opID not match, stay original state",
|
||||
zap.Any("currentState", c.currentState),
|
||||
zap.String("channel", c.Name),
|
||||
zap.Int64("target opID", opID),
|
||||
zap.Int64("channel opID", c.Info.GetOpID()))
|
||||
return
|
||||
}
|
||||
switch c.currentState {
|
||||
case Watching:
|
||||
c.setState(Standby)
|
||||
|
|
|
@ -546,8 +546,8 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
|
|||
nodeID := nodeAssign.NodeID
|
||||
|
||||
var (
|
||||
succeededChannels = make([]RWChannel, 0, channelCount)
|
||||
failedChannels = make([]RWChannel, 0, channelCount)
|
||||
succeededChannels = 0
|
||||
failedChannels = 0
|
||||
futures = make([]*conc.Future[any], 0, channelCount)
|
||||
)
|
||||
|
||||
|
@ -564,31 +564,42 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
|
|||
|
||||
future := getOrCreateIOPool().Submit(func() (any, error) {
|
||||
err := m.Notify(ctx, nodeID, tmpWatchInfo)
|
||||
return innerCh, err
|
||||
return poolResult{
|
||||
ch: innerCh,
|
||||
opID: tmpWatchInfo.GetOpID(),
|
||||
}, err
|
||||
})
|
||||
futures = append(futures, future)
|
||||
}
|
||||
|
||||
for _, f := range futures {
|
||||
ch, err := f.Await()
|
||||
got, err := f.Await()
|
||||
res := got.(poolResult)
|
||||
|
||||
if err != nil {
|
||||
failedChannels = append(failedChannels, ch.(RWChannel))
|
||||
log.Ctx(ctx).Warn("Failed to notify channel operations to datanode",
|
||||
zap.Int64("assignment", nodeAssign.NodeID),
|
||||
zap.Int("operation count", channelCount),
|
||||
zap.String("channel name", res.ch.GetName()),
|
||||
zap.Error(err),
|
||||
)
|
||||
failedChannels++
|
||||
} else {
|
||||
succeededChannels = append(succeededChannels, ch.(RWChannel))
|
||||
succeededChannels++
|
||||
advanced = true
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
m.store.UpdateState(err == nil, nodeID, res.ch, res.opID)
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
log.Ctx(ctx).Info("Finish to notify channel operations to datanode",
|
||||
zap.Int64("assignment", nodeAssign.NodeID),
|
||||
zap.Int("operation count", channelCount),
|
||||
zap.Int("success count", len(succeededChannels)),
|
||||
zap.Int("failure count", len(failedChannels)),
|
||||
zap.Int("success count", succeededChannels),
|
||||
zap.Int("failure count", failedChannels),
|
||||
)
|
||||
m.mu.Lock()
|
||||
m.store.UpdateState(false, failedChannels...)
|
||||
m.store.UpdateState(true, succeededChannels...)
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
return advanced
|
||||
|
@ -597,6 +608,7 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
|
|||
type poolResult struct {
|
||||
successful bool
|
||||
ch RWChannel
|
||||
opID int64
|
||||
}
|
||||
|
||||
func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*NodeChannelInfo) bool {
|
||||
|
@ -617,13 +629,15 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
|
|||
|
||||
for _, ch := range nodeAssign.Channels {
|
||||
innerCh := ch
|
||||
tmpWatchInfo := typeutil.Clone(innerCh.GetWatchInfo())
|
||||
|
||||
future := getOrCreateIOPool().Submit(func() (any, error) {
|
||||
successful, got := m.Check(ctx, nodeID, innerCh.GetWatchInfo())
|
||||
successful, got := m.Check(ctx, nodeID, tmpWatchInfo)
|
||||
if got {
|
||||
return poolResult{
|
||||
successful: successful,
|
||||
ch: innerCh,
|
||||
opID: tmpWatchInfo.GetOpID(),
|
||||
}, nil
|
||||
}
|
||||
return nil, errors.New("Got results with no progress")
|
||||
|
@ -636,7 +650,7 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
|
|||
if err == nil {
|
||||
m.mu.Lock()
|
||||
result := got.(poolResult)
|
||||
m.store.UpdateState(result.successful, result.ch)
|
||||
m.store.UpdateState(result.successful, nodeID, result.ch, result.opID)
|
||||
m.mu.Unlock()
|
||||
|
||||
advanced = true
|
||||
|
@ -656,6 +670,7 @@ func (m *ChannelManagerImpl) Notify(ctx context.Context, nodeID int64, info *dat
|
|||
zap.String("channel", info.GetVchan().GetChannelName()),
|
||||
zap.Int64("assignment", nodeID),
|
||||
zap.String("operation", info.GetState().String()),
|
||||
zap.Int64("opID", info.GetOpID()),
|
||||
)
|
||||
log.Info("Notify channel operation")
|
||||
err := m.subCluster.NotifyChannelOperation(ctx, nodeID, &datapb.ChannelOperationsRequest{Infos: []*datapb.ChannelWatchInfo{info}})
|
||||
|
@ -706,6 +721,7 @@ func (m *ChannelManagerImpl) Check(ctx context.Context, nodeID int64, info *data
|
|||
if resp.GetState() == datapb.ChannelWatchState_ReleaseFailure {
|
||||
return false, true
|
||||
}
|
||||
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
|
|
@ -450,6 +450,50 @@ func (s *ChannelManagerSuite) TestAdvanceChannelState() {
|
|||
s.checkAssignment(m, 1, "ch1", Watching)
|
||||
s.checkAssignment(m, 1, "ch2", Watching)
|
||||
})
|
||||
|
||||
s.Run("advance watching channels released during check", func() {
|
||||
idx := int64(19530)
|
||||
mockAlloc := globalIDAllocator.NewMockGlobalIDAllocator(s.T())
|
||||
mockAlloc.EXPECT().AllocOne().
|
||||
RunAndReturn(func() (int64, error) {
|
||||
idx++
|
||||
return idx, nil
|
||||
})
|
||||
chNodes := map[string]int64{
|
||||
"ch1": 1,
|
||||
"ch2": 1,
|
||||
}
|
||||
s.prepareMeta(chNodes, datapb.ChannelWatchState_ToWatch)
|
||||
s.mockCluster.EXPECT().NotifyChannelOperation(mock.Anything, mock.Anything, mock.Anything).Return(nil).Twice()
|
||||
m, err := NewChannelManager(s.mockKv, s.mockHandler, s.mockCluster, mockAlloc)
|
||||
s.Require().NoError(err)
|
||||
s.checkAssignment(m, 1, "ch1", ToWatch)
|
||||
s.checkAssignment(m, 1, "ch2", ToWatch)
|
||||
|
||||
m.AdvanceChannelState(ctx)
|
||||
s.checkAssignment(m, 1, "ch1", Watching)
|
||||
s.checkAssignment(m, 1, "ch2", Watching)
|
||||
|
||||
// Release belfore check return
|
||||
s.mockCluster.EXPECT().CheckChannelOperationProgress(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, info *datapb.ChannelWatchInfo) (*datapb.ChannelOperationProgressResponse, error) {
|
||||
if info.GetVchan().GetChannelName() == "ch1" {
|
||||
m.Release(1, "ch1")
|
||||
s.checkAssignment(m, 1, "ch1", ToRelease)
|
||||
rwChannel, found := m.GetChannel(nodeID, "ch1")
|
||||
s.True(found)
|
||||
metaInfo := rwChannel.GetWatchInfo()
|
||||
s.Require().EqualValues(metaInfo.GetOpID(), 19531)
|
||||
log.Info("Trying to check this info", zap.Any("meta info", rwChannel.GetWatchInfo()))
|
||||
}
|
||||
log.Info("Trying to check this info", zap.Any("rpc info", info))
|
||||
return &datapb.ChannelOperationProgressResponse{State: datapb.ChannelWatchState_WatchSuccess, Progress: 100}, nil
|
||||
}).Twice()
|
||||
m.AdvanceChannelState(ctx)
|
||||
|
||||
s.checkAssignment(m, 1, "ch1", ToRelease)
|
||||
s.checkAssignment(m, 1, "ch2", Watched)
|
||||
})
|
||||
|
||||
s.Run("advance watching channels check ErrNodeNotFound", func() {
|
||||
chNodes := map[string]int64{
|
||||
"ch1": 1,
|
||||
|
|
|
@ -72,7 +72,7 @@ type RWChannelStore interface {
|
|||
Update(op *ChannelOpSet) error
|
||||
|
||||
// UpdateState is used by StateChannelStore only
|
||||
UpdateState(isSuccessful bool, channels ...RWChannel)
|
||||
UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64)
|
||||
// SegLegacyChannelByNode is used by StateChannelStore only
|
||||
SetLegacyChannelByNode(nodeIDs ...int64)
|
||||
|
||||
|
@ -375,18 +375,17 @@ func (c *StateChannelStore) AddNode(nodeID int64) {
|
|||
}
|
||||
}
|
||||
|
||||
func (c *StateChannelStore) UpdateState(isSuccessful bool, channels ...RWChannel) {
|
||||
lo.ForEach(channels, func(ch RWChannel, _ int) {
|
||||
for _, cInfo := range c.channelsInfo {
|
||||
if stateChannel, ok := cInfo.Channels[ch.GetName()]; ok {
|
||||
if isSuccessful {
|
||||
stateChannel.(*StateChannel).TransitionOnSuccess()
|
||||
} else {
|
||||
stateChannel.(*StateChannel).TransitionOnFailure()
|
||||
}
|
||||
func (c *StateChannelStore) UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64) {
|
||||
channelName := channel.GetName()
|
||||
if cInfo, ok := c.channelsInfo[nodeID]; ok {
|
||||
if stateChannel, ok := cInfo.Channels[channelName]; ok {
|
||||
if isSuccessful {
|
||||
stateChannel.(*StateChannel).TransitionOnSuccess(opID)
|
||||
} else {
|
||||
stateChannel.(*StateChannel).TransitionOnFailure(opID)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StateChannelStore) SetLegacyChannelByNode(nodeIDs ...int64) {
|
||||
|
|
|
@ -436,14 +436,14 @@ func (s *StateChannelStoreSuite) TestUpdateState() {
|
|||
ch := "ch-1"
|
||||
channel := NewStateChannel(getChannel(ch, 1))
|
||||
channel.setState(test.inChannelState)
|
||||
store.channelsInfo[1] = &NodeChannelInfo{
|
||||
store.channelsInfo[bufferID] = &NodeChannelInfo{
|
||||
NodeID: bufferID,
|
||||
Channels: map[string]RWChannel{
|
||||
ch: channel,
|
||||
},
|
||||
}
|
||||
|
||||
store.UpdateState(test.inSuccess, channel)
|
||||
store.UpdateState(test.inSuccess, bufferID, channel, 0)
|
||||
s.Equal(test.outChannelState, channel.currentState)
|
||||
})
|
||||
}
|
||||
|
|
|
@ -566,16 +566,9 @@ func (_c *MockRWChannelStore_Update_Call) RunAndReturn(run func(*ChannelOpSet) e
|
|||
return _c
|
||||
}
|
||||
|
||||
// UpdateState provides a mock function with given fields: isSuccessful, channels
|
||||
func (_m *MockRWChannelStore) UpdateState(isSuccessful bool, channels ...RWChannel) {
|
||||
_va := make([]interface{}, len(channels))
|
||||
for _i := range channels {
|
||||
_va[_i] = channels[_i]
|
||||
}
|
||||
var _ca []interface{}
|
||||
_ca = append(_ca, isSuccessful)
|
||||
_ca = append(_ca, _va...)
|
||||
_m.Called(_ca...)
|
||||
// UpdateState provides a mock function with given fields: isSuccessful, nodeID, channel, opID
|
||||
func (_m *MockRWChannelStore) UpdateState(isSuccessful bool, nodeID int64, channel RWChannel, opID int64) {
|
||||
_m.Called(isSuccessful, nodeID, channel, opID)
|
||||
}
|
||||
|
||||
// MockRWChannelStore_UpdateState_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'UpdateState'
|
||||
|
@ -585,21 +578,16 @@ type MockRWChannelStore_UpdateState_Call struct {
|
|||
|
||||
// UpdateState is a helper method to define mock.On call
|
||||
// - isSuccessful bool
|
||||
// - channels ...RWChannel
|
||||
func (_e *MockRWChannelStore_Expecter) UpdateState(isSuccessful interface{}, channels ...interface{}) *MockRWChannelStore_UpdateState_Call {
|
||||
return &MockRWChannelStore_UpdateState_Call{Call: _e.mock.On("UpdateState",
|
||||
append([]interface{}{isSuccessful}, channels...)...)}
|
||||
// - nodeID int64
|
||||
// - channel RWChannel
|
||||
// - opID int64
|
||||
func (_e *MockRWChannelStore_Expecter) UpdateState(isSuccessful interface{}, nodeID interface{}, channel interface{}, opID interface{}) *MockRWChannelStore_UpdateState_Call {
|
||||
return &MockRWChannelStore_UpdateState_Call{Call: _e.mock.On("UpdateState", isSuccessful, nodeID, channel, opID)}
|
||||
}
|
||||
|
||||
func (_c *MockRWChannelStore_UpdateState_Call) Run(run func(isSuccessful bool, channels ...RWChannel)) *MockRWChannelStore_UpdateState_Call {
|
||||
func (_c *MockRWChannelStore_UpdateState_Call) Run(run func(isSuccessful bool, nodeID int64, channel RWChannel, opID int64)) *MockRWChannelStore_UpdateState_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
variadicArgs := make([]RWChannel, len(args)-1)
|
||||
for i, a := range args[1:] {
|
||||
if a != nil {
|
||||
variadicArgs[i] = a.(RWChannel)
|
||||
}
|
||||
}
|
||||
run(args[0].(bool), variadicArgs...)
|
||||
run(args[0].(bool), args[1].(int64), args[2].(RWChannel), args[3].(int64))
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
@ -609,7 +597,7 @@ func (_c *MockRWChannelStore_UpdateState_Call) Return() *MockRWChannelStore_Upda
|
|||
return _c
|
||||
}
|
||||
|
||||
func (_c *MockRWChannelStore_UpdateState_Call) RunAndReturn(run func(bool, ...RWChannel)) *MockRWChannelStore_UpdateState_Call {
|
||||
func (_c *MockRWChannelStore_UpdateState_Call) RunAndReturn(run func(bool, int64, RWChannel, int64)) *MockRWChannelStore_UpdateState_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue