mirror of https://github.com/milvus-io/milvus.git
enhance: Log error instead of panicking if load lock wait timeout (#39308)
Related to #39205 Previous PR #39206 This PR change wait timeout behavior to log error and return to avoid making other collection read failure in only some collections have deadlock Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>pull/39253/merge
parent
a5a83a0904
commit
57e5652f1a
|
@ -1176,10 +1176,10 @@ func (s *LocalSegment) WarmupChunkCache(ctx context.Context, fieldID int64, mmap
|
|||
}).Await()
|
||||
case "async":
|
||||
task := func() (any, error) {
|
||||
// bad implemtation, warmup is async at another goroutine and hold the rlock.
|
||||
// the state transition of segment in segment loader will blocked.
|
||||
// add a waiter to avoid it.
|
||||
s.ptrLock.BlockUntilDataLoadedOrReleased()
|
||||
// failed to wait for state update, return directly
|
||||
if !s.ptrLock.BlockUntilDataLoadedOrReleased() {
|
||||
return nil, nil
|
||||
}
|
||||
if s.PinIfNotReleased() != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
|
|
@ -7,7 +7,9 @@ import (
|
|||
|
||||
"github.com/cockroachdb/errors"
|
||||
"go.uber.org/atomic"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
)
|
||||
|
||||
|
@ -168,10 +170,12 @@ func (ls *LoadStateLock) StartReleaseAll() (g LoadStateLockGuard) {
|
|||
}
|
||||
|
||||
// blockUntilDataLoadedOrReleased blocks until the segment is loaded or released.
|
||||
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() {
|
||||
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() bool {
|
||||
var ok bool
|
||||
ls.waitOrPanic(func(state loadStateEnum) bool {
|
||||
return state == LoadStateDataLoaded || state == LoadStateReleased
|
||||
}, noop)
|
||||
}, func() { ok = true })
|
||||
return ok
|
||||
}
|
||||
|
||||
// waitUntilCanReleaseData waits until segment is release data able.
|
||||
|
@ -199,7 +203,7 @@ func (ls *LoadStateLock) waitOrPanic(ready func(state loadStateEnum) bool, then
|
|||
|
||||
select {
|
||||
case <-time.After(maxWaitTime):
|
||||
panic(fmt.Sprintf("max WLock wait time(%v) excceeded", maxWaitTime))
|
||||
log.Error("load state lock wait timeout", zap.Duration("maxWaitTime", maxWaitTime))
|
||||
case <-ch:
|
||||
}
|
||||
}
|
||||
|
|
|
@ -197,12 +197,15 @@ func TestWaitOrPanic(t *testing.T) {
|
|||
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)
|
||||
|
||||
l := NewLoadStateLock(LoadStateDataLoaded)
|
||||
executed := false
|
||||
|
||||
assert.NotPanics(t, func() {
|
||||
l.waitOrPanic(func(state loadStateEnum) bool {
|
||||
return state == LoadStateDataLoaded
|
||||
}, noop)
|
||||
}, func() { executed = true })
|
||||
})
|
||||
|
||||
assert.True(t, executed)
|
||||
})
|
||||
|
||||
t.Run("timeout_panic", func(t *testing.T) {
|
||||
|
@ -210,12 +213,14 @@ func TestWaitOrPanic(t *testing.T) {
|
|||
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)
|
||||
|
||||
l := NewLoadStateLock(LoadStateOnlyMeta)
|
||||
executed := false
|
||||
|
||||
assert.Panics(t, func() {
|
||||
assert.NotPanics(t, func() {
|
||||
l.waitOrPanic(func(state loadStateEnum) bool {
|
||||
return state == LoadStateDataLoaded
|
||||
}, noop)
|
||||
})
|
||||
assert.False(t, executed)
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -51,9 +51,11 @@ int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
|
|||
}
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"github.com/cockroachdb/errors"
|
||||
"unsafe"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
)
|
||||
|
||||
// GPUMemoryInfo represents a single GPU's memory information.
|
||||
|
|
Loading…
Reference in New Issue