enhance: Log error instead of panicking if load lock wait timeout (#39308)

Related to #39205
Previous PR #39206

This PR change wait timeout behavior to log error and return to avoid
making other collection read failure in only some collections have
deadlock

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/39253/merge
congqixia 2025-01-16 02:31:02 +08:00 committed by GitHub
parent a5a83a0904
commit 57e5652f1a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 21 additions and 10 deletions

View File

@ -1176,10 +1176,10 @@ func (s *LocalSegment) WarmupChunkCache(ctx context.Context, fieldID int64, mmap
}).Await()
case "async":
task := func() (any, error) {
// bad implemtation, warmup is async at another goroutine and hold the rlock.
// the state transition of segment in segment loader will blocked.
// add a waiter to avoid it.
s.ptrLock.BlockUntilDataLoadedOrReleased()
// failed to wait for state update, return directly
if !s.ptrLock.BlockUntilDataLoadedOrReleased() {
return nil, nil
}
if s.PinIfNotReleased() != nil {
return nil, nil
}

View File

@ -7,7 +7,9 @@ import (
"github.com/cockroachdb/errors"
"go.uber.org/atomic"
"go.uber.org/zap"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
)
@ -168,10 +170,12 @@ func (ls *LoadStateLock) StartReleaseAll() (g LoadStateLockGuard) {
}
// blockUntilDataLoadedOrReleased blocks until the segment is loaded or released.
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() {
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() bool {
var ok bool
ls.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded || state == LoadStateReleased
}, noop)
}, func() { ok = true })
return ok
}
// waitUntilCanReleaseData waits until segment is release data able.
@ -199,7 +203,7 @@ func (ls *LoadStateLock) waitOrPanic(ready func(state loadStateEnum) bool, then
select {
case <-time.After(maxWaitTime):
panic(fmt.Sprintf("max WLock wait time(%v) excceeded", maxWaitTime))
log.Error("load state lock wait timeout", zap.Duration("maxWaitTime", maxWaitTime))
case <-ch:
}
}

View File

@ -197,12 +197,15 @@ func TestWaitOrPanic(t *testing.T) {
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)
l := NewLoadStateLock(LoadStateDataLoaded)
executed := false
assert.NotPanics(t, func() {
l.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded
}, noop)
}, func() { executed = true })
})
assert.True(t, executed)
})
t.Run("timeout_panic", func(t *testing.T) {
@ -210,12 +213,14 @@ func TestWaitOrPanic(t *testing.T) {
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)
l := NewLoadStateLock(LoadStateOnlyMeta)
executed := false
assert.Panics(t, func() {
assert.NotPanics(t, func() {
l.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded
}, noop)
})
assert.False(t, executed)
})
}

View File

@ -51,9 +51,11 @@ int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
}
*/
import "C"
import (
"github.com/cockroachdb/errors"
"unsafe"
"github.com/cockroachdb/errors"
)
// GPUMemoryInfo represents a single GPU's memory information.