mirror of https://github.com/milvus-io/milvus.git
fix: Wrap init segcore tracing with golang timeout (#33494)
See also #33483 Wrap `C.InitTrace` & `C.SetTrace` with timeout preventing otlp initializtion hangs forever when endpoint is not set correctly --------- Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>pull/33570/head
parent
34c6a989ab
commit
2b285e5573
|
@ -55,13 +55,13 @@ initTelemetry(const TraceConfig& cfg) {
|
|||
opts.transport_format = jaeger::TransportFormat::kThriftHttp;
|
||||
opts.endpoint = cfg.jaegerURL;
|
||||
exporter = jaeger::JaegerExporterFactory::Create(opts);
|
||||
LOG_INFO("init jaeger exporter, endpoint:", opts.endpoint);
|
||||
LOG_INFO("init jaeger exporter, endpoint: {}", opts.endpoint);
|
||||
} else if (cfg.exporter == "otlp") {
|
||||
auto opts = otlp::OtlpGrpcExporterOptions{};
|
||||
opts.endpoint = cfg.otlpEndpoint;
|
||||
opts.use_ssl_credentials = cfg.oltpSecure;
|
||||
exporter = otlp::OtlpGrpcExporterFactory::Create(opts);
|
||||
LOG_INFO("init otlp exporter, endpoint:", opts.endpoint);
|
||||
LOG_INFO("init otlp exporter, endpoint: {}", opts.endpoint);
|
||||
} else {
|
||||
LOG_INFO("Empty Trace");
|
||||
enable_trace = false;
|
||||
|
|
|
@ -29,6 +29,7 @@ import "C"
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
|
@ -61,7 +62,13 @@ func InitTraceConfig(params *paramtable.ComponentParam) {
|
|||
otlpEndpoint: endpoint,
|
||||
nodeID: nodeID,
|
||||
}
|
||||
C.InitTrace(&config)
|
||||
// oltp grpc may hangs forever, add timeout logic at go side
|
||||
timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second)
|
||||
callWithTimeout(func() {
|
||||
C.InitTrace(&config)
|
||||
}, func() {
|
||||
panic("init segcore tracing timeout, See issue #33483")
|
||||
}, timeout)
|
||||
}
|
||||
|
||||
func ResetTraceConfig(params *paramtable.ComponentParam) {
|
||||
|
@ -81,7 +88,31 @@ func ResetTraceConfig(params *paramtable.ComponentParam) {
|
|||
otlpEndpoint: endpoint,
|
||||
nodeID: nodeID,
|
||||
}
|
||||
C.SetTrace(&config)
|
||||
|
||||
// oltp grpc may hangs forever, add timeout logic at go side
|
||||
timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second)
|
||||
callWithTimeout(func() {
|
||||
C.SetTrace(&config)
|
||||
}, func() {
|
||||
panic("set segcore tracing timeout, See issue #33483")
|
||||
}, timeout)
|
||||
}
|
||||
|
||||
func callWithTimeout(fn func(), timeoutHandler func(), timeout time.Duration) {
|
||||
if timeout > 0 {
|
||||
ch := make(chan struct{})
|
||||
go func() {
|
||||
defer close(ch)
|
||||
fn()
|
||||
}()
|
||||
select {
|
||||
case <-ch:
|
||||
case <-time.After(timeout):
|
||||
timeoutHandler()
|
||||
}
|
||||
} else {
|
||||
fn()
|
||||
}
|
||||
}
|
||||
|
||||
func InitRemoteChunkManager(params *paramtable.ComponentParam) error {
|
||||
|
|
|
@ -19,6 +19,8 @@ package initcore
|
|||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
)
|
||||
|
||||
|
@ -29,3 +31,17 @@ func TestTracer(t *testing.T) {
|
|||
paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "stdout")
|
||||
ResetTraceConfig(paramtable.Get())
|
||||
}
|
||||
|
||||
func TestOtlpHang(t *testing.T) {
|
||||
paramtable.Init()
|
||||
InitTraceConfig(paramtable.Get())
|
||||
|
||||
paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "otlp")
|
||||
paramtable.Get().Save(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key, "1")
|
||||
defer paramtable.Get().Reset(paramtable.Get().TraceCfg.Exporter.Key)
|
||||
defer paramtable.Get().Reset(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key)
|
||||
|
||||
assert.Panics(t, func() {
|
||||
ResetTraceConfig(paramtable.Get())
|
||||
})
|
||||
}
|
||||
|
|
|
@ -787,11 +787,12 @@ func (t *gpuConfig) init(base *BaseTable) {
|
|||
}
|
||||
|
||||
type traceConfig struct {
|
||||
Exporter ParamItem `refreshable:"false"`
|
||||
SampleFraction ParamItem `refreshable:"false"`
|
||||
JaegerURL ParamItem `refreshable:"false"`
|
||||
OtlpEndpoint ParamItem `refreshable:"false"`
|
||||
OtlpSecure ParamItem `refreshable:"false"`
|
||||
Exporter ParamItem `refreshable:"false"`
|
||||
SampleFraction ParamItem `refreshable:"false"`
|
||||
JaegerURL ParamItem `refreshable:"false"`
|
||||
OtlpEndpoint ParamItem `refreshable:"false"`
|
||||
OtlpSecure ParamItem `refreshable:"false"`
|
||||
InitTimeoutSeconds ParamItem `refreshable:"false"`
|
||||
}
|
||||
|
||||
func (t *traceConfig) init(base *BaseTable) {
|
||||
|
@ -839,6 +840,15 @@ Fractions >= 1 will always sample. Fractions < 0 are treated as zero.`,
|
|||
Export: true,
|
||||
}
|
||||
t.OtlpSecure.Init(base.mgr)
|
||||
|
||||
t.InitTimeoutSeconds = ParamItem{
|
||||
Key: "trace.initTimeoutSeconds",
|
||||
Version: "2.4.4",
|
||||
DefaultValue: "10",
|
||||
Export: true,
|
||||
Doc: "segcore initialization timeout in seconds, preventing otlp grpc hangs forever",
|
||||
}
|
||||
t.InitTimeoutSeconds.Init(base.mgr)
|
||||
}
|
||||
|
||||
type logConfig struct {
|
||||
|
|
Loading…
Reference in New Issue