mirror of https://github.com/milvus-io/milvus.git
Add datacoord metricsinfo unit test (#7595)
Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>pull/7595/merge
parent
762060e670
commit
c96b19a640
|
@ -13,6 +13,7 @@ package datacoord
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/typeutil"
|
||||
|
@ -26,101 +27,33 @@ import (
|
|||
"github.com/milvus-io/milvus/internal/util/metricsinfo"
|
||||
)
|
||||
|
||||
// TODO(dragondriver): add more detail metrics
|
||||
// getSystemInfoMetrics compose data cluster metrics
|
||||
func (s *Server) getSystemInfoMetrics(
|
||||
ctx context.Context,
|
||||
req *milvuspb.GetMetricsRequest,
|
||||
) (*milvuspb.GetMetricsResponse, error) {
|
||||
// TODO(dragondriver): add more detail metrics
|
||||
|
||||
// get datacoord info
|
||||
nodes := s.cluster.GetNodes()
|
||||
clusterTopology := metricsinfo.DataClusterTopology{
|
||||
Self: metricsinfo.DataCoordInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
Name: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
|
||||
HardwareInfos: metricsinfo.HardwareMetrics{
|
||||
IP: s.session.Address,
|
||||
CPUCoreCount: metricsinfo.GetCPUCoreCount(false),
|
||||
CPUCoreUsage: metricsinfo.GetCPUUsage(),
|
||||
Memory: metricsinfo.GetMemoryCount(),
|
||||
MemoryUsage: metricsinfo.GetUsedMemoryCount(),
|
||||
Disk: metricsinfo.GetDiskCount(),
|
||||
DiskUsage: metricsinfo.GetDiskUsage(),
|
||||
},
|
||||
SystemInfo: metricsinfo.DeployMetrics{
|
||||
SystemVersion: os.Getenv(metricsinfo.GitCommitEnvKey),
|
||||
DeployMode: os.Getenv(metricsinfo.DeployModeEnvKey),
|
||||
},
|
||||
// TODO(dragondriver): CreatedTime & UpdatedTime, easy but time-costing
|
||||
Type: typeutil.DataCoordRole,
|
||||
},
|
||||
SystemConfigurations: metricsinfo.DataCoordConfiguration{
|
||||
SegmentMaxSize: Params.SegmentMaxSize,
|
||||
},
|
||||
},
|
||||
ConnectedNodes: make([]metricsinfo.DataNodeInfos, 0),
|
||||
Self: s.getDataCoordMetrics(),
|
||||
ConnectedNodes: make([]metricsinfo.DataNodeInfos, 0, len(nodes)),
|
||||
}
|
||||
|
||||
nodes := s.cluster.GetNodes()
|
||||
// for each data node, fetch metrics info
|
||||
log.Debug("datacoord.getSystemInfoMetrics",
|
||||
zap.Int("data nodes num", len(nodes)))
|
||||
for _, node := range nodes {
|
||||
if node == nil {
|
||||
log.Warn("skip invalid data node",
|
||||
zap.String("reason", "datanode is nil"))
|
||||
continue
|
||||
}
|
||||
|
||||
if node.GetClient() == nil {
|
||||
log.Warn("skip invalid data node",
|
||||
zap.String("reason", "datanode client is nil"))
|
||||
continue
|
||||
}
|
||||
|
||||
metrics, err := node.GetClient().GetMetrics(ctx, req)
|
||||
infos, err := s.getDataNodeMetrics(ctx, req, node)
|
||||
if err != nil {
|
||||
log.Warn("invalid metrics of query node was found",
|
||||
zap.Error(err))
|
||||
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
HasError: true,
|
||||
ErrorReason: err.Error(),
|
||||
// Name doesn't matter here cause we can't get it when error occurs, using address as the Name?
|
||||
Name: "",
|
||||
},
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if metrics.Status.ErrorCode != commonpb.ErrorCode_Success {
|
||||
log.Warn("invalid metrics of query node was found",
|
||||
zap.Any("error_code", metrics.Status.ErrorCode),
|
||||
zap.Any("error_reason", metrics.Status.Reason))
|
||||
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
HasError: true,
|
||||
ErrorReason: metrics.Status.Reason,
|
||||
Name: metrics.ComponentName,
|
||||
},
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
infos := metricsinfo.DataNodeInfos{}
|
||||
err = metricsinfo.UnmarshalComponentInfos(metrics.Response, &infos)
|
||||
if err != nil {
|
||||
log.Warn("invalid metrics of query node was found",
|
||||
zap.Error(err))
|
||||
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
HasError: true,
|
||||
ErrorReason: err.Error(),
|
||||
Name: metrics.ComponentName,
|
||||
},
|
||||
})
|
||||
log.Warn("fails to get datanode metrics", zap.Error(err))
|
||||
continue
|
||||
}
|
||||
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, infos)
|
||||
}
|
||||
|
||||
// compose topolgoy struct
|
||||
coordTopology := metricsinfo.DataCoordTopology{
|
||||
Cluster: clusterTopology,
|
||||
Connections: metricsinfo.ConnTopology{
|
||||
|
@ -130,24 +63,92 @@ func (s *Server) getSystemInfoMetrics(
|
|||
},
|
||||
}
|
||||
|
||||
resp, err := metricsinfo.MarshalTopology(coordTopology)
|
||||
resp := &milvuspb.GetMetricsResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||
},
|
||||
Response: "",
|
||||
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
|
||||
}
|
||||
var err error
|
||||
resp.Response, err = metricsinfo.MarshalTopology(coordTopology)
|
||||
if err != nil {
|
||||
return &milvuspb.GetMetricsResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||
Reason: err.Error(),
|
||||
},
|
||||
Response: "",
|
||||
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
|
||||
}, nil
|
||||
resp.Status.Reason = err.Error()
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
return &milvuspb.GetMetricsResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
Reason: "",
|
||||
},
|
||||
Response: resp,
|
||||
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
|
||||
}, nil
|
||||
resp.Status.ErrorCode = commonpb.ErrorCode_Success
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// getDataCoordMetrics composes datacoord infos
|
||||
func (s *Server) getDataCoordMetrics() metricsinfo.DataCoordInfos {
|
||||
return metricsinfo.DataCoordInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
Name: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
|
||||
HardwareInfos: metricsinfo.HardwareMetrics{
|
||||
IP: s.session.Address,
|
||||
CPUCoreCount: metricsinfo.GetCPUCoreCount(false),
|
||||
CPUCoreUsage: metricsinfo.GetCPUUsage(),
|
||||
Memory: metricsinfo.GetMemoryCount(),
|
||||
MemoryUsage: metricsinfo.GetUsedMemoryCount(),
|
||||
Disk: metricsinfo.GetDiskCount(),
|
||||
DiskUsage: metricsinfo.GetDiskUsage(),
|
||||
},
|
||||
SystemInfo: metricsinfo.DeployMetrics{
|
||||
SystemVersion: os.Getenv(metricsinfo.GitCommitEnvKey),
|
||||
DeployMode: os.Getenv(metricsinfo.DeployModeEnvKey),
|
||||
},
|
||||
// TODO(dragondriver): CreatedTime & UpdatedTime, easy but time-costing
|
||||
Type: typeutil.DataCoordRole,
|
||||
},
|
||||
SystemConfigurations: metricsinfo.DataCoordConfiguration{
|
||||
SegmentMaxSize: Params.SegmentMaxSize,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// getDataNodeMetrics composes data node infos
|
||||
// this function will invoke GetMetrics with data node specified in NodeInfo
|
||||
func (s *Server) getDataNodeMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest, node *NodeInfo) (metricsinfo.DataNodeInfos, error) {
|
||||
infos := metricsinfo.DataNodeInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
HasError: true,
|
||||
},
|
||||
}
|
||||
if node == nil {
|
||||
return infos, errors.New("datanode is nil")
|
||||
}
|
||||
|
||||
if node.GetClient() == nil {
|
||||
return infos, errors.New("datanode client is nil")
|
||||
}
|
||||
|
||||
metrics, err := node.GetClient().GetMetrics(ctx, req)
|
||||
if err != nil {
|
||||
log.Warn("invalid metrics of data node was found",
|
||||
zap.Error(err))
|
||||
infos.BaseComponentInfos.ErrorReason = err.Error()
|
||||
// err handled, returns nil
|
||||
return infos, nil
|
||||
}
|
||||
infos.BaseComponentInfos.Name = metrics.GetComponentName()
|
||||
|
||||
if metrics.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success {
|
||||
log.Warn("invalid metrics of data node was found",
|
||||
zap.Any("error_code", metrics.Status.ErrorCode),
|
||||
zap.Any("error_reason", metrics.Status.Reason))
|
||||
infos.BaseComponentInfos.ErrorReason = metrics.GetStatus().GetReason()
|
||||
return infos, nil
|
||||
}
|
||||
|
||||
err = metricsinfo.UnmarshalComponentInfos(metrics.GetResponse(), &infos)
|
||||
if err != nil {
|
||||
log.Warn("invalid metrics of data node was found",
|
||||
zap.Error(err))
|
||||
infos.BaseComponentInfos.ErrorReason = err.Error()
|
||||
return infos, nil
|
||||
}
|
||||
infos.BaseComponentInfos.HasError = false
|
||||
return infos, nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
package datacoord
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||
"github.com/milvus-io/milvus/internal/proto/milvuspb"
|
||||
"github.com/milvus-io/milvus/internal/types"
|
||||
"github.com/milvus-io/milvus/internal/util/metricsinfo"
|
||||
"github.com/milvus-io/milvus/internal/util/typeutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type mockMetricDataNodeClient struct {
|
||||
types.DataNode
|
||||
mock func() (*milvuspb.GetMetricsResponse, error)
|
||||
}
|
||||
|
||||
func (c *mockMetricDataNodeClient) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
|
||||
if c.mock == nil {
|
||||
return c.DataNode.GetMetrics(ctx, req)
|
||||
}
|
||||
return c.mock()
|
||||
}
|
||||
|
||||
func TestGetDataNodeMetrics(t *testing.T) {
|
||||
svr := newTestServer(t, nil)
|
||||
defer closeTestServer(t, svr)
|
||||
|
||||
ctx := context.Background()
|
||||
req := &milvuspb.GetMetricsRequest{}
|
||||
// nil node
|
||||
_, err := svr.getDataNodeMetrics(ctx, req, nil)
|
||||
assert.NotNil(t, err)
|
||||
|
||||
// nil client node
|
||||
_, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{})
|
||||
assert.NotNil(t, err)
|
||||
|
||||
client, err := newMockDataNodeClient(100, nil)
|
||||
assert.Nil(t, err)
|
||||
// mock datanode client
|
||||
info, err := svr.getDataNodeMetrics(ctx, req, &NodeInfo{
|
||||
client: client,
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
assert.False(t, info.HasError)
|
||||
assert.Equal(t, metricsinfo.ConstructComponentName(typeutil.DataNodeRole, client.id), info.BaseComponentInfos.Name)
|
||||
|
||||
// mock grpc return error
|
||||
mock := &mockMetricDataNodeClient{DataNode: client}
|
||||
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
|
||||
return nil, errors.New("mocked fail")
|
||||
}
|
||||
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
|
||||
client: mock,
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
assert.True(t, info.HasError)
|
||||
|
||||
// mock status not success
|
||||
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
|
||||
return &milvuspb.GetMetricsResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||
Reason: "mocked error",
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
|
||||
client: mock,
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
assert.True(t, info.HasError)
|
||||
assert.Equal(t, "mocked error", info.ErrorReason)
|
||||
|
||||
// mock parse error
|
||||
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
|
||||
return &milvuspb.GetMetricsResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
},
|
||||
Response: `{"error_reason": 1}`,
|
||||
}, nil
|
||||
}
|
||||
|
||||
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
|
||||
client: mock,
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
assert.True(t, info.HasError)
|
||||
|
||||
}
|
|
@ -153,7 +153,7 @@ func (c *mockDataNodeClient) FlushSegments(ctx context.Context, in *datapb.Flush
|
|||
|
||||
func (c *mockDataNodeClient) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
|
||||
// TODO(dragondriver): change the id, though it's not important in ut
|
||||
nodeID := UniqueID(20210819)
|
||||
nodeID := UniqueID(c.id)
|
||||
|
||||
nodeInfos := metricsinfo.DataNodeInfos{
|
||||
BaseComponentInfos: metricsinfo.BaseComponentInfos{
|
||||
|
|
Loading…
Reference in New Issue