Add datacoord metricsinfo unit test (#7595)

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/7595/merge
congqixia 2021-09-09 10:16:00 +08:00 committed by GitHub
parent 762060e670
commit c96b19a640
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 202 additions and 96 deletions

View File

@ -13,6 +13,7 @@ package datacoord
import (
"context"
"errors"
"os"
"github.com/milvus-io/milvus/internal/util/typeutil"
@ -26,101 +27,33 @@ import (
"github.com/milvus-io/milvus/internal/util/metricsinfo"
)
// TODO(dragondriver): add more detail metrics
// getSystemInfoMetrics compose data cluster metrics
func (s *Server) getSystemInfoMetrics(
ctx context.Context,
req *milvuspb.GetMetricsRequest,
) (*milvuspb.GetMetricsResponse, error) {
// TODO(dragondriver): add more detail metrics
// get datacoord info
nodes := s.cluster.GetNodes()
clusterTopology := metricsinfo.DataClusterTopology{
Self: metricsinfo.DataCoordInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
Name: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
HardwareInfos: metricsinfo.HardwareMetrics{
IP: s.session.Address,
CPUCoreCount: metricsinfo.GetCPUCoreCount(false),
CPUCoreUsage: metricsinfo.GetCPUUsage(),
Memory: metricsinfo.GetMemoryCount(),
MemoryUsage: metricsinfo.GetUsedMemoryCount(),
Disk: metricsinfo.GetDiskCount(),
DiskUsage: metricsinfo.GetDiskUsage(),
},
SystemInfo: metricsinfo.DeployMetrics{
SystemVersion: os.Getenv(metricsinfo.GitCommitEnvKey),
DeployMode: os.Getenv(metricsinfo.DeployModeEnvKey),
},
// TODO(dragondriver): CreatedTime & UpdatedTime, easy but time-costing
Type: typeutil.DataCoordRole,
},
SystemConfigurations: metricsinfo.DataCoordConfiguration{
SegmentMaxSize: Params.SegmentMaxSize,
},
},
ConnectedNodes: make([]metricsinfo.DataNodeInfos, 0),
Self: s.getDataCoordMetrics(),
ConnectedNodes: make([]metricsinfo.DataNodeInfos, 0, len(nodes)),
}
nodes := s.cluster.GetNodes()
// for each data node, fetch metrics info
log.Debug("datacoord.getSystemInfoMetrics",
zap.Int("data nodes num", len(nodes)))
for _, node := range nodes {
if node == nil {
log.Warn("skip invalid data node",
zap.String("reason", "datanode is nil"))
continue
}
if node.GetClient() == nil {
log.Warn("skip invalid data node",
zap.String("reason", "datanode client is nil"))
continue
}
metrics, err := node.GetClient().GetMetrics(ctx, req)
infos, err := s.getDataNodeMetrics(ctx, req, node)
if err != nil {
log.Warn("invalid metrics of query node was found",
zap.Error(err))
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
HasError: true,
ErrorReason: err.Error(),
// Name doesn't matter here cause we can't get it when error occurs, using address as the Name?
Name: "",
},
})
continue
}
if metrics.Status.ErrorCode != commonpb.ErrorCode_Success {
log.Warn("invalid metrics of query node was found",
zap.Any("error_code", metrics.Status.ErrorCode),
zap.Any("error_reason", metrics.Status.Reason))
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
HasError: true,
ErrorReason: metrics.Status.Reason,
Name: metrics.ComponentName,
},
})
continue
}
infos := metricsinfo.DataNodeInfos{}
err = metricsinfo.UnmarshalComponentInfos(metrics.Response, &infos)
if err != nil {
log.Warn("invalid metrics of query node was found",
zap.Error(err))
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, metricsinfo.DataNodeInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
HasError: true,
ErrorReason: err.Error(),
Name: metrics.ComponentName,
},
})
log.Warn("fails to get datanode metrics", zap.Error(err))
continue
}
clusterTopology.ConnectedNodes = append(clusterTopology.ConnectedNodes, infos)
}
// compose topolgoy struct
coordTopology := metricsinfo.DataCoordTopology{
Cluster: clusterTopology,
Connections: metricsinfo.ConnTopology{
@ -130,24 +63,92 @@ func (s *Server) getSystemInfoMetrics(
},
}
resp, err := metricsinfo.MarshalTopology(coordTopology)
resp := &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
},
Response: "",
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
}
var err error
resp.Response, err = metricsinfo.MarshalTopology(coordTopology)
if err != nil {
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
},
Response: "",
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
}, nil
resp.Status.Reason = err.Error()
return resp, nil
}
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
},
Response: resp,
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
}, nil
resp.Status.ErrorCode = commonpb.ErrorCode_Success
return resp, nil
}
// getDataCoordMetrics composes datacoord infos
func (s *Server) getDataCoordMetrics() metricsinfo.DataCoordInfos {
return metricsinfo.DataCoordInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
Name: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.NodeID),
HardwareInfos: metricsinfo.HardwareMetrics{
IP: s.session.Address,
CPUCoreCount: metricsinfo.GetCPUCoreCount(false),
CPUCoreUsage: metricsinfo.GetCPUUsage(),
Memory: metricsinfo.GetMemoryCount(),
MemoryUsage: metricsinfo.GetUsedMemoryCount(),
Disk: metricsinfo.GetDiskCount(),
DiskUsage: metricsinfo.GetDiskUsage(),
},
SystemInfo: metricsinfo.DeployMetrics{
SystemVersion: os.Getenv(metricsinfo.GitCommitEnvKey),
DeployMode: os.Getenv(metricsinfo.DeployModeEnvKey),
},
// TODO(dragondriver): CreatedTime & UpdatedTime, easy but time-costing
Type: typeutil.DataCoordRole,
},
SystemConfigurations: metricsinfo.DataCoordConfiguration{
SegmentMaxSize: Params.SegmentMaxSize,
},
}
}
// getDataNodeMetrics composes data node infos
// this function will invoke GetMetrics with data node specified in NodeInfo
func (s *Server) getDataNodeMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest, node *NodeInfo) (metricsinfo.DataNodeInfos, error) {
infos := metricsinfo.DataNodeInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{
HasError: true,
},
}
if node == nil {
return infos, errors.New("datanode is nil")
}
if node.GetClient() == nil {
return infos, errors.New("datanode client is nil")
}
metrics, err := node.GetClient().GetMetrics(ctx, req)
if err != nil {
log.Warn("invalid metrics of data node was found",
zap.Error(err))
infos.BaseComponentInfos.ErrorReason = err.Error()
// err handled, returns nil
return infos, nil
}
infos.BaseComponentInfos.Name = metrics.GetComponentName()
if metrics.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success {
log.Warn("invalid metrics of data node was found",
zap.Any("error_code", metrics.Status.ErrorCode),
zap.Any("error_reason", metrics.Status.Reason))
infos.BaseComponentInfos.ErrorReason = metrics.GetStatus().GetReason()
return infos, nil
}
err = metricsinfo.UnmarshalComponentInfos(metrics.GetResponse(), &infos)
if err != nil {
log.Warn("invalid metrics of data node was found",
zap.Error(err))
infos.BaseComponentInfos.ErrorReason = err.Error()
return infos, nil
}
infos.BaseComponentInfos.HasError = false
return infos, nil
}

View File

@ -0,0 +1,105 @@
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package datacoord
import (
"context"
"errors"
"testing"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/milvuspb"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/internal/util/metricsinfo"
"github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/stretchr/testify/assert"
)
type mockMetricDataNodeClient struct {
types.DataNode
mock func() (*milvuspb.GetMetricsResponse, error)
}
func (c *mockMetricDataNodeClient) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
if c.mock == nil {
return c.DataNode.GetMetrics(ctx, req)
}
return c.mock()
}
func TestGetDataNodeMetrics(t *testing.T) {
svr := newTestServer(t, nil)
defer closeTestServer(t, svr)
ctx := context.Background()
req := &milvuspb.GetMetricsRequest{}
// nil node
_, err := svr.getDataNodeMetrics(ctx, req, nil)
assert.NotNil(t, err)
// nil client node
_, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{})
assert.NotNil(t, err)
client, err := newMockDataNodeClient(100, nil)
assert.Nil(t, err)
// mock datanode client
info, err := svr.getDataNodeMetrics(ctx, req, &NodeInfo{
client: client,
})
assert.Nil(t, err)
assert.False(t, info.HasError)
assert.Equal(t, metricsinfo.ConstructComponentName(typeutil.DataNodeRole, client.id), info.BaseComponentInfos.Name)
// mock grpc return error
mock := &mockMetricDataNodeClient{DataNode: client}
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
return nil, errors.New("mocked fail")
}
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
client: mock,
})
assert.Nil(t, err)
assert.True(t, info.HasError)
// mock status not success
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: "mocked error",
},
}, nil
}
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
client: mock,
})
assert.Nil(t, err)
assert.True(t, info.HasError)
assert.Equal(t, "mocked error", info.ErrorReason)
// mock parse error
mock.mock = func() (*milvuspb.GetMetricsResponse, error) {
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
},
Response: `{"error_reason": 1}`,
}, nil
}
info, err = svr.getDataNodeMetrics(ctx, req, &NodeInfo{
client: mock,
})
assert.Nil(t, err)
assert.True(t, info.HasError)
}

View File

@ -153,7 +153,7 @@ func (c *mockDataNodeClient) FlushSegments(ctx context.Context, in *datapb.Flush
func (c *mockDataNodeClient) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
// TODO(dragondriver): change the id, though it's not important in ut
nodeID := UniqueID(20210819)
nodeID := UniqueID(c.id)
nodeInfos := metricsinfo.DataNodeInfos{
BaseComponentInfos: metricsinfo.BaseComponentInfos{