milvus/internal/dataservice/server.go

465 lines
14 KiB
Go
Raw Normal View History

package dataservice
import (
"context"
"fmt"
"log"
"sync"
"github.com/zilliztech/milvus-distributed/internal/msgstream/util"
"github.com/zilliztech/milvus-distributed/internal/msgstream"
"github.com/zilliztech/milvus-distributed/internal/msgstream/pulsarms"
"github.com/zilliztech/milvus-distributed/internal/distributed/masterservice"
"github.com/zilliztech/milvus-distributed/internal/proto/milvuspb"
"github.com/zilliztech/milvus-distributed/internal/timesync"
etcdkv "github.com/zilliztech/milvus-distributed/internal/kv/etcd"
"go.etcd.io/etcd/clientv3"
"github.com/zilliztech/milvus-distributed/internal/proto/commonpb"
"github.com/zilliztech/milvus-distributed/internal/proto/datapb"
"github.com/zilliztech/milvus-distributed/internal/proto/internalpb2"
"github.com/zilliztech/milvus-distributed/internal/util/typeutil"
)
const role = "dataservice"
type DataService interface {
typeutil.Service
typeutil.Component
RegisterNode(req *datapb.RegisterNodeRequest) (*datapb.RegisterNodeResponse, error)
Flush(req *datapb.FlushRequest) (*commonpb.Status, error)
AssignSegmentID(req *datapb.AssignSegIDRequest) (*datapb.AssignSegIDResponse, error)
ShowSegments(req *datapb.ShowSegmentRequest) (*datapb.ShowSegmentResponse, error)
GetSegmentStates(req *datapb.SegmentStatesRequest) (*datapb.SegmentStatesResponse, error)
GetInsertBinlogPaths(req *datapb.InsertBinlogPathRequest) (*datapb.InsertBinlogPathsResponse, error)
GetSegmentInfoChannel() (string, error)
GetInsertChannels(req *datapb.InsertChannelRequest) ([]string, error)
GetCollectionStatistics(req *datapb.CollectionStatsRequest) (*datapb.CollectionStatsResponse, error)
GetPartitionStatistics(req *datapb.PartitionStatsRequest) (*datapb.PartitionStatsResponse, error)
GetComponentStates() (*internalpb2.ComponentStates, error)
}
type (
UniqueID = typeutil.UniqueID
Timestamp = typeutil.Timestamp
Server struct {
ctx context.Context
serverLoopCtx context.Context
serverLoopCancel context.CancelFunc
serverLoopWg sync.WaitGroup
state internalpb2.StateCode
client *etcdkv.EtcdKV
meta *meta
segAllocator segmentAllocator
statsHandler *statsHandler
insertChannelMgr *insertChannelManager
allocator allocator
cluster *dataNodeCluster
msgProducer *timesync.MsgProducer
registerFinishCh chan struct{}
masterClient *masterservice.GrpcClient
ttMsgStream msgstream.MsgStream
ddChannelName string
segmentInfoStream msgstream.MsgStream
}
)
func CreateServer(ctx context.Context, client *masterservice.GrpcClient) (*Server, error) {
ch := make(chan struct{})
return &Server{
ctx: ctx,
state: internalpb2.StateCode_INITIALIZING,
insertChannelMgr: newInsertChannelManager(),
registerFinishCh: ch,
cluster: newDataNodeCluster(ch),
masterClient: client,
}, nil
}
func (s *Server) Init() error {
Params.Init()
return nil
}
func (s *Server) Start() error {
s.allocator = newAllocatorImpl(s.masterClient)
if err := s.initMeta(); err != nil {
return err
}
s.statsHandler = newStatsHandler(s.meta)
segAllocator, err := newSegmentAllocator(s.meta, s.allocator)
if err != nil {
return err
}
s.segAllocator = segAllocator
s.waitDataNodeRegister()
if err = s.loadMetaFromMaster(); err != nil {
return err
}
if err = s.initMsgProducer(); err != nil {
return err
}
s.initSegmentInfoChannel()
s.startServerLoop()
s.state = internalpb2.StateCode_HEALTHY
log.Println("start success")
return nil
}
func (s *Server) initMeta() error {
etcdClient, err := clientv3.New(clientv3.Config{Endpoints: []string{Params.EtcdAddress}})
if err != nil {
return err
}
etcdKV := etcdkv.NewEtcdKV(etcdClient, Params.MetaRootPath)
s.client = etcdKV
s.meta, err = newMeta(etcdKV)
if err != nil {
return err
}
return nil
}
func (s *Server) waitDataNodeRegister() {
log.Println("waiting data node to register")
<-s.registerFinishCh
log.Println("all data nodes register")
}
func (s *Server) initMsgProducer() error {
ttMsgStream := pulsarms.NewPulsarTtMsgStream(s.ctx, 1024)
ttMsgStream.SetPulsarClient(Params.PulsarAddress)
ttMsgStream.CreatePulsarConsumers([]string{Params.TimeTickChannelName}, Params.DataServiceSubscriptionName, util.NewUnmarshalDispatcher(), 1024)
s.ttMsgStream = ttMsgStream
s.ttMsgStream.Start()
timeTickBarrier := timesync.NewHardTimeTickBarrier(s.ttMsgStream, s.cluster.GetNodeIDs())
dataNodeTTWatcher := newDataNodeTimeTickWatcher(s.meta, s.segAllocator, s.cluster)
producer, err := timesync.NewTimeSyncMsgProducer(timeTickBarrier, dataNodeTTWatcher)
if err != nil {
return err
}
s.msgProducer = producer
s.msgProducer.Start(s.ctx)
return nil
}
func (s *Server) startServerLoop() {
s.serverLoopCtx, s.serverLoopCancel = context.WithCancel(s.ctx)
s.serverLoopWg.Add(1)
go s.startStatsChannel(s.serverLoopCtx)
}
func (s *Server) startStatsChannel(ctx context.Context) {
defer s.serverLoopWg.Done()
statsStream := pulsarms.NewPulsarMsgStream(ctx, 1024)
statsStream.SetPulsarClient(Params.PulsarAddress)
statsStream.CreatePulsarConsumers([]string{Params.StatisticsChannelName}, Params.DataServiceSubscriptionName, util.NewUnmarshalDispatcher(), 1024)
statsStream.Start()
defer statsStream.Close()
for {
select {
case <-ctx.Done():
return
default:
}
msgPack := statsStream.Consume()
for _, msg := range msgPack.Msgs {
statistics := msg.(*msgstream.SegmentStatisticsMsg)
for _, stat := range statistics.SegStats {
if err := s.statsHandler.HandleSegmentStat(stat); err != nil {
log.Println(err.Error())
continue
}
}
}
}
}
func (s *Server) initSegmentInfoChannel() {
segmentInfoStream := pulsarms.NewPulsarMsgStream(s.ctx, 1024)
segmentInfoStream.SetPulsarClient(Params.PulsarAddress)
segmentInfoStream.CreatePulsarProducers([]string{Params.SegmentInfoChannelName})
s.segmentInfoStream = segmentInfoStream
s.segmentInfoStream.Start()
}
func (s *Server) loadMetaFromMaster() error {
log.Println("loading collection meta from master")
collections, err := s.masterClient.ShowCollections(&milvuspb.ShowCollectionRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_kShowCollections,
MsgID: -1, // todo add msg id
Timestamp: 0, // todo
SourceID: -1, // todo
},
DbName: "",
})
if err != nil {
return err
}
for _, collectionName := range collections.CollectionNames {
collection, err := s.masterClient.DescribeCollection(&milvuspb.DescribeCollectionRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_kDescribeCollection,
MsgID: -1, // todo
Timestamp: 0, // todo
SourceID: -1, // todo
},
DbName: "",
CollectionName: collectionName,
})
if err != nil {
log.Println(err.Error())
continue
}
partitions, err := s.masterClient.ShowPartitions(&milvuspb.ShowPartitionRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_kShowPartitions,
MsgID: -1, // todo
Timestamp: 0, // todo
SourceID: -1, // todo
},
DbName: "",
CollectionName: collectionName,
CollectionID: collection.CollectionID,
})
if err != nil {
log.Println(err.Error())
continue
}
err = s.meta.AddCollection(&collectionInfo{
ID: collection.CollectionID,
Schema: collection.Schema,
Partitions: partitions.PartitionIDs,
})
if err != nil {
log.Println(err.Error())
continue
}
}
log.Println("load collection meta from master complete")
return nil
}
func (s *Server) Stop() error {
s.ttMsgStream.Close()
s.msgProducer.Close()
s.segmentInfoStream.Close()
s.stopServerLoop()
return nil
}
func (s *Server) stopServerLoop() {
s.serverLoopCancel()
s.serverLoopWg.Wait()
}
func (s *Server) GetComponentStates() (*internalpb2.ComponentStates, error) {
resp := &internalpb2.ComponentStates{
State: &internalpb2.ComponentInfo{
NodeID: Params.NodeID,
Role: role,
StateCode: s.state,
},
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UNEXPECTED_ERROR,
},
}
dataNodeStates, err := s.cluster.GetDataNodeStates()
if err != nil {
resp.Status.Reason = err.Error()
return resp, nil
}
resp.SubcomponentStates = dataNodeStates
resp.Status.ErrorCode = commonpb.ErrorCode_SUCCESS
return resp, nil
}
func (s *Server) GetTimeTickChannel() (string, error) {
return Params.TimeTickChannelName, nil
}
func (s *Server) GetStatisticsChannel() (string, error) {
return Params.StatisticsChannelName, nil
}
func (s *Server) RegisterNode(req *datapb.RegisterNodeRequest) (*datapb.RegisterNodeResponse, error) {
ret := &datapb.RegisterNodeResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UNEXPECTED_ERROR,
},
}
s.cluster.Register(req.Address.Ip, req.Address.Port, req.Base.SourceID)
if s.ddChannelName == "" {
resp, err := s.masterClient.GetDdChannel()
if err != nil {
ret.Status.Reason = err.Error()
return ret, err
}
s.ddChannelName = resp
}
ret.Status.ErrorCode = commonpb.ErrorCode_SUCCESS
ret.InitParams = &internalpb2.InitParams{
NodeID: Params.NodeID,
StartParams: []*commonpb.KeyValuePair{
{Key: "DDChannelName", Value: s.ddChannelName},
{Key: "SegmentStatisticsChannelName", Value: Params.StatisticsChannelName},
{Key: "TimeTickChannelName", Value: Params.TimeTickChannelName},
{Key: "CompleteFlushChannelName", Value: Params.SegmentInfoChannelName},
},
}
return ret, nil
}
func (s *Server) Flush(req *datapb.FlushRequest) (*commonpb.Status, error) {
s.segAllocator.SealAllSegments(req.CollectionID)
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_SUCCESS,
}, nil
}
func (s *Server) AssignSegmentID(req *datapb.AssignSegIDRequest) (*datapb.AssignSegIDResponse, error) {
resp := &datapb.AssignSegIDResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_SUCCESS,
},
SegIDAssignments: make([]*datapb.SegIDAssignment, 0),
}
for _, r := range req.SegIDRequests {
result := &datapb.SegIDAssignment{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UNEXPECTED_ERROR,
},
}
segmentID, retCount, expireTs, err := s.segAllocator.AllocSegment(r.CollectionID, r.PartitionID, r.ChannelName, int(r.Count))
if err != nil {
if _, ok := err.(errRemainInSufficient); !ok {
result.Status.Reason = fmt.Sprintf("allocation of Collection %d, Partition %d, Channel %s, Count %d error: %s",
r.CollectionID, r.PartitionID, r.ChannelName, r.Count, err.Error())
resp.SegIDAssignments = append(resp.SegIDAssignments, result)
continue
}
log.Printf("no enough space for allocation of Collection %d, Partition %d, Channel %s, Count %d",
r.CollectionID, r.PartitionID, r.ChannelName, r.Count)
if err = s.openNewSegment(r.CollectionID, r.PartitionID, r.ChannelName); err != nil {
result.Status.Reason = fmt.Sprintf("open new segment of Collection %d, Partition %d, Channel %s, Count %d error: %s",
r.CollectionID, r.PartitionID, r.ChannelName, r.Count, err.Error())
resp.SegIDAssignments = append(resp.SegIDAssignments, result)
continue
}
segmentID, retCount, expireTs, err = s.segAllocator.AllocSegment(r.CollectionID, r.PartitionID, r.ChannelName, int(r.Count))
if err != nil {
result.Status.Reason = fmt.Sprintf("retry allocation of Collection %d, Partition %d, Channel %s, Count %d error: %s",
r.CollectionID, r.PartitionID, r.ChannelName, r.Count, err.Error())
resp.SegIDAssignments = append(resp.SegIDAssignments, result)
continue
}
}
result.Status.ErrorCode = commonpb.ErrorCode_SUCCESS
result.CollectionID = r.CollectionID
result.SegID = segmentID
result.PartitionID = r.PartitionID
result.Count = uint32(retCount)
result.ExpireTime = expireTs
result.ChannelName = r.ChannelName
resp.SegIDAssignments = append(resp.SegIDAssignments, result)
}
return resp, nil
}
func (s *Server) openNewSegment(collectionID UniqueID, partitionID UniqueID, channelName string) error {
group, err := s.insertChannelMgr.GetChannelGroup(collectionID, channelName)
if err != nil {
return err
}
id, err := s.allocator.allocID()
if err != nil {
return err
}
segmentInfo, err := BuildSegment(collectionID, partitionID, id, group)
if err != nil {
return err
}
if err = s.meta.AddSegment(segmentInfo); err != nil {
return err
}
if err = s.segAllocator.OpenSegment(segmentInfo); err != nil {
return err
}
return nil
}
func (s *Server) ShowSegments(req *datapb.ShowSegmentRequest) (*datapb.ShowSegmentResponse, error) {
ids := s.meta.GetSegmentsByCollectionAndPartitionID(req.CollectionID, req.PartitionID)
return &datapb.ShowSegmentResponse{SegmentIDs: ids}, nil
}
func (s *Server) GetSegmentStates(req *datapb.SegmentStatesRequest) (*datapb.SegmentStatesResponse, error) {
resp := &datapb.SegmentStatesResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UNEXPECTED_ERROR,
},
}
segmentInfo, err := s.meta.GetSegment(req.SegmentID)
if err != nil {
resp.Status.Reason = "get segment states error: " + err.Error()
return resp, nil
}
resp.State = segmentInfo.State
resp.CreateTime = segmentInfo.OpenTime
resp.SealedTime = segmentInfo.SealedTime
resp.FlushedTime = segmentInfo.FlushedTime
// TODO start/end positions
return resp, nil
}
func (s *Server) GetInsertBinlogPaths(req *datapb.InsertBinlogPathRequest) (*datapb.InsertBinlogPathsResponse, error) {
panic("implement me")
}
func (s *Server) GetInsertChannels(req *datapb.InsertChannelRequest) ([]string, error) {
contains, ret := s.insertChannelMgr.ContainsCollection(req.CollectionID)
if contains {
return ret, nil
}
channelGroups, err := s.insertChannelMgr.AllocChannels(req.CollectionID, s.cluster.GetNumOfNodes())
if err != nil {
return nil, err
}
channels := make([]string, Params.InsertChannelNumPerCollection)
for _, group := range channelGroups {
channels = append(channels, group...)
}
s.cluster.WatchInsertChannels(channelGroups)
return channels, nil
}
func (s *Server) GetCollectionStatistics(req *datapb.CollectionStatsRequest) (*datapb.CollectionStatsResponse, error) {
// todo implement
return nil, nil
}
func (s *Server) GetPartitionStatistics(req *datapb.PartitionStatsRequest) (*datapb.PartitionStatsResponse, error) {
// todo implement
return nil, nil
}
func (s *Server) GetSegmentInfoChannel() (string, error) {
return Params.SegmentInfoChannelName, nil
}