milvus/internal/querynodev2/segments/metricsutil/observer.go

272 lines
12 KiB
Go

package metricsutil
import (
"strconv"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
// labeledRecord is a labeled sample point.
type labeledRecord interface {
// Label of the access metric.
Label() SegmentLabel
// Finish finishes the record.
Finish(err error)
// getError returns the error of the record.
// current metric system simply reject the error operation.
getError() error
}
// globalObserver is the global resource groups observer.
var (
once sync.Once
globalObserver *segmentsObserver
)
func getGlobalObserver() *segmentsObserver {
once.Do(func() {
globalObserver = newSegmentsObserver()
go func() {
d := 15 * time.Minute
ticker := time.NewTicker(d)
defer ticker.Stop()
for range ticker.C {
expireAt := time.Now().Add(-d)
globalObserver.Expire(expireAt)
}
}()
})
return globalObserver
}
// newSegmentsObserver creates a new segmentsObserver.
// Used to check if a segment is hot or cold.
func newSegmentsObserver() *segmentsObserver {
return &segmentsObserver{
nodeID: strconv.FormatInt(paramtable.GetNodeID(), 10),
segments: typeutil.NewConcurrentMap[SegmentLabel, *segmentObserver](),
}
}
// segmentsObserver is a observer all segments metrics.
type segmentsObserver struct {
nodeID string
segments *typeutil.ConcurrentMap[SegmentLabel, *segmentObserver] // map segment id to observer.
// one segment can be removed from one query node, for balancing or compacting.
// no more search operation will be performed on the segment after it is removed.
// all related metric should be expired after a while.
// may be a huge map with 100000+ entries.
}
// Observe records a new metric
func (o *segmentsObserver) Observe(m labeledRecord) {
if m.getError() != nil {
return // reject error record.
// TODO: add error as a label of metrics.
}
// fast path.
label := m.Label()
observer, ok := o.segments.Get(label)
if !ok {
// slow path.
newObserver := newSegmentObserver(o.nodeID, label)
observer, _ = o.segments.GetOrInsert(label, newObserver)
}
// do a observer.
observer.Observe(m)
}
// Expire expires the observer.
func (o *segmentsObserver) Expire(expiredAt time.Time) {
o.segments.Range(func(label SegmentLabel, value *segmentObserver) bool {
if value.IsExpired(expiredAt) {
o.segments.Remove(label)
value.Clear()
return true
}
return true
})
}
// newSegmentObserver creates a new segmentObserver.
func newSegmentObserver(nodeID string, label SegmentLabel) *segmentObserver {
now := time.Now()
return &segmentObserver{
label: label,
prom: newPromObserver(nodeID, label),
lastUpdates: atomic.NewPointer[time.Time](&now),
}
}
// segmentObserver is a observer for segment metrics.
type segmentObserver struct {
label SegmentLabel // never updates
// observers.
prom promMetricsObserver // prometheus metrics observer.
// for expiration.
lastUpdates *atomic.Pointer[time.Time] // update every access.
}
// IsExpired checks if the segment observer is expired.
func (o *segmentObserver) IsExpired(expireAt time.Time) bool {
return o.lastUpdates.Load().Before(expireAt)
}
// Observe observe a new
func (o *segmentObserver) Observe(m labeledRecord) {
now := time.Now()
o.lastUpdates.Store(&now)
switch mm := m.(type) {
case *CacheLoadRecord:
o.prom.ObserveCacheLoad(mm)
case *CacheEvictRecord:
o.prom.ObserveCacheEvict(mm)
case QuerySegmentAccessRecord:
o.prom.ObserveQueryAccess(mm)
case SearchSegmentAccessRecord:
o.prom.ObserveSearchAccess(mm)
default:
panic("unknown segment access metric")
}
}
// Clear clears the observer.
func (o *segmentObserver) Clear() {
o.prom.Clear()
}
// newPromObserver creates a new promMetrics.
func newPromObserver(nodeID string, label SegmentLabel) promMetricsObserver {
return promMetricsObserver{
nodeID: nodeID,
label: label,
DiskCacheLoadTotal: metrics.QueryNodeDiskCacheLoadTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
DiskCacheLoadDuration: metrics.QueryNodeDiskCacheLoadDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
DiskCacheLoadBytes: metrics.QueryNodeDiskCacheLoadBytes.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
DiskCacheEvictTotal: metrics.QueryNodeDiskCacheEvictTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
DiskCacheEvictDuration: metrics.QueryNodeDiskCacheEvictDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
DiskCacheEvictBytes: metrics.QueryNodeDiskCacheEvictBytes.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup),
QuerySegmentAccessTotal: metrics.QueryNodeSegmentAccessTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel),
QuerySegmentAccessDuration: metrics.QueryNodeSegmentAccessDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel),
QuerySegmentAccessWaitCacheTotal: metrics.QueryNodeSegmentAccessWaitCacheTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel),
QuerySegmentAccessWaitCacheDuration: metrics.QueryNodeSegmentAccessWaitCacheDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel),
SearchSegmentAccessTotal: metrics.QueryNodeSegmentAccessTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel),
SearchSegmentAccessDuration: metrics.QueryNodeSegmentAccessDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel),
SearchSegmentAccessWaitCacheTotal: metrics.QueryNodeSegmentAccessWaitCacheTotal.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel),
SearchSegmentAccessWaitCacheDuration: metrics.QueryNodeSegmentAccessWaitCacheDuration.WithLabelValues(nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel),
DiskCacheLoadGlobalDuration: metrics.QueryNodeDiskCacheLoadGlobalDuration.WithLabelValues(nodeID),
DiskCacheEvictGlobalDuration: metrics.QueryNodeDiskCacheEvictGlobalDuration.WithLabelValues(nodeID),
QuerySegmentAccessGlobalDuration: metrics.QueryNodeSegmentAccessGlobalDuration.WithLabelValues(nodeID, metrics.QueryLabel),
SearchSegmentAccessGlobalDuration: metrics.QueryNodeSegmentAccessGlobalDuration.WithLabelValues(nodeID, metrics.SearchLabel),
QuerySegmentAccessWaitCacheGlobalDuration: metrics.QueryNodeSegmentAccessWaitCacheGlobalDuration.WithLabelValues(nodeID, metrics.QueryLabel),
SearchSegmentAccessWaitCacheGlobalDuration: metrics.QueryNodeSegmentAccessWaitCacheGlobalDuration.WithLabelValues(nodeID, metrics.SearchLabel),
}
}
// promMetricsObserver is a observer for prometheus metrics.
type promMetricsObserver struct {
nodeID string
label SegmentLabel // never updates
DiskCacheLoadTotal prometheus.Counter
DiskCacheLoadDuration prometheus.Counter
DiskCacheLoadBytes prometheus.Counter
DiskCacheEvictTotal prometheus.Counter
DiskCacheEvictBytes prometheus.Counter
DiskCacheEvictDuration prometheus.Counter
QuerySegmentAccessTotal prometheus.Counter
QuerySegmentAccessDuration prometheus.Counter
QuerySegmentAccessWaitCacheTotal prometheus.Counter
QuerySegmentAccessWaitCacheDuration prometheus.Counter
SearchSegmentAccessTotal prometheus.Counter
SearchSegmentAccessDuration prometheus.Counter
SearchSegmentAccessWaitCacheTotal prometheus.Counter
SearchSegmentAccessWaitCacheDuration prometheus.Counter
DiskCacheLoadGlobalDuration prometheus.Observer
DiskCacheEvictGlobalDuration prometheus.Observer
QuerySegmentAccessGlobalDuration prometheus.Observer
SearchSegmentAccessGlobalDuration prometheus.Observer
QuerySegmentAccessWaitCacheGlobalDuration prometheus.Observer
SearchSegmentAccessWaitCacheGlobalDuration prometheus.Observer
}
// ObserveLoad records a new cache load
func (o *promMetricsObserver) ObserveCacheLoad(r *CacheLoadRecord) {
o.DiskCacheLoadTotal.Inc()
o.DiskCacheLoadBytes.Add(r.getBytes())
d := r.getMilliseconds()
o.DiskCacheLoadDuration.Add(d)
o.DiskCacheLoadGlobalDuration.Observe(d)
}
// ObserveCacheEvict records a new cache evict.
func (o *promMetricsObserver) ObserveCacheEvict(r *CacheEvictRecord) {
o.DiskCacheEvictTotal.Inc()
o.DiskCacheEvictBytes.Add(r.getBytes())
d := r.getMilliseconds()
o.DiskCacheEvictDuration.Add(d)
o.DiskCacheEvictGlobalDuration.Observe(d)
}
// ObserveQueryAccess records a new query access.
func (o *promMetricsObserver) ObserveQueryAccess(r QuerySegmentAccessRecord) {
o.QuerySegmentAccessTotal.Inc()
d := r.getMilliseconds()
o.QuerySegmentAccessDuration.Add(d)
o.QuerySegmentAccessGlobalDuration.Observe(d)
if r.isCacheMiss {
o.QuerySegmentAccessWaitCacheTotal.Inc()
d := r.getWaitLoadMilliseconds()
o.QuerySegmentAccessWaitCacheDuration.Add(d)
o.QuerySegmentAccessWaitCacheGlobalDuration.Observe(d)
}
}
// ObserveSearchAccess records a new search access.
func (o *promMetricsObserver) ObserveSearchAccess(r SearchSegmentAccessRecord) {
o.SearchSegmentAccessTotal.Inc()
d := r.getMilliseconds()
o.SearchSegmentAccessDuration.Add(d)
o.SearchSegmentAccessGlobalDuration.Observe(d)
if r.isCacheMiss {
o.SearchSegmentAccessWaitCacheTotal.Inc()
d := r.getWaitLoadMilliseconds()
o.SearchSegmentAccessWaitCacheDuration.Add(d)
o.SearchSegmentAccessWaitCacheGlobalDuration.Observe(d)
}
}
// Clear clears the prometheus metrics.
func (o *promMetricsObserver) Clear() {
label := o.label
metrics.QueryNodeDiskCacheLoadTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeDiskCacheLoadBytes.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeDiskCacheLoadDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeDiskCacheEvictTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeDiskCacheEvictBytes.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeDiskCacheEvictDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup)
metrics.QueryNodeSegmentAccessTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel)
metrics.QueryNodeSegmentAccessTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel)
metrics.QueryNodeSegmentAccessDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel)
metrics.QueryNodeSegmentAccessDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel)
metrics.QueryNodeSegmentAccessWaitCacheTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel)
metrics.QueryNodeSegmentAccessWaitCacheTotal.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel)
metrics.QueryNodeSegmentAccessWaitCacheDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.SearchLabel)
metrics.QueryNodeSegmentAccessWaitCacheDuration.DeleteLabelValues(o.nodeID, label.DatabaseName, label.ResourceGroup, metrics.QueryLabel)
}