Skip to content

Commit 99df2d9

Browse files
authored
Add binlogSize and ttDelay metrics monitor (milvus-io#19872)
Signed-off-by: bigsheeper <[email protected]> Signed-off-by: bigsheeper <[email protected]>
1 parent 4d1ab28 commit 99df2d9

File tree

7 files changed

+153
-104
lines changed

7 files changed

+153
-104
lines changed

configs/milvus.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ quotaAndLimits:
449449
diskProtection:
450450
# When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
451451
enabled: true
452-
diskQuota: -1 # GB, (0, +inf), default no limit
452+
diskQuota: -1 # MB, (0, +inf), default no limit
453453

454454
# limitReading decides whether dql requests are allowed.
455455
limitReading:

internal/datacoord/meta.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ func (m *meta) GetTotalBinlogSize() int64 {
205205
for _, segment := range segments {
206206
ret += segment.getSegmentSize()
207207
}
208+
metrics.DataCoordStoredBinlogSize.WithLabelValues().Set(float64(ret))
208209
return ret
209210
}
210211

internal/metrics/datacoord_metrics.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ var (
8484
Help: "synchronized unix epoch per physical channel",
8585
}, []string{channelNameLabelName})
8686

87+
DataCoordStoredBinlogSize = prometheus.NewGaugeVec(
88+
prometheus.GaugeOpts{
89+
Namespace: milvusNamespace,
90+
Subsystem: typeutil.DataCoordRole,
91+
Name: "stored_binlog_size",
92+
Help: "binlog size of all collections/segments",
93+
}, []string{})
94+
8795
/* hard to implement, commented now
8896
DataCoordSegmentSizeRatio = prometheus.NewHistogramVec(
8997
prometheus.HistogramOpts{
@@ -139,4 +147,5 @@ func RegisterDataCoord(registry *prometheus.Registry) {
139147
registry.MustRegister(DataCoordNumStoredRows)
140148
registry.MustRegister(DataCoordNumStoredRowsCounter)
141149
registry.MustRegister(DataCoordSyncEpoch)
150+
registry.MustRegister(DataCoordStoredBinlogSize)
142151
}

internal/metrics/rootcoord_metrics.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,17 @@ var (
133133
Name: "num_of_roles",
134134
Help: "The number of roles",
135135
})
136+
137+
// RootCoordTtDelay records the max time tick delay of flow graphs in DataNodes and QueryNodes.
138+
RootCoordTtDelay = prometheus.NewGaugeVec(
139+
prometheus.GaugeOpts{
140+
Namespace: milvusNamespace,
141+
Subsystem: typeutil.RootCoordRole,
142+
Name: "time_tick_delay",
143+
Help: "The max time tick delay of flow graphs",
144+
}, []string{
145+
nodeIDLabelName,
146+
})
136147
)
137148

138149
//RegisterRootCoord registers RootCoord metrics
@@ -163,4 +174,5 @@ func RegisterRootCoord(registry *prometheus.Registry) {
163174
registry.MustRegister(RootCoordNumOfCredentials)
164175

165176
registry.MustRegister(RootCoordNumOfRoles)
177+
registry.MustRegister(RootCoordTtDelay)
166178
}

internal/rootcoord/quota_center.go

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package rootcoord
1919
import (
2020
"context"
2121
"fmt"
22+
"strconv"
2223
"sync"
2324
"time"
2425

@@ -27,14 +28,14 @@ import (
2728

2829
"github.com/milvus-io/milvus-proto/go-api/commonpb"
2930
"github.com/milvus-io/milvus/internal/log"
31+
"github.com/milvus-io/milvus/internal/metrics"
3032
"github.com/milvus-io/milvus/internal/proto/internalpb"
3133
"github.com/milvus-io/milvus/internal/proto/proxypb"
3234
"github.com/milvus-io/milvus/internal/tso"
3335
"github.com/milvus-io/milvus/internal/types"
3436
"github.com/milvus-io/milvus/internal/util/metricsinfo"
3537
"github.com/milvus-io/milvus/internal/util/ratelimitutil"
3638
"github.com/milvus-io/milvus/internal/util/tsoutil"
37-
"github.com/milvus-io/milvus/internal/util/typeutil"
3839
)
3940

4041
const (
@@ -86,9 +87,9 @@ type QuotaCenter struct {
8687
dataCoord types.DataCoord
8788

8889
// metrics
89-
queryNodeMetrics []*metricsinfo.QueryNodeQuotaMetrics
90-
dataNodeMetrics []*metricsinfo.DataNodeQuotaMetrics
91-
proxyMetrics []*metricsinfo.ProxyQuotaMetrics
90+
queryNodeMetrics map[UniqueID]*metricsinfo.QueryNodeQuotaMetrics
91+
dataNodeMetrics map[UniqueID]*metricsinfo.DataNodeQuotaMetrics
92+
proxyMetrics map[UniqueID]*metricsinfo.ProxyQuotaMetrics
9293
dataCoordMetrics *metricsinfo.DataCoordQuotaMetrics
9394

9495
currentRates map[internalpb.RateType]Limit
@@ -152,9 +153,9 @@ func (q *QuotaCenter) stop() {
152153

153154
// clearMetrics removes all metrics stored in QuotaCenter.
154155
func (q *QuotaCenter) clearMetrics() {
155-
q.dataNodeMetrics = make([]*metricsinfo.DataNodeQuotaMetrics, 0)
156-
q.queryNodeMetrics = make([]*metricsinfo.QueryNodeQuotaMetrics, 0)
157-
q.proxyMetrics = make([]*metricsinfo.ProxyQuotaMetrics, 0)
156+
q.dataNodeMetrics = make(map[UniqueID]*metricsinfo.DataNodeQuotaMetrics, 0)
157+
q.queryNodeMetrics = make(map[UniqueID]*metricsinfo.QueryNodeQuotaMetrics, 0)
158+
q.proxyMetrics = make(map[UniqueID]*metricsinfo.ProxyQuotaMetrics, 0)
158159
}
159160

160161
// syncMetrics sends GetMetrics requests to DataCoord and QueryCoord to sync the metrics in DataNodes and QueryNodes.
@@ -185,7 +186,7 @@ func (q *QuotaCenter) syncMetrics() error {
185186
}
186187
for _, queryNodeMetric := range queryCoordTopology.Cluster.ConnectedNodes {
187188
if queryNodeMetric.QuotaMetrics != nil {
188-
q.queryNodeMetrics = append(q.queryNodeMetrics, queryNodeMetric.QuotaMetrics)
189+
q.queryNodeMetrics[queryNodeMetric.ID] = queryNodeMetric.QuotaMetrics
189190
}
190191
}
191192
return nil
@@ -206,7 +207,7 @@ func (q *QuotaCenter) syncMetrics() error {
206207
}
207208
for _, dataNodeMetric := range dataCoordTopology.Cluster.ConnectedNodes {
208209
if dataNodeMetric.QuotaMetrics != nil {
209-
q.dataNodeMetrics = append(q.dataNodeMetrics, dataNodeMetric.QuotaMetrics)
210+
q.dataNodeMetrics[dataNodeMetric.ID] = dataNodeMetric.QuotaMetrics
210211
}
211212
}
212213
if dataCoordTopology.Cluster.Self.QuotaMetrics != nil {
@@ -228,7 +229,7 @@ func (q *QuotaCenter) syncMetrics() error {
228229
return err
229230
}
230231
if proxyMetric.QuotaMetrics != nil {
231-
q.proxyMetrics = append(q.proxyMetrics, proxyMetric.QuotaMetrics)
232+
q.proxyMetrics[proxyMetric.ID] = proxyMetric.QuotaMetrics
232233
}
233234
}
234235
return nil
@@ -339,10 +340,11 @@ func (q *QuotaCenter) calculateWriteRates() error {
339340
}
340341
log.Debug("QuotaCenter check diskQuota done", zap.Bool("exceeded", exceeded))
341342

342-
ttFactor, err := q.timeTickDelay()
343+
ts, err := q.tsoAllocator.GenerateTSO(1)
343344
if err != nil {
344345
return err
345346
}
347+
ttFactor := q.timeTickDelay(ts)
346348
if ttFactor <= 0 {
347349
q.forceDenyWriting(TimeTickLongDelay) // tt protection
348350
return nil
@@ -409,43 +411,46 @@ func (q *QuotaCenter) resetCurrentRates() {
409411

410412
// timeTickDelay gets time tick delay of DataNodes and QueryNodes,
411413
// and return the factor according to max tolerable time tick delay.
412-
func (q *QuotaCenter) timeTickDelay() (float64, error) {
414+
func (q *QuotaCenter) timeTickDelay(ts Timestamp) float64 {
415+
t1, _ := tsoutil.ParseTS(ts)
416+
417+
var maxDelay time.Duration
418+
for nodeID, metric := range q.queryNodeMetrics {
419+
if metric.Fgm.NumFlowGraph > 0 {
420+
t2, _ := tsoutil.ParseTS(metric.Fgm.MinFlowGraphTt)
421+
delay := t1.Sub(t2)
422+
if delay.Nanoseconds() > maxDelay.Nanoseconds() {
423+
maxDelay = delay
424+
}
425+
metrics.RootCoordTtDelay.WithLabelValues(strconv.FormatInt(nodeID, 10)).Set(float64(maxDelay.Milliseconds()))
426+
}
427+
}
428+
for nodeID, metric := range q.dataNodeMetrics {
429+
if metric.Fgm.NumFlowGraph > 0 {
430+
t2, _ := tsoutil.ParseTS(metric.Fgm.MinFlowGraphTt)
431+
delay := t1.Sub(t2)
432+
if delay.Nanoseconds() > maxDelay.Nanoseconds() {
433+
maxDelay = delay
434+
}
435+
metrics.RootCoordTtDelay.WithLabelValues(strconv.FormatInt(nodeID, 10)).Set(float64(maxDelay.Milliseconds()))
436+
}
437+
}
438+
413439
if !Params.QuotaConfig.TtProtectionEnabled {
414-
return 1, nil
440+
return 1
415441
}
416442

417443
maxTt := Params.QuotaConfig.MaxTimeTickDelay
418444
if maxTt < 0 {
419445
// < 0 means disable tt protection
420-
return 1, nil
446+
return 1
421447
}
422448

423-
minTs := typeutil.MaxTimestamp
424-
for _, metric := range q.queryNodeMetrics {
425-
if metric.Fgm.NumFlowGraph > 0 && metric.Fgm.MinFlowGraphTt < minTs {
426-
minTs = metric.Fgm.MinFlowGraphTt
427-
}
428-
}
429-
for _, metric := range q.dataNodeMetrics {
430-
if metric.Fgm.NumFlowGraph > 0 && metric.Fgm.MinFlowGraphTt < minTs {
431-
minTs = metric.Fgm.MinFlowGraphTt
432-
}
433-
}
434-
ts, err := q.tsoAllocator.GenerateTSO(1)
435-
if err != nil {
436-
return 0, err
437-
}
438-
if minTs >= ts {
439-
return 1, nil
440-
}
441-
t1, _ := tsoutil.ParseTS(minTs)
442-
t2, _ := tsoutil.ParseTS(ts)
443-
delay := t2.Sub(t1)
444-
log.Debug("QuotaCenter check timeTick delay", zap.Time("minTs", t1), zap.Time("curTs", t2), zap.Duration("delay", delay))
445-
if delay.Nanoseconds() >= maxTt.Nanoseconds() {
446-
return 0, nil
449+
log.Debug("QuotaCenter check timeTick delay", zap.Time("curTs", t1), zap.Duration("maxDelay", maxDelay))
450+
if maxDelay.Nanoseconds() >= maxTt.Nanoseconds() {
451+
return 0
447452
}
448-
return float64(maxTt.Nanoseconds()-delay.Nanoseconds()) / float64(maxTt.Nanoseconds()), nil
453+
return float64(maxTt.Nanoseconds()-maxDelay.Nanoseconds()) / float64(maxTt.Nanoseconds())
449454
}
450455

451456
// checkNQInQuery checks search&query nq in QueryNode,

0 commit comments

Comments
 (0)