Skip to content

Commit f86e4f9

Browse files
sumingZeroflesher0813
authored andcommitted
[Feat] UCM supports metrics display online via Grafana and Promethues (ModelEngine-Group#414)
* [Feat] Build metrics frame * [Feat]add metrics(ucm_obser.py + metrics_configs.yaml) * [Feat] Implementation of metrics logger on the C++ side for storing and retrieving stats * [Fix] Provide simple grafana and fix bugs * [feat] change the log position of UCM metrics * [fix]modify grafana.json * [Feat] UCM supports metrics display online via Grafana and Promethues * [Fix] Remove configs to examples and add liscense --------- Co-authored-by: flesher0813 <[email protected]> Co-authored-by: hero<[email protected]>
1 parent ed3ee25 commit f86e4f9

File tree

16 files changed

+2054
-1
lines changed

16 files changed

+2054
-1
lines changed

examples/metrics/grafana.json

Lines changed: 1025 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Prometheus Metrics Configuration
2+
# This file defines which metrics should be enabled and their configurations
3+
log_interval: 5 # Interval in seconds for logging metrics
4+
5+
prometheus:
6+
multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode
7+
8+
metric_prefix: "ucm:"
9+
10+
# Enable/disable metrics by category
11+
enabled_metrics:
12+
counters: true
13+
gauges: true
14+
histograms: true
15+
16+
# Counter metrics configuration
17+
# counters:
18+
# - name: "received_requests"
19+
# documentation: "Total number of requests sent to ucm"
20+
21+
# Gauge metrics configuration
22+
# gauges:
23+
# - name: "lookup_hit_rate"
24+
# documentation: "Hit rate of ucm lookup requests since last log"
25+
# multiprocess_mode: "livemostrecent"
26+
27+
# Histogram metrics configuration
28+
histograms:
29+
- name: "load_requests_num"
30+
documentation: "Number of requests loaded from ucm"
31+
buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000]
32+
- name: "load_blocks_num"
33+
documentation: "Number of blocks loaded from ucm"
34+
buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000]
35+
- name: "load_duration"
36+
documentation: "Time to load from ucm (ms)"
37+
buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000]
38+
- name: "load_speed"
39+
documentation: "Speed of loading from ucm (GB/s)"
40+
buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100]
41+
- name: "save_requests_num"
42+
documentation: "Number of requests saved to ucm"
43+
buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000]
44+
- name: "save_blocks_num"
45+
documentation: "Number of blocks saved to ucm"
46+
buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000]
47+
- name: "save_duration"
48+
documentation: "Time to save to ucm (ms)"
49+
buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000]
50+
- name: "save_speed"
51+
documentation: "Speed of saving to ucm (GB/s)"
52+
buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100]
53+
- name: "interval_lookup_hit_rates"
54+
documentation: "Hit rates of ucm lookup requests"
55+
buckets: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
56+

examples/ucm_config_example.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ ucm_connectors:
1616

1717
load_only_first_rank: false
1818

19+
metrics_config_path: "/vllm-workspace/metrics_config.yaml"
20+
1921
# Sparse attention configuration
2022
# Format 1: Dictionary format (for methods like ESA, KvComp)
2123
# ucm_sparse_config:

ucm/integration/vllm/ucm_connector.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import itertools
33
import os
44
import pickle
5+
import time
56
from dataclasses import dataclass, field
67
from typing import TYPE_CHECKING, Callable, List, Optional
78

@@ -18,6 +19,8 @@
1819
from vllm.v1.request import Request
1920

2021
from ucm.logger import init_logger
22+
from ucm.shared.metrics import ucmmonitor
23+
from ucm.shared.metrics.observability import UCMStatsLogger
2124
from ucm.store.factory import UcmConnectorFactory
2225
from ucm.store.ucmstore import Task, UcmKVStoreBase
2326
from ucm.utils import Config
@@ -128,6 +131,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
128131
self.broadcast_fn = self.group_coordinator.broadcast
129132
self.broadcast_stream = torch.cuda.Stream()
130133

134+
logger.info(f"self.launch_config: {self.launch_config}")
131135
connector_configs = self.launch_config.get("ucm_connectors", [])
132136
assert len(connector_configs) > 0, "no storage connector name in config."
133137

@@ -154,6 +158,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
154158
1 if self.is_mla else num_head_per_tp
155159
)
156160
self.store = UcmConnectorFactory.create_connector(name, config)
161+
self.block_data_size = config["kv_block_size"]
157162

158163
logger.info("init UCConnectorImpl, connector: %s", name)
159164
logger.info(
@@ -162,6 +167,20 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
162167
config["io_size"] / 1024,
163168
)
164169

170+
self.metrics_config = self.launch_config.get("metrics_config_path", "")
171+
if self.metrics_config:
172+
self.stats_logger = UCMStatsLogger(
173+
vllm_config.model_config.served_model_name,
174+
self.rank,
175+
self.metrics_config,
176+
)
177+
self.monitor = ucmmonitor.StatsMonitor.get_instance()
178+
self.synchronize = (
179+
torch.cuda.synchronize
180+
if current_platform.is_cuda_alike()
181+
else torch.npu.synchronize
182+
)
183+
165184
def generate_hash(self, block_size: int, request: "Request") -> list[str]:
166185
token_ids = request.all_token_ids
167186

@@ -210,6 +229,11 @@ def get_num_new_matched_tokens(
210229
f"hit hbm: {hbm_hit_block_num}, "
211230
f"hit external: {external_hit_blocks}"
212231
)
232+
if self.metrics_config:
233+
self.monitor.update_stats(
234+
"ConnStats",
235+
{"interval_lookup_hit_rates": external_hit_blocks / len(ucm_block_ids)},
236+
)
213237

214238
total_hit_block_num = hbm_hit_block_num + external_hit_blocks
215239

@@ -454,17 +478,23 @@ def _broadcast(self, dst_tensor_addr: list[torch.Tensor]):
454478
tensor.copy_(rec_tensor[i])
455479

456480
def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
457-
458481
metadata = self._get_connector_metadata()
459482
assert isinstance(metadata, UCMConnectorMetadata)
460483

461484
self._init_kv_caches_from_forward_context(forward_context)
462485

463486
request_to_task: dict[str, Optional[Task]] = {}
464487
req_broadcast_addr = {}
488+
is_load = False
489+
num_loaded_block = 0
490+
num_loaded_request = 0
491+
load_start_time = time.perf_counter() * 1000
465492
for request_id, request in metadata.request_meta.items():
466493
if len(request.load_block_ids[0]) == 0:
467494
continue
495+
is_load = True
496+
num_loaded_block += len(request.load_block_ids[0])
497+
num_loaded_request += 1
468498

469499
ucm_block_ids, vllm_block_ids = request.load_block_ids
470500
if self.global_rank != 0 and not self.is_mla and not self.is_dsa:
@@ -488,6 +518,24 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
488518
logger.error(f"request {request_id} load kv cache failed.")
489519
if self.load_only_first_rank:
490520
self._broadcast(req_broadcast_addr[request_id])
521+
load_end_time = time.perf_counter() * 1000
522+
load_speed = (
523+
num_loaded_block
524+
* self.block_data_size
525+
/ (load_end_time - load_start_time)
526+
/ 1024
527+
/ 1024
528+
) # GB/s
529+
if self.metrics_config and is_load:
530+
self.monitor.update_stats(
531+
"ConnStats",
532+
{
533+
"load_requests_num": num_loaded_request,
534+
"load_blocks_num": num_loaded_block,
535+
"load_duration": load_end_time - load_start_time,
536+
"load_speed": load_speed,
537+
},
538+
)
491539

492540
def wait_for_layer_load(self, layer_name: str) -> None:
493541
pass
@@ -506,15 +554,24 @@ def wait_for_save(self) -> None:
506554
# TODO support PP
507555
if (self.is_mla or self.is_dsa) and self.global_rank != 0:
508556
return
557+
if self.metrics_config:
558+
self.synchronize()
509559

510560
metadata = self._get_connector_metadata()
511561
assert isinstance(metadata, UCMConnectorMetadata)
512562

513563
request_to_task: dict[str, Task] = {}
514564
request_to_blocks: dict[str, list[str]] = {}
565+
is_save = False
566+
num_saved_block = 0
567+
num_saved_request = 0
568+
save_start_time = time.perf_counter() * 1000
515569
for request_id, request in metadata.request_meta.items():
516570
if len(request.dump_block_ids[0]) == 0:
517571
continue
572+
is_save = True
573+
num_saved_block += len(request.dump_block_ids[0])
574+
num_saved_request += 1
518575

519576
ucm_block_ids, vllm_block_ids = request.dump_block_ids
520577
if self.global_rank != 0:
@@ -549,6 +606,24 @@ def wait_for_save(self) -> None:
549606
else:
550607
logger.error(f"request {request_id} dump kv cache failed.")
551608
self.store.commit(ucm_block_ids, False)
609+
save_end_time = time.perf_counter() * 1000
610+
save_speed = (
611+
num_saved_block
612+
* self.block_data_size
613+
/ (save_end_time - save_start_time)
614+
/ 1024
615+
/ 1024
616+
) # GB/s
617+
if self.metrics_config and is_save:
618+
self.monitor.update_stats(
619+
"ConnStats",
620+
{
621+
"save_requests_num": num_saved_request,
622+
"save_blocks_num": num_saved_block,
623+
"save_duration": save_end_time - save_start_time,
624+
"save_speed": save_speed,
625+
},
626+
)
552627

553628
def clear_connector_metadata(self) -> None:
554629
super().clear_connector_metadata()

ucm/shared/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
add_subdirectory(vendor)
22
add_subdirectory(infra)
33
add_subdirectory(trans)
4+
add_subdirectory(metrics)
45
add_subdirectory(test)

ucm/shared/metrics/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
file(GLOB_RECURSE CORE_SRCS CONFIGURE_DEPENDS
2+
"${CMAKE_CURRENT_SOURCE_DIR}/cc/stats/*.cc"
3+
"${CMAKE_CURRENT_SOURCE_DIR}/cc/*.cc")
4+
add_library(monitor_static STATIC ${CORE_SRCS})
5+
set_property(TARGET monitor_static PROPERTY POSITION_INDEPENDENT_CODE ON)
6+
target_include_directories(monitor_static PUBLIC
7+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/cc>
8+
$<INSTALL_INTERFACE:include>)
9+
set_target_properties(monitor_static PROPERTIES OUTPUT_NAME monitor)
10+
11+
file(GLOB_RECURSE BINDINGS_SRCS CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/cpy/*.cc")
12+
pybind11_add_module(ucmmonitor ${BINDINGS_SRCS})
13+
target_link_libraries(ucmmonitor PRIVATE -Wl,--whole-archive monitor_static -Wl,--no-whole-archive)
14+
target_include_directories(ucmmonitor PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cc)
15+
set_target_properties(ucmmonitor PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/**
2+
* MIT License
3+
*
4+
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
* */
24+
#include "conn_stats.h"
25+
26+
namespace UC::Metrics {
27+
28+
ConnStats::ConnStats() = default;
29+
30+
std::string ConnStats::Name() const {
31+
return "ConnStats";
32+
}
33+
34+
void ConnStats::Reset() {
35+
for (auto& v : data_) v.clear();
36+
}
37+
38+
void ConnStats::Update(const std::unordered_map<std::string, double>& params) {
39+
for (const auto& [k, v] : params) {
40+
Key id = KeyFromString(k);
41+
if (id == Key::COUNT) continue;
42+
EmplaceBack(id, v);
43+
}
44+
}
45+
46+
std::unordered_map<std::string, std::vector<double>> ConnStats::Data() {
47+
std::unordered_map<std::string, std::vector<double>> result;
48+
result["save_requests_num"] = data_[static_cast<std::size_t>(Key::save_requests_num)];
49+
result["save_blocks_num"] = data_[static_cast<std::size_t>(Key::save_blocks_num)];
50+
result["save_duration"] = data_[static_cast<std::size_t>(Key::save_duration)];
51+
result["save_speed"] = data_[static_cast<std::size_t>(Key::save_speed)];
52+
result["load_requests_num"] = data_[static_cast<std::size_t>(Key::load_requests_num)];
53+
result["load_blocks_num"] = data_[static_cast<std::size_t>(Key::load_blocks_num)];
54+
result["load_duration"] = data_[static_cast<std::size_t>(Key::load_duration)];
55+
result["load_speed"] = data_[static_cast<std::size_t>(Key::load_speed)];
56+
result["interval_lookup_hit_rates"] = data_[static_cast<std::size_t>(Key::interval_lookup_hit_rates)];
57+
return result;
58+
}
59+
60+
Key ConnStats::KeyFromString(const std::string& k) {
61+
if (k == "save_requests_num") return Key::save_requests_num;
62+
if (k == "save_blocks_num") return Key::save_blocks_num;
63+
if (k == "save_duration") return Key::save_duration;
64+
if (k == "save_speed") return Key::save_speed;
65+
if (k == "load_requests_num") return Key::load_requests_num;
66+
if (k == "load_blocks_num") return Key::load_blocks_num;
67+
if (k == "load_duration") return Key::load_duration;
68+
if (k == "load_speed") return Key::load_speed;
69+
if (k == "interval_lookup_hit_rates")return Key::interval_lookup_hit_rates;
70+
return Key::COUNT;
71+
}
72+
73+
void ConnStats::EmplaceBack(Key id, double value) {
74+
data_[static_cast<std::size_t>(id)].push_back(value);
75+
}
76+
77+
static Registrar registrar;
78+
79+
}

0 commit comments

Comments
 (0)