-
Notifications
You must be signed in to change notification settings - Fork 8
ec connector stats #164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: v0.11.0
Are you sure you want to change the base?
ec connector stats #164
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,38 @@ | ||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||||
| from dataclasses import dataclass, field | ||||||
| from typing import Any, Union | ||||||
|
|
||||||
|
|
||||||
| @dataclass | ||||||
| class ECConnectorStats: | ||||||
| """ | ||||||
| Base class for EC Connector Stats, a container for transfer performance | ||||||
|
||||||
| Base class for EC Connector Stats, a container for transfer performance | |
| Base class for EC Connector Stats, a container for transfer performance |
Copilot
AI
Nov 28, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace at the end of the line. Consider removing it for consistent code style.
| metrics or otherwise important telemetry from the connector. | |
| metrics or otherwise important telemetry from the connector. |
Copilot
AI
Nov 28, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace at the end of the line. Consider removing it for consistent code style.
| Reduce the observations collected during a time interval to one or | |
| Reduce the observations collected during a time interval to one or |
Copilot
AI
Nov 28, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace at the end of the line. Consider removing it for consistent code style.
| more representative values (eg avg/median/sum of the series). | |
| more representative values (eg avg/median/sum of the series). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,17 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| import copy | ||
| import os | ||
| from contextlib import contextmanager | ||
| from dataclasses import dataclass | ||
| from importlib import import_module | ||
| from time import perf_counter | ||
| from typing import TYPE_CHECKING, Optional, Union | ||
|
|
||
| from vllm.config import VllmConfig | ||
| from vllm.distributed.ec_transfer.ec_connector.base import ( | ||
| ECConnectorBase, ECConnectorMetadata, ECConnectorRole) | ||
| from vllm.distributed.ec_transfer.ec_connector.metrics import ECConnectorStats | ||
| from vllm.logger import init_logger | ||
| from vllm.v1.core.sched.output import SchedulerOutput | ||
|
|
||
|
|
@@ -56,6 +60,7 @@ def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): | |
| # mm_hash -> num_tokens | ||
| self._mm_datas_need_loads: dict[str, int] = {} | ||
| self.store = ECMooncakeStore(vllm_config) | ||
| self.stats = MooncakeECConnectorStats() | ||
|
|
||
| def start_load_caches(self, encoder_cache, **kwargs) -> None: | ||
| """ | ||
|
|
@@ -79,18 +84,19 @@ def start_load_caches(self, encoder_cache, **kwargs) -> None: | |
| if not metadata.mm_datas: | ||
| return | ||
|
|
||
| mm_hashes = [ | ||
| mm_data.mm_hash for mm_data in metadata.mm_datas | ||
| if mm_data.mm_hash not in encoder_cache | ||
| ] | ||
| device = self._vllm_config.device_config.device | ||
| tensors = self.store.batch_get(mm_hashes, device) | ||
| with self.stats.load_timer(): | ||
| mm_hashes = [ | ||
| mm_data.mm_hash for mm_data in metadata.mm_datas | ||
| if mm_data.mm_hash not in encoder_cache | ||
| ] | ||
| device = self._vllm_config.device_config.device | ||
| tensors = self.store.batch_get(mm_hashes, device) | ||
|
|
||
| for mm_hash, ec_cache in zip(mm_hashes, tensors): | ||
| encoder_cache[mm_hash] = ec_cache | ||
| if ec_cache is None: | ||
| logger.error("Load failed for %s", mm_hash) | ||
| logger.debug("Load tensor for %s successfully", mm_hash) | ||
| for mm_hash, ec_cache in zip(mm_hashes, tensors): | ||
| encoder_cache[mm_hash] = ec_cache | ||
| if ec_cache is None: | ||
| logger.error("Load failed for %s", mm_hash) | ||
| logger.debug("Load tensor for %s successfully", mm_hash) | ||
|
|
||
| def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None: | ||
| """ | ||
|
|
@@ -113,6 +119,9 @@ def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None: | |
| self.store.batch_put([mm_hash], [encoder_cache[mm_hash]]) | ||
|
|
||
| def wait_for_save(self): | ||
| if not self.is_producer: | ||
| return | ||
|
|
||
| self.store.wait_for_put() | ||
|
|
||
| def has_caches( | ||
|
|
@@ -167,3 +176,74 @@ def build_connector_meta( | |
| meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token)) | ||
| self._mm_datas_need_loads.clear() | ||
| return meta | ||
|
|
||
| def get_stats(self) -> ECConnectorStats: | ||
| return self.stats.clone_and_reset() | ||
|
|
||
|
|
||
| @dataclass | ||
| class MooncakeECConnectorStats(ECConnectorStats): | ||
| """Container for transfer performance metrics""" | ||
|
|
||
| def __post_init__(self): | ||
| if "load_time_ms" not in self.data: | ||
| self.data["load_time_ms"] = 0.0 | ||
| if "save_time_ms" not in self.data: | ||
| self.data["save_time_ms"] = 0.0 | ||
| if "num_loads" not in self.data: | ||
| self.data["num_loads"] = 0 | ||
| if "num_saves" not in self.data: | ||
| self.data["num_saves"] = 0 | ||
|
|
||
| def reset(self): | ||
| self.data = { | ||
| "load_time_ms": 0.0, | ||
| "save_time_ms": 0.0, | ||
| "num_loads": 0, | ||
| "num_saves": 0, | ||
| } | ||
|
|
||
| @contextmanager | ||
| def load_timer(self): | ||
| start = perf_counter() | ||
| try: | ||
| yield | ||
| finally: | ||
| elapsed_ms = (perf_counter() - start) * 1000.0 | ||
| self.record_load(elapsed_ms) | ||
|
|
||
| def record_load(self, load_time_ms: float): | ||
| self.data["load_time_ms"] += load_time_ms | ||
| self.data["num_loads"] += 1 | ||
|
|
||
| def record_save(self, save_time_ms: float): | ||
| self.data["save_time_ms"] += save_time_ms | ||
| self.data["num_saves"] += 1 | ||
|
|
||
| def clone_and_reset(self) -> "MooncakeECConnectorStats": | ||
| old = copy.copy(self) | ||
| self.reset() | ||
| return old | ||
|
|
||
| def is_empty(self) -> bool: | ||
| return self.data["num_loads"] == 0 and self.data["num_saves"] == 0 | ||
|
|
||
| def aggregate(self, other: ECConnectorStats) -> ECConnectorStats: | ||
| if not other.is_empty(): | ||
| self.data["load_time_ms"] += other.data["load_time_ms"] | ||
| self.data["save_time_ms"] += other.data["save_time_ms"] | ||
| self.data["num_loads"] += other.data["num_loads"] | ||
| self.data["num_saves"] += other.data["num_saves"] | ||
| return self | ||
|
Comment on lines
+231
to
+237
|
||
|
|
||
| def reduce(self) -> dict[str, Union[int, float]]: | ||
| return { | ||
| "avg_load_time_ms": | ||
| (self.data["load_time_ms"] / max(1, self.data["num_loads"])), | ||
| "avg_save_time_ms": | ||
| (self.data["save_time_ms"] / max(1, self.data["num_saves"])), | ||
| "total_loads": | ||
| self.data["num_loads"], | ||
| "total_saves": | ||
| self.data["num_saves"], | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
get_statsmethod is marked as@abstractmethodbutECSharedStorageConnector(inshared_storage_connector.py) does not implement this method. This will cause instantiation failures forECSharedStorageConnector. Either remove the@abstractmethoddecorator to make it optional, or ensure all subclasses implement this method.