Skip to content

Commit 9338727

Browse files
feat(proxy): support absolute RPM/TPM in priority_reservation (#15813)
* feat(proxy): support absolute RPM/TPM in priority_reservation Allow priority reservations as absolute values instead of percentages: - Float: {'prod': 0.75} (75%, existing) - RPM: {'prod': {'type': 'rpm', 'value': 750}} - TPM: {'prod': {'type': 'tpm', 'value': 750000}} Added _convert_to_percent() that converts absolute values to percentages based on model capacity. Fully backward compatible. * feat(types): convert priority_reservation Dict to TypedDict Add PriorityReservationDict TypedDict to replace generic Dict type in priority_reservation configuration. Changes: - Add PriorityReservationDict to litellm/types/utils.py - Update convert_priority_to_percent() signature in rate_limiter_utils.py - Update litellm.priority_reservation type annotation in __init__.py Improves IDE autocomplete and type checking for priority reservation configs. * docs: update dynamic rate limiter priority reservation docs
1 parent c5fee97 commit 9338727

File tree

6 files changed

+116
-17
lines changed

6 files changed

+116
-17
lines changed

docs/my-website/docs/proxy/dynamic_rate_limit.md

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,16 @@ model_list:
136136

137137
litellm_settings:
138138
callbacks: ["dynamic_rate_limiter_v3"]
139-
priority_reservation:
140-
"prod": 0.9 # 90% reserved for production (9 RPM)
141-
"dev": 0.1 # 10% reserved for development (1 RPM)
139+
priority_reservation:
140+
"prod": 0.9 # 90% reserved for production (9 RPM)
141+
"dev": 0.1 # 10% reserved for development (1 RPM)
142+
# Alternative format:
143+
# "prod":
144+
# type: "rpm" # Reserve based on requests per minute
145+
# value: 9 # 9 RPM = 90% of 10 RPM capacity
146+
# "dev":
147+
# type: "tpm" # Reserve based on tokens per minute
148+
# value: 100 # 100 TPM
142149
priority_reservation_settings:
143150
default_priority: 0 # Weight (0%) assigned to keys without explicit priority metadata
144151
saturation_threshold: 0.50 # A model is saturated if it has hit 50% of its RPM limit
@@ -150,10 +157,12 @@ general_settings:
150157
151158
**Configuration Details:**
152159
153-
`priority_reservation`: Dict[str, float]
160+
`priority_reservation`: Dict[str, Union[float, PriorityReservationDict]]
154161
- **Key (str)**: Priority level name (can be any string like "prod", "dev", "critical", etc.)
155-
- **Value (float)**: Percentage of total TPM/RPM to reserve (0.0 to 1.0)
156-
- **Note**: Values should sum to 1.0 or less
162+
- **Value**: Either a float (0.0-1.0) or dict with `type` and `value`
163+
- Float: `0.9` = 90% of capacity
164+
- Dict: `{"type": "rpm", "value": 9}` = 9 requests/min
165+
- Supported types: `"percent"`, `"rpm"`, `"tpm"`
157166

158167
`priority_reservation_settings`: Object (Optional)
159168
- **default_priority (float)**: Weight/percentage (0.0 to 1.0) assigned to API keys that have no priority metadata set (defaults to 0.5)

litellm/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
all_litellm_params,
2929
all_litellm_params as _litellm_completion_params,
3030
CredentialItem,
31+
PriorityReservationDict,
3132
) # maintain backwards compatibility for root param
3233
from litellm._logging import (
3334
set_verbose,
@@ -369,7 +370,7 @@
369370
public_model_groups: Optional[List[str]] = None
370371
public_model_groups_links: Dict[str, str] = {}
371372
#### REQUEST PRIORITIZATION #######
372-
priority_reservation: Optional[Dict[str, float]] = None
373+
priority_reservation: Optional[Dict[str, Union[float, PriorityReservationDict]]] = None
373374
priority_reservation_settings: "PriorityReservationSettings" = (
374375
PriorityReservationSettings()
375376
)

litellm/proxy/hooks/dynamic_rate_limiter.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
from litellm.types.router import ModelGroupInfo
1818
from litellm.utils import get_utc_datetime
1919

20+
from .rate_limiter_utils import convert_priority_to_percent
21+
2022

2123
class DynamicRateLimiterCache:
2224
"""
@@ -99,6 +101,11 @@ async def check_available_usage(
99101
- active_projects: int or null
100102
"""
101103
try:
104+
# Get model info first for conversion
105+
model_group_info: Optional[
106+
ModelGroupInfo
107+
] = self.llm_router.get_model_group_info(model_group=model)
108+
102109
weight: float = 1
103110
if (
104111
litellm.priority_reservation is None
@@ -115,7 +122,8 @@ async def check_available_usage(
115122
"PREMIUM FEATURE: Reserving tpm/rpm by priority is a premium feature. Please add a 'LITELLM_LICENSE' to your .env to enable this.\nGet a license: https://docs.litellm.ai/docs/proxy/enterprise."
116123
)
117124
else:
118-
weight = litellm.priority_reservation[priority]
125+
value = litellm.priority_reservation[priority]
126+
weight = convert_priority_to_percent(value, model_group_info)
119127

120128
active_projects = await self.internal_usage_cache.async_get_cache(
121129
model=model
@@ -124,9 +132,6 @@ async def check_available_usage(
124132
current_model_tpm,
125133
current_model_rpm,
126134
) = await self.llm_router.get_model_group_usage(model_group=model)
127-
model_group_info: Optional[
128-
ModelGroupInfo
129-
] = self.llm_router.get_model_group_info(model_group=model)
130135
total_model_tpm: Optional[int] = None
131136
total_model_rpm: Optional[int] = None
132137
if model_group_info is not None:

litellm/proxy/hooks/dynamic_rate_limiter_v3.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
RateLimitDescriptorRateLimitObject,
1919
_PROXY_MaxParallelRequestsHandler_v3,
2020
)
21+
from litellm.proxy.hooks.rate_limiter_utils import convert_priority_to_percent
2122
from litellm.proxy.utils import InternalUsageCache
2223
from litellm.types.router import ModelGroupInfo
2324

@@ -48,7 +49,7 @@ def __init__(self, internal_usage_cache: DualCache):
4849
def update_variables(self, llm_router: Router):
4950
self.llm_router = llm_router
5051

51-
def _get_priority_weight(self, priority: Optional[str]) -> float:
52+
def _get_priority_weight(self, priority: Optional[str], model_info: Optional[ModelGroupInfo] = None) -> float:
5253
"""Get the weight for a given priority from litellm.priority_reservation"""
5354
weight: float = litellm.priority_reservation_settings.default_priority
5455
if (
@@ -64,19 +65,25 @@ def _get_priority_weight(self, priority: Optional[str]) -> float:
6465
"PREMIUM FEATURE: Reserving tpm/rpm by priority is a premium feature. Please add a 'LITELLM_LICENSE' to your .env to enable this.\nGet a license: https://docs.litellm.ai/docs/proxy/enterprise."
6566
)
6667
else:
67-
weight = litellm.priority_reservation[priority]
68+
value = litellm.priority_reservation[priority]
69+
weight = convert_priority_to_percent(value, model_info)
6870
return weight
6971

70-
def _normalize_priority_weights(self) -> Dict[str, float]:
72+
def _normalize_priority_weights(self, model_info: ModelGroupInfo) -> Dict[str, float]:
7173
"""
7274
Normalize priority weights if they sum to > 1.0
7375
7476
Handles over-allocation: {key_a: 0.60, key_b: 0.80} -> {key_a: 0.43, key_b: 0.57}
77+
Converts absolute rpm/tpm values to percentages based on model capacity.
7578
"""
7679
if litellm.priority_reservation is None:
7780
return {}
7881

79-
weights = dict(litellm.priority_reservation)
82+
# Convert all values to percentages first
83+
weights: Dict[str, float] = {}
84+
for k, v in litellm.priority_reservation.items():
85+
weights[k] = convert_priority_to_percent(v, model_info)
86+
8087
total_weight = sum(weights.values())
8188

8289
if total_weight > 1.0:
@@ -93,6 +100,7 @@ def _get_priority_allocation(
93100
model: str,
94101
priority: Optional[str],
95102
normalized_weights: Dict[str, float],
103+
model_info: Optional[ModelGroupInfo] = None,
96104
) -> tuple[float, str]:
97105
"""
98106
Get priority weight and pool key for a given priority.
@@ -104,6 +112,7 @@ def _get_priority_allocation(
104112
model: Model name
105113
priority: Priority level (None for default)
106114
normalized_weights: Pre-computed normalized weights
115+
model_info: Model configuration (optional, for fallback conversion)
107116
108117
Returns:
109118
tuple: (priority_weight, priority_key)
@@ -117,7 +126,7 @@ def _get_priority_allocation(
117126

118127
if has_explicit_priority and priority is not None:
119128
# Explicit priority: get its specific allocation
120-
priority_weight = normalized_weights.get(priority, self._get_priority_weight(priority))
129+
priority_weight = normalized_weights.get(priority, self._get_priority_weight(priority, model_info))
121130
# Use unique key per priority level
122131
priority_key = f"{model}:{priority}"
123132
else:
@@ -232,11 +241,12 @@ def _create_priority_based_descriptors(
232241
return descriptors
233242

234243
# Get normalized priority weight and pool key
235-
normalized_weights = self._normalize_priority_weights()
244+
normalized_weights = self._normalize_priority_weights(model_group_info)
236245
priority_weight, priority_key = self._get_priority_allocation(
237246
model=model,
238247
priority=priority,
239248
normalized_weights=normalized_weights,
249+
model_info=model_group_info,
240250
)
241251

242252
rate_limit_config: RateLimitDescriptorRateLimitObject = {}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Shared utility functions for rate limiter hooks.
3+
"""
4+
5+
from typing import Optional, Union
6+
7+
from litellm.types.router import ModelGroupInfo
8+
from litellm.types.utils import PriorityReservationDict
9+
10+
11+
def convert_priority_to_percent(
12+
value: Union[float, PriorityReservationDict], model_info: Optional[ModelGroupInfo]
13+
) -> float:
14+
"""
15+
Convert priority reservation value to percentage (0.0-1.0).
16+
17+
Supports three formats:
18+
1. Plain float/int: 0.9 -> 0.9 (90%)
19+
2. Dict with percent: {"type": "percent", "value": 0.9} -> 0.9
20+
3. Dict with rpm: {"type": "rpm", "value": 900} -> 900/model_rpm
21+
4. Dict with tpm: {"type": "tpm", "value": 900000} -> 900000/model_tpm
22+
23+
Args:
24+
value: Priority value as float or dict with type/value keys
25+
model_info: Model configuration containing rpm/tpm limits
26+
27+
Returns:
28+
float: Percentage value between 0.0 and 1.0
29+
"""
30+
if isinstance(value, (int, float)):
31+
return float(value)
32+
33+
if isinstance(value, dict):
34+
val_type = value.get("type", "percent")
35+
val_num = value.get("value", 1.0)
36+
37+
if val_type == "percent":
38+
return float(val_num)
39+
elif (
40+
val_type == "rpm"
41+
and model_info
42+
and model_info.rpm
43+
and model_info.rpm > 0
44+
):
45+
return float(val_num) / model_info.rpm
46+
elif (
47+
val_type == "tpm"
48+
and model_info
49+
and model_info.tpm
50+
and model_info.tpm > 0
51+
):
52+
return float(val_num) / model_info.tpm
53+
54+
# Fallback: treat as percent
55+
return float(val_num)

litellm/types/utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2784,6 +2784,25 @@ class CallbacksByType(TypedDict):
27842784
]
27852785

27862786

2787+
class PriorityReservationDict(TypedDict, total=False):
2788+
"""
2789+
Dictionary format for priority reservation values.
2790+
2791+
Used in litellm.priority_reservation to specify how much capacity to reserve
2792+
for each priority level. Supports three formats:
2793+
1. Percentage-based: {"type": "percent", "value": 0.9} -> 90% of capacity
2794+
2. RPM-based: {"type": "rpm", "value": 900} -> 900 requests per minute
2795+
3. TPM-based: {"type": "tpm", "value": 900000} -> 900,000 tokens per minute
2796+
2797+
Attributes:
2798+
type: The type of value - "percent", "rpm", or "tpm". Defaults to "percent".
2799+
value: The numeric value. For percent (0.0-1.0), for rpm/tpm (absolute value).
2800+
"""
2801+
2802+
type: Literal["percent", "rpm", "tpm"]
2803+
value: float
2804+
2805+
27872806
class PriorityReservationSettings(BaseModel):
27882807
"""
27892808
Settings for priority-based rate limiting reservation.

0 commit comments

Comments
 (0)