feat(proxy): support absolute RPM/TPM in priority_reservation (#15813)

AlexsanderHamir · web-flow · commit 933872796087 · 2025-10-23T18:30:36.000-07:00
* feat(proxy): support absolute RPM/TPM in priority_reservation

Allow priority reservations as absolute values instead of percentages:
- Float: {'prod': 0.75} (75%, existing)
- RPM: {'prod': {'type': 'rpm', 'value': 750}}
- TPM: {'prod': {'type': 'tpm', 'value': 750000}}

Added _convert_to_percent() that converts absolute values to percentages
based on model capacity. Fully backward compatible.

* feat(types): convert priority_reservation Dict to TypedDict

Add PriorityReservationDict TypedDict to replace generic Dict type in priority_reservation configuration.

Changes:
- Add PriorityReservationDict to litellm/types/utils.py
- Update convert_priority_to_percent() signature in rate_limiter_utils.py
- Update litellm.priority_reservation type annotation in __init__.py

Improves IDE autocomplete and type checking for priority reservation configs.

* docs: update dynamic rate limiter priority reservation docs
diff --git a/docs/my-website/docs/proxy/dynamic_rate_limit.md b/docs/my-website/docs/proxy/dynamic_rate_limit.md
@@ -136,9 +136,16 @@ model_list:
 
 litellm_settings:
   callbacks: ["dynamic_rate_limiter_v3"]
-  priority_reservation: 
-    "prod": 0.9  # 90% reserved for production (9 RPM)
-    "dev": 0.1   # 10% reserved for development (1 RPM)
+  priority_reservation:
+    "prod": 0.9 # 90% reserved for production (9 RPM)
+    "dev": 0.1 # 10% reserved for development (1 RPM)
+    # Alternative format:
+    # "prod":
+    #   type: "rpm"    # Reserve based on requests per minute
+    #   value: 9       # 9 RPM = 90% of 10 RPM capacity
+    # "dev":
+    #   type: "tpm"    # Reserve based on tokens per minute
+    #   value: 100     # 100 TPM
   priority_reservation_settings:
     default_priority: 0  # Weight (0%) assigned to keys without explicit priority metadata
     saturation_threshold: 0.50 #  A model is saturated if it has hit 50% of its RPM limit
@@ -150,10 +157,12 @@ general_settings:
 
 **Configuration Details:**
 
-`priority_reservation`: Dict[str, float]
+`priority_reservation`: Dict[str, Union[float, PriorityReservationDict]]
 - **Key (str)**: Priority level name (can be any string like "prod", "dev", "critical", etc.)
-- **Value (float)**: Percentage of total TPM/RPM to reserve (0.0 to 1.0)
-- **Note**: Values should sum to 1.0 or less
+- **Value**: Either a float (0.0-1.0) or dict with `type` and `value`
+  - Float: `0.9` = 90% of capacity
+  - Dict: `{"type": "rpm", "value": 9}` = 9 requests/min
+  - Supported types: `"percent"`, `"rpm"`, `"tpm"`
 
 `priority_reservation_settings`: Object (Optional)
 - **default_priority (float)**: Weight/percentage (0.0 to 1.0) assigned to API keys that have no priority metadata set (defaults to 0.5)
diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -28,6 +28,7 @@
     all_litellm_params,
     all_litellm_params as _litellm_completion_params,
     CredentialItem,
+    PriorityReservationDict,
 )  # maintain backwards compatibility for root param
 from litellm._logging import (
     set_verbose,
@@ -369,7 +370,7 @@
 public_model_groups: Optional[List[str]] = None
 public_model_groups_links: Dict[str, str] = {}
 #### REQUEST PRIORITIZATION #######
-priority_reservation: Optional[Dict[str, float]] = None
+priority_reservation: Optional[Dict[str, Union[float, PriorityReservationDict]]] = None
 priority_reservation_settings: "PriorityReservationSettings" = (
     PriorityReservationSettings()
 )
diff --git a/litellm/proxy/hooks/dynamic_rate_limiter.py b/litellm/proxy/hooks/dynamic_rate_limiter.py
@@ -17,6 +17,8 @@
 from litellm.types.router import ModelGroupInfo
 from litellm.utils import get_utc_datetime
 
+from .rate_limiter_utils import convert_priority_to_percent
+
 
 class DynamicRateLimiterCache:
     """
@@ -99,6 +101,11 @@ async def check_available_usage(
             - active_projects: int or null
         """
         try:
+            # Get model info first for conversion
+            model_group_info: Optional[
+                ModelGroupInfo
+            ] = self.llm_router.get_model_group_info(model_group=model)
+            
             weight: float = 1
             if (
                 litellm.priority_reservation is None
@@ -115,7 +122,8 @@ async def check_available_usage(
                         "PREMIUM FEATURE: Reserving tpm/rpm by priority is a premium feature. Please add a 'LITELLM_LICENSE' to your .env to enable this.\nGet a license: https://docs.litellm.ai/docs/proxy/enterprise."
                     )
                 else:
-                    weight = litellm.priority_reservation[priority]
+                    value = litellm.priority_reservation[priority]
+                    weight = convert_priority_to_percent(value, model_group_info)
 
             active_projects = await self.internal_usage_cache.async_get_cache(
                 model=model
@@ -124,9 +132,6 @@ async def check_available_usage(
                 current_model_tpm,
                 current_model_rpm,
             ) = await self.llm_router.get_model_group_usage(model_group=model)
-            model_group_info: Optional[
-                ModelGroupInfo
-            ] = self.llm_router.get_model_group_info(model_group=model)
             total_model_tpm: Optional[int] = None
             total_model_rpm: Optional[int] = None
             if model_group_info is not None:
diff --git a/litellm/proxy/hooks/dynamic_rate_limiter_v3.py b/litellm/proxy/hooks/dynamic_rate_limiter_v3.py
@@ -18,6 +18,7 @@
     RateLimitDescriptorRateLimitObject,
     _PROXY_MaxParallelRequestsHandler_v3,
 )
+from litellm.proxy.hooks.rate_limiter_utils import convert_priority_to_percent
 from litellm.proxy.utils import InternalUsageCache
 from litellm.types.router import ModelGroupInfo
 
@@ -48,7 +49,7 @@ def __init__(self, internal_usage_cache: DualCache):
     def update_variables(self, llm_router: Router):
         self.llm_router = llm_router
 
-    def _get_priority_weight(self, priority: Optional[str]) -> float:
+    def _get_priority_weight(self, priority: Optional[str], model_info: Optional[ModelGroupInfo] = None) -> float:
         """Get the weight for a given priority from litellm.priority_reservation"""
         weight: float = litellm.priority_reservation_settings.default_priority
         if (
@@ -64,19 +65,25 @@ def _get_priority_weight(self, priority: Optional[str]) -> float:
                     "PREMIUM FEATURE: Reserving tpm/rpm by priority is a premium feature. Please add a 'LITELLM_LICENSE' to your .env to enable this.\nGet a license: https://docs.litellm.ai/docs/proxy/enterprise."
                 )
             else:
-                weight = litellm.priority_reservation[priority]
+                value = litellm.priority_reservation[priority]
+                weight = convert_priority_to_percent(value, model_info)
         return weight
 
-    def _normalize_priority_weights(self) -> Dict[str, float]:
+    def _normalize_priority_weights(self, model_info: ModelGroupInfo) -> Dict[str, float]:
         """
         Normalize priority weights if they sum to > 1.0
         
         Handles over-allocation: {key_a: 0.60, key_b: 0.80} -> {key_a: 0.43, key_b: 0.57}
+        Converts absolute rpm/tpm values to percentages based on model capacity.
         """
         if litellm.priority_reservation is None:
             return {}
         
-        weights = dict(litellm.priority_reservation)
+        # Convert all values to percentages first
+        weights: Dict[str, float] = {}
+        for k, v in litellm.priority_reservation.items():
+            weights[k] = convert_priority_to_percent(v, model_info)
+        
         total_weight = sum(weights.values())
         
         if total_weight > 1.0:
@@ -93,6 +100,7 @@ def _get_priority_allocation(
         model: str,
         priority: Optional[str],
         normalized_weights: Dict[str, float],
+        model_info: Optional[ModelGroupInfo] = None,
     ) -> tuple[float, str]:
         """
         Get priority weight and pool key for a given priority.
@@ -104,6 +112,7 @@ def _get_priority_allocation(
             model: Model name
             priority: Priority level (None for default)
             normalized_weights: Pre-computed normalized weights
+            model_info: Model configuration (optional, for fallback conversion)
             
         Returns:
             tuple: (priority_weight, priority_key)
@@ -117,7 +126,7 @@ def _get_priority_allocation(
         
         if has_explicit_priority and priority is not None:
             # Explicit priority: get its specific allocation
-            priority_weight = normalized_weights.get(priority, self._get_priority_weight(priority))
+            priority_weight = normalized_weights.get(priority, self._get_priority_weight(priority, model_info))
             # Use unique key per priority level
             priority_key = f"{model}:{priority}"
         else:
@@ -232,11 +241,12 @@ def _create_priority_based_descriptors(
             return descriptors
 
         # Get normalized priority weight and pool key
-        normalized_weights = self._normalize_priority_weights()
+        normalized_weights = self._normalize_priority_weights(model_group_info)
         priority_weight, priority_key = self._get_priority_allocation(
             model=model,
             priority=priority,
             normalized_weights=normalized_weights,
+            model_info=model_group_info,
         )
         
         rate_limit_config: RateLimitDescriptorRateLimitObject = {}
diff --git a/litellm/proxy/hooks/rate_limiter_utils.py b/litellm/proxy/hooks/rate_limiter_utils.py
@@ -0,0 +1,55 @@
+"""
+Shared utility functions for rate limiter hooks.
+"""
+
+from typing import Optional, Union
+
+from litellm.types.router import ModelGroupInfo
+from litellm.types.utils import PriorityReservationDict
+
+
+def convert_priority_to_percent(
+    value: Union[float, PriorityReservationDict], model_info: Optional[ModelGroupInfo]
+) -> float:
+    """
+    Convert priority reservation value to percentage (0.0-1.0).
+
+    Supports three formats:
+    1. Plain float/int: 0.9 -> 0.9 (90%)
+    2. Dict with percent: {"type": "percent", "value": 0.9} -> 0.9
+    3. Dict with rpm: {"type": "rpm", "value": 900} -> 900/model_rpm
+    4. Dict with tpm: {"type": "tpm", "value": 900000} -> 900000/model_tpm
+
+    Args:
+        value: Priority value as float or dict with type/value keys
+        model_info: Model configuration containing rpm/tpm limits
+
+    Returns:
+        float: Percentage value between 0.0 and 1.0
+    """
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, dict):
+        val_type = value.get("type", "percent")
+        val_num = value.get("value", 1.0)
+
+        if val_type == "percent":
+            return float(val_num)
+        elif (
+            val_type == "rpm"
+            and model_info
+            and model_info.rpm
+            and model_info.rpm > 0
+        ):
+            return float(val_num) / model_info.rpm
+        elif (
+            val_type == "tpm"
+            and model_info
+            and model_info.tpm
+            and model_info.tpm > 0
+        ):
+            return float(val_num) / model_info.tpm
+
+        # Fallback: treat as percent
+        return float(val_num)
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
@@ -2784,6 +2784,25 @@ class CallbacksByType(TypedDict):
 ]
 
 
+class PriorityReservationDict(TypedDict, total=False):
+    """
+    Dictionary format for priority reservation values.
+    
+    Used in litellm.priority_reservation to specify how much capacity to reserve
+    for each priority level. Supports three formats:
+    1. Percentage-based: {"type": "percent", "value": 0.9} -> 90% of capacity
+    2. RPM-based: {"type": "rpm", "value": 900} -> 900 requests per minute
+    3. TPM-based: {"type": "tpm", "value": 900000} -> 900,000 tokens per minute
+    
+    Attributes:
+        type: The type of value - "percent", "rpm", or "tpm". Defaults to "percent".
+        value: The numeric value. For percent (0.0-1.0), for rpm/tpm (absolute value).
+    """
+
+    type: Literal["percent", "rpm", "tpm"]
+    value: float
+
+
 class PriorityReservationSettings(BaseModel):
     """
     Settings for priority-based rate limiting reservation.