Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/kthena-router/datastore/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ func (s *store) DeleteModelRoute(namespacedName string) error {
s.triggerCallbacks("ModelRoute", EventData{
EventType: EventDelete,
ModelName: modelName,
ModelRoute: nil,
ModelRoute: deletedRoute,
})
return nil
}
Expand Down
30 changes: 15 additions & 15 deletions pkg/kthena-router/filters/ratelimit/ratelimit.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func NewTokenRateLimiter() *TokenRateLimiter {
}

// RateLimit checks if the request is within rate limits for both input and output tokens
func (r *TokenRateLimiter) RateLimit(model, prompt string) error {
func (r *TokenRateLimiter) RateLimit(limiterKey, prompt string) error {
// Estimate input tokens
tokens, err := r.tokenizer.CalculateTokenNum(prompt)
if err != nil {
Expand All @@ -107,8 +107,8 @@ func (r *TokenRateLimiter) RateLimit(model, prompt string) error {
}

r.mutex.RLock()
inputLimiter, hasInputLimit := r.inputLimiter[model]
outputLimiter, hasOutputLimit := r.outputLimiter[model]
inputLimiter, hasInputLimit := r.inputLimiter[limiterKey]
outputLimiter, hasOutputLimit := r.outputLimiter[limiterKey]
r.mutex.RUnlock()

// Check input token rate limit
Expand All @@ -126,9 +126,9 @@ func (r *TokenRateLimiter) RateLimit(model, prompt string) error {
}

// RecordOutputTokens records the actual output tokens consumed after response generation
func (r *TokenRateLimiter) RecordOutputTokens(model string, tokenCount int) {
func (r *TokenRateLimiter) RecordOutputTokens(limiterKey string, tokenCount int) {
r.mutex.RLock()
outputLimiter, exists := r.outputLimiter[model]
outputLimiter, exists := r.outputLimiter[limiterKey]
r.mutex.RUnlock()

if exists {
Expand All @@ -137,7 +137,7 @@ func (r *TokenRateLimiter) RecordOutputTokens(model string, tokenCount int) {
}

// AddOrUpdateLimiter adds or updates rate limiter for a model
func (r *TokenRateLimiter) AddOrUpdateLimiter(model string, ratelimit *networkingv1alpha1.RateLimit) error {
func (r *TokenRateLimiter) AddOrUpdateLimiter(limiterKey string, ratelimit *networkingv1alpha1.RateLimit) error {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I don’t really understand what ‘limiterKey’ refers to. Changing the name of this formal parameter isn’t a good idea.

@nXtCyberNet nXtCyberNet Jun 1, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @LiZhenCheng9527 ,
So currently the ratelimit was defined by only the model which cause there is no namespace level isolation that cause the follow ratelimit bug :

ModelRoute-1 (namespace-a, model: llama-70b, tokenLimit: 100 tokens/min) 
  → Redis key: "llama-70b"

ModelRoute-2 (namespace-b, model: llama-70b, tokenLimit: 50 tokens/min)
  → Redis key: "llama-70b"  ← SAME KEY ❌

Result: ModelRoute-2's update OVERWRITES ModelRoute-1's token bucket
        Both namespaces now share the SAME token bucket with 50 tokens/min
        namespace-a requests get rate-limited incorrectly ❌

Which is inconsistent and breaks ratelimit if multiple teams were using the same model with different configs ,

So after changing the model name key to limiterkey - namespace/routename , since the routename is crd , so it's itself a unique name , so this will also allow per route rate limit isolation too
Like this

ModelRoute-1 (namespace-a, model: llama-70b, tokenLimit: 100 tokens/min)
  → Redis key: "namespace-a/modelroute-1"
  → Independent token bucket with 100 tokens/min

ModelRoute-2 (namespace-b, model: llama-70b, tokenLimit: 50 tokens/min)
  → Redis key: "namespace-b/modelroute-2"
  → Independent token bucket with 50 tokens/min ✅

Result: Each ModelRoute has its own isolated token bucket
namespace-a can use 100 tokens/min, namespace-b uses 50 tokens/min

,

Also it act same even if multiple routes were created in a same namespace

And the limitkey is a code level refactoring so also there is no userside level change , WDYT?

Comment thread
nXtCyberNet marked this conversation as resolved.
Comment on lines 139 to +140
Comment on lines 139 to +140
r.mutex.Lock()
defer r.mutex.Unlock()
Comment thread
hzxuzhonghu marked this conversation as resolved.

Expand All @@ -161,21 +161,21 @@ func (r *TokenRateLimiter) AddOrUpdateLimiter(model string, ratelimit *networkin

// Create global rate limiters
if ratelimit.InputTokensPerUnit != nil {
r.inputLimiter[model] = NewGlobalRateLimiter(
r.inputLimiter[limiterKey] = NewGlobalRateLimiter(
r.redisClient,
"kthena:ratelimit",
model,
limiterKey,
"input",
*ratelimit.InputTokensPerUnit,
ratelimit.Unit,
)
Comment thread
hzxuzhonghu marked this conversation as resolved.
}

if ratelimit.OutputTokensPerUnit != nil {
r.outputLimiter[model] = NewGlobalRateLimiter(
r.outputLimiter[limiterKey] = NewGlobalRateLimiter(
r.redisClient,
"kthena:ratelimit",
model,
limiterKey,
"output",
*ratelimit.OutputTokensPerUnit,
ratelimit.Unit,
Expand All @@ -186,14 +186,14 @@ func (r *TokenRateLimiter) AddOrUpdateLimiter(model string, ratelimit *networkin
duration := getTimeUnitDuration(ratelimit.Unit)

if ratelimit.InputTokensPerUnit != nil {
r.inputLimiter[model] = NewLocalLimiter(
r.inputLimiter[limiterKey] = NewLocalLimiter(
rate.Limit(float64(*ratelimit.InputTokensPerUnit)/duration.Seconds()),
int(*ratelimit.InputTokensPerUnit),
)
}

if ratelimit.OutputTokensPerUnit != nil {
r.outputLimiter[model] = NewLocalLimiter(
r.outputLimiter[limiterKey] = NewLocalLimiter(
rate.Limit(float64(*ratelimit.OutputTokensPerUnit)/duration.Seconds()),
int(*ratelimit.OutputTokensPerUnit),
)
Expand All @@ -204,12 +204,12 @@ func (r *TokenRateLimiter) AddOrUpdateLimiter(model string, ratelimit *networkin
}

// DeleteLimiter deletes rate limiter for a model
func (r *TokenRateLimiter) DeleteLimiter(model string) {
func (r *TokenRateLimiter) DeleteLimiter(limiterKey string) {
Comment thread
nXtCyberNet marked this conversation as resolved.
Comment thread
nXtCyberNet marked this conversation as resolved.
Comment on lines 206 to +207
Comment on lines 206 to +207
r.mutex.Lock()
defer r.mutex.Unlock()

delete(r.inputLimiter, model)
delete(r.outputLimiter, model)
delete(r.inputLimiter, limiterKey)
delete(r.outputLimiter, limiterKey)
}

func getTimeUnitDuration(unit networkingv1alpha1.RateLimitUnit) time.Duration {
Expand Down
Loading
Loading