From dc174c2a9c32faa8377e88cf5fd9f76b9f999973 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 17:05:41 +0300 Subject: [PATCH 01/60] - implemented kvcache-aware-scorer - added configuration Signed-off-by: Maroon Ayoub --- Dockerfile | 23 ++- Makefile | 7 +- README.md | 17 +++ cmd/epp/main.go | 1 - go.mod | 19 ++- go.sum | 36 +++-- pkg/epp/backend/metrics/fake.go | 2 + pkg/epp/backend/metrics/metrics.go | 1 - pkg/epp/backend/metrics/metrics_test.go | 3 - pkg/epp/backend/metrics/pod_metrics_test.go | 2 + .../inferencemodel_reconciler_test.go | 1 - pkg/epp/datastore/datastore.go | 4 +- pkg/epp/datastore/datastore_test.go | 2 - pkg/epp/handlers/request.go | 7 + pkg/epp/scheduling/local_config.go | 76 +++++++++- pkg/epp/scheduling/plugins/filter/filter.go | 1 - .../plugins/scorer/kvcache-aware-scorer.go | 141 ++++++++++++++++++ .../{scorers => scorer}/load_based_scorer.go | 11 +- pkg/epp/scheduling/scheduler.go | 1 + pkg/epp/scheduling/scheduler_test.go | 1 - pkg/epp/scheduling/scorers_test.go | 4 +- test/e2e/epp/e2e_test.go | 1 - test/integration/bbr/hermetic_test.go | 6 +- test/integration/epp/hermetic_test.go | 45 ++++-- test/utils/utils.go | 1 - 25 files changed, 348 insertions(+), 65 deletions(-) create mode 100644 pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go rename pkg/epp/scheduling/plugins/{scorers => scorer}/load_based_scorer.go (88%) diff --git a/Dockerfile b/Dockerfile index a92cbb711..5f7631ee6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,15 +3,26 @@ FROM quay.io/projectquay/golang:1.24 AS builder ARG TARGETOS ARG TARGETARCH -# ENV GOPROXY=https://goproxy.io,direct +# Install build tools +RUN dnf install -y gcc-c++ libstdc++ libstdc++-devel && dnf clean all WORKDIR /workspace + +## NeuralMagic internal repos pull config +ARG GIT_NM_USER +ARG NM_TOKEN +### use git token +RUN echo -e "machine github.com\n\tlogin ${GIT_NM_USER}\n\tpassword ${NM_TOKEN}" >> ~/.netrc +ENV GOPRIVATE=github.com/neuralmagic +ENV GIT_TERMINAL_PROMPT=1 + # Copy the Go Modules manifests COPY go.mod go.mod COPY go.sum go.sum # cache deps before building and copying source so that we don't need to re-download as much # and so that source changes don't invalidate our downloaded layer RUN go mod download +RUN rm -rf ~/.netrc # remove git token # Copy the go source COPY cmd ./cmd @@ -19,12 +30,20 @@ COPY pkg ./pkg COPY internal ./internal COPY api ./api +# HuggingFace tokenizer bindings +RUN mkdir -p lib +RUN curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib +RUN ranlib lib/*.a + # Build # the GOARCH has not a default value to allow the binary be built according to the host where the command # was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -o bin/epp cmd/epp/main.go cmd/epp/health.go +ENV CGO_ENABLED=1 +ENV GOOS=${TARGETOS:-linux} +ENV GOARCH=${TARGETARCH} +RUN go build -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib'" cmd/epp/main.go cmd/epp/health.go # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details diff --git a/Makefile b/Makefile index 0bfb19fc7..bb4f078d9 100644 --- a/Makefile +++ b/Makefile @@ -489,7 +489,12 @@ buildah-build: check-builder load-version-json ## Build and push image (multi-ar .PHONY: image-build image-build: check-container-tool load-version-json ## Build container image using $(CONTAINER_TOOL) @printf "\033[33;1m==== Building container image $(IMG) ====\033[0m\n" - $(CONTAINER_TOOL) build --build-arg TARGETOS=$(TARGETOS) --build-arg TARGETARCH=$(TARGETARCH) -t $(IMG) . + $(CONTAINER_TOOL) build --platform=$(TARGETOS)/$(TARGETARCH) \ + --build-arg TARGETOS=$(TARGETOS) \ + --build-arg TARGETARCH=$(TARGETARCH) \ + --build-arg GIT_NM_USER=$(GIT_NM_USER)\ + --build-arg NM_TOKEN=$(NM_TOKEN) \ + -t $(IMG) . .PHONY: image-push image-push: check-container-tool load-version-json ## Push container image $(IMG) to registry diff --git a/README.md b/README.md index 4cdb17811..12d4186ee 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,23 @@ This project offers tools for AI Inference, enabling developers to build [Inference Gateways]. +--- +## Temporary Fork Configuration + +To enable KVCacheAwareScorer, the following env vars must be configured: +``` +export ENABLE_KVCACHE_AWARE_SCORER=true +export KVCACHE_AWARE_SCORER_WEIGHT=1.0 +export KVCACHE_INDEXER_REDIS_ADDR= +export HF_TOKEN= +``` + +To enable LoadAwareScorer, the following env vars must be configured: +``` +export ENABLE_LOAD_AWARE_SCORER=true +export LOAD_AWARE_SCORER_WEIGHT=1.0 +``` +--- [Inference Gateways]:#concepts-and-definitions ## Concepts and Definitions diff --git a/cmd/epp/main.go b/cmd/epp/main.go index c0a87e62e..3c3832251 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -314,5 +314,4 @@ func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logge if mapping.LoraRequestInfo == nil { logger.Info("Not scraping metric: LoraRequestInfo") } - } diff --git a/go.mod b/go.mod index 7da237678..dff0542e9 100644 --- a/go.mod +++ b/go.mod @@ -1,12 +1,15 @@ module sigs.k8s.io/gateway-api-inference-extension -go 1.24.0 +go 1.24.1 + +toolchain go1.24.2 require ( github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.7.0 + github.com/neuralmagic/llm-d-kv-cache-manager v0.0.0-20250430102735-86595011431d github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/prometheus/client_golang v1.22.0 @@ -41,7 +44,9 @@ require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect + github.com/daulet/tokenizers v1.20.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect @@ -69,6 +74,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -90,6 +96,7 @@ require ( github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/procfs v0.15.1 // indirect + github.com/redis/go-redis/v9 v9.7.3 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect @@ -104,15 +111,15 @@ require ( go.opentelemetry.io/otel/trace v1.34.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/automaxprocs v1.6.0 // indirect - golang.org/x/crypto v0.36.0 // indirect + golang.org/x/crypto v0.37.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.39.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect + golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/term v0.31.0 // indirect + golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.7.0 // indirect golang.org/x/tools v0.31.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect diff --git a/go.sum b/go.sum index 11c244d44..ea299e2fd 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,10 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -24,10 +28,14 @@ github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySe github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/daulet/tokenizers v1.20.2 h1:tlq/vIOiBTKDPets3596aFvmJYLn3XI6LFKq4q9LKhQ= +github.com/daulet/tokenizers v1.20.2/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/elastic/crd-ref-docs v0.1.0 h1:Cr5kz89QB3Iuuj7dhAfLMApCrChEGAaIBTxGk/xuRKw= github.com/elastic/crd-ref-docs v0.1.0/go.mod h1:X83mMBdJt05heJUYiS3T0yJ/JkCuliuhSUNav5Gjo/U= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= @@ -100,6 +108,8 @@ github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWm github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= @@ -147,6 +157,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/neuralmagic/llm-d-kv-cache-manager v0.0.0-20250430102735-86595011431d h1:6YSxvAG4ve5jy0nTLs509OMU5fuiQ3JNQdZxqiu8PgQ= +github.com/neuralmagic/llm-d-kv-cache-manager v0.0.0-20250430102735-86595011431d/go.mod h1:VB+KcEemkO1ZKpz/hgUPQMU9oSLv2uCLW6y6c+r8jkQ= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= @@ -172,6 +184,8 @@ github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= +github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -226,8 +240,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= -golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= +golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= +golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -238,17 +252,15 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= -golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -256,13 +268,13 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index ec97c6dea..d1b373fdc 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -40,9 +40,11 @@ func (fpm *FakePodMetrics) String() string { func (fpm *FakePodMetrics) GetPod() *Pod { return fpm.Pod } + func (fpm *FakePodMetrics) GetMetrics() *Metrics { return fpm.Metrics } + func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { fpm.Pod = toInternalPod(pod) } diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 96814b4bb..efe847dd1 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -47,7 +47,6 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *Metrics, port int32, ) (*Metrics, error) { - // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index e3b45b94a..c69f2c67e 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -58,7 +58,6 @@ func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { // --- Tests --- func TestGetMetric(t *testing.T) { - metricFamilies := map[string]*dto.MetricFamily{ "metric1": makeMetricFamily("metric1", makeMetric(map[string]string{"label1": "value1"}, 1.0, 1000), @@ -166,7 +165,6 @@ func TestGetMetric(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - gotMetric, err := p.getMetric(metricFamilies, tt.spec) if tt.wantError { @@ -240,7 +238,6 @@ func TestLabelsMatch(t *testing.T) { } func TestGetLatestLoraMetric(t *testing.T) { - testCases := []struct { name string metricFamilies map[string]*dto.MetricFamily diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go index e79c1bf0b..8d5f064ad 100644 --- a/pkg/epp/backend/metrics/pod_metrics_test.go +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -88,10 +88,12 @@ type fakeDataStore struct{} func (f *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) { return &v1alpha2.InferencePool{Spec: v1alpha2.InferencePoolSpec{TargetPortNumber: 8000}}, nil } + func (f *fakeDataStore) PodGetAll() []PodMetrics { // Not implemented. return nil } + func (f *fakeDataStore) PodList(func(PodMetrics) bool) []PodMetrics { // Not implemented. return nil diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index 80c30e191..024b69016 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -227,7 +227,6 @@ func TestInferenceModelReconciler(t *testing.T) { if diff := diffStore(ds, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } - }) } } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 288c4d7b1..630b71199 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -38,9 +38,7 @@ const ( ModelNameIndexKey = "spec.modelName" ) -var ( - errPoolNotSynced = errors.New("InferencePool is not initialized in data store") -) +var errPoolNotSynced = errors.New("InferencePool is not initialized in data store") // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index b6466e6b2..248e95b48 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -204,7 +204,6 @@ func TestModel(t *testing.T) { existing := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) got := ds.ModelGet(tsModel) return existing != nil && got == nil - }, wantOpResult: true, wantModels: []*v1alpha2.InferenceModel{model2chat}, @@ -226,7 +225,6 @@ func TestModel(t *testing.T) { if diff := testutil.DiffModelLists(test.wantModels, ds.ModelGetAll()); diff != "" { t.Errorf("Unexpected models diff: %s", diff) } - }) } } diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 203afc2f0..4997a8b30 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -31,6 +31,8 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +const emptyPrompt = "" + // HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. func (s *StreamingServer) HandleRequestBody( ctx context.Context, @@ -68,6 +70,7 @@ func (s *StreamingServer) HandleRequestBody( Headers: reqCtx.RequestHeaders, ResolvedTargetModel: modelName, Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, + Prompt: emptyPrompt, } logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq) @@ -76,6 +79,10 @@ func (s *StreamingServer) HandleRequestBody( if llmReq.Model != llmReq.ResolvedTargetModel { requestBodyMap["model"] = llmReq.ResolvedTargetModel } + // Extract prompt from the request body. + if prompt, ok := requestBodyMap["prompt"].(string); ok { + llmReq.Prompt = prompt + } requestBodyBytes, err = json.Marshal(requestBodyMap) if err != nil { diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 87098ae0d..931c28487 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -17,12 +17,78 @@ limitations under the License. package scheduling import ( - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorers" + "context" + "os" + "strconv" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" + loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" + + kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" + loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" ) -func init() { - defaultConfig.scorers[&scorers.LoadBasedScorer{}] = 1.0 +func setDefaultConfig() { + // since the default config is a global variable, we add this function to minimize rebase conflicts. + // this configuration is a temporary state, it should be better streamlined. + setLoadBasedScorer() + setKVCacheAwareScorer() +} + +func setLoadBasedScorer() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if os.Getenv(loadAwareScorerEnablementEnvVar) != "true" { + loggerDebug.Info("Skipping LoadAwareScorer creation as it is not enabled") + return + } + + loadBasedScorerWeight := 1 + if weightStr := os.Getenv(loadAwareScorerWeightEnvVar); weightStr != "" { + var err error + loadBasedScorerWeightInt64, err := strconv.ParseInt(weightStr, 10, 32) + if err != nil { + loggerDebug.Error(err, "Failed to parse LOAD_BASED_SCORER_WEIGHT") + } + + loadBasedScorerWeight = int(loadBasedScorerWeightInt64) + } + + defaultConfig.scorers[&scorer.LoadAwareScorer{}] = loadBasedScorerWeight +} + +func setKVCacheAwareScorer() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if os.Getenv(kvCacheScorerEnablementEnvVar) != "true" { + loggerDebug.Info("Skipping KVCacheAwareScorer creation as it is not enabled") + return + } + + kvCacheScorer, err := scorer.NewKVCacheAwareScorer(ctx) + if err != nil { + loggerDebug.Error(err, "Failed to create KVCacheAwareScorer") + return + } + + kvCacheScorerWeight := 1 + if weightStr := os.Getenv(kvCacheScorerWeightEnvVar); weightStr != "" { + var err error + kvCacheScorerWeightInt64, err := strconv.ParseInt(weightStr, 10, 32) + if err != nil { + loggerDebug.Error(err, "Failed to parse KVCACHE_SCORER_WEIGHT") + } + + kvCacheScorerWeight = int(kvCacheScorerWeightInt64) + } - // Added as a reference - // defaultConfig.filters = []plugins.Filter{filter.PDFilter} + defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight } diff --git a/pkg/epp/scheduling/plugins/filter/filter.go b/pkg/epp/scheduling/plugins/filter/filter.go index 86620aa9f..a8c68ea9a 100644 --- a/pkg/epp/scheduling/plugins/filter/filter.go +++ b/pkg/epp/scheduling/plugins/filter/filter.go @@ -214,7 +214,6 @@ var LoRAAffinityFilter = &baseFilter{ // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering func loRASoftAffinityFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { - // Pre-allocate slices with estimated capacity filtered_affinity := make([]types.Pod, 0, len(pods)) filtered_available := make([]types.Pod, 0, len(pods)) diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go new file mode 100644 index 000000000..ce23e2c7b --- /dev/null +++ b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go @@ -0,0 +1,141 @@ +/* +Copyright 2025 The Neural Magic Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "fmt" + "os" + + kvcache "github.com/neuralmagic/llm-d-kv-cache-manager/pkg/kv-cache" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + kvCacheAwareScorerName = "kvcache-aware-scorer" + kvCacheRedisEnvVar = "KVCACHE_INDEXER_REDIS_ADDR" + huggingFaceTokenEnvVar = "HF_TOKEN" +) + +// KVCacheAwareScorer is a concrete implementation of the Scorer interface. +// It uses the KVCacheIndexer to score pods based on KVCache awareness. +type KVCacheAwareScorer struct { + kvCacheIndexer *kvcache.Indexer +} + +// NewKVCacheAwareScorer creates a new KVCacheAwareScorer instance. +// It initializes the KVCacheIndexer with the provided configuration, +// and runs it with the given context. +// +// If the configuration is nil, it uses the default configuration. +func NewKVCacheAwareScorer(ctx context.Context) (plugins.Scorer, error) { + config := kvcache.NewDefaultConfig() + + redisAddr := os.Getenv(kvCacheRedisEnvVar) + if redisAddr != "" { + config.KVBlockIndexerConfig.RedisKVBlockIndexerConfig.RedisAddr = redisAddr + } else { + return nil, fmt.Errorf("environment variable %s is not set", kvCacheRedisEnvVar) + } + + hfToken := os.Getenv(huggingFaceTokenEnvVar) + if hfToken != "" { + config.TokenizersPoolConfig.HFTokenizerConfig.HuggingFaceToken = hfToken + } else { + return nil, fmt.Errorf("environment variable %s is not set", huggingFaceTokenEnvVar) + } + + kvCacheIndexer, err := kvcache.NewKVCacheIndexer(config) + if err != nil { + return nil, fmt.Errorf("failed to create KVCacheIndexer: %w", err) + } + + go kvCacheIndexer.Run(ctx) + + return &KVCacheAwareScorer{ + kvCacheIndexer: kvCacheIndexer, + }, nil +} + +// Name returns the name of the scorer. +func (s *KVCacheAwareScorer) Name() string { + return kvCacheAwareScorerName +} + +// Score scores the provided pod based on the KVCache index state. +// This function is not concurrent-safe and should be called in a +// single-threaded manner. +func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { + loggerDebug := log.FromContext(ctx).WithName(kvCacheAwareScorerName).V(logutil.DEBUG) + if ctx.Req == nil { + loggerDebug.Info("Request is nil, skipping scoring") + return nil + } + + scores, err := s.kvCacheIndexer.GetPodScores(ctx.Context, ctx.Req.Prompt, ctx.Req.Model, nil) + if err != nil { + loggerDebug.Error(err, "Failed to get pod scores") + return nil + } + + return indexerScoresToNormalizedScoredPods(pods, scores) +} + +func getMinMax(scores map[string]int) (int, int) { + minScore := int(^uint(0) >> 1) // max int + maxScore := -1 + + for _, score := range scores { + if score < minScore { + minScore = score + } + if score > maxScore { + maxScore = score + } + } + + return minScore, maxScore +} + +func indexerScoresToNormalizedScoredPods(pods []types.Pod, scores map[string]int) map[types.Pod]float64 { + scoredPods := make(map[types.Pod]float64) + minScore, maxScore := getMinMax(scores) + + for _, pod := range pods { + metricsPod := pod.GetPod() + if metricsPod == nil { + continue + } + + if score, ok := scores[metricsPod.Address]; ok { + if minScore == maxScore { + scoredPods[pod] = 1.0 + continue + } + + scoredPods[pod] = float64(score-minScore) / float64(maxScore-minScore) + } else { + scoredPods[pod] = 0.0 + } + } + return scoredPods +} diff --git a/pkg/epp/scheduling/plugins/scorers/load_based_scorer.go b/pkg/epp/scheduling/plugins/scorer/load_based_scorer.go similarity index 88% rename from pkg/epp/scheduling/plugins/scorers/load_based_scorer.go rename to pkg/epp/scheduling/plugins/scorer/load_based_scorer.go index 5bea87c95..d24f49b33 100644 --- a/pkg/epp/scheduling/plugins/scorers/load_based_scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/load_based_scorer.go @@ -13,17 +13,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package scorers + +package scorer import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) -type LoadBasedScorer struct{} +type LoadAwareScorer struct{} -func (s LoadBasedScorer) Name() string { - return "load based scorer" +func (s *LoadAwareScorer) Name() string { + return "load-aware-scorer" } // Score scores the given pod in range of 0-1 @@ -33,7 +34,7 @@ func (s LoadBasedScorer) Name() string { // Pod with requests in the queue will get score between 0.5 and 0. // Score 0 will get pod with number of requests in the queue equal to the threshold used in load-based filter (QueueingThresholdLoRA) // In future pods with additional capacity will get score higher than 0.5 -func (s LoadBasedScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { +func (s *LoadAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { scoredPods := make(map[types.Pod]float64) for _, pod := range pods { diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 9bad61316..f4e1714d4 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -69,6 +69,7 @@ var ( ) func NewScheduler(datastore Datastore) *Scheduler { + setDefaultConfig() return NewSchedulerWithConfig(datastore, defaultConfig) } diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index e6d229aee..cda65496e 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -536,7 +536,6 @@ func (tp *TestPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) []t tp.ReceivedRequestHeaders[key] = value } return findPods(ctx, tp.FilterRes...) - } func (tp *TestPlugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { diff --git a/pkg/epp/scheduling/scorers_test.go b/pkg/epp/scheduling/scorers_test.go index 365b2375b..a98a838b1 100644 --- a/pkg/epp/scheduling/scorers_test.go +++ b/pkg/epp/scheduling/scorers_test.go @@ -25,7 +25,7 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) @@ -40,7 +40,7 @@ func TestScorers(t *testing.T) { }{ { name: "load based scorer", - scorer: &scorers.LoadBasedScorer{}, + scorer: &scorer.LoadAwareScorer{}, req: &types.LLMRequest{ Model: "critical", ResolvedTargetModel: "critical", diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index 7240cebc4..f0220b301 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -87,7 +87,6 @@ var _ = ginkgo.Describe("InferencePool", func() { return nil }, readyTimeout, curlInterval).Should(gomega.Succeed()) - }) }) }) diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go index b99186db9..07b846de2 100644 --- a/test/integration/bbr/hermetic_test.go +++ b/test/integration/bbr/hermetic_test.go @@ -122,7 +122,8 @@ func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { RawValue: []byte("foo"), }, }, - }}, + }, + }, }, }, }, @@ -187,7 +188,8 @@ func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { RawValue: []byte("sql-lora-sheddable"), }, }, - }}, + }, + }, }, }, }, diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 79b619fd6..45e99dec9 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -121,7 +121,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { KVCacheUsagePercent: 0.2, }, }, - wantMetrics: map[string]string{`inference_model_request_total`: ` + wantMetrics: map[string]string{ + `inference_model_request_total`: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. # TYPE inference_model_request_total counter inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 @@ -153,7 +154,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }}, + }, + }, }, }, }, @@ -237,7 +239,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }}, + }, + }, }, }, }, @@ -321,7 +324,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }}, + }, + }, }, }, }, @@ -454,7 +458,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }}, + }, + }, }, }, }, @@ -565,7 +570,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }}, + }, + }, }, }, }, @@ -676,7 +682,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(74)), }, }, - }}, + }, + }, }, }, }, @@ -924,35 +931,40 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { @@ -961,14 +973,16 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} data: [DONE]`, ), - EndOfStream: false}, + EndOfStream: false, + }, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(""), - EndOfStream: true}, + EndOfStream: true, + }, }, }, }, @@ -1172,7 +1186,8 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte("192.168.1.1:8000"), }, }, - }}, + }, + }, }, }, }, diff --git a/test/utils/utils.go b/test/utils/utils.go index 1ec0fbaae..e6add0b6d 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -240,7 +240,6 @@ func ExecCommandInPod( podNamespace, podName, containerName string, cmd []string, ) (string, error) { - parameterCodec := runtime.NewParameterCodec(scheme) req := kubeClient.CoreV1().RESTClient(). From 3476f5949cbdcf5329d161f363d27718f683c83e Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 17:13:42 +0300 Subject: [PATCH 02/60] undo gofumpt Signed-off-by: Maroon Ayoub --- cmd/epp/main.go | 1 + pkg/epp/backend/metrics/fake.go | 2 - pkg/epp/backend/metrics/metrics.go | 1 + pkg/epp/backend/metrics/metrics_test.go | 3 ++ pkg/epp/backend/metrics/pod_metrics_test.go | 2 - .../inferencemodel_reconciler_test.go | 1 + pkg/epp/datastore/datastore.go | 4 +- pkg/epp/datastore/datastore_test.go | 2 + pkg/epp/scheduling/plugins/filter/filter.go | 1 + pkg/epp/scheduling/scheduler_test.go | 1 + test/e2e/epp/e2e_test.go | 1 + test/integration/bbr/hermetic_test.go | 6 +-- test/integration/epp/hermetic_test.go | 45 +++++++------------ test/utils/utils.go | 1 + 14 files changed, 32 insertions(+), 39 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 3c3832251..c0a87e62e 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -314,4 +314,5 @@ func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logge if mapping.LoraRequestInfo == nil { logger.Info("Not scraping metric: LoraRequestInfo") } + } diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index d1b373fdc..ec97c6dea 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -40,11 +40,9 @@ func (fpm *FakePodMetrics) String() string { func (fpm *FakePodMetrics) GetPod() *Pod { return fpm.Pod } - func (fpm *FakePodMetrics) GetMetrics() *Metrics { return fpm.Metrics } - func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { fpm.Pod = toInternalPod(pod) } diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index efe847dd1..96814b4bb 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -47,6 +47,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *Metrics, port int32, ) (*Metrics, error) { + // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index c69f2c67e..e3b45b94a 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -58,6 +58,7 @@ func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { // --- Tests --- func TestGetMetric(t *testing.T) { + metricFamilies := map[string]*dto.MetricFamily{ "metric1": makeMetricFamily("metric1", makeMetric(map[string]string{"label1": "value1"}, 1.0, 1000), @@ -165,6 +166,7 @@ func TestGetMetric(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + gotMetric, err := p.getMetric(metricFamilies, tt.spec) if tt.wantError { @@ -238,6 +240,7 @@ func TestLabelsMatch(t *testing.T) { } func TestGetLatestLoraMetric(t *testing.T) { + testCases := []struct { name string metricFamilies map[string]*dto.MetricFamily diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go index 8d5f064ad..e79c1bf0b 100644 --- a/pkg/epp/backend/metrics/pod_metrics_test.go +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -88,12 +88,10 @@ type fakeDataStore struct{} func (f *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) { return &v1alpha2.InferencePool{Spec: v1alpha2.InferencePoolSpec{TargetPortNumber: 8000}}, nil } - func (f *fakeDataStore) PodGetAll() []PodMetrics { // Not implemented. return nil } - func (f *fakeDataStore) PodList(func(PodMetrics) bool) []PodMetrics { // Not implemented. return nil diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index 024b69016..80c30e191 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -227,6 +227,7 @@ func TestInferenceModelReconciler(t *testing.T) { if diff := diffStore(ds, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } + }) } } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 630b71199..288c4d7b1 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -38,7 +38,9 @@ const ( ModelNameIndexKey = "spec.modelName" ) -var errPoolNotSynced = errors.New("InferencePool is not initialized in data store") +var ( + errPoolNotSynced = errors.New("InferencePool is not initialized in data store") +) // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 248e95b48..b6466e6b2 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -204,6 +204,7 @@ func TestModel(t *testing.T) { existing := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) got := ds.ModelGet(tsModel) return existing != nil && got == nil + }, wantOpResult: true, wantModels: []*v1alpha2.InferenceModel{model2chat}, @@ -225,6 +226,7 @@ func TestModel(t *testing.T) { if diff := testutil.DiffModelLists(test.wantModels, ds.ModelGetAll()); diff != "" { t.Errorf("Unexpected models diff: %s", diff) } + }) } } diff --git a/pkg/epp/scheduling/plugins/filter/filter.go b/pkg/epp/scheduling/plugins/filter/filter.go index a8c68ea9a..86620aa9f 100644 --- a/pkg/epp/scheduling/plugins/filter/filter.go +++ b/pkg/epp/scheduling/plugins/filter/filter.go @@ -214,6 +214,7 @@ var LoRAAffinityFilter = &baseFilter{ // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering func loRASoftAffinityFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + // Pre-allocate slices with estimated capacity filtered_affinity := make([]types.Pod, 0, len(pods)) filtered_available := make([]types.Pod, 0, len(pods)) diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index cda65496e..e6d229aee 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -536,6 +536,7 @@ func (tp *TestPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) []t tp.ReceivedRequestHeaders[key] = value } return findPods(ctx, tp.FilterRes...) + } func (tp *TestPlugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index f0220b301..7240cebc4 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -87,6 +87,7 @@ var _ = ginkgo.Describe("InferencePool", func() { return nil }, readyTimeout, curlInterval).Should(gomega.Succeed()) + }) }) }) diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go index 07b846de2..b99186db9 100644 --- a/test/integration/bbr/hermetic_test.go +++ b/test/integration/bbr/hermetic_test.go @@ -122,8 +122,7 @@ func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { RawValue: []byte("foo"), }, }, - }, - }, + }}, }, }, }, @@ -188,8 +187,7 @@ func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { RawValue: []byte("sql-lora-sheddable"), }, }, - }, - }, + }}, }, }, }, diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 45e99dec9..79b619fd6 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -121,8 +121,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { KVCacheUsagePercent: 0.2, }, }, - wantMetrics: map[string]string{ - `inference_model_request_total`: ` + wantMetrics: map[string]string{`inference_model_request_total`: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. # TYPE inference_model_request_total counter inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 @@ -154,8 +153,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }, - }, + }}, }, }, }, @@ -239,8 +237,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }, - }, + }}, }, }, }, @@ -324,8 +321,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }, - }, + }}, }, }, }, @@ -458,8 +454,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }, - }, + }}, }, }, }, @@ -570,8 +565,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(76)), }, }, - }, - }, + }}, }, }, }, @@ -682,8 +676,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte(strconv.Itoa(74)), }, }, - }, - }, + }}, }, }, }, @@ -931,40 +924,35 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { @@ -973,16 +961,14 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} data: [DONE]`, ), - EndOfStream: false, - }, + EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ Body: []byte(""), - EndOfStream: true, - }, + EndOfStream: true}, }, }, }, @@ -1186,8 +1172,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { RawValue: []byte("192.168.1.1:8000"), }, }, - }, - }, + }}, }, }, }, diff --git a/test/utils/utils.go b/test/utils/utils.go index e6add0b6d..1ec0fbaae 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -240,6 +240,7 @@ func ExecCommandInPod( podNamespace, podName, containerName string, cmd []string, ) (string, error) { + parameterCodec := runtime.NewParameterCodec(scheme) req := kubeClient.CoreV1().RESTClient(). From e6ca553362e642375215a09e19406161dd0c5bfd Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 21:43:59 +0300 Subject: [PATCH 03/60] added scorer initialization debug msg Signed-off-by: Maroon Ayoub --- pkg/epp/scheduling/local_config.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 931c28487..ccb61510e 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -37,11 +37,11 @@ const ( func setDefaultConfig() { // since the default config is a global variable, we add this function to minimize rebase conflicts. // this configuration is a temporary state, it should be better streamlined. - setLoadBasedScorer() + setLoadAwareScorer() setKVCacheAwareScorer() } -func setLoadBasedScorer() { +func setLoadAwareScorer() { ctx := context.Background() loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) @@ -61,6 +61,7 @@ func setLoadBasedScorer() { loadBasedScorerWeight = int(loadBasedScorerWeightInt64) } + loggerDebug.Info("Initialized LoadAwareScorer", "weight", loadBasedScorerWeight) defaultConfig.scorers[&scorer.LoadAwareScorer{}] = loadBasedScorerWeight } @@ -90,5 +91,6 @@ func setKVCacheAwareScorer() { kvCacheScorerWeight = int(kvCacheScorerWeightInt64) } + loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight } From 388a7db26f917b61090ac45ae0def1089c23d837 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 22:41:32 +0300 Subject: [PATCH 04/60] - added debug logging - configured maxscorepicker as default Signed-off-by: Maroon Ayoub --- pkg/epp/scheduling/local_config.go | 3 +++ pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go | 2 ++ 2 files changed, 5 insertions(+) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index ccb61510e..25c2cb24c 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -19,6 +19,7 @@ package scheduling import ( "context" "os" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "strconv" "sigs.k8s.io/controller-runtime/pkg/log" @@ -39,6 +40,8 @@ func setDefaultConfig() { // this configuration is a temporary state, it should be better streamlined. setLoadAwareScorer() setKVCacheAwareScorer() + + defaultConfig.picker = picker.NewMaxScorePicker() } func setLoadAwareScorer() { diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go index ce23e2c7b..171967ef3 100644 --- a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go @@ -96,6 +96,7 @@ func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Po loggerDebug.Error(err, "Failed to get pod scores") return nil } + loggerDebug.Info("Got pod scores", "scores", scores) return indexerScoresToNormalizedScoredPods(pods, scores) } @@ -137,5 +138,6 @@ func indexerScoresToNormalizedScoredPods(pods []types.Pod, scores map[string]int scoredPods[pod] = 0.0 } } + return scoredPods } From 01f019d640b3d05a06bfe774e318434195ed9904 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 22:48:17 +0300 Subject: [PATCH 05/60] updated KVCacheAwareScorer comments Signed-off-by: Maroon Ayoub --- .../plugins/scorer/kvcache-aware-scorer.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go index 171967ef3..bc025751e 100644 --- a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go @@ -36,17 +36,17 @@ const ( huggingFaceTokenEnvVar = "HF_TOKEN" ) -// KVCacheAwareScorer is a concrete implementation of the Scorer interface. -// It uses the KVCacheIndexer to score pods based on KVCache awareness. +// KVCacheAwareScorer uses the KVCacheIndexer to score pods based on KVCache +// awareness. type KVCacheAwareScorer struct { kvCacheIndexer *kvcache.Indexer } // NewKVCacheAwareScorer creates a new KVCacheAwareScorer instance. -// It initializes the KVCacheIndexer with the provided configuration, -// and runs it with the given context. +// It initializes the KVCacheIndexer from environment variables. // -// If the configuration is nil, it uses the default configuration. +// If the environment variables are not set, or if the indexer +// fails to initialize, an error is returned. func NewKVCacheAwareScorer(ctx context.Context) (plugins.Scorer, error) { config := kvcache.NewDefaultConfig() @@ -82,8 +82,7 @@ func (s *KVCacheAwareScorer) Name() string { } // Score scores the provided pod based on the KVCache index state. -// This function is not concurrent-safe and should be called in a -// single-threaded manner. +// The returned scores are normalized to a range of 0-1. func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { loggerDebug := log.FromContext(ctx).WithName(kvCacheAwareScorerName).V(logutil.DEBUG) if ctx.Req == nil { From bc2fee3a61911462ee26af80af1678c6d913d56a Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Wed, 30 Apr 2025 23:02:45 +0300 Subject: [PATCH 06/60] reused envutils (review comment) Signed-off-by: Maroon Ayoub --- pkg/epp/scheduling/local_config.go | 38 +++++++----------------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 25c2cb24c..2e261a87a 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -18,12 +18,10 @@ package scheduling import ( "context" - "os" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" - "strconv" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -48,31 +46,21 @@ func setLoadAwareScorer() { ctx := context.Background() loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) - if os.Getenv(loadAwareScorerEnablementEnvVar) != "true" { + if envutil.GetEnvString(loadAwareScorerEnablementEnvVar, "false", loggerDebug) != "true" { loggerDebug.Info("Skipping LoadAwareScorer creation as it is not enabled") return } - loadBasedScorerWeight := 1 - if weightStr := os.Getenv(loadAwareScorerWeightEnvVar); weightStr != "" { - var err error - loadBasedScorerWeightInt64, err := strconv.ParseInt(weightStr, 10, 32) - if err != nil { - loggerDebug.Error(err, "Failed to parse LOAD_BASED_SCORER_WEIGHT") - } - - loadBasedScorerWeight = int(loadBasedScorerWeightInt64) - } - - loggerDebug.Info("Initialized LoadAwareScorer", "weight", loadBasedScorerWeight) + loadBasedScorerWeight := envutil.GetEnvInt(loadAwareScorerWeightEnvVar, 1, loggerDebug) defaultConfig.scorers[&scorer.LoadAwareScorer{}] = loadBasedScorerWeight + loggerDebug.Info("Initialized LoadAwareScorer", "weight", loadBasedScorerWeight) } func setKVCacheAwareScorer() { ctx := context.Background() loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) - if os.Getenv(kvCacheScorerEnablementEnvVar) != "true" { + if envutil.GetEnvString(kvCacheScorerEnablementEnvVar, "false", loggerDebug) != "true" { loggerDebug.Info("Skipping KVCacheAwareScorer creation as it is not enabled") return } @@ -83,17 +71,7 @@ func setKVCacheAwareScorer() { return } - kvCacheScorerWeight := 1 - if weightStr := os.Getenv(kvCacheScorerWeightEnvVar); weightStr != "" { - var err error - kvCacheScorerWeightInt64, err := strconv.ParseInt(weightStr, 10, 32) - if err != nil { - loggerDebug.Error(err, "Failed to parse KVCACHE_SCORER_WEIGHT") - } - - kvCacheScorerWeight = int(kvCacheScorerWeightInt64) - } - - loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) + kvCacheScorerWeight := envutil.GetEnvInt(kvCacheScorerWeightEnvVar, 1, loggerDebug) defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight + loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) } From e7d88377953d2f51d8f6700cbbf77bbd7c2826a1 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 16:47:31 -0400 Subject: [PATCH 07/60] testing new lint config to diff: false --- .golangci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index d2364062e..ced848116 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,7 +1,8 @@ run: timeout: 5m allow-parallel-runners: true - + diff: false + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) From fe5168f7f9fb03b290f32ef378783ce4ced77673 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 20:49:58 -0400 Subject: [PATCH 08/60] fix lint --- .golangci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index ced848116..19139a67a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,8 +1,9 @@ run: timeout: 5m allow-parallel-runners: true - diff: false - + skip-files: + - "pkg/epp/server/runserver_test.go" + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) @@ -10,6 +11,7 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin + linters: disable-all: true enable: From b96946e5c30d53bb8f03c9450fe25f8bfc175ed2 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 20:53:49 -0400 Subject: [PATCH 09/60] fix lint --- .golangci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 19139a67a..452ad11cd 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -2,7 +2,7 @@ run: timeout: 5m allow-parallel-runners: true skip-files: - - "pkg/epp/server/runserver_test.go" + - '^pkg/epp/server/runserver_test\.go$' # Settings related to issues issues: From 476cbab124bb1bff02fd896b433a6a712df979fc Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 20:57:25 -0400 Subject: [PATCH 10/60] fix lint --- .golangci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 452ad11cd..e4836bfaf 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,8 +1,6 @@ run: timeout: 5m allow-parallel-runners: true - skip-files: - - '^pkg/epp/server/runserver_test\.go$' # Settings related to issues issues: @@ -11,6 +9,8 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin + exclude-files: + - '^pkg/epp/server/runserver_test\.go$' linters: disable-all: true From 847daf887d9562e288a41774e4d4fa766e341a8d Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 21:09:29 -0400 Subject: [PATCH 11/60] fix lint --- .golangci.yml | 4 +--- pkg/epp/server/runserver_test.go | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index e4836bfaf..ec8870a04 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,7 +1,7 @@ run: timeout: 5m allow-parallel-runners: true - + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) @@ -9,8 +9,6 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin - exclude-files: - - '^pkg/epp/server/runserver_test\.go$' linters: disable-all: true diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index b02688c58..f5a428141 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -16,6 +16,8 @@ limitations under the License. package server_test +//nolint:typecheck + import ( "testing" From 288589c247c7ddc856c0019d316b18a7d285941a Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 22:59:44 -0400 Subject: [PATCH 12/60] fix lint --- .golangci.yml | 8 +++++++- pkg/epp/server/runserver_test.go | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index ec8870a04..6e9c47448 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -9,7 +9,13 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin - + exclude-files: + - '^some/other/file\.go$' # if you still need it elsewhere + exclude-rules: + - path: '^pkg/epp/server/runserver_test\.go$' + linters: + - typecheck + linters: disable-all: true enable: diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index f5a428141..b02688c58 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -16,8 +16,6 @@ limitations under the License. package server_test -//nolint:typecheck - import ( "testing" From 5c364e261fc2db5da5f89ea5e6b349c334bb5e1c Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:07:11 -0400 Subject: [PATCH 13/60] fix lint --- .golangci.yml | 8 +------- pkg/epp/server/runserver_test.go | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 6e9c47448..ec8870a04 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -9,13 +9,7 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin - exclude-files: - - '^some/other/file\.go$' # if you still need it elsewhere - exclude-rules: - - path: '^pkg/epp/server/runserver_test\.go$' - linters: - - typecheck - + linters: disable-all: true enable: diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index b02688c58..4b524cc51 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +// nolint package server_test import ( From d835ccb7d97f52f3563f151e6232335d4d74d5b4 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:13:48 -0400 Subject: [PATCH 14/60] fix lint --- .golangci.yml | 2 ++ .tekton/go-lint-task.yaml | 2 +- pkg/epp/server/runserver_test.go | 4 +--- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index ec8870a04..8360cb880 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,3 +1,5 @@ +version: "2" + run: timeout: 5m allow-parallel-runners: true diff --git a/.tekton/go-lint-task.yaml b/.tekton/go-lint-task.yaml index f42471a19..72b0ce384 100644 --- a/.tekton/go-lint-task.yaml +++ b/.tekton/go-lint-task.yaml @@ -10,7 +10,7 @@ spec: - name: source steps: - name: run-lint - image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v1.64.8 + image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v2.0.3 imagePullPolicy: IfNotPresent script: | #!/bin/bash diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index 4b524cc51..fa6e43f29 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -13,8 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -// nolint package server_test import ( From 4818a312c16660b50c26a772af78b891ceda87ab Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:16:24 -0400 Subject: [PATCH 15/60] fix lint --- .golangci.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 8360cb880..126716aa1 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -21,7 +21,7 @@ linters: - fatcontext - ginkgolinter - gocritic - - govet + - govet - loggercheck - misspell - perfsprint @@ -36,11 +36,10 @@ linters: - ineffassign - nakedret - prealloc - - typecheck - unparam - unused - + linters-settings: revive: rules: - - name: comment-spacings + - name: comment-spacings \ No newline at end of file From ed2db83f1fb126bac246ec2bb02665021a3ebc96 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:18:45 -0400 Subject: [PATCH 16/60] fix lint --- .golangci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 126716aa1..4024db5fe 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -30,7 +30,6 @@ linters: - makezero - errcheck - goconst - - gofmt - goimports - gosimple - ineffassign From 869f7cd32697e7772e90821a2e1d754b5e5eea97 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:21:06 -0400 Subject: [PATCH 17/60] fix lint --- .golangci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 4024db5fe..5be231756 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -30,7 +30,6 @@ linters: - makezero - errcheck - goconst - - goimports - gosimple - ineffassign - nakedret From 91a0f85021416786fd3e2dc60dd6083c7789efa5 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:23:32 -0400 Subject: [PATCH 18/60] fix lint --- .golangci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 5be231756..13769403b 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -30,7 +30,6 @@ linters: - makezero - errcheck - goconst - - gosimple - ineffassign - nakedret - prealloc From 424d5b44fd24998f12cce48be28ba591cb5b9a67 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:27:09 -0400 Subject: [PATCH 19/60] fix lint --- .golangci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.golangci.yml b/.golangci.yml index 13769403b..4f9eeccf0 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -11,6 +11,8 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin + exclude-files: + - '^pkg/epp/server/runserver_test\.go$' linters: disable-all: true From 27a62bb17dab3e956d4e96d27fd91f77d121a873 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:31:51 -0400 Subject: [PATCH 20/60] fix lint --- .golangci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 4f9eeccf0..2db156ef0 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -3,7 +3,9 @@ version: "2" run: timeout: 5m allow-parallel-runners: true - + skip-files: + - '^pkg/epp/server/runserver_test\.go$' + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) From d88c4416cb670fff3abeab4a20b50ebad77ff5b9 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:37:15 -0400 Subject: [PATCH 21/60] fix lint --- pkg/epp/server/runserver_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index fa6e43f29..64e251609 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -18,7 +18,7 @@ package server_test import ( "testing" - "sigs.k8s.io/controller-runtime/pkg/manager" + . "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -27,7 +27,7 @@ import ( func TestRunnable(t *testing.T) { // Make sure AsRunnable() does not use leader election. runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) - r, ok := runner.(manager.LeaderElectionRunnable) + r, ok := runner.(LeaderElectionRunnable) if !ok { t.Fatal("runner is not LeaderElectionRunnable") } From c56372cd2400fa1763c035a9689617ff5eff237d Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:42:01 -0400 Subject: [PATCH 22/60] fix lint --- .golangci.yml | 5 ++--- pkg/epp/server/runserver_test.go | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 2db156ef0..13a2edd5a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -3,8 +3,6 @@ version: "2" run: timeout: 5m allow-parallel-runners: true - skip-files: - - '^pkg/epp/server/runserver_test\.go$' # Settings related to issues issues: @@ -13,8 +11,9 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin + # Files to exclude from linting exclude-files: - - '^pkg/epp/server/runserver_test\.go$' + - pkg/epp/server/runserver_test.go linters: disable-all: true diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index 64e251609..fa6e43f29 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -18,7 +18,7 @@ package server_test import ( "testing" - . "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -27,7 +27,7 @@ import ( func TestRunnable(t *testing.T) { // Make sure AsRunnable() does not use leader election. runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) - r, ok := runner.(LeaderElectionRunnable) + r, ok := runner.(manager.LeaderElectionRunnable) if !ok { t.Fatal("runner is not LeaderElectionRunnable") } From 1e5466a23f7450a61a8b5cf259b5702abb80e58e Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Wed, 30 Apr 2025 23:48:55 -0400 Subject: [PATCH 23/60] fix lint --- pkg/epp/server/runserver_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go index fa6e43f29..0cb52d6d2 100644 --- a/pkg/epp/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -24,6 +24,9 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +// Define a variable with the manager package type to explicitly show usage to linter +var _ manager.LeaderElectionRunnable = nil + func TestRunnable(t *testing.T) { // Make sure AsRunnable() does not use leader election. runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) From 6f36dfd47eb59be16fce96ab236eeb5d650c7621 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:03:41 -0400 Subject: [PATCH 24/60] fix lint --- .golangci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 13a2edd5a..d753ba742 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -37,7 +37,7 @@ linters: - nakedret - prealloc - unparam - - unused + # - unused linters-settings: revive: From 3a958819895b9b9282a74999e20480f862e1fbc7 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:06:57 -0400 Subject: [PATCH 25/60] fix lint --- .golangci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index d753ba742..21c41b0ad 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -17,6 +17,8 @@ issues: linters: disable-all: true + disable: + - goanalysis_metalinter enable: - copyloopvar - dupword @@ -37,7 +39,7 @@ linters: - nakedret - prealloc - unparam - # - unused + - unused linters-settings: revive: From ce547eb86555dbf9121a1923d69a80cbc2f99cbb Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:12:40 -0400 Subject: [PATCH 26/60] fix lint --- .golangci.yml | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 21c41b0ad..43e08e52a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -12,34 +12,32 @@ issues: exclude-dirs: - bin # Files to exclude from linting - exclude-files: - - pkg/epp/server/runserver_test.go + # exclude-files: + # - pkg/epp/server/runserver_test.go linters: disable-all: true - disable: - - goanalysis_metalinter enable: - copyloopvar - - dupword - - durationcheck - - fatcontext - - ginkgolinter - - gocritic - - govet - - loggercheck - - misspell - - perfsprint - - revive - - unconvert - - makezero - - errcheck - - goconst - - ineffassign - - nakedret - - prealloc - - unparam - - unused + # - dupword + # - durationcheck + # - fatcontext + # - ginkgolinter + # - gocritic + # - govet + # - loggercheck + # - misspell + # - perfsprint + # - revive + # - unconvert + # - makezero + # - errcheck + # - goconst + # - ineffassign + # - nakedret + # - prealloc + # - unparam + # - unused linters-settings: revive: From 39dd25797bbf0bf15ec9fb09d2015e1dc8696112 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:16:08 -0400 Subject: [PATCH 27/60] fix lint --- .golangci.yml | 5 ----- .tekton/go-lint-task.yaml | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 43e08e52a..bb1310d54 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,5 +1,3 @@ -version: "2" - run: timeout: 5m allow-parallel-runners: true @@ -11,9 +9,6 @@ issues: # Which dirs to exclude: issues from them won't be reported exclude-dirs: - bin - # Files to exclude from linting - # exclude-files: - # - pkg/epp/server/runserver_test.go linters: disable-all: true diff --git a/.tekton/go-lint-task.yaml b/.tekton/go-lint-task.yaml index 72b0ce384..809a03223 100644 --- a/.tekton/go-lint-task.yaml +++ b/.tekton/go-lint-task.yaml @@ -10,7 +10,8 @@ spec: - name: source steps: - name: run-lint - image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v2.0.3 + image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v1.64.8 + # image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v2.0.3 imagePullPolicy: IfNotPresent script: | #!/bin/bash From 5e6f5b6df9bca6fdf69c40e79a34cc2bf46d5f75 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:18:08 -0400 Subject: [PATCH 28/60] fix lint --- .golangci.yml | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index bb1310d54..ec8870a04 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,7 +1,7 @@ run: timeout: 5m allow-parallel-runners: true - + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) @@ -14,27 +14,31 @@ linters: disable-all: true enable: - copyloopvar - # - dupword - # - durationcheck - # - fatcontext - # - ginkgolinter - # - gocritic - # - govet - # - loggercheck - # - misspell - # - perfsprint - # - revive - # - unconvert - # - makezero - # - errcheck - # - goconst - # - ineffassign - # - nakedret - # - prealloc - # - unparam - # - unused - + - dupword + - durationcheck + - fatcontext + - ginkgolinter + - gocritic + - govet + - loggercheck + - misspell + - perfsprint + - revive + - unconvert + - makezero + - errcheck + - goconst + - gofmt + - goimports + - gosimple + - ineffassign + - nakedret + - prealloc + - typecheck + - unparam + - unused + linters-settings: revive: rules: - - name: comment-spacings \ No newline at end of file + - name: comment-spacings From d62d4570cde164e9c364a641dccfdb07c803b1ae Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:22:32 -0400 Subject: [PATCH 29/60] fix --- .golangci.yml | 42 +++++++++++++++++--------------------- .tekton/go-build-task.yaml | 16 +++++++++++++++ .tekton/pipelinerun.yaml | 3 +++ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index ec8870a04..a7f3ce053 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,7 +1,7 @@ run: timeout: 5m allow-parallel-runners: true - + # Settings related to issues issues: # Report issues on new code only (since we're brining in from upstream) @@ -17,28 +17,24 @@ linters: - dupword - durationcheck - fatcontext - - ginkgolinter - - gocritic - - govet - - loggercheck - - misspell - - perfsprint - - revive - - unconvert - - makezero - - errcheck - - goconst - - gofmt - - goimports - - gosimple - - ineffassign - - nakedret - - prealloc - - typecheck - - unparam - - unused - + # - ginkgolinter + # - gocritic + # - govet + # - loggercheck + # - misspell + # - perfsprint + # - revive + # - unconvert + # - makezero + # - errcheck + # - goconst + # - ineffassign + # - nakedret + # - prealloc + # - unparam + # - unused + linters-settings: revive: rules: - - name: comment-spacings + - name: comment-spacings \ No newline at end of file diff --git a/.tekton/go-build-task.yaml b/.tekton/go-build-task.yaml index eeb117976..e0e61c6ea 100644 --- a/.tekton/go-build-task.yaml +++ b/.tekton/go-build-task.yaml @@ -12,5 +12,21 @@ spec: script: | #!/bin/bash cd $(workspaces.source.path) + + echo "🔐 Extracting Git credentials from workspace..." + GIT_USER=$(cat /workspace/git-auth/username) + GIT_TOKEN=$(cat /workspace/git-auth/token) + + if [ -z "$GIT_USER" ] || [ -z "$GIT_TOKEN" ]; then + echo "❌ Error: Missing git-auth credentials" + exit 1 + fi + + echo "🔐 Configuring Git..." + git config --global user.email "ci-tag-bot@example.com" + git config --global user.name "ci-tag-bot" + git config --global url."https://${GIT_USER}:${GIT_TOKEN}@github.com".insteadOf "https://github.com" + git config --global --add safe.directory "$(pwd)" + go env -w GOFLAGS=-buildvcs=false make build diff --git a/.tekton/pipelinerun.yaml b/.tekton/pipelinerun.yaml index 29ef7b666..0e7f3e695 100644 --- a/.tekton/pipelinerun.yaml +++ b/.tekton/pipelinerun.yaml @@ -165,6 +165,9 @@ spec: workspaces: - name: source workspace: source + - name: git-auth + workspace: git-auth + - name: extract-version-and-registry params: From 508fd2953186d1861958da430d7769a88e7f819e Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:25:35 -0400 Subject: [PATCH 30/60] fix --- .golangci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index a7f3ce053..f0798c4bd 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -17,9 +17,9 @@ linters: - dupword - durationcheck - fatcontext - # - ginkgolinter - # - gocritic - # - govet + - ginkgolinter + - gocritic + - govet # - loggercheck # - misspell # - perfsprint From 06f6d26390c3f7aeb4e99f1d651ed87bf4d2824a Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:27:55 -0400 Subject: [PATCH 31/60] fix --- .golangci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index f0798c4bd..bf7cb7b9e 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -18,8 +18,8 @@ linters: - durationcheck - fatcontext - ginkgolinter - - gocritic - - govet + # - gocritic + # - govet # - loggercheck # - misspell # - perfsprint From 19617ad780a1fa73e6fdaba99953f548cf234355 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:29:48 -0400 Subject: [PATCH 32/60] fix --- .golangci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index bf7cb7b9e..a04be04ae 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -18,7 +18,7 @@ linters: - durationcheck - fatcontext - ginkgolinter - # - gocritic + - gocritic # - govet # - loggercheck # - misspell From 255beb51476b89718b470ce4bf43485b306961e1 Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 00:31:59 -0400 Subject: [PATCH 33/60] fix --- .golangci.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index a04be04ae..b5f5b4ecb 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -20,19 +20,19 @@ linters: - ginkgolinter - gocritic # - govet - # - loggercheck - # - misspell - # - perfsprint - # - revive - # - unconvert - # - makezero - # - errcheck - # - goconst - # - ineffassign - # - nakedret - # - prealloc - # - unparam - # - unused + - loggercheck + - misspell + - perfsprint + - revive + - unconvert + - makezero + - errcheck + - goconst + - ineffassign + - nakedret + - prealloc + - unparam + - unused linters-settings: revive: From f9e6530cfa0ce2c56c08694ef09ab49ef435bbbf Mon Sep 17 00:00:00 2001 From: Andrew Anderson Date: Thu, 1 May 2025 06:41:06 -0400 Subject: [PATCH 34/60] add comments to working golang file --- .golangci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index b5f5b4ecb..a42307fce 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -19,7 +19,7 @@ linters: - fatcontext - ginkgolinter - gocritic - # - govet + # - govet # do not enable - this causes some metalinter issue - loggercheck - misspell - perfsprint From e2f398a0402365c1222d32d51767b293eb31cae2 Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Thu, 1 May 2025 13:50:41 -0400 Subject: [PATCH 35/60] Provide a way to enable the PDFilter --- pkg/epp/scheduling/local_config.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 2e261a87a..fe4d0b3b7 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -18,7 +18,9 @@ package scheduling import ( "context" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" @@ -28,6 +30,7 @@ import ( const ( kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" + pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" @@ -38,6 +41,7 @@ func setDefaultConfig() { // this configuration is a temporary state, it should be better streamlined. setLoadAwareScorer() setKVCacheAwareScorer() + setPDFilter() defaultConfig.picker = picker.NewMaxScorePicker() } @@ -75,3 +79,15 @@ func setKVCacheAwareScorer() { defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) } + +func setPDFilter() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if envutil.GetEnvString(pdFilterEnablementEnvVar, "false", loggerDebug) != "true" { + loggerDebug.Info("Skipping PDFilter creation as it is not enabled") + return + } + + defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter) +} From 01c043e461dccf44193c2bc1dd861c960fdabcef Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Thu, 1 May 2025 14:08:01 -0400 Subject: [PATCH 36/60] update readme --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 12d4186ee..dd262dcfc 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,11 @@ To enable LoadAwareScorer, the following env vars must be configured: export ENABLE_LOAD_AWARE_SCORER=true export LOAD_AWARE_SCORER_WEIGHT=1.0 ``` + +To enable PDFilter, the following env var must be configured: +``` +export ENABLE_PD_FILTER=true +``` --- [Inference Gateways]:#concepts-and-definitions @@ -96,8 +101,8 @@ See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for deta ## Roadmap As Inference Gateway builds towards a GA release. We will continue to expand our capabilities, namely: -1. Prefix-cache aware load balancing with interfaces for remote caches -1. Recommended LoRA adapter pipeline for automated rollout +1. Prefix-cache aware load balancing with interfaces for remote caches +1. Recommended LoRA adapter pipeline for automated rollout 1. Fairness and priority between workloads within the same criticality band 1. HPA support for autoscaling on aggregate metrics derived from the load balancer 1. Support for large multi-modal inputs and outputs @@ -121,4 +126,3 @@ Contributions are readily welcomed, follow the [dev guide](./docs/dev.md) to sta ### Code of conduct Participation in the Kubernetes community is governed by the [Kubernetes Code of Conduct](code-of-conduct.md). - From a1d7254eeffafaf4c7410f2517da5ac5e12e0c5b Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Thu, 1 May 2025 19:02:11 -0400 Subject: [PATCH 37/60] add log lines --- .gitignore | 2 ++ Makefile | 1 + pkg/epp/scheduling/local_config.go | 1 + pkg/epp/scheduling/plugins/filter/pd_filter.go | 9 ++++++++- 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4442b6516..db1d4621d 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,5 @@ go.work.sum # generated docs site + +.envrc diff --git a/Makefile b/Makefile index bb4f078d9..672db27e9 100644 --- a/Makefile +++ b/Makefile @@ -494,6 +494,7 @@ image-build: check-container-tool load-version-json ## Build container image usi --build-arg TARGETARCH=$(TARGETARCH) \ --build-arg GIT_NM_USER=$(GIT_NM_USER)\ --build-arg NM_TOKEN=$(NM_TOKEN) \ + --progress=plain \ -t $(IMG) . .PHONY: image-push diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index fe4d0b3b7..85b91d7cd 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -90,4 +90,5 @@ func setPDFilter() { } defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter) + loggerDebug.Info("Initialized PDFilter") } diff --git a/pkg/epp/scheduling/plugins/filter/pd_filter.go b/pkg/epp/scheduling/plugins/filter/pd_filter.go index 945c615d3..777bb054e 100644 --- a/pkg/epp/scheduling/plugins/filter/pd_filter.go +++ b/pkg/epp/scheduling/plugins/filter/pd_filter.go @@ -19,8 +19,10 @@ import ( "fmt" "math/rand/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( @@ -42,6 +44,8 @@ var PDFilter = &baseFilter{ // Returns: // - Filtered slice of pod metrics, could contain one or zerro elements func prefillDecodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + logger := log.FromContext(ctx).WithName("p/d filter").V(logutil.DEBUG) + pPods := make([]types.Pod, 0) dPods := make([]types.Pod, 0) @@ -56,7 +60,10 @@ func prefillDecodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []t if len(pPods) > 0 { // select a random prefill pod randomIndex := rand.IntN(len(pPods)) - ctx.MutatedHeaders[prefillPodHeader] = fmt.Sprintf("http://%s:%d", pPods[randomIndex].GetPod().Address, ctx.TargetPort) + url := fmt.Sprintf("http://%s:%d", pPods[randomIndex].GetPod().Address, ctx.TargetPort) + logger.Info("prefill pod selected", "url", url) + + ctx.MutatedHeaders[prefillPodHeader] = url } if len(dPods) > 1 { From c2d68de5728103dd29cb6fcaef0c615ea2ad5fdf Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Fri, 2 May 2025 09:27:36 -0400 Subject: [PATCH 38/60] Update pod labels to match ModelService Signed-off-by: Jing Chen --- pkg/epp/backend/metrics/pod_metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index 901697cb4..5ebf8484e 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -32,7 +32,7 @@ import ( const ( fetchMetricsTimeout = 5 * time.Second - roleLabel = "llmd.org/role" + roleLabel = "llm-d.ai/role" rolePrefill = "prefill" roleDecode = "decode" roleBoth = "both" From 867b18cd07b06653bf6d8565a676cb2105cb96da Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Fri, 2 May 2025 12:55:54 -0400 Subject: [PATCH 39/60] address review comments --- pkg/epp/scheduling/plugins/filter/pd_filter.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/epp/scheduling/plugins/filter/pd_filter.go b/pkg/epp/scheduling/plugins/filter/pd_filter.go index 777bb054e..228d18143 100644 --- a/pkg/epp/scheduling/plugins/filter/pd_filter.go +++ b/pkg/epp/scheduling/plugins/filter/pd_filter.go @@ -44,7 +44,7 @@ var PDFilter = &baseFilter{ // Returns: // - Filtered slice of pod metrics, could contain one or zerro elements func prefillDecodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { - logger := log.FromContext(ctx).WithName("p/d filter").V(logutil.DEBUG) + loggerDebug := log.FromContext(ctx).WithName("pd_filter").V(logutil.DEBUG) pPods := make([]types.Pod, 0) dPods := make([]types.Pod, 0) @@ -61,7 +61,7 @@ func prefillDecodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []t // select a random prefill pod randomIndex := rand.IntN(len(pPods)) url := fmt.Sprintf("http://%s:%d", pPods[randomIndex].GetPod().Address, ctx.TargetPort) - logger.Info("prefill pod selected", "url", url) + loggerDebug.Info("Prefill pod selected", "url", url) ctx.MutatedHeaders[prefillPodHeader] = url } From e0eee4c4362ae79bd683a32e0d0f0b47975e2697 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Fri, 2 May 2025 22:09:02 +0300 Subject: [PATCH 40/60] fix build: - added tokenizer lib linking - go package pulling from neuralmagic internal repo --- .gitignore | 4 ++++ .tekton/buildah-build.yaml | 15 +++++++++++++-- .tekton/go-build-task.yaml | 3 +++ .tekton/pipelinerun.yaml | 2 ++ Makefile | 26 ++++++++++++++++++++++---- 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index db1d4621d..599999112 100644 --- a/.gitignore +++ b/.gitignore @@ -31,4 +31,8 @@ go.work.sum # generated docs site +# tokenizer lib +lib + +# local configuration files .envrc diff --git a/.tekton/buildah-build.yaml b/.tekton/buildah-build.yaml index ad4ab4f40..d680a2333 100644 --- a/.tekton/buildah-build.yaml +++ b/.tekton/buildah-build.yaml @@ -44,6 +44,15 @@ spec: USERNAME=$(jq -r '.auths["quay.io"].username' /root/.docker/config.json) PASSWORD=$(jq -r '.auths["quay.io"].password' /root/.docker/config.json) + echo "🔐 Extracting Git credentials from workspace..." + GIT_USER=$(cat /workspace/git-auth/username) + GIT_TOKEN=$(cat /workspace/git-auth/token) + + if [ -z "$GIT_USER" ] || [ -z "$GIT_TOKEN" ]; then + echo "❌ Error: Missing git-auth credentials" + exit 1 + fi + if [ "$USERNAME" = "null" ] || [ "$PASSWORD" = "null" ]; then echo "❌ Error: Missing registry credentials" exit 1 @@ -56,8 +65,10 @@ spec: export DOCKER_CONFIG=/root/.docker export BUILDER=buildah export IMG=$(params.image_tag_base):$(params.dev-version) - + export GIT_NM_USER=$GIT_USER + export NM_TOKEN=$GIT_TOKEN + echo "🚀 Calling make buildah-build with IMG=$IMG..." - make buildah-build IMG=$IMG + make buildah-build IMG=$IMG echo "$IMG" > /tekton/results/image-url diff --git a/.tekton/go-build-task.yaml b/.tekton/go-build-task.yaml index e0e61c6ea..579d20086 100644 --- a/.tekton/go-build-task.yaml +++ b/.tekton/go-build-task.yaml @@ -28,5 +28,8 @@ spec: git config --global url."https://${GIT_USER}:${GIT_TOKEN}@github.com".insteadOf "https://github.com" git config --global --add safe.directory "$(pwd)" + # required for go build with tokenizer lib linking + dnf install -y gcc-c++ libstdc++ libstdc++-devel && dnf clean all + go env -w GOFLAGS=-buildvcs=false make build diff --git a/.tekton/pipelinerun.yaml b/.tekton/pipelinerun.yaml index 0e7f3e695..27cfe5c30 100644 --- a/.tekton/pipelinerun.yaml +++ b/.tekton/pipelinerun.yaml @@ -331,6 +331,8 @@ spec: workspace: registry-secret - name: container-storage workspace: container-storage + - name: git-auth + workspace: git-auth - name: vulnerability-scan when: diff --git a/Makefile b/Makefile index 672db27e9..b51bc16b0 100644 --- a/Makefile +++ b/Makefile @@ -439,11 +439,20 @@ lint: check-golangci-lint ## Run lint golangci-lint run ##@ Build +LDFLAGS ?= -extldflags '-L$(shell pwd)/lib' +CGO_ENABLED=1 # Enable CGO + +.PHONY: download-tokenizer +download-tokenizer: ## Download the HuggingFace tokenizer bindings. + @echo "Downloading HuggingFace tokenizer bindings..." + mkdir -p lib + curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib + ranlib lib/*.a .PHONY: build -build: check-go ## +build: check-go download-tokenizer ## @printf "\033[33;1m==== Building ====\033[0m\n" - go build -o bin/epp cmd/epp/main.go cmd/epp/health.go + go build -ldflags="$(LDFLAGS)" -o bin/epp cmd/epp/main.go cmd/epp/health.go ##@ Container Build/Push @@ -456,7 +465,12 @@ buildah-build: check-builder load-version-json ## Build and push image (multi-ar for arch in amd64; do \ ARCH_TAG=$$FINAL_TAG-$$arch; \ echo "📦 Building for architecture: $$arch"; \ - buildah build --arch=$$arch --os=linux --layers -t $(IMG)-$$arch . || exit 1; \ + buildah build \ + --arch=$$arch \ + --build-arg GIT_NM_USER=$(GIT_NM_USER) \ + --build-arg NM_TOKEN=$(NM_TOKEN) \ + --os=linux \ + --layers -t $(IMG)-$$arch . || exit 1; \ echo "🚀 Pushing image: $(IMG)-$$arch"; \ buildah push $(IMG)-$$arch docker://$(IMG)-$$arch || exit 1; \ done; \ @@ -474,7 +488,11 @@ buildah-build: check-builder load-version-json ## Build and push image (multi-ar sed -e '1 s/\(^FROM\)/FROM --platform=$${BUILDPLATFORM}/' Dockerfile > Dockerfile.cross; \ - docker buildx create --use --name image-builder || true; \ docker buildx use image-builder; \ - docker buildx build --push --platform=$(PLATFORMS) --tag $(IMG) -f Dockerfile.cross . || exit 1; \ + docker buildx build --push \ + --platform=$(PLATFORMS) \ + --build-arg GIT_NM_USER=$(GIT_NM_USER)\ + --build-arg NM_TOKEN=$(NM_TOKEN) \ + --tag $(IMG) -f Dockerfile.cross . || exit 1; \ docker buildx rm image-builder || true; \ rm Dockerfile.cross; \ elif [ "$(BUILDER)" = "podman" ]; then \ From 466e773a215e7c684456bfd8d819b0bea7a355bc Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:37:14 +0300 Subject: [PATCH 41/60] Fixed scorer tests --- pkg/epp/scheduling/scorers_test.go | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pkg/epp/scheduling/scorers_test.go b/pkg/epp/scheduling/scorers_test.go index a98a838b1..640143bf1 100644 --- a/pkg/epp/scheduling/scorers_test.go +++ b/pkg/epp/scheduling/scorers_test.go @@ -86,19 +86,23 @@ func TestScorers(t *testing.T) { }, }, wantRes: &types.Result{ - TargetPod: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + TargetPod: &types.ScoredPod{ + Pod: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + WaitingModels: map[string]int{}, }, - WaitingModels: map[string]int{}, }, + Score: 0.5, }, + MutatedHeaders: map[string]string{}, }, }, } From 49b6afac192651fda99841143066a6688ac931fe Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:37:56 +0300 Subject: [PATCH 42/60] Added PostResponse to scheduler config --- pkg/epp/scheduling/config.go | 2 ++ pkg/epp/scheduling/scheduler.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg/epp/scheduling/config.go b/pkg/epp/scheduling/config.go index 5c64228ca..3f064fe75 100644 --- a/pkg/epp/scheduling/config.go +++ b/pkg/epp/scheduling/config.go @@ -26,6 +26,7 @@ type SchedulerConfig struct { scorers map[plugins.Scorer]int // map from scorer to weight picker plugins.Picker postSchedulePlugins []plugins.PostSchedule + postResponsePlugins []plugins.PostResponse } var defPlugin = &defaultPlugin{} @@ -40,4 +41,5 @@ var defaultConfig = &SchedulerConfig{ scorers: map[plugins.Scorer]int{}, picker: defPlugin, postSchedulePlugins: []plugins.PostSchedule{}, + postResponsePlugins: []plugins.PostResponse{}, } diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index f4e1714d4..83309cb81 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -81,6 +81,7 @@ func NewSchedulerWithConfig(datastore Datastore, config *SchedulerConfig) *Sched scorers: config.scorers, picker: config.picker, postSchedulePlugins: config.postSchedulePlugins, + postResponsePlugins: config.postResponsePlugins, } } @@ -91,6 +92,7 @@ type Scheduler struct { scorers map[plugins.Scorer]int // map from scorer to its weight picker plugins.Picker postSchedulePlugins []plugins.PostSchedule + postResponsePlugins []plugins.PostResponse } type Datastore interface { From 3e8284cefcdd087610336dcc503c52589873978d Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:38:36 +0300 Subject: [PATCH 43/60] Use an init() function instead of modifying the scheduler code to inject our config --- pkg/epp/scheduling/local_config.go | 4 ++++ pkg/epp/scheduling/scheduler.go | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 85b91d7cd..d1df2459c 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -36,6 +36,10 @@ const ( loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" ) +func init() { + setDefaultConfig() +} + func setDefaultConfig() { // since the default config is a global variable, we add this function to minimize rebase conflicts. // this configuration is a temporary state, it should be better streamlined. diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 83309cb81..56290cfd2 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -69,7 +69,6 @@ var ( ) func NewScheduler(datastore Datastore) *Scheduler { - setDefaultConfig() return NewSchedulerWithConfig(datastore, defaultConfig) } From 32e43b1ad2bc630e4658747e6f721753fd185306 Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:39:14 +0300 Subject: [PATCH 44/60] Added code to scheduler to enable running the PostResponse plugins --- pkg/epp/scheduling/scheduler.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 56290cfd2..3dd0ca059 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -212,6 +212,38 @@ func (s *Scheduler) runPostSchedulePlugins(ctx *types.SchedulingContext, res *ty } } +func (s *Scheduler) RunPostResponsePlugins(ctx context.Context, req *types.LLMRequest, targetPodName string) (*types.Result, error) { + logger := log.FromContext(ctx) + + pool, err := s.datastore.PoolGet() + if err != nil { + return nil, errutil.Error{Code: errutil.Internal, Msg: "failed to find a target pod"} // pool not defined, no pods + } + + // Snapshot pod metrics from the datastore to: + // 1. Reduce concurrent access to the datastore. + // 2. Ensure consistent data during the scheduling operation of a request. + pods := types.ToSchedulerPodMetrics(s.datastore.PodGetAll()) + var targetPod types.Pod + for _, pod := range pods { + if pod.GetPod().NamespacedName.String() == targetPodName { + targetPod = pod + break + } + } + + sCtx := types.NewSchedulingContext(ctx, req, pods, pool.Spec.TargetPortNumber) + + for _, plugin := range s.postResponsePlugins { + logger.V(logutil.DEBUG).Info("Running post-response plugin", "plugin", plugin.Name()) + before := time.Now() + plugin.PostResponse(sCtx, targetPod) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PostResponsePluginType, plugin.Name(), time.Since(before)) + } + + return &types.Result{TargetPod: nil, MutatedHeaders: sCtx.MutatedHeaders}, nil +} + type defaultPlugin struct { picker.RandomPicker } From 4655be48ad5dddcd2dd39d16775dc165a831f99a Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:39:58 +0300 Subject: [PATCH 45/60] Invoke the PostResponse handlers and send any added headers to the user --- pkg/epp/handlers/server.go | 59 ++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 6ea7d438c..11587fb1c 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -37,6 +37,7 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -66,6 +67,7 @@ type StreamingServer struct { type Scheduler interface { Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (result *schedulingtypes.Result, err error) + RunPostResponsePlugins(ctx context.Context, req *types.LLMRequest, tragetPodName string) (*schedulingtypes.Result, error) } // RequestContext stores context information during the life time of an HTTP request. @@ -189,6 +191,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) case *extProcPb.ProcessingRequest_RequestTrailers: // This is currently unused. case *extProcPb.ProcessingRequest_ResponseHeaders: + responseHeaders := make(map[string]string) for _, header := range v.ResponseHeaders.Headers.GetHeaders() { value := string(header.RawValue) @@ -199,27 +202,53 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) reqCtx.modelServerStreaming = true loggerTrace.Info("model server is streaming response") } + responseHeaders[header.Key] = value } - reqCtx.RequestState = ResponseRecieved - reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - // This is for debugging purpose only. - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, + llmReq := &schedulingtypes.LLMRequest{ + Model: reqCtx.Model, + Headers: responseHeaders, + ResolvedTargetModel: reqCtx.ResolvedTargetModel, + } + + var result *types.Result + result, err = s.scheduler.RunPostResponsePlugins(ctx, llmReq, reqCtx.TargetPod) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error handling response") + reqCtx.ResponseStatusCode = errutil.ModelServerError + } else { + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + // This is for debugging purpose only. + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + } + + // Add headers added by PostResponse + for key, value := range result.MutatedHeaders { + headers = append(headers, &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: key, + RawValue: []byte(value), + }, + }) + } + + reqCtx.RequestState = ResponseRecieved + reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headers, }, }, }, }, - }, + } } case *extProcPb.ProcessingRequest_ResponseBody: From 6fffe9ed65c3124ab28d0621d03f26a578ea568c Mon Sep 17 00:00:00 2001 From: Shmuel Kallner Date: Sun, 4 May 2025 15:40:35 +0300 Subject: [PATCH 46/60] Added a simple unit test for the PostResponse plugin invocation --- pkg/epp/scheduling/scheduler_test.go | 67 ++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index e6d229aee..eafa8d681 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -483,6 +483,56 @@ func TestSchedulePlugins(t *testing.T) { } } +func TestPostResponse(t *testing.T) { + pr1 := &testPostResponse{ + NameRes: "pr1", + ExtraHeaders: map[string]string{"x-session-id": "qwer-asdf-zxcv"}, + ReceivedResponseHeaders: make(map[string]string), + } + + tests := []struct { + name string + config SchedulerConfig + input []*backendmetrics.FakePodMetrics + responseHeaders map[string]string + wantMutatedHeaders map[string]string + }{ + { + name: "Simple postResponse test", + config: SchedulerConfig{ + postResponsePlugins: []plugins.PostResponse{pr1}, + }, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + }, + responseHeaders: map[string]string{"Content-type": "application/json", "Content-Length": "1234"}, + wantMutatedHeaders: map[string]string{"x-session-id": "qwer-asdf-zxcv"}, + }, + } + + for _, test := range tests { + scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, &test.config) + + req := &types.LLMRequest{ + Model: "test-model", + Headers: test.responseHeaders, + } + + result, err := scheduler.RunPostResponsePlugins(context.Background(), req, test.input[0].Pod.NamespacedName.String()) + if err != nil { + t.Errorf("Received an error. Error: %s", err) + } + + if diff := cmp.Diff(test.responseHeaders, pr1.ReceivedResponseHeaders); diff != "" { + t.Errorf("Unexpected output (-responseHeaders +ReceivedResponseHeaders): %v", diff) + } + + if diff := cmp.Diff(test.wantMutatedHeaders, result.MutatedHeaders); diff != "" { + t.Errorf("Unexpected output (-wantedMutatedHeaders +MutatedHeaders): %v", diff) + } + } +} + type fakeDataStore struct { pods []*backendmetrics.FakePodMetrics } @@ -571,6 +621,23 @@ func (tp *TestPlugin) reset() { tp.NumOfPickerCandidates = 0 } +type testPostResponse struct { + NameRes string + ReceivedResponseHeaders map[string]string + ExtraHeaders map[string]string +} + +func (pr *testPostResponse) Name() string { return pr.NameRes } + +func (pr *testPostResponse) PostResponse(ctx *types.SchedulingContext, pod types.Pod) { + for key, value := range ctx.Req.Headers { + pr.ReceivedResponseHeaders[key] = value + } + for key, value := range pr.ExtraHeaders { + ctx.MutatedHeaders[key] = value + } +} + func findPods(ctx *types.SchedulingContext, names ...k8stypes.NamespacedName) []types.Pod { res := []types.Pod{} for _, pod := range ctx.PodsSnapshot { From 403fae6045ae1f123b27b3c44107030dbb3411e9 Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Mon, 5 May 2025 10:33:11 +0300 Subject: [PATCH 47/60] [build]: Updating vllm deployment to the latest image and scorers (#112) Update the vLLM P2P deployment to support KV-cache and load scorers. Signed-off-by: Kfir Toledo --- deploy/components/vllm-p2p/vllm-deployment.yaml | 9 ++++++--- .../dev/kubernetes-kgateway/patch-deployments.yaml | 10 +++++++++- scripts/kubernetes-dev-env.sh | 8 ++++---- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/deploy/components/vllm-p2p/vllm-deployment.yaml b/deploy/components/vllm-p2p/vllm-deployment.yaml index 19fd59c21..c9964962e 100644 --- a/deploy/components/vllm-p2p/vllm-deployment.yaml +++ b/deploy/components/vllm-p2p/vllm-deployment.yaml @@ -31,13 +31,12 @@ spec: - "-c" args: - | - export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \ + export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}} && \ vllm serve ${MODEL_NAME} \ --host 0.0.0.0 \ --port 8000 \ - --enable-chunked-prefill false \ --max-model-len ${MAX_MODEL_LEN} \ - --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' ports: - name: http containerPort: 8000 @@ -78,6 +77,10 @@ spec: secretKeyRef: name: ${HF_SECRET_NAME} key: ${HF_SECRET_KEY} + - name: VLLM_ENABLE_V1_MULTIPROCESSING + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn - name: LMCACHE_LOOKUP_URL value: ${REDIS_HOST}:${REDIS_PORT} - name: LMCACHE_ENABLE_DEBUG diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index 00c87fbbf..a6b1d4a2b 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -29,4 +29,12 @@ spec: valueFrom: secretKeyRef: name: hf-token - key: ${HF_SECRET_KEY} \ No newline at end of file + key: ${HF_SECRET_KEY} + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "true" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "2.0" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1.0" diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 21564e9cc..e9d92c174 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -65,10 +65,10 @@ case "${VLLM_MODE}" in export LORA_ADAPTER_SYNCER_TAG="${LORA_ADAPTER_SYNCER_TAG:-v20250425-ddc3d69}" elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then - export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}" - export VLLM_TAG="${VLLM_TAG:-2025-03-10}" - export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}" - export EPP_TAG="${EPP_TAG:-kv-aware}" + export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/llm-d/llm-d-dev}" + export VLLM_TAG="${VLLM_TAG:-lmcache-0.0.6-amd64}" + export EPP_IMAGE="${EPP_IMAGE:-quay.io/llm-d/llm-d-gateway-api-inference-extension-dev}" + export EPP_TAG="${EPP_TAG:-0.0.5-amd64}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}" export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}" From 9f01f6c11fe6d84e5eb1b01a01bbaba617111a16 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Mon, 5 May 2025 12:31:26 +0300 Subject: [PATCH 48/60] Add P/D scheduler (#115) * Add P/D scheduler - use 2 schedulers in it, one for prefill and one for decode. P/D scheduler is enabled by environment variable value, list of scorers and their weight are defined by environment variables + delete pd-filter * Remove unused variable * Update readme file with envirnment variables relevant to P/D scheduler * Fix problem caused by merge * Add documentation for PDScheduler.Schedule function * Update names of prefill and decode filters to avoid spaces * Update comment for prefill/decode fitlers * Change IsPDEnabled to PDEnabled * Fix typo in readme * Fix pd scheduler behavior for short promprts * Fix prefill/decode related text in readme * Remove redundant filter creation of prefil/decode filters + make promptLengthThreshold local Add function for schedulerContext creation * Fixes in readme * fix compilation prblem * add pd scheduler test * add postResponse plugins array to prefile and decode config * fix comment in test * fix pd-scheduler test --- README.md | 41 ++++- pkg/epp/scheduling/config_utils.go | 84 ++++++++++ pkg/epp/scheduling/local_config.go | 15 -- pkg/epp/scheduling/pd_config.go | 72 ++++++++ pkg/epp/scheduling/pd_scheduler.go | 90 ++++++++++ pkg/epp/scheduling/pd_scheduler_test.go | 154 ++++++++++++++++++ .../scheduling/plugins/filter/pd_filter.go | 65 +++----- pkg/epp/scheduling/scheduler.go | 25 ++- pkg/epp/server/runserver.go | 9 +- 9 files changed, 488 insertions(+), 67 deletions(-) create mode 100644 pkg/epp/scheduling/config_utils.go create mode 100644 pkg/epp/scheduling/pd_config.go create mode 100644 pkg/epp/scheduling/pd_scheduler.go create mode 100644 pkg/epp/scheduling/pd_scheduler_test.go diff --git a/README.md b/README.md index dd262dcfc..dc8921795 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This project offers tools for AI Inference, enabling developers to build [Infere --- ## Temporary Fork Configuration -To enable KVCacheAwareScorer, the following env vars must be configured: +To enable the KVCacheAwareScorer, the following environment variables must be configured: ``` export ENABLE_KVCACHE_AWARE_SCORER=true export KVCACHE_AWARE_SCORER_WEIGHT=1.0 @@ -17,15 +17,48 @@ export KVCACHE_INDEXER_REDIS_ADDR= export HF_TOKEN= ``` -To enable LoadAwareScorer, the following env vars must be configured: +To enable the LoadAwareScorer, the following environment variables must be configured: ``` export ENABLE_LOAD_AWARE_SCORER=true export LOAD_AWARE_SCORER_WEIGHT=1.0 ``` -To enable PDFilter, the following env var must be configured: +To enable Prefill/Decode (PD) processing, the following environment variable must be configured: ``` -export ENABLE_PD_FILTER=true +export PD_ENABLED=true +``` + +To define the prompt length threshold (requests with a prompt longer than the value defined here will be processed using the prefill-decode process), the following environment variable must be configured: +``` +export PD_PROMPT_LEN_THRESHOLD=10 +``` + +Prefill configuration: + +To enable and configure the kv cache scorer for prefill, the following environment variables must be configured: +``` +export PREFILL_ENABLE_KVCACHE_AWARE_SCORER=true +export PREFILL_KVCACHE_AWARE_SCORER_WEIGHT=1.0 +``` + +To enable and configure the load aware scorer for prefill, the following environment variables must be configured: +``` +export PREFILL_ENABLE_LOAD_AWARE_SCORER=true +export PREFILL_LOAD_AWARE_SCORER_WEIGHT=1.0 +``` + +Decode configuration: + +To enable and configure the kv cache scorer for decode, the following environment variables must be configured: +``` +export DECODE_ENABLE_KVCACHE_AWARE_SCORER=true +export DECODE_KVCACHE_AWARE_SCORER_WEIGHT=1.0 +``` + +To enable and configure the load aware scorer for decode, the following environment variables must be configured: +``` +export DECODE_ENABLE_LOAD_AWARE_SCORER=true +export DECODE_LOAD_AWARE_SCORER_WEIGHT=1.0 ``` --- [Inference Gateways]:#concepts-and-definitions diff --git a/pkg/epp/scheduling/config_utils.go b/pkg/epp/scheduling/config_utils.go new file mode 100644 index 000000000..4145dbe1b --- /dev/null +++ b/pkg/epp/scheduling/config_utils.go @@ -0,0 +1,84 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" +) + +const ( + prefillKvCacheScorerEnablementEnvVar = "PREFILL_ENABLE_KVCACHE_AWARE_SCORER" + prefillLoadAwareScorerEnablementEnvVar = "PREFILL_ENABLE_LOAD_AWARE_SCORER" + decodeKvCacheScorerEnablementEnvVar = "DECODE_ENABLE_KVCACHE_AWARE_SCORER" + decodeLoadAwareScorerEnablementEnvVar = "DECODE_ENABLE_LOAD_AWARE_SCORER" + + prefillKvCacheScorerWeightEnvVar = "PREFILL_KVCACHE_AWARE_SCORER_WEIGHT" + prefillLoadAwareScorerWeightEnvVar = "PREFILL_LOAD_AWARE_SCORER_WEIGHT" + decodeKvCacheScorerWeightEnvVar = "DECODE_KVCACHE_AWARE_SCORER_WEIGHT" + decodeLoadAwareScorerWeightEnvVar = "DECODE_LOAD_AWARE_SCORER_WEIGHT" + + pdEnabledEnvKey = "PD_ENABLED" + + pdPromptLenThresholdEnvKey = "PD_PROMPT_LEN_THRESHOLD" + pdPromptLenThresholdDefault = 10 +) + +const ( + loadAwareScorerName = "LoadAwareScorer" + kvCacheAwareScorerName = "KVCacheAwareScorer" +) + +func addScorerByEnvironment(ctx context.Context, config *SchedulerConfig, scorerName string, scorerEnabledEnvKey string, weightEnvKey string, logger logr.Logger) { + if envutil.GetEnvString(scorerEnabledEnvKey, "false", logger) != "true" { + logger.Info(fmt.Sprintf("Skipping %s creation as it is not enabled", scorerName)) + return + } + + weight := envutil.GetEnvInt(weightEnvKey, 1, logger) + scorer, err := createScorerByName(ctx, scorerName) + if err != nil { + logger.Error(err, "Failed to create scorrer") + return + } + + defaultConfig.scorers[scorer] = weight + logger.Info("Initialized scorer", "scorer", scorerName, "weight", weight) +} + +func createScorerByName(ctx context.Context, name string) (plugins.Scorer, error) { + switch name { + case loadAwareScorerName: + return &scorer.LoadAwareScorer{}, nil + case kvCacheAwareScorerName: + return scorer.NewKVCacheAwareScorer(ctx) + } + return nil, fmt.Errorf("invalid scorer type %s", name) +} + +func getPDEnabledFromEnvironment(logger logr.Logger) bool { + return envutil.GetEnvString(pdEnabledEnvKey, "false", logger) == "true" +} + +func getPDPromptLenThresholdFromEnvironment(logger logr.Logger) int { + return envutil.GetEnvInt(pdPromptLenThresholdEnvKey, pdPromptLenThresholdDefault, logger) +} diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index d1df2459c..018c630a3 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -20,7 +20,6 @@ import ( "context" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" @@ -45,7 +44,6 @@ func setDefaultConfig() { // this configuration is a temporary state, it should be better streamlined. setLoadAwareScorer() setKVCacheAwareScorer() - setPDFilter() defaultConfig.picker = picker.NewMaxScorePicker() } @@ -83,16 +81,3 @@ func setKVCacheAwareScorer() { defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) } - -func setPDFilter() { - ctx := context.Background() - loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) - - if envutil.GetEnvString(pdFilterEnablementEnvVar, "false", loggerDebug) != "true" { - loggerDebug.Info("Skipping PDFilter creation as it is not enabled") - return - } - - defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter) - loggerDebug.Info("Initialized PDFilter") -} diff --git a/pkg/epp/scheduling/pd_config.go b/pkg/epp/scheduling/pd_config.go new file mode 100644 index 000000000..107ef88e6 --- /dev/null +++ b/pkg/epp/scheduling/pd_config.go @@ -0,0 +1,72 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +var prefillConfig = &SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{}, + filters: []plugins.Filter{filter.PrefillFilter}, + scorers: map[plugins.Scorer]int{}, + picker: picker.NewMaxScorePicker(), + postSchedulePlugins: []plugins.PostSchedule{}, + postResponsePlugins: []plugins.PostResponse{}, +} +var decodeConfig = &SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{}, + filters: []plugins.Filter{filter.DecodeFilter}, + scorers: map[plugins.Scorer]int{}, + picker: picker.NewMaxScorePicker(), + postSchedulePlugins: []plugins.PostSchedule{}, + postResponsePlugins: []plugins.PostResponse{}, +} + +var PDEnabled = false +var promptLengthThreshold int + +func init() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + loadPrefillConfiguration(ctx, loggerDebug) + loadDecodeConfiguration(ctx, loggerDebug) + + // set IsPDEnabled by environment + PDEnabled = getPDEnabledFromEnvironment(loggerDebug) + promptLengthThreshold = getPDPromptLenThresholdFromEnvironment(loggerDebug) +} + +func loadPrefillConfiguration(ctx context.Context, logger logr.Logger) { + // add scorers + addScorerByEnvironment(ctx, prefillConfig, kvCacheAwareScorerName, kvCacheScorerEnablementEnvVar, kvCacheScorerWeightEnvVar, logger) + addScorerByEnvironment(ctx, prefillConfig, loadAwareScorerName, loadAwareScorerEnablementEnvVar, loadAwareScorerWeightEnvVar, logger) +} + +func loadDecodeConfiguration(ctx context.Context, logger logr.Logger) { + // add scorers + addScorerByEnvironment(ctx, decodeConfig, kvCacheAwareScorerName, kvCacheScorerEnablementEnvVar, kvCacheScorerWeightEnvVar, logger) + addScorerByEnvironment(ctx, decodeConfig, loadAwareScorerName, loadAwareScorerEnablementEnvVar, loadAwareScorerWeightEnvVar, logger) +} diff --git a/pkg/epp/scheduling/pd_scheduler.go b/pkg/epp/scheduling/pd_scheduler.go new file mode 100644 index 000000000..37822201a --- /dev/null +++ b/pkg/epp/scheduling/pd_scheduler.go @@ -0,0 +1,90 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package scheduling implements request scheduling algorithms. +package scheduling + +import ( + "context" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + prefillPodHeader = "x-prefiller-url" +) + +func NewPDScheduler(datastore Datastore) *PDScheduler { + return NewPDSchedulerWithConfig(datastore, prefillConfig, decodeConfig, defaultConfig) +} + +func NewPDSchedulerWithConfig(datastore Datastore, pConfig *SchedulerConfig, dConfig *SchedulerConfig, defConfig *SchedulerConfig) *PDScheduler { + return &PDScheduler{ + datastore: datastore, + prefillScheduler: NewSchedulerWithConfig(datastore, pConfig), + decodeScheduler: NewSchedulerWithConfig(datastore, dConfig), + defaultScheduler: NewSchedulerWithConfig(datastore, defConfig), + } +} + +type PDScheduler struct { + datastore Datastore + prefillScheduler *Scheduler + decodeScheduler *Scheduler + defaultScheduler *Scheduler +} + +// Schedule finds the target pod based on metrics and the requested lora adapter. +// PD scheduler uses three base schedulers to process requests, the overall configuration is currently loaded from environment variables. +// If the request prompt is short enough (defined by the threshold in the configuration) - use the default behavior +// If the request prompt is long enough to use prefill-decode process: +// 1 - find the pod for prefill, save its url in a special header. For this, use the Scheduler configured for this goal, which uses the prefill filter +// and scorers according to the configuration. +// 2 - find the pod for decode, use the Scheduler configured for this goal, which uses the decode filer and scorers defined in the configuration +func (s *PDScheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) { + logger := log.FromContext(ctx).WithValues("pd-schedule", req) + + if len(req.Prompt) < promptLengthThreshold { + // the prompt is short enough - use the default scheduling logic + return s.defaultScheduler.Schedule(ctx, req) + } + + sCtx, err := createSchedulerContext(ctx, req, s.datastore) + if err != nil { + return nil, err + } + + // prompt requires processing on two pods - prefill and decode + // start with calculating of the prefill pod + res, err := s.prefillScheduler.scheduleWithContext(ctx, sCtx, req, logger) + if err != nil { + return nil, err + } + + if res.TargetPod != nil { + url := fmt.Sprintf("http://%s:%d", res.TargetPod.GetPod().Address, sCtx.TargetPort) + sCtx.MutatedHeaders[prefillPodHeader] = url + } + + // get decode pod + return s.decodeScheduler.scheduleWithContext(ctx, sCtx, req, logger) +} + +func (s *PDScheduler) RunPostResponsePlugins(ctx context.Context, req *types.LLMRequest, targetPodName string) (*types.Result, error) { + return s.decodeScheduler.RunPostResponsePlugins(ctx, req, targetPodName) +} diff --git a/pkg/epp/scheduling/pd_scheduler_test.go b/pkg/epp/scheduling/pd_scheduler_test.go new file mode 100644 index 000000000..1cec19433 --- /dev/null +++ b/pkg/epp/scheduling/pd_scheduler_test.go @@ -0,0 +1,154 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + k8stypes "k8s.io/apimachinery/pkg/types" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// Tests the default scheduler configuration and expected behavior. +func TestPDSchedule(t *testing.T) { + // Set configuration + PDEnabled = true + promptLengthThreshold = 10 + prefillConfig.filters = []plugins.Filter{filter.PrefillFilter} + prefillConfig.scorers = map[plugins.Scorer]int{} + decodeConfig.filters = []plugins.Filter{filter.DecodeFilter} + decodeConfig.scorers = map[plugins.Scorer]int{} + + pod1 := &backendmetrics.FakePodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, + Address: "1.2.3.4", + Role: backendmetrics.Prefill, + }, + Metrics: &backendmetrics.Metrics{}, + } + pod2 := &backendmetrics.FakePodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, + Address: "5.6.7.8", + Role: backendmetrics.Decode, + }, + Metrics: &backendmetrics.Metrics{}, + } + wantPod1 := &types.PodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, + Address: "1.2.3.4", + Role: backendmetrics.Prefill, + }, + Metrics: &backendmetrics.Metrics{ + ActiveModels: map[string]int{}, + WaitingModels: map[string]int{}, + }, + } + wantPod2 := &types.PodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, + Address: "5.6.7.8", + Role: backendmetrics.Decode, + }, + Metrics: &backendmetrics.Metrics{ + ActiveModels: map[string]int{}, + WaitingModels: map[string]int{}, + }, + } + + tests := []struct { + name string + req *types.LLMRequest + input []*backendmetrics.FakePodMetrics + wantRes *types.Result + err bool + }{ + { + name: "no pods in datastore", + req: &types.LLMRequest{ + Model: "any-model", + ResolvedTargetModel: "any-model", + Critical: true, + Prompt: "12345678901", + }, + input: []*backendmetrics.FakePodMetrics{}, + err: true, + }, + { + name: "one pod, short prompt", + req: &types.LLMRequest{ + Model: "critical", + ResolvedTargetModel: "critical", + Critical: true, + Prompt: "123", + }, + // pod1 will be picked because it is the only one pod + input: []*backendmetrics.FakePodMetrics{pod1}, + wantRes: &types.Result{ + TargetPod: &types.ScoredPod{ + Pod: wantPod1, + }, + MutatedHeaders: map[string]string{}, + }, + }, + { + name: "1P1D", + req: &types.LLMRequest{ + Model: "critical", + ResolvedTargetModel: "critical", + Critical: true, + Prompt: "12345678901", + }, + // pod2 will be picked because it is the decode pod + input: []*backendmetrics.FakePodMetrics{pod1, pod2}, + wantRes: &types.Result{ + TargetPod: &types.ScoredPod{ + Pod: wantPod2, + Score: 0.0, + }, + MutatedHeaders: map[string]string{"x-prefiller-url": "http://1.2.3.4:0"}, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scheduler := NewPDScheduler(&fakeDataStore{pods: test.input}) + got, err := scheduler.Schedule(context.Background(), test.req) + + fmt.Printf("Test %s:\n", test.name) + fmt.Printf("Result: %#v\n", got) + fmt.Printf("Expected: %#v\n", test.wantRes) + + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.wantRes, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} diff --git a/pkg/epp/scheduling/plugins/filter/pd_filter.go b/pkg/epp/scheduling/plugins/filter/pd_filter.go index 228d18143..fd4c5a8cc 100644 --- a/pkg/epp/scheduling/plugins/filter/pd_filter.go +++ b/pkg/epp/scheduling/plugins/filter/pd_filter.go @@ -16,61 +16,44 @@ limitations under the License. package filter import ( - "fmt" - "math/rand/v2" - - "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - prefillPodHeader = "x-prefiller-url" ) -var PDFilter = &baseFilter{ - name: "p/d filter", - filter: prefillDecodeFilterFunc, +// PrefillFilter - filters out all pods that are not marked as decode/both pod role +var PrefillFilter = &baseFilter{ + name: "prefill_filter", + filter: prefillFilterFunc, } -// prefillDecodeFilterFunc implements a pod selection strategy that filters out pods, -// which role is 'prefill', in addition a header with selected prefill pod is added -// -// Initial implementation: -// 1 - select one random pod marked as 'prefill' and add it name to header -// 2 - return a random pod that marked as "decode" or "both" -// -// Returns: -// - Filtered slice of pod metrics, could contain one or zerro elements -func prefillDecodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { - loggerDebug := log.FromContext(ctx).WithName("pd_filter").V(logutil.DEBUG) - - pPods := make([]types.Pod, 0) - dPods := make([]types.Pod, 0) +// prefillFilterFunc filters out all pods that are not marked as "prefill" +func prefillFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + filteredPods := make([]types.Pod, 0) for _, pod := range pods { if pod.GetPod().Role == metrics.Prefill { - pPods = append(pPods, pod) - } else if pod.GetPod().Role == metrics.Decode || pod.GetPod().Role == metrics.Both { - dPods = append(dPods, pod) + filteredPods = append(filteredPods, pod) } } - if len(pPods) > 0 { - // select a random prefill pod - randomIndex := rand.IntN(len(pPods)) - url := fmt.Sprintf("http://%s:%d", pPods[randomIndex].GetPod().Address, ctx.TargetPort) - loggerDebug.Info("Prefill pod selected", "url", url) + return filteredPods +} + +// DecodeFilter - fiters out all pods that are not marked as prefill pod role +var DecodeFilter = &baseFilter{ + name: "decode_filter", + filter: decodeFilterFunc, +} - ctx.MutatedHeaders[prefillPodHeader] = url - } +// decodeFilterFunc filters out all pods that are not marked as "decode" or "both" +func decodeFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + filteredPods := make([]types.Pod, 0) - if len(dPods) > 1 { - // leave only one pod - randomIndex := rand.IntN(len(dPods)) - return []types.Pod{dPods[randomIndex]} + for _, pod := range pods { + if pod.GetPod().Role == metrics.Decode || pod.GetPod().Role == metrics.Both { + filteredPods = append(filteredPods, pod) + } } - return dPods + return filteredPods } diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 3dd0ca059..b56d20ca7 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -22,6 +22,7 @@ import ( "fmt" "time" + "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" @@ -99,20 +100,32 @@ type Datastore interface { PodGetAll() []backendmetrics.PodMetrics } +func createSchedulerContext(ctx context.Context, req *types.LLMRequest, datastore Datastore) (*types.SchedulingContext, error) { + pool, err := datastore.PoolGet() + if err != nil { + return nil, errutil.Error{Code: errutil.Internal, Msg: "failed to find a target pod"} // pool not defined, no pods + } + + // Snapshot pod metrics from the datastore to: + // 1. Reduce concurrent access to the datastore. + // 2. Ensure consistent data during the scheduling operation of a request. + return types.NewSchedulingContext(ctx, req, types.ToSchedulerPodMetrics(datastore.PodGetAll()), pool.Spec.TargetPortNumber), nil +} + // Schedule finds the target pod based on metrics and the requested lora adapter. func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) { logger := log.FromContext(ctx).WithValues("request", req) loggerDebug := logger.V(logutil.DEBUG) - pool, err := s.datastore.PoolGet() + sCtx, err := createSchedulerContext(ctx, req, s.datastore) if err != nil { - return nil, errutil.Error{Code: errutil.Internal, Msg: "failed to find a target pod"} // pool not defined, no pods + return nil, err } - // Snapshot pod metrics from the datastore to: - // 1. Reduce concurrent access to the datastore. - // 2. Ensure consistent data during the scheduling operation of a request. - sCtx := types.NewSchedulingContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll()), pool.Spec.TargetPortNumber) + return s.scheduleWithContext(ctx, sCtx, req, loggerDebug) +} + +func (s *Scheduler) scheduleWithContext(ctx context.Context, sCtx *types.SchedulingContext, req *types.LLMRequest, loggerDebug logr.Logger) (*types.Result, error) { loggerDebug.Info(fmt.Sprintf("Scheduling a request, Metrics: %+v", sCtx.PodsSnapshot)) s.runPreSchedulePlugins(sCtx) diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 0c0a6a6dc..9b8ea4177 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -137,7 +137,14 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } else { srv = grpc.NewServer() } - extProcServer := handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + + var scheduler handlers.Scheduler + if scheduling.PDEnabled { + scheduler = scheduling.NewPDScheduler(r.Datastore) + } else { + scheduler = scheduling.NewScheduler(r.Datastore) + } + extProcServer := handlers.NewStreamingServer(scheduler, r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) extProcPb.RegisterExternalProcessorServer( srv, extProcServer, From b7689d02b354c847843d4fd11f1551cf21434142 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Mon, 5 May 2025 16:41:09 +0300 Subject: [PATCH 49/60] Add decode filter to the default filters list in case pd is enabled (#119) --- pkg/epp/scheduling/pd_config.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/epp/scheduling/pd_config.go b/pkg/epp/scheduling/pd_config.go index 107ef88e6..3371093a1 100644 --- a/pkg/epp/scheduling/pd_config.go +++ b/pkg/epp/scheduling/pd_config.go @@ -57,6 +57,11 @@ func init() { // set IsPDEnabled by environment PDEnabled = getPDEnabledFromEnvironment(loggerDebug) promptLengthThreshold = getPDPromptLenThresholdFromEnvironment(loggerDebug) + + // update default config if pd is enabled + if PDEnabled { + defaultConfig.filters = append(defaultConfig.filters, filter.DecodeFilter) + } } func loadPrefillConfiguration(ctx context.Context, logger logr.Logger) { From e45e31c83c5f7ac1a0ace2973acc4bb65e8a67e4 Mon Sep 17 00:00:00 2001 From: Ricardo Noriega De Soto Date: Tue, 22 Apr 2025 21:49:37 +0200 Subject: [PATCH 50/60] cherry-picked prefix_score --- pkg/epp/scheduling/prefix_store.go | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 pkg/epp/scheduling/prefix_store.go diff --git a/pkg/epp/scheduling/prefix_store.go b/pkg/epp/scheduling/prefix_store.go new file mode 100644 index 000000000..d5eca90c6 --- /dev/null +++ b/pkg/epp/scheduling/prefix_store.go @@ -0,0 +1,35 @@ +package scheduling + +import ( + "sync" + "time" + + "github.com/armon/go-radix" + "k8s.io/apimachinery/pkg/types" +) + +type PrefixEntry struct { + PodRef types.NamespacedName + LastUsed time.Time + ModelName string +} + +type PrefixStoreConfig struct { + MaxEntries int + MinPrefixLen int + MaxPrefixLen int + EntryTTL time.Duration +} + +type PrefixStore struct { + tree *radix.Tree + mu sync.RWMutex + config PrefixStoreConfig +} + +func NewPrefixStore(config PrefixStoreConfig) *PrefixStore { + return &PrefixStore{ + tree: radix.New(), + config: config, + } +} From 073069a939c2c3c959d44214d47f0dde8126f900 Mon Sep 17 00:00:00 2001 From: Ricardo Noriega De Soto Date: Tue, 22 Apr 2025 22:07:01 +0200 Subject: [PATCH 51/60] Add prefix store functionality Signed-off-by: Ricardo Noriega De Soto --- pkg/epp/scheduling/prefix_store.go | 161 ++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 4 deletions(-) diff --git a/pkg/epp/scheduling/prefix_store.go b/pkg/epp/scheduling/prefix_store.go index d5eca90c6..6878646d2 100644 --- a/pkg/epp/scheduling/prefix_store.go +++ b/pkg/epp/scheduling/prefix_store.go @@ -1,35 +1,188 @@ package scheduling import ( + "context" "sync" "time" "github.com/armon/go-radix" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" ) +// PrefixEntry represents a single entry in the prefix store type PrefixEntry struct { PodRef types.NamespacedName LastUsed time.Time ModelName string } +// PrefixStoreConfig holds configuration for the prefix store type PrefixStoreConfig struct { - MaxEntries int - MinPrefixLen int - MaxPrefixLen int - EntryTTL time.Duration + MaxEntries int // Maximum total entries in the store + MinPrefixLen int // Minimum prefix length to store + MaxPrefixLen int // Maximum prefix length to store + EntryTTL time.Duration // Time-to-live for entries } +// PrefixStore manages prompt prefixes and their pod assignments type PrefixStore struct { tree *radix.Tree mu sync.RWMutex config PrefixStoreConfig } +// NewPrefixStore creates a new PrefixStore with the given configuration func NewPrefixStore(config PrefixStoreConfig) *PrefixStore { return &PrefixStore{ tree: radix.New(), config: config, } } + +// AddPrefix adds or updates a prefix entry in the store +func (ps *PrefixStore) AddPrefix(ctx context.Context, prefix string, pod types.NamespacedName, modelName string) error { + ps.mu.Lock() + defer ps.mu.Unlock() + + logger := log.FromContext(ctx) + + // Validate prefix length + if len(prefix) < ps.config.MinPrefixLen { + return ErrPrefixTooShort + } + if len(prefix) > ps.config.MaxPrefixLen { + prefix = prefix[:ps.config.MaxPrefixLen] + } + + // Check if we're updating an existing entry + if val, exists := ps.tree.Get(prefix); exists { + entry := val.(*PrefixEntry) + if entry.PodRef == pod && entry.ModelName == modelName { + entry.LastUsed = time.Now() + ps.tree.Insert(prefix, entry) + return nil + } + } + + // Check total entries limit + if ps.tree.Len() >= ps.config.MaxEntries { + ps.evictOldest() + } + + // Add new entry + entry := &PrefixEntry{ + PodRef: pod, + LastUsed: time.Now(), + ModelName: modelName, + } + ps.tree.Insert(prefix, entry) + + logger.Info("Added prefix entry", "prefix", prefix, "pod", pod.String(), "model", modelName) + return nil +} + +// FindPodForPrefix finds the best matching pod for a given prefix and model +func (ps *PrefixStore) FindPodForPrefix(ctx context.Context, prefix string, modelName string) (types.NamespacedName, bool) { + ps.mu.RLock() + defer ps.mu.RUnlock() + + logger := log.FromContext(ctx) + + if len(prefix) < ps.config.MinPrefixLen { + return types.NamespacedName{}, false + } + + if len(prefix) > ps.config.MaxPrefixLen { + prefix = prefix[:ps.config.MaxPrefixLen] + } + + // Use LongestPrefix to find the best match + matchedPrefix, val, found := ps.tree.LongestPrefix(prefix) + if !found { + return types.NamespacedName{}, false + } + + entry := val.(*PrefixEntry) + + // Check if entry has expired or model doesn't match + if time.Since(entry.LastUsed) > ps.config.EntryTTL || entry.ModelName != modelName { + // Don't remove here to avoid write lock + return types.NamespacedName{}, false + } + + // Update LastUsed time for the matched entry + entry.LastUsed = time.Now() + ps.tree.Insert(matchedPrefix, entry) + + logger.Info("Found pod for prefix", "prefix", prefix, "matchedPrefix", matchedPrefix, "pod", entry.PodRef.String(), "model", modelName) + return entry.PodRef, true +} + +// evictOldest removes the oldest entry from the store +func (ps *PrefixStore) evictOldest() { + var oldestKey string + var oldestTime time.Time + first := true + + // Use Walk to find the oldest entry + ps.tree.Walk(func(key string, value interface{}) bool { + entry := value.(*PrefixEntry) + if first || entry.LastUsed.Before(oldestTime) { + oldestKey = key + oldestTime = entry.LastUsed + first = false + } + return false // continue walking + }) + + if oldestKey != "" { + ps.tree.Delete(oldestKey) + } +} + +// cleanupExpired removes expired entries +func (ps *PrefixStore) cleanupExpired(ctx context.Context) { + ps.mu.Lock() + defer ps.mu.Unlock() + + logger := log.FromContext(ctx) + now := time.Now() + var keysToDelete []string + + // Use Walk to find expired entries + ps.tree.Walk(func(key string, value interface{}) bool { + entry := value.(*PrefixEntry) + if now.Sub(entry.LastUsed) > ps.config.EntryTTL { + keysToDelete = append(keysToDelete, key) + } + return false + }) + + // Delete expired entries + for _, key := range keysToDelete { + ps.tree.Delete(key) + } + + if len(keysToDelete) > 0 { + logger.Info("Cleaned up expired entries", "count", len(keysToDelete)) + } +} + +// RunMaintenance performs periodic cleanup of expired entries +func (ps *PrefixStore) RunMaintenance(ctx context.Context) { + logger := log.FromContext(ctx) + ticker := time.NewTicker(ps.config.EntryTTL / 2) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + logger.Info("Maintenance routine stopping") + return + case <-ticker.C: + ps.cleanupExpired(ctx) + logger.Info("Completed maintenance cycle") + } + } +} From 9e30e0755c878754d4d44da984409bdd8fec0224 Mon Sep 17 00:00:00 2001 From: Ricardo Noriega De Soto Date: Wed, 23 Apr 2025 11:23:51 +0200 Subject: [PATCH 52/60] Prefix Aware Scorer Signed-off-by: Ricardo Noriega De Soto --- pkg/epp/scheduling/prefix_aware_scorer.go | 88 +++++++++++++++++++++++ pkg/epp/scheduling/types/types.go | 3 +- 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 pkg/epp/scheduling/prefix_aware_scorer.go diff --git a/pkg/epp/scheduling/prefix_aware_scorer.go b/pkg/epp/scheduling/prefix_aware_scorer.go new file mode 100644 index 000000000..75504777f --- /dev/null +++ b/pkg/epp/scheduling/prefix_aware_scorer.go @@ -0,0 +1,88 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// PrefixAwareScorer is a routing scorer that scores pods based on the longest prefix match +// between the request's prompt and stored prefixes. The score is normalized between 0 and 1, +// where 1 represents the longest matching prefix. +type PrefixAwareScorer struct { + weight float64 + prefixStore *PrefixStore +} + +// NewPrefixAwareScorer creates a new PrefixAwareScorer with the given weight and prefix store +func NewPrefixAwareScorer(weight float64, prefixStore *PrefixStore) Scorer { + return &PrefixAwareScorer{ + weight: weight, + prefixStore: prefixStore, + } +} + +// ScoreTargets scores the target pods based on the longest prefix match +func (s *PrefixAwareScorer) ScoreTargets(ctx *types.Context, pods []*types.PodMetrics) ([]PodScore, error) { + logger := log.FromContext(ctx) + scoredPods := make([]PodScore, len(pods)) + + // Get the prompt from the request + prompt := ctx.Req.Prompt + if prompt == "" { + // If no prompt, return zero scores for all pods + for i, pod := range pods { + scoredPods[i] = PodScore{ + Score: 0, + Pod: pod, + } + } + return scoredPods, nil + } + + // Find the best matching pod for the prompt + matchedPod, found := s.prefixStore.FindPodForPrefix(ctx, prompt, ctx.Req.ResolvedTargetModel) + if !found { + // If no matching prefix found, return zero scores for all pods + for i, pod := range pods { + scoredPods[i] = PodScore{ + Score: 0, + Pod: pod, + } + } + return scoredPods, nil + } + + // Assign scores based on pod match + for i, pod := range pods { + if pod.NamespacedName == matchedPod { + logger.Info("Pod found for prefix", "prompt", prompt, "pod", pod.NamespacedName.String()) + scoredPods[i] = PodScore{ + Score: s.weight, // Use the configured weight for the matching pod + Pod: pod, + } + } else { + scoredPods[i] = PodScore{ + Score: 0, // Zero score for non-matching pods + Pod: pod, + } + } + } + + return scoredPods, nil +} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index e5896dbc8..eb8612072 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -27,7 +27,8 @@ import ( // LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. type LLMRequest struct { - Model string + Model string + Prompt string // Target models is a map of target model name to weight. TargetModels map[string]int Prompt string From a481c8550e634f75c7477063dbc6940ee75aa2bd Mon Sep 17 00:00:00 2001 From: Ricardo Noriega De Soto Date: Wed, 23 Apr 2025 12:56:21 +0200 Subject: [PATCH 53/60] Add unit tests for prefix store Signed-off-by: Ricardo Noriega De Soto --- pkg/epp/scheduling/prefix_store_test.go | 315 ++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 pkg/epp/scheduling/prefix_store_test.go diff --git a/pkg/epp/scheduling/prefix_store_test.go b/pkg/epp/scheduling/prefix_store_test.go new file mode 100644 index 000000000..6854de747 --- /dev/null +++ b/pkg/epp/scheduling/prefix_store_test.go @@ -0,0 +1,315 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "testing" + "time" + + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// TestBasicPrefixOperations tests the basic functionality of adding and finding prefixes +func TestBasicPrefixOperations(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Test adding a prefix + err := store.AddPrefix(ctx, "hello", podName, "model1") + if err != nil { + t.Errorf("Failed to add prefix: %v", err) + } + + // Test finding the exact prefix + foundPod, found := store.FindPodForPrefix(ctx, "hello", "model1") + if !found { + t.Error("Expected to find prefix") + } + if foundPod != podName { + t.Errorf("Expected pod %v, got %v", podName, foundPod) + } + + // Test finding with a longer prefix + foundPod, found = store.FindPodForPrefix(ctx, "hello world", "model1") + if !found { + t.Error("Expected to find prefix with longer input") + } + if foundPod != podName { + t.Errorf("Expected pod %v, got %v", podName, foundPod) + } + + // Test updating an existing prefix + err = store.AddPrefix(ctx, "hello", podName, "model1") + if err != nil { + t.Errorf("Failed to update prefix: %v", err) + } +} + +// TestPrefixLengthConstraints tests the handling of prefixes that are too short or too long +func TestPrefixLengthConstraints(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Test adding a prefix that's too short + err := store.AddPrefix(ctx, "hi", podName, "model1") + if err == nil { + t.Error("Expected error for prefix that's too short") + } + + // Test adding a prefix that's too long (should be truncated) + longPrefix := "this is a very long prefix" + err = store.AddPrefix(ctx, longPrefix, podName, "model1") + if err != nil { + t.Errorf("Expected success when adding long prefix (should be truncated): %v", err) + } + + // Test finding with the truncated version + truncatedPrefix := longPrefix[:10] // MaxPrefixLen is 10 + foundPod, found := store.FindPodForPrefix(ctx, truncatedPrefix, "model1") + if !found { + t.Error("Expected to find truncated prefix") + } + if foundPod != podName { + t.Errorf("Expected pod %v, got %v", podName, foundPod) + } + + // Test finding with the full long prefix (should match the truncated version) + foundPod, found = store.FindPodForPrefix(ctx, longPrefix, "model1") + if !found { + t.Error("Expected to find pod with full long prefix (should match truncated version)") + } + if foundPod != podName { + t.Errorf("Expected pod %v, got %v", podName, foundPod) + } + + // Test finding with a prefix that's too short + _, found = store.FindPodForPrefix(ctx, "hi", "model1") + if found { + t.Error("Expected not to find prefix that's too short") + } +} + +// TestModelNameMatching tests that prefixes are only matched when the model name matches +func TestModelNameMatching(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Add prefix with model1 + err := store.AddPrefix(ctx, "hello", podName, "model1") + if err != nil { + t.Errorf("Failed to add prefix: %v", err) + } + + // Test finding with same model name + _, found := store.FindPodForPrefix(ctx, "hello", "model1") + if !found { + t.Error("Expected to find prefix with matching model name") + } + + // Test finding with different model name + _, found = store.FindPodForPrefix(ctx, "hello", "model2") + if found { + t.Error("Expected not to find prefix with different model name") + } +} + +// TestTTLExpiration tests that prefixes are removed after their TTL expires +func TestTTLExpiration(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 100 * time.Millisecond, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Add prefix + err := store.AddPrefix(ctx, "hello", podName, "model1") + if err != nil { + t.Errorf("Failed to add prefix: %v", err) + } + + // Should find it immediately + _, found := store.FindPodForPrefix(ctx, "hello", "model1") + if !found { + t.Error("Expected to find prefix immediately after adding") + } + + // Wait for TTL to expire + time.Sleep(200 * time.Millisecond) + + // Should not find it after TTL expires + _, found = store.FindPodForPrefix(ctx, "hello", "model1") + if found { + t.Error("Expected prefix to be expired after TTL") + } +} + +// TestMaxEntries tests that the store respects the maximum number of entries +func TestMaxEntries(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 2, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Add first prefix + err := store.AddPrefix(ctx, "prefix1", podName, "model1") + if err != nil { + t.Errorf("Failed to add first prefix: %v", err) + } + + // Add second prefix + err = store.AddPrefix(ctx, "prefix2", podName, "model1") + if err != nil { + t.Errorf("Failed to add second prefix: %v", err) + } + + // Add third prefix (should cause eviction) + err = store.AddPrefix(ctx, "prefix3", podName, "model1") + if err != nil { + t.Errorf("Failed to add third prefix: %v", err) + } + + // First prefix should be evicted + _, found := store.FindPodForPrefix(ctx, "prefix1", "model1") + if found { + t.Error("Expected first prefix to be evicted") + } + + // Second and third prefixes should still be there + _, found = store.FindPodForPrefix(ctx, "prefix2", "model1") + if !found { + t.Error("Expected second prefix to still be present") + } + + _, found = store.FindPodForPrefix(ctx, "prefix3", "model1") + if !found { + t.Error("Expected third prefix to still be present") + } +} + +// TestMaintenanceRoutine tests that the maintenance routine properly cleans up expired entries +func TestMaintenanceRoutine(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 100 * time.Millisecond, + } + + store := NewPrefixStore(config) + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Add prefix + err := store.AddPrefix(ctx, "hello", podName, "model1") + if err != nil { + t.Errorf("Failed to add prefix: %v", err) + } + + // Start maintenance routine + go store.RunMaintenance(ctx) + + // Should find it immediately + _, found := store.FindPodForPrefix(ctx, "hello", "model1") + if !found { + t.Error("Expected to find prefix immediately after adding") + } + + // Wait for TTL to expire + time.Sleep(200 * time.Millisecond) + + // Should not find it after TTL expires + _, found = store.FindPodForPrefix(ctx, "hello", "model1") + if found { + t.Error("Expected prefix to be expired after TTL") + } + + // Clean up + cancel() +} From 53c550d38cb401edbdb0235b46d725624a763de4 Mon Sep 17 00:00:00 2001 From: Ricardo Noriega De Soto Date: Wed, 23 Apr 2025 13:00:01 +0200 Subject: [PATCH 54/60] Add unit tests for prefix aware scorer Signed-off-by: Ricardo Noriega De Soto --- .../scheduling/prefix_aware_scorer_test.go | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 pkg/epp/scheduling/prefix_aware_scorer_test.go diff --git a/pkg/epp/scheduling/prefix_aware_scorer_test.go b/pkg/epp/scheduling/prefix_aware_scorer_test.go new file mode 100644 index 000000000..851bd8456 --- /dev/null +++ b/pkg/epp/scheduling/prefix_aware_scorer_test.go @@ -0,0 +1,171 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "testing" + "time" + + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestPrefixAwareScorer(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + // Create a prefix store with test configuration + prefixStore := NewPrefixStore(PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + }) + + // Create test pods + pod1 := &types.PodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + }, + }, + Metrics: &backendmetrics.Metrics{}, + } + pod2 := &types.PodMetrics{ + Pod: &backendmetrics.Pod{ + NamespacedName: k8stypes.NamespacedName{ + Name: "pod2", + Namespace: "default", + }, + }, + Metrics: &backendmetrics.Metrics{}, + } + + tests := []struct { + name string + weight float64 + prompt string + modelName string + prefixToAdd string + podToAdd k8stypes.NamespacedName + prefixModel string // Model name to use when adding the prefix + expectedScores []float64 + }{ + { + name: "no prompt", + weight: 1.0, + prompt: "", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: []float64{0, 0}, // No prompt means zero scores + }, + { + name: "exact prefix match", + weight: 1.0, + prompt: "hello world", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: []float64{1.0, 0}, // pod1 matches, pod2 doesn't + }, + { + name: "no prefix match", + weight: 1.0, + prompt: "goodbye", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: []float64{0, 0}, // No matching prefix + }, + { + name: "different model name", + weight: 1.0, + prompt: "hello world", + modelName: "model2", // Try to find with model2 + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", // But prefix was added with model1 + expectedScores: []float64{0, 0}, // Model name mismatch should result in no match + }, + { + name: "custom weight", + weight: 0.5, + prompt: "hello world", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: []float64{0.5, 0}, // Weight affects score + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Reset prefix store for each test + prefixStore = NewPrefixStore(PrefixStoreConfig{ + MaxEntries: 100, + MinPrefixLen: 3, + MaxPrefixLen: 10, + EntryTTL: 1 * time.Hour, + }) + + // Add prefix if specified + if tt.prefixToAdd != "" { + err := prefixStore.AddPrefix(ctx, tt.prefixToAdd, tt.podToAdd, tt.prefixModel) + if err != nil { + t.Fatalf("Failed to add prefix: %v", err) + } + } + + // Create scorer with test weight + scorer := NewPrefixAwareScorer(tt.weight, prefixStore) + + // Create test context + sCtx := types.NewContext(ctx, &types.LLMRequest{ + Prompt: tt.prompt, + ResolvedTargetModel: tt.modelName, + }, []*types.PodMetrics{}) + + // Score pods + pods := []*types.PodMetrics{pod1, pod2} + scores, err := scorer.ScoreTargets(sCtx, pods) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + // Verify scores + if len(scores) != len(tt.expectedScores) { + t.Fatalf("Expected %d scores, got %d", len(tt.expectedScores), len(scores)) + } + + for i, score := range scores { + if score.Score != tt.expectedScores[i] { + t.Errorf("Pod %d: expected score %v, got %v", i, tt.expectedScores[i], score.Score) + } + } + }) + } +} From d7f20fec35b7c00fb06697d688fa245c4b7b9f29 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Mon, 5 May 2025 00:55:37 +0300 Subject: [PATCH 55/60] implemented PrefixAwareScorer based on Ricardo's work --- pkg/epp/scheduling/local_config.go | 36 +- .../plugins/scorer/kvcache-aware-scorer.go | 27 +- .../plugins/scorer/prefix_aware_scorer.go | 100 ++++++ .../scorer}/prefix_aware_scorer_test.go | 101 +++--- .../scheduling/plugins/scorer/prefix_store.go | 177 ++++++++++ .../plugins/scorer/prefix_store_test.go | 59 ++++ pkg/epp/scheduling/prefix_aware_scorer.go | 88 ----- pkg/epp/scheduling/prefix_store.go | 188 ----------- pkg/epp/scheduling/prefix_store_test.go | 315 ------------------ pkg/epp/scheduling/types/types.go | 1 - 10 files changed, 436 insertions(+), 656 deletions(-) create mode 100644 pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go rename pkg/epp/scheduling/{ => plugins/scorer}/prefix_aware_scorer_test.go (57%) create mode 100644 pkg/epp/scheduling/plugins/scorer/prefix_store.go create mode 100644 pkg/epp/scheduling/plugins/scorer/prefix_store_test.go delete mode 100644 pkg/epp/scheduling/prefix_aware_scorer.go delete mode 100644 pkg/epp/scheduling/prefix_store.go delete mode 100644 pkg/epp/scheduling/prefix_store_test.go diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 018c630a3..9e554b679 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -20,6 +20,7 @@ import ( "context" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" @@ -29,10 +30,12 @@ import ( const ( kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" - pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" + prefixScorerEnablementEnvVar = "ENABLE_PREFIX_AWARE_SCORER" + pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" + prefixScorerWeightEnvVar = "PREFIX_AWARE_SCORER_WEIGHT" ) func init() { @@ -44,6 +47,7 @@ func setDefaultConfig() { // this configuration is a temporary state, it should be better streamlined. setLoadAwareScorer() setKVCacheAwareScorer() + setPrefixScorer() defaultConfig.picker = picker.NewMaxScorePicker() } @@ -81,3 +85,33 @@ func setKVCacheAwareScorer() { defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) } + +func setPDFilter() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if envutil.GetEnvString(pdFilterEnablementEnvVar, "false", loggerDebug) != "true" { + loggerDebug.Info("Skipping PDFilter creation as it is not enabled") + return + } + + defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter) + loggerDebug.Info("Initialized PDFilter") +} + +func setPrefixScorer() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if envutil.GetEnvString(prefixScorerEnablementEnvVar, "false", loggerDebug) != "true" { + loggerDebug.Info("Skipping PrefixScorer creation as it is not enabled") + return + } + + prefixScorerWeight := envutil.GetEnvInt(prefixScorerWeightEnvVar, 1, loggerDebug) + prefixScorer := scorer.NewPrefixAwareScorer(nil) + defaultConfig.scorers[prefixScorer] = prefixScorerWeight // TODO: make configurable + defaultConfig.postResponsePlugins = append(defaultConfig.postResponsePlugins, prefixScorer) + + loggerDebug.Info("Initialized PrefixAwareScorer", "weight", prefixScorerWeight) +} diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go index bc025751e..47b326f54 100644 --- a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go @@ -97,7 +97,20 @@ func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Po } loggerDebug.Info("Got pod scores", "scores", scores) - return indexerScoresToNormalizedScoredPods(pods, scores) + if len(scores) == 0 { + loggerDebug.Info("No scores found for pods") + return nil + } + + podToKey := func(pod types.Pod) (string, bool) { + metricsPod := pod.GetPod() + if metricsPod == nil { + return "", false + } + return metricsPod.Address, true + } + + return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) } func getMinMax(scores map[string]int) (int, int) { @@ -116,17 +129,21 @@ func getMinMax(scores map[string]int) (int, int) { return minScore, maxScore } -func indexerScoresToNormalizedScoredPods(pods []types.Pod, scores map[string]int) map[types.Pod]float64 { +// podToKey is a function type that converts a Pod to a string key. +// It returns the key and a boolean indicating success. +type podToKeyFunc func(pod types.Pod) (string, bool) + +func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc, scores map[string]int) map[types.Pod]float64 { scoredPods := make(map[types.Pod]float64) minScore, maxScore := getMinMax(scores) for _, pod := range pods { - metricsPod := pod.GetPod() - if metricsPod == nil { + key, ok := podToKey(pod) + if !ok { continue } - if score, ok := scores[metricsPod.Address]; ok { + if score, ok := scores[key]; ok { if minScore == maxScore { scoredPods[pod] = 1.0 continue diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go new file mode 100644 index 000000000..1ff99ecc6 --- /dev/null +++ b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go @@ -0,0 +1,100 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const prefixAwareScorerName = "prefix-aware-scorer" + +// PrefixAwareScorer is a routing scorer that scores pods based on the longest prefix match +// between the request's prompt and stored prefixes. The score is normalized between 0 and 1, +// where 1 represents the longest matching prefix. +type PrefixAwareScorer struct { + prefixStore *PrefixStore +} + +var _ plugins.Scorer = &PrefixAwareScorer{} + +// NewPrefixAwareScorer creates a new PrefixAwareScorer with the given +// PrefixStoreConfig. If the config is nil, default is used. +func NewPrefixAwareScorer(config *PrefixStoreConfig) *PrefixAwareScorer { + return &PrefixAwareScorer{ + prefixStore: NewPrefixStore(config), + } +} + +func (s *PrefixAwareScorer) Name() string { + return "prefix-aware-scorer" +} + +// Score scores the target pods based on the longest prefix match. +func (s *PrefixAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { + loggerDebug := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG) + if ctx.Req == nil { + loggerDebug.Info("Request is nil, skipping scoring") + return nil + } + + scores := s.prefixStore.FindMatchingPods(ctx.Req.Prompt, ctx.Req.Model) + loggerDebug.Info("Got pod scores", "scores", scores) + + if len(scores) == 0 { + loggerDebug.Info("No scores found for pods") + return nil + } + + podToKey := func(pod types.Pod) (string, bool) { + if pod.GetPod() == nil { + return "", false + } + + return pod.GetPod().NamespacedName.String(), true + } + + return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) +} + +// PostResponse implements the PostResponsePlugin interface. +// It adds the prefix to the PrefixStore for the given pod. +func (s *PrefixAwareScorer) PostResponse(ctx *types.SchedulingContext, pod types.Pod) { + debugLogger := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG) + + if ctx.Req == nil { + debugLogger.Info("Request is nil, skipping PostResponse") + return + } + + if pod.GetPod() == nil { + debugLogger.Info("Pod is nil, skipping PostResponse", "req", ctx.Req, "pod", pod) + return + } + + if err := s.prefixStore.AddEntry(ctx.Req.Model, ctx.Req.Prompt, &pod.GetPod().NamespacedName); err != nil { + debugLogger.Error(err, "Failed to add entry to prefix store", "req", ctx.Req, "pod", pod) + return + } +} + +// GetPrefixStore returns the scorer's PrefixStore. +func (s *PrefixAwareScorer) GetPrefixStore() *PrefixStore { + return s.prefixStore +} diff --git a/pkg/epp/scheduling/prefix_aware_scorer_test.go b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer_test.go similarity index 57% rename from pkg/epp/scheduling/prefix_aware_scorer_test.go rename to pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer_test.go index 851bd8456..49318fa47 100644 --- a/pkg/epp/scheduling/prefix_aware_scorer_test.go +++ b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer_test.go @@ -14,17 +14,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -package scheduling +package scorer_test import ( "context" - "testing" - "time" - k8stypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + "testing" ) func TestPrefixAwareScorer(t *testing.T) { @@ -32,14 +31,6 @@ func TestPrefixAwareScorer(t *testing.T) { logger := log.FromContext(ctx) ctx = log.IntoContext(ctx, logger) - // Create a prefix store with test configuration - prefixStore := NewPrefixStore(PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - }) - // Create test pods pod1 := &types.PodMetrics{ Pod: &backendmetrics.Pod{ @@ -68,7 +59,7 @@ func TestPrefixAwareScorer(t *testing.T) { prefixToAdd string podToAdd k8stypes.NamespacedName prefixModel string // Model name to use when adding the prefix - expectedScores []float64 + expectedScores map[types.Pod]float64 }{ { name: "no prompt", @@ -78,17 +69,20 @@ func TestPrefixAwareScorer(t *testing.T) { prefixToAdd: "hello", podToAdd: pod1.Pod.NamespacedName, prefixModel: "model1", - expectedScores: []float64{0, 0}, // No prompt means zero scores + expectedScores: map[types.Pod]float64{}, // No prompt means zero scores }, { - name: "exact prefix match", - weight: 1.0, - prompt: "hello world", - modelName: "model1", - prefixToAdd: "hello", - podToAdd: pod1.Pod.NamespacedName, - prefixModel: "model1", - expectedScores: []float64{1.0, 0}, // pod1 matches, pod2 doesn't + name: "exact prefix match", + weight: 1.0, + prompt: "hello world", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: map[types.Pod]float64{ + pod1: 1.0, + pod2: 0.0, + }, // pod1 matches, pod2 doesn't }, { name: "no prefix match", @@ -98,7 +92,7 @@ func TestPrefixAwareScorer(t *testing.T) { prefixToAdd: "hello", podToAdd: pod1.Pod.NamespacedName, prefixModel: "model1", - expectedScores: []float64{0, 0}, // No matching prefix + expectedScores: map[types.Pod]float64{}, // No matching prefix }, { name: "different model name", @@ -107,63 +101,54 @@ func TestPrefixAwareScorer(t *testing.T) { modelName: "model2", // Try to find with model2 prefixToAdd: "hello", podToAdd: pod1.Pod.NamespacedName, - prefixModel: "model1", // But prefix was added with model1 - expectedScores: []float64{0, 0}, // Model name mismatch should result in no match + prefixModel: "model1", // But prefix was added with model1 + expectedScores: map[types.Pod]float64{}, // Model name mismatch should result in no match }, { - name: "custom weight", - weight: 0.5, - prompt: "hello world", - modelName: "model1", - prefixToAdd: "hello", - podToAdd: pod1.Pod.NamespacedName, - prefixModel: "model1", - expectedScores: []float64{0.5, 0}, // Weight affects score + name: "custom weight", + weight: 0.5, + prompt: "hello world", + modelName: "model1", + prefixToAdd: "hello", + podToAdd: pod1.Pod.NamespacedName, + prefixModel: "model1", + expectedScores: map[types.Pod]float64{ + pod1: 0.5, // Pod1 matches with weight + pod2: 0.0, // Pod2 doesn't match + }, // Weight affects score }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Reset prefix store for each test - prefixStore = NewPrefixStore(PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - }) + config := scorer.DefaultPrefixStoreConfig() + config.BlockSize = 5 // set small chunking for testing + + s := scorer.NewPrefixAwareScorer(config) // Add prefix if specified if tt.prefixToAdd != "" { - err := prefixStore.AddPrefix(ctx, tt.prefixToAdd, tt.podToAdd, tt.prefixModel) + err := s.GetPrefixStore().AddEntry(tt.prefixModel, + tt.prefixToAdd, &tt.podToAdd) if err != nil { t.Fatalf("Failed to add prefix: %v", err) } } - // Create scorer with test weight - scorer := NewPrefixAwareScorer(tt.weight, prefixStore) - // Create test context - sCtx := types.NewContext(ctx, &types.LLMRequest{ + sCtx := types.NewSchedulingContext(ctx, &types.LLMRequest{ Prompt: tt.prompt, ResolvedTargetModel: tt.modelName, - }, []*types.PodMetrics{}) + }, []types.Pod{}, 0) // Score pods - pods := []*types.PodMetrics{pod1, pod2} - scores, err := scorer.ScoreTargets(sCtx, pods) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - - // Verify scores - if len(scores) != len(tt.expectedScores) { - t.Fatalf("Expected %d scores, got %d", len(tt.expectedScores), len(scores)) - } + pods := []types.Pod{pod1, pod2} + scores := s.Score(sCtx, pods) - for i, score := range scores { - if score.Score != tt.expectedScores[i] { - t.Errorf("Pod %d: expected score %v, got %v", i, tt.expectedScores[i], score.Score) + for p, score := range scores { + if score != tt.expectedScores[p] { + t.Errorf("Pod %v: expected score %v, got %v", p, tt.expectedScores[p], score) } } }) diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_store.go b/pkg/epp/scheduling/plugins/scorer/prefix_store.go new file mode 100644 index 000000000..7f8f96c38 --- /dev/null +++ b/pkg/epp/scheduling/plugins/scorer/prefix_store.go @@ -0,0 +1,177 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "fmt" + "k8s.io/apimachinery/pkg/types" + "sync" + "time" + + "github.com/cespare/xxhash/v2" + lru "github.com/hashicorp/golang-lru/v2" +) + +const ( + // defaultMaxCacheSize sets the maximum number of blocks the LRU cache can store. + defaultMaxCacheSize = 500000 + // defaultBlockSize defines how many runes each block contains in the prefix cache. + defaultBlockSize = 256 + // defaultMaxBlockCacheSize sets the maximum number of pods a block can store. + defaultMaxBlockCacheSize = 100 +) + +// PrefixStoreConfig contains initialization configuration for PrefixStore. +type PrefixStoreConfig struct { + // CacheSize sets the maximum number of blocks the LRU cache can store. + CacheSize int + // BlockSize defines how many runes each block contains in the prefix cache. + BlockSize int + // BlockCacheSize sets the maximum number of pods a block can store. + BlockCacheSize int +} + +// DefaultPrefixStoreConfig returns an PrefixStoreConfig instance with default +// configuration. +func DefaultPrefixStoreConfig() *PrefixStoreConfig { + return &PrefixStoreConfig{ + CacheSize: defaultMaxCacheSize, + BlockSize: defaultBlockSize, + BlockCacheSize: defaultMaxBlockCacheSize, + } +} + +// block holds the tokens contained in the block. +type block struct { + Pods *lru.Cache[types.NamespacedName, time.Time] //TODO: implement Pod eviction based on staleness +} + +// PrefixStore is an in-memory prefix-to-block cache with xxhash keys and LRU +// eviction. +type PrefixStore struct { + sync.RWMutex + + cacheSize int + blockSize int + blockCacheSize int + + store map[string]*lru.Cache[uint64, *block] +} + +// NewPrefixStore initializes the PrefixStore with LRU cache. +// If the configuration is nil, default is used. +func NewPrefixStore(config *PrefixStoreConfig) *PrefixStore { + if config == nil { + config = DefaultPrefixStoreConfig() + } + + return &PrefixStore{ + cacheSize: config.CacheSize, + blockSize: config.BlockSize, + blockCacheSize: config.BlockCacheSize, + store: make(map[string]*lru.Cache[uint64, *block]), + } +} + +// AddEntry adds a new entry to the prefix store. +func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.NamespacedName) error { + if prompt == "" || pod == nil { + return nil + } + + s.Lock() + // Get or create the LRU cache for the model + cache, ok := s.store[modelName] + if !ok { + var err error + cache, err = lru.New[uint64, *block](s.cacheSize) + if err != nil { + return fmt.Errorf("failed to create LRU cache for model %s: %w", modelName, err) + } + + s.store[modelName] = cache + } + s.Unlock() + + // Chunk the text into blocks and populate the cache + for start := 0; start < len(prompt); start += s.blockSize { + end := start + s.blockSize + if end > len(prompt) { + end = len(prompt) + } + + // Compute the hash for the current block + digest := xxhash.New() + if _, err := digest.WriteString(prompt[start:end]); err != nil { + return fmt.Errorf("failed to compute chunk hash: %w", err) + } + + blockHash := digest.Sum64() + + b, ok := cache.Get(blockHash) + if !ok { + pods, err := lru.New[types.NamespacedName, time.Time](s.blockCacheSize) + if err != nil { + return fmt.Errorf("failed to create LRU cache for block: %w", err) + } + + b = &block{Pods: pods} + cache.Add(blockHash, b) + } + + b.Pods.Add(*pod, time.Now()) // thread-safe + } + + return nil +} + +// FindMatchingPods finds all pods that match the given prompt and model name. +// It returns a map of pods and the number of blocks they match. +func (s *PrefixStore) FindMatchingPods(prompt, modelName string) map[string]int { + s.RLock() + cache, ok := s.store[modelName] // cache is thread-safe + s.RUnlock() + + if !ok { + return nil + } + + matchedPods := make(map[string]int) + for start := 0; start < len(prompt); start += s.blockSize { + end := start + s.blockSize + if end > len(prompt) { + end = len(prompt) + } + + digest := xxhash.New() + if _, err := digest.WriteString(prompt[start:end]); err != nil { + return nil + } + blockHash := digest.Sum64() + + b, ok := cache.Get(blockHash) + if !ok { + break // match consecutive blocks + } + + for _, pod := range b.Pods.Keys() { + matchedPods[pod.String()]++ + } + } + + return matchedPods +} diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_store_test.go b/pkg/epp/scheduling/plugins/scorer/prefix_store_test.go new file mode 100644 index 000000000..c0765b845 --- /dev/null +++ b/pkg/epp/scheduling/plugins/scorer/prefix_store_test.go @@ -0,0 +1,59 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer_test + +import ( + "context" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" + "testing" +) + +// TestBasicPrefixOperations tests the basic functionality of adding and finding prefixes +func TestBasicPrefixOperations(t *testing.T) { + ctx := context.Background() + logger := log.FromContext(ctx) + ctx = log.IntoContext(ctx, logger) + + config := scorer.DefaultPrefixStoreConfig() + config.BlockSize = 5 // set small chunking for testing + store := scorer.NewPrefixStore(config) + + podName := k8stypes.NamespacedName{ + Name: "pod1", + Namespace: "default", + } + + // Test adding a prefix + err := store.AddEntry("model1", "hello", &podName) + if err != nil { + t.Errorf("Failed to add prefix: %v", err) + } + + // Test finding the exact prefix + scores := store.FindMatchingPods("hello", "model1") + if _, ok := scores[podName.String()]; !ok { + t.Errorf("Expected pod %v, scores %v", podName, scores) + } + + // Test finding with a longer prefix + scores = store.FindMatchingPods("hello world", "model1") + if _, ok := scores[podName.String()]; !ok { + t.Errorf("Expected pod %v, scores %v", podName, scores) + } +} diff --git a/pkg/epp/scheduling/prefix_aware_scorer.go b/pkg/epp/scheduling/prefix_aware_scorer.go deleted file mode 100644 index 75504777f..000000000 --- a/pkg/epp/scheduling/prefix_aware_scorer.go +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" -) - -// PrefixAwareScorer is a routing scorer that scores pods based on the longest prefix match -// between the request's prompt and stored prefixes. The score is normalized between 0 and 1, -// where 1 represents the longest matching prefix. -type PrefixAwareScorer struct { - weight float64 - prefixStore *PrefixStore -} - -// NewPrefixAwareScorer creates a new PrefixAwareScorer with the given weight and prefix store -func NewPrefixAwareScorer(weight float64, prefixStore *PrefixStore) Scorer { - return &PrefixAwareScorer{ - weight: weight, - prefixStore: prefixStore, - } -} - -// ScoreTargets scores the target pods based on the longest prefix match -func (s *PrefixAwareScorer) ScoreTargets(ctx *types.Context, pods []*types.PodMetrics) ([]PodScore, error) { - logger := log.FromContext(ctx) - scoredPods := make([]PodScore, len(pods)) - - // Get the prompt from the request - prompt := ctx.Req.Prompt - if prompt == "" { - // If no prompt, return zero scores for all pods - for i, pod := range pods { - scoredPods[i] = PodScore{ - Score: 0, - Pod: pod, - } - } - return scoredPods, nil - } - - // Find the best matching pod for the prompt - matchedPod, found := s.prefixStore.FindPodForPrefix(ctx, prompt, ctx.Req.ResolvedTargetModel) - if !found { - // If no matching prefix found, return zero scores for all pods - for i, pod := range pods { - scoredPods[i] = PodScore{ - Score: 0, - Pod: pod, - } - } - return scoredPods, nil - } - - // Assign scores based on pod match - for i, pod := range pods { - if pod.NamespacedName == matchedPod { - logger.Info("Pod found for prefix", "prompt", prompt, "pod", pod.NamespacedName.String()) - scoredPods[i] = PodScore{ - Score: s.weight, // Use the configured weight for the matching pod - Pod: pod, - } - } else { - scoredPods[i] = PodScore{ - Score: 0, // Zero score for non-matching pods - Pod: pod, - } - } - } - - return scoredPods, nil -} diff --git a/pkg/epp/scheduling/prefix_store.go b/pkg/epp/scheduling/prefix_store.go deleted file mode 100644 index 6878646d2..000000000 --- a/pkg/epp/scheduling/prefix_store.go +++ /dev/null @@ -1,188 +0,0 @@ -package scheduling - -import ( - "context" - "sync" - "time" - - "github.com/armon/go-radix" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// PrefixEntry represents a single entry in the prefix store -type PrefixEntry struct { - PodRef types.NamespacedName - LastUsed time.Time - ModelName string -} - -// PrefixStoreConfig holds configuration for the prefix store -type PrefixStoreConfig struct { - MaxEntries int // Maximum total entries in the store - MinPrefixLen int // Minimum prefix length to store - MaxPrefixLen int // Maximum prefix length to store - EntryTTL time.Duration // Time-to-live for entries -} - -// PrefixStore manages prompt prefixes and their pod assignments -type PrefixStore struct { - tree *radix.Tree - mu sync.RWMutex - config PrefixStoreConfig -} - -// NewPrefixStore creates a new PrefixStore with the given configuration -func NewPrefixStore(config PrefixStoreConfig) *PrefixStore { - return &PrefixStore{ - tree: radix.New(), - config: config, - } -} - -// AddPrefix adds or updates a prefix entry in the store -func (ps *PrefixStore) AddPrefix(ctx context.Context, prefix string, pod types.NamespacedName, modelName string) error { - ps.mu.Lock() - defer ps.mu.Unlock() - - logger := log.FromContext(ctx) - - // Validate prefix length - if len(prefix) < ps.config.MinPrefixLen { - return ErrPrefixTooShort - } - if len(prefix) > ps.config.MaxPrefixLen { - prefix = prefix[:ps.config.MaxPrefixLen] - } - - // Check if we're updating an existing entry - if val, exists := ps.tree.Get(prefix); exists { - entry := val.(*PrefixEntry) - if entry.PodRef == pod && entry.ModelName == modelName { - entry.LastUsed = time.Now() - ps.tree.Insert(prefix, entry) - return nil - } - } - - // Check total entries limit - if ps.tree.Len() >= ps.config.MaxEntries { - ps.evictOldest() - } - - // Add new entry - entry := &PrefixEntry{ - PodRef: pod, - LastUsed: time.Now(), - ModelName: modelName, - } - ps.tree.Insert(prefix, entry) - - logger.Info("Added prefix entry", "prefix", prefix, "pod", pod.String(), "model", modelName) - return nil -} - -// FindPodForPrefix finds the best matching pod for a given prefix and model -func (ps *PrefixStore) FindPodForPrefix(ctx context.Context, prefix string, modelName string) (types.NamespacedName, bool) { - ps.mu.RLock() - defer ps.mu.RUnlock() - - logger := log.FromContext(ctx) - - if len(prefix) < ps.config.MinPrefixLen { - return types.NamespacedName{}, false - } - - if len(prefix) > ps.config.MaxPrefixLen { - prefix = prefix[:ps.config.MaxPrefixLen] - } - - // Use LongestPrefix to find the best match - matchedPrefix, val, found := ps.tree.LongestPrefix(prefix) - if !found { - return types.NamespacedName{}, false - } - - entry := val.(*PrefixEntry) - - // Check if entry has expired or model doesn't match - if time.Since(entry.LastUsed) > ps.config.EntryTTL || entry.ModelName != modelName { - // Don't remove here to avoid write lock - return types.NamespacedName{}, false - } - - // Update LastUsed time for the matched entry - entry.LastUsed = time.Now() - ps.tree.Insert(matchedPrefix, entry) - - logger.Info("Found pod for prefix", "prefix", prefix, "matchedPrefix", matchedPrefix, "pod", entry.PodRef.String(), "model", modelName) - return entry.PodRef, true -} - -// evictOldest removes the oldest entry from the store -func (ps *PrefixStore) evictOldest() { - var oldestKey string - var oldestTime time.Time - first := true - - // Use Walk to find the oldest entry - ps.tree.Walk(func(key string, value interface{}) bool { - entry := value.(*PrefixEntry) - if first || entry.LastUsed.Before(oldestTime) { - oldestKey = key - oldestTime = entry.LastUsed - first = false - } - return false // continue walking - }) - - if oldestKey != "" { - ps.tree.Delete(oldestKey) - } -} - -// cleanupExpired removes expired entries -func (ps *PrefixStore) cleanupExpired(ctx context.Context) { - ps.mu.Lock() - defer ps.mu.Unlock() - - logger := log.FromContext(ctx) - now := time.Now() - var keysToDelete []string - - // Use Walk to find expired entries - ps.tree.Walk(func(key string, value interface{}) bool { - entry := value.(*PrefixEntry) - if now.Sub(entry.LastUsed) > ps.config.EntryTTL { - keysToDelete = append(keysToDelete, key) - } - return false - }) - - // Delete expired entries - for _, key := range keysToDelete { - ps.tree.Delete(key) - } - - if len(keysToDelete) > 0 { - logger.Info("Cleaned up expired entries", "count", len(keysToDelete)) - } -} - -// RunMaintenance performs periodic cleanup of expired entries -func (ps *PrefixStore) RunMaintenance(ctx context.Context) { - logger := log.FromContext(ctx) - ticker := time.NewTicker(ps.config.EntryTTL / 2) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - logger.Info("Maintenance routine stopping") - return - case <-ticker.C: - ps.cleanupExpired(ctx) - logger.Info("Completed maintenance cycle") - } - } -} diff --git a/pkg/epp/scheduling/prefix_store_test.go b/pkg/epp/scheduling/prefix_store_test.go deleted file mode 100644 index 6854de747..000000000 --- a/pkg/epp/scheduling/prefix_store_test.go +++ /dev/null @@ -1,315 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "context" - "testing" - "time" - - k8stypes "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// TestBasicPrefixOperations tests the basic functionality of adding and finding prefixes -func TestBasicPrefixOperations(t *testing.T) { - ctx := context.Background() - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Test adding a prefix - err := store.AddPrefix(ctx, "hello", podName, "model1") - if err != nil { - t.Errorf("Failed to add prefix: %v", err) - } - - // Test finding the exact prefix - foundPod, found := store.FindPodForPrefix(ctx, "hello", "model1") - if !found { - t.Error("Expected to find prefix") - } - if foundPod != podName { - t.Errorf("Expected pod %v, got %v", podName, foundPod) - } - - // Test finding with a longer prefix - foundPod, found = store.FindPodForPrefix(ctx, "hello world", "model1") - if !found { - t.Error("Expected to find prefix with longer input") - } - if foundPod != podName { - t.Errorf("Expected pod %v, got %v", podName, foundPod) - } - - // Test updating an existing prefix - err = store.AddPrefix(ctx, "hello", podName, "model1") - if err != nil { - t.Errorf("Failed to update prefix: %v", err) - } -} - -// TestPrefixLengthConstraints tests the handling of prefixes that are too short or too long -func TestPrefixLengthConstraints(t *testing.T) { - ctx := context.Background() - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Test adding a prefix that's too short - err := store.AddPrefix(ctx, "hi", podName, "model1") - if err == nil { - t.Error("Expected error for prefix that's too short") - } - - // Test adding a prefix that's too long (should be truncated) - longPrefix := "this is a very long prefix" - err = store.AddPrefix(ctx, longPrefix, podName, "model1") - if err != nil { - t.Errorf("Expected success when adding long prefix (should be truncated): %v", err) - } - - // Test finding with the truncated version - truncatedPrefix := longPrefix[:10] // MaxPrefixLen is 10 - foundPod, found := store.FindPodForPrefix(ctx, truncatedPrefix, "model1") - if !found { - t.Error("Expected to find truncated prefix") - } - if foundPod != podName { - t.Errorf("Expected pod %v, got %v", podName, foundPod) - } - - // Test finding with the full long prefix (should match the truncated version) - foundPod, found = store.FindPodForPrefix(ctx, longPrefix, "model1") - if !found { - t.Error("Expected to find pod with full long prefix (should match truncated version)") - } - if foundPod != podName { - t.Errorf("Expected pod %v, got %v", podName, foundPod) - } - - // Test finding with a prefix that's too short - _, found = store.FindPodForPrefix(ctx, "hi", "model1") - if found { - t.Error("Expected not to find prefix that's too short") - } -} - -// TestModelNameMatching tests that prefixes are only matched when the model name matches -func TestModelNameMatching(t *testing.T) { - ctx := context.Background() - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Add prefix with model1 - err := store.AddPrefix(ctx, "hello", podName, "model1") - if err != nil { - t.Errorf("Failed to add prefix: %v", err) - } - - // Test finding with same model name - _, found := store.FindPodForPrefix(ctx, "hello", "model1") - if !found { - t.Error("Expected to find prefix with matching model name") - } - - // Test finding with different model name - _, found = store.FindPodForPrefix(ctx, "hello", "model2") - if found { - t.Error("Expected not to find prefix with different model name") - } -} - -// TestTTLExpiration tests that prefixes are removed after their TTL expires -func TestTTLExpiration(t *testing.T) { - ctx := context.Background() - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 100 * time.Millisecond, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Add prefix - err := store.AddPrefix(ctx, "hello", podName, "model1") - if err != nil { - t.Errorf("Failed to add prefix: %v", err) - } - - // Should find it immediately - _, found := store.FindPodForPrefix(ctx, "hello", "model1") - if !found { - t.Error("Expected to find prefix immediately after adding") - } - - // Wait for TTL to expire - time.Sleep(200 * time.Millisecond) - - // Should not find it after TTL expires - _, found = store.FindPodForPrefix(ctx, "hello", "model1") - if found { - t.Error("Expected prefix to be expired after TTL") - } -} - -// TestMaxEntries tests that the store respects the maximum number of entries -func TestMaxEntries(t *testing.T) { - ctx := context.Background() - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 2, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 1 * time.Hour, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Add first prefix - err := store.AddPrefix(ctx, "prefix1", podName, "model1") - if err != nil { - t.Errorf("Failed to add first prefix: %v", err) - } - - // Add second prefix - err = store.AddPrefix(ctx, "prefix2", podName, "model1") - if err != nil { - t.Errorf("Failed to add second prefix: %v", err) - } - - // Add third prefix (should cause eviction) - err = store.AddPrefix(ctx, "prefix3", podName, "model1") - if err != nil { - t.Errorf("Failed to add third prefix: %v", err) - } - - // First prefix should be evicted - _, found := store.FindPodForPrefix(ctx, "prefix1", "model1") - if found { - t.Error("Expected first prefix to be evicted") - } - - // Second and third prefixes should still be there - _, found = store.FindPodForPrefix(ctx, "prefix2", "model1") - if !found { - t.Error("Expected second prefix to still be present") - } - - _, found = store.FindPodForPrefix(ctx, "prefix3", "model1") - if !found { - t.Error("Expected third prefix to still be present") - } -} - -// TestMaintenanceRoutine tests that the maintenance routine properly cleans up expired entries -func TestMaintenanceRoutine(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - logger := log.FromContext(ctx) - ctx = log.IntoContext(ctx, logger) - - config := PrefixStoreConfig{ - MaxEntries: 100, - MinPrefixLen: 3, - MaxPrefixLen: 10, - EntryTTL: 100 * time.Millisecond, - } - - store := NewPrefixStore(config) - podName := k8stypes.NamespacedName{ - Name: "pod1", - Namespace: "default", - } - - // Add prefix - err := store.AddPrefix(ctx, "hello", podName, "model1") - if err != nil { - t.Errorf("Failed to add prefix: %v", err) - } - - // Start maintenance routine - go store.RunMaintenance(ctx) - - // Should find it immediately - _, found := store.FindPodForPrefix(ctx, "hello", "model1") - if !found { - t.Error("Expected to find prefix immediately after adding") - } - - // Wait for TTL to expire - time.Sleep(200 * time.Millisecond) - - // Should not find it after TTL expires - _, found = store.FindPodForPrefix(ctx, "hello", "model1") - if found { - t.Error("Expected prefix to be expired after TTL") - } - - // Clean up - cancel() -} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index eb8612072..d46b9d063 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -31,7 +31,6 @@ type LLMRequest struct { Prompt string // Target models is a map of target model name to weight. TargetModels map[string]int - Prompt string Headers map[string]string // Resolved target model is the final target model after traffic split. ResolvedTargetModel string From b852c923a2b8717457d337b02a6ecddf69b92549 Mon Sep 17 00:00:00 2001 From: Ricardo Noriega Date: Mon, 5 May 2025 12:21:16 +0200 Subject: [PATCH 56/60] Remove KVcache scorer changes for traceability Signed-off-by: Ricardo Noriega --- .../plugins/scorer/kvcache-aware-scorer.go | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go index 47b326f54..bc025751e 100644 --- a/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/kvcache-aware-scorer.go @@ -97,20 +97,7 @@ func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Po } loggerDebug.Info("Got pod scores", "scores", scores) - if len(scores) == 0 { - loggerDebug.Info("No scores found for pods") - return nil - } - - podToKey := func(pod types.Pod) (string, bool) { - metricsPod := pod.GetPod() - if metricsPod == nil { - return "", false - } - return metricsPod.Address, true - } - - return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) + return indexerScoresToNormalizedScoredPods(pods, scores) } func getMinMax(scores map[string]int) (int, int) { @@ -129,21 +116,17 @@ func getMinMax(scores map[string]int) (int, int) { return minScore, maxScore } -// podToKey is a function type that converts a Pod to a string key. -// It returns the key and a boolean indicating success. -type podToKeyFunc func(pod types.Pod) (string, bool) - -func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc, scores map[string]int) map[types.Pod]float64 { +func indexerScoresToNormalizedScoredPods(pods []types.Pod, scores map[string]int) map[types.Pod]float64 { scoredPods := make(map[types.Pod]float64) minScore, maxScore := getMinMax(scores) for _, pod := range pods { - key, ok := podToKey(pod) - if !ok { + metricsPod := pod.GetPod() + if metricsPod == nil { continue } - if score, ok := scores[key]; ok { + if score, ok := scores[metricsPod.Address]; ok { if minScore == maxScore { scoredPods[pod] = 1.0 continue From a0e02c0dd17b3d925e4325a62e729e79eb618e23 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Mon, 5 May 2025 17:34:39 +0300 Subject: [PATCH 57/60] addressed review comments Signed-off-by: Maroon Ayoub --- pkg/epp/scheduling/local_config.go | 16 +--------- .../plugins/scorer/prefix_aware_scorer.go | 30 +++++++++++++++++++ .../scheduling/plugins/scorer/prefix_store.go | 8 +++-- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 9e554b679..f6beb4787 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -20,7 +20,6 @@ import ( "context" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" @@ -31,7 +30,7 @@ const ( kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" prefixScorerEnablementEnvVar = "ENABLE_PREFIX_AWARE_SCORER" - pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" + pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" @@ -86,19 +85,6 @@ func setKVCacheAwareScorer() { loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight) } -func setPDFilter() { - ctx := context.Background() - loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) - - if envutil.GetEnvString(pdFilterEnablementEnvVar, "false", loggerDebug) != "true" { - loggerDebug.Info("Skipping PDFilter creation as it is not enabled") - return - } - - defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter) - loggerDebug.Info("Initialized PDFilter") -} - func setPrefixScorer() { ctx := context.Background() loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go index 1ff99ecc6..0dee156d9 100644 --- a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go @@ -98,3 +98,33 @@ func (s *PrefixAwareScorer) PostResponse(ctx *types.SchedulingContext, pod types func (s *PrefixAwareScorer) GetPrefixStore() *PrefixStore { return s.prefixStore } + +// podToKey is a function type that converts a Pod to a string key. +// It returns the key and a boolean indicating success. +type podToKeyFunc func(pod types.Pod) (string, bool) + +func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc, + scores map[string]int) map[types.Pod]float64 { + scoredPods := make(map[types.Pod]float64) + minScore, maxScore := getMinMax(scores) + + for _, pod := range pods { + key, ok := podToKey(pod) + if !ok { + continue + } + + if score, ok := scores[key]; ok { + if minScore == maxScore { + scoredPods[pod] = 1.0 + continue + } + + scoredPods[pod] = float64(score-minScore) / float64(maxScore-minScore) + } else { + scoredPods[pod] = 0.0 + } + } + + return scoredPods +} diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_store.go b/pkg/epp/scheduling/plugins/scorer/prefix_store.go index 7f8f96c38..8c6961647 100644 --- a/pkg/epp/scheduling/plugins/scorer/prefix_store.go +++ b/pkg/epp/scheduling/plugins/scorer/prefix_store.go @@ -89,7 +89,7 @@ func NewPrefixStore(config *PrefixStoreConfig) *PrefixStore { // AddEntry adds a new entry to the prefix store. func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.NamespacedName) error { - if prompt == "" || pod == nil { + if prompt == "" || pod == nil || len(prompt) < s.blockSize /* skip if prompt is too short */ { return nil } @@ -111,7 +111,7 @@ func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.Names for start := 0; start < len(prompt); start += s.blockSize { end := start + s.blockSize if end > len(prompt) { - end = len(prompt) + break // skip partial blocks } // Compute the hash for the current block @@ -142,6 +142,10 @@ func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.Names // FindMatchingPods finds all pods that match the given prompt and model name. // It returns a map of pods and the number of blocks they match. func (s *PrefixStore) FindMatchingPods(prompt, modelName string) map[string]int { + if prompt == "" || modelName == "" || len(prompt) < s.blockSize /* skip if prompt is too short */ { + return nil + } + s.RLock() cache, ok := s.store[modelName] // cache is thread-safe s.RUnlock() From f52caa2f288399554d4908bf9450f81dee332d9f Mon Sep 17 00:00:00 2001 From: dmitripikus <46105577+dmitripikus@users.noreply.github.com> Date: Mon, 5 May 2025 22:48:03 +0300 Subject: [PATCH 58/60] Session affinity scorer (#117) * 'session affinity scorer' partial implementation (without headers in response) * Fix in filling request headers * Encoded value of namespaced pod name is sent in response to client * Support of session affinity scorer configuration via environment variables, is added * Go file for session affinity scorer is renamed * Redundant 'sessions' field is removed * Redundant 'ScorerWithPostResponse' struct is removed * - SessionID is renamed to sessionToken - Map fetch is done instead of loop * Session token name is changed to 'x-session-token' * Minor fixes are made in README * Small fix after merge --------- Co-authored-by: Shmuel Kallner --- README.md | 6 ++ pkg/epp/handlers/request.go | 2 +- pkg/epp/scheduling/local_config.go | 36 +++++++-- .../plugins/scorer/session-affinity-scorer.go | 79 +++++++++++++++++++ 4 files changed, 114 insertions(+), 9 deletions(-) create mode 100644 pkg/epp/scheduling/plugins/scorer/session-affinity-scorer.go diff --git a/README.md b/README.md index dc8921795..1948be6bf 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,12 @@ export ENABLE_LOAD_AWARE_SCORER=true export LOAD_AWARE_SCORER_WEIGHT=1.0 ``` +To enable the SessionAwareScorer, the following environment variables must be configured: +``` +export ENABLE_SESSION_AWARE_SCORER=true +export SESSION_AWARE_SCORER_WEIGHT=1.0 +``` + To enable Prefill/Decode (PD) processing, the following environment variable must be configured: ``` export PD_ENABLED=true diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 4997a8b30..47cd37dee 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -159,7 +159,7 @@ func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *Requ } for _, header := range req.RequestHeaders.Headers.Headers { - reqCtx.RequestHeaders[header.Key] = header.Value + reqCtx.RequestHeaders[header.Key] = string(header.RawValue) } return nil diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index f6beb4787..6b1c3e481 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -27,14 +27,16 @@ import ( ) const ( - kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" - loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" - prefixScorerEnablementEnvVar = "ENABLE_PREFIX_AWARE_SCORER" - pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" - - kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" - loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" - prefixScorerWeightEnvVar = "PREFIX_AWARE_SCORER_WEIGHT" + kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER" + loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER" + prefixScorerEnablementEnvVar = "ENABLE_PREFIX_AWARE_SCORER" + sessionAwareScorerEnablementEnvVar = "ENABLE_SESSION_AWARE_SCORER" + pdFilterEnablementEnvVar = "ENABLE_PD_FILTER" + + kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT" + loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT" + prefixScorerWeightEnvVar = "PREFIX_AWARE_SCORER_WEIGHT" + sessionAwareScorerWeightEnvVar = "SESSION_AWARE_SCORER_WEIGHT" ) func init() { @@ -45,6 +47,7 @@ func setDefaultConfig() { // since the default config is a global variable, we add this function to minimize rebase conflicts. // this configuration is a temporary state, it should be better streamlined. setLoadAwareScorer() + setSessionAwareScorer() setKVCacheAwareScorer() setPrefixScorer() @@ -65,6 +68,23 @@ func setLoadAwareScorer() { loggerDebug.Info("Initialized LoadAwareScorer", "weight", loadBasedScorerWeight) } +func setSessionAwareScorer() { + ctx := context.Background() + loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) + + if envutil.GetEnvString(sessionAwareScorerEnablementEnvVar, "false", loggerDebug) != "true" { + loggerDebug.Info("Skipping SessionAwareScorer creation as it is not enabled") + return + } + + sessionBasedScorerWeight := envutil.GetEnvInt(sessionAwareScorerWeightEnvVar, 1, loggerDebug) + sessionAffinity := scorer.NewSessionAffinity() + + defaultConfig.scorers[sessionAffinity] = sessionBasedScorerWeight + defaultConfig.postResponsePlugins = append(defaultConfig.postResponsePlugins, sessionAffinity) + loggerDebug.Info("Initialized SessionAwareScorer", "weight", sessionBasedScorerWeight) +} + func setKVCacheAwareScorer() { ctx := context.Background() loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG) diff --git a/pkg/epp/scheduling/plugins/scorer/session-affinity-scorer.go b/pkg/epp/scheduling/plugins/scorer/session-affinity-scorer.go new file mode 100644 index 000000000..2431b95a2 --- /dev/null +++ b/pkg/epp/scheduling/plugins/scorer/session-affinity-scorer.go @@ -0,0 +1,79 @@ +/* +Copyright 2025 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "encoding/base64" + "time" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + sessionKeepAliveTime = 60 * time.Minute // How long should an idle session be kept alive + sessionKeepAliveCheckFrequency = 15 * time.Minute // How often to check for overly idle sessions + sessionTokenHeader = "x-session-token" // name of the session header in request +) + +// sessionAffinity is a routing scorer that routes subsequent +// requests in a session to the same pod as the first request in the +// session was sent to, by giving that pod the specified weight and assigning +// zero score to the rest of the targets +type SessionAffinity struct { +} + +func NewSessionAffinity() *SessionAffinity { + return &SessionAffinity{} +} + +func (s *SessionAffinity) Name() string { + return "session affinity scorer" +} + +func (s *SessionAffinity) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { + scoredPods := make(map[types.Pod]float64) + + reqHeaders := ctx.Req.Headers + + var sessionToken = "" + v, ok := reqHeaders[sessionTokenHeader] + if ok { + sessionToken = v + } + + podName := "" + if sessionToken != "" { + decodedBytes, err := base64.StdEncoding.DecodeString(sessionToken) + if err != nil { + ctx.Logger.Error(err, "Error decoding") + } else { + podName = string(decodedBytes) + } + } + for _, pod := range pods { + if podName == "" { + scoredPods[pod] = 0.0 + } else { + if pod.GetPod().NamespacedName.String() == podName { + scoredPods[pod] = 1.0 + } + } + } + + return scoredPods +} + +func (s *SessionAffinity) PostResponse(ctx *types.SchedulingContext, pod types.Pod) { + ctx.MutatedHeaders[sessionTokenHeader] = base64.StdEncoding.EncodeToString([]byte(pod.GetPod().NamespacedName.String())) +} From b98733f31c31fa8f3f98521bb6a853cdad0c91d5 Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Tue, 6 May 2025 08:26:13 +0300 Subject: [PATCH 59/60] [docs]: Add prefix flags to README Signed-off-by: Kfir Toledo --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 1948be6bf..76a333eea 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,12 @@ export KVCACHE_INDEXER_REDIS_ADDR= export HF_TOKEN= ``` +To enable the PrefixAwareScorer, the following environment variables must be configured: +``` +export ENABLE_PREFIX_AWARE_SCORER=true +export PREFIX_AWARE_SCORER_WEIGHT=1.0 +``` + To enable the LoadAwareScorer, the following environment variables must be configured: ``` export ENABLE_LOAD_AWARE_SCORER=true From 1bd3e92ee1ee149e05e8cdfde60ee00f0be1d3e0 Mon Sep 17 00:00:00 2001 From: Maroon Ayoub Date: Tue, 6 May 2025 16:11:20 +0300 Subject: [PATCH 60/60] switch prefix_scorer updates to post-schedule temporarily Signed-off-by: Maroon Ayoub --- pkg/epp/scheduling/local_config.go | 2 +- .../scheduling/plugins/scorer/prefix_aware_scorer.go | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/epp/scheduling/local_config.go b/pkg/epp/scheduling/local_config.go index 6b1c3e481..a1812b0bd 100644 --- a/pkg/epp/scheduling/local_config.go +++ b/pkg/epp/scheduling/local_config.go @@ -117,7 +117,7 @@ func setPrefixScorer() { prefixScorerWeight := envutil.GetEnvInt(prefixScorerWeightEnvVar, 1, loggerDebug) prefixScorer := scorer.NewPrefixAwareScorer(nil) defaultConfig.scorers[prefixScorer] = prefixScorerWeight // TODO: make configurable - defaultConfig.postResponsePlugins = append(defaultConfig.postResponsePlugins, prefixScorer) + defaultConfig.postSchedulePlugins = append(defaultConfig.postSchedulePlugins, prefixScorer) loggerDebug.Info("Initialized PrefixAwareScorer", "weight", prefixScorerWeight) } diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go index 0dee156d9..8c3d673b0 100644 --- a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go +++ b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go @@ -73,10 +73,14 @@ func (s *PrefixAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) } -// PostResponse implements the PostResponsePlugin interface. +// PostSchedule implements the PostSchedulePlugin interface. // It adds the prefix to the PrefixStore for the given pod. -func (s *PrefixAwareScorer) PostResponse(ctx *types.SchedulingContext, pod types.Pod) { - debugLogger := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG) +// TODO: switch to PostResponse. +func (s *PrefixAwareScorer) PostSchedule(ctx *types.SchedulingContext, res *types.Result) { + pod := res.TargetPod + + debugLogger := log.FromContext(ctx).WithName(prefixAwareScorerName) + debugLogger.Info("PostResponse called", "req", ctx.Req, "pod", pod) if ctx.Req == nil { debugLogger.Info("Request is nil, skipping PostResponse")