From 46199e0c431c2bbd28de5113434ab0701e6cd574 Mon Sep 17 00:00:00 2001 From: Ridwan Sharif <18472685+ridwanmsharif@users.noreply.github.com> Date: Tue, 28 May 2024 22:31:00 +0200 Subject: [PATCH] entrypoint: add liveness probe endpoint to the sidecar (#33) * entrypoint: add liveness probe endpoint to the sidecar This change uses the liveness probes as a way to guarantee a regular period of uninterrupted CPU for the sidecar to complete its prometheus scrapes and flushes to GMP. Change-Id: Ic6f0ed38ade237d6179dea12268cd2575d221146 * entrypoint: handle serving errors Change-Id: Id31f5294721e6583af84aa4127b8bae21a243267 --- Dockerfile | 1 - entrypoint.go | 24 ++++++++++++++++++ run-service-simple.yaml | 31 ++++++++++++----------- run-service.yaml | 55 ++++++++++++++++++++++------------------- 4 files changed, 70 insertions(+), 41 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2362be3..085eaf8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,6 @@ RUN make build FROM alpine:latest RUN apk add --no-cache ca-certificates -RUN apk add openssl=3.1.4-r6 && apk upgrade openssl --no-cache COPY --from=builder /sidecar/bin/rungmpcol /rungmpcol COPY --from=builder /sidecar/bin/run-gmp-entrypoint /run-gmp-entrypoint diff --git a/entrypoint.go b/entrypoint.go index 4c6043e..c81c189 100644 --- a/entrypoint.go +++ b/entrypoint.go @@ -19,6 +19,7 @@ import ( "fmt" "io/ioutil" "log" + "net/http" "os" "os/signal" "path/filepath" @@ -34,6 +35,9 @@ var userConfigFile = "/etc/rungmp/config.yaml" var otelConfigFile = "/run/rungmp/otel.yaml" var configRefreshInterval = 20 * time.Second var selfMetricsPort = 0 +var livenessProbePort = 13133 +var livenessProbePath = "/liveness" +var delayLivenessProbe = 5 * time.Second func getRawUserConfig(userConfigFile string) (string, error) { _, err := os.Stat(userConfigFile) @@ -84,6 +88,16 @@ func generateOtelConfig(ctx context.Context, userConfigFile string) error { return nil } +// The container is allocated CPU for the duration of the healthcheck. Delaying +// the response to this probe allows the container to complete telemetry flushes +// that may have been throttled. +// +// TODO(b/342463831): Use a more reliable way of checking if telemetry is being +// flushed instead of using a static sleep. +func healthcheckHandler(_ http.ResponseWriter, _ *http.Request) { + time.Sleep(delayLivenessProbe) +} + func main() { // SIGINT handles Ctrl+C locally. // SIGTERM handles Cloud Run termination signal. @@ -101,6 +115,16 @@ func main() { log.Fatal(err) } + entrypointMux := http.NewServeMux() + entrypointMux.HandleFunc(livenessProbePath, healthcheckHandler) + + go func() { + err := http.ListenAndServe(fmt.Sprintf(":%d", livenessProbePort), entrypointMux) + if err != nil && err != http.ErrServerClosed { + log.Fatal(err) + } + }() + // Spin up new-subprocess that runs the OTel collector and store the PID. // This OTel collector should use the generated config. var procAttr os.ProcAttr diff --git a/run-service-simple.yaml b/run-service-simple.yaml index 113e5fe..ec5d63e 100644 --- a/run-service-simple.yaml +++ b/run-service-simple.yaml @@ -23,21 +23,22 @@ spec: metadata: annotations: run.googleapis.com/execution-environment: gen2 - run.googleapis.com/cpu-throttling: 'false' + run.googleapis.com/cpu-throttling: "false" run.googleapis.com/container-dependencies: '{"collector":["app"]}' spec: containers: - - image: "%SAMPLE_APP_IMAGE%" - name: app - startupProbe: - httpGet: - path: /startup - port: 8000 - livenessProbe: - httpGet: - path: /liveness - port: 8000 - ports: - - containerPort: 8000 - - image: us-docker.pkg.dev/cloud-ops-agents-artifacts/cloud-run-gmp-sidecar/cloud-run-gmp-sidecar:1.1.1 - name: collector + - image: "%SAMPLE_APP_IMAGE%" + name: app + startupProbe: + httpGet: + path: /startup + port: 8000 + livenessProbe: + httpGet: + path: /liveness + port: 8000 + ports: + - containerPort: 8000 + - image: us-docker.pkg.dev/cloud-ops-agents-artifacts/cloud-run-gmp-sidecar/cloud-run-gmp-sidecar:1.1.1 + name: collector + # TODO(b/342463134): Post release 1.2.0, update the collector container to use the healthcheck endpoint diff --git a/run-service.yaml b/run-service.yaml index 2baed17..83cb62c 100644 --- a/run-service.yaml +++ b/run-service.yaml @@ -23,32 +23,37 @@ spec: metadata: annotations: run.googleapis.com/execution-environment: gen2 - run.googleapis.com/cpu-throttling: 'false' run.googleapis.com/container-dependencies: '{"collector":["app"]}' - run.googleapis.com/secrets: '%SECRET%:projects/%PROJECT%/secrets/%SECRET%' + run.googleapis.com/secrets: "%SECRET%:projects/%PROJECT%/secrets/%SECRET%" spec: containers: - - image: "%SAMPLE_APP_IMAGE%" - name: app - startupProbe: - httpGet: - path: /startup - port: 8000 - livenessProbe: - httpGet: - path: /liveness - port: 8000 - ports: - - containerPort: 8000 - - image: "%OTELCOL_IMAGE%" - name: collector - volumeMounts: - - mountPath: /etc/rungmp/ - name: config + - image: "%SAMPLE_APP_IMAGE%" + name: app + startupProbe: + httpGet: + path: /startup + port: 8000 + livenessProbe: + httpGet: + path: /liveness + port: 8000 + ports: + - containerPort: 8000 + - image: "%OTELCOL_IMAGE%" + name: collector + livenessProbe: + httpGet: + path: /liveness + port: 13133 + timeoutSeconds: 30 + periodSeconds: 30 + volumeMounts: + - mountPath: /etc/rungmp/ + name: config volumes: - - name: config - secret: - items: - - key: latest - path: config.yaml - secretName: '%SECRET%' + - name: config + secret: + items: + - key: latest + path: config.yaml + secretName: "%SECRET%"