Skip to content

Commit

Permalink
prometheus: add start up and shutdown scrape
Browse files Browse the repository at this point in the history
This change does the following:
- Add an entrypoint for the sidecar that can control the collector
- Add start up scraping with a 10s offset
- Add scraping before shutdown
- Add metric adjuster that falls back on collector start time
- Add unit test and hook things up to the prometheus pipeline
- Add sample app that dies after a request and a cloudbuild taml file to
  run it

Change-Id: I33b4a8af0fd9568c4f0f258e95c600e1b9c6df0d
Signed-off-by: Ridwan Sharif <[email protected]>
  • Loading branch information
ridwanmsharif committed Oct 26, 2023
1 parent 316c815 commit 182598e
Show file tree
Hide file tree
Showing 30 changed files with 1,020 additions and 216 deletions.
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
.git
Dockerfile
.dockerignore
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ RUN go install github.com/client9/misspell/cmd/[email protected] \
&& go install github.com/golangci/golangci-lint/cmd/[email protected] \
&& go install github.com/google/[email protected]
RUN apt update && apt install -y make
RUN make build-collector
RUN make build

FROM alpine:3
RUN apk add --no-cache ca-certificates
COPY --from=builder /sidecar/bin/rungmpcol /rungmpcol
COPY collector-config.yaml /etc/rungmp/config.yaml
COPY --from=builder /sidecar/bin/run-gmp-entrypoint /run-gmp-entrypoint
COPY collector-config.yaml /etc/rungmp/config.yml

ENTRYPOINT ["/rungmpcol"]
CMD ["--config", "/etc/rungmp/config.yaml"]
ENTRYPOINT ["/run-gmp-entrypoint"]
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,21 @@ GO_BUILD_OUT ?= ./bin/rungmpcol
build-collector:
CGO_ENABLED=0 go build -tags=$(GO_BUILD_TAGS) -o $(GO_BUILD_OUT) $(LD_FLAGS) -buildvcs=false ./collector/cmd/rungmpcol

OTELCOL_BINARY = google-cloud-run-rmp-sidecar-$(GOOS)
OTELCOL_BINARY = google-cloud-run-gmp-sidecar-$(GOOS)
.PHONY: build-collector-full-name
build-collector-full-name:
$(MAKE) GO_BUILD_OUT=./bin/$(OTELCOL_BINARY) build-collector

ENTRYPOINT_BINARY = run-gmp-entrypoint
.PHONY: build-run-gmp-entrypoint
build-run-gmp-entrypoint:
CGO_ENABLED=0 go build -tags=$(GO_BUILD_TAGS) -o ./bin/$(ENTRYPOINT_BINARY) -buildvcs=false entrypoint.go

.PHONY: build
build:
$(MAKE) build-collector
$(MAKE) build-run-gmp-entrypoint

.PHONY: test
test:
go test -tags=$(GO_BUILD_TAGS) $(GO_TEST_VERBOSE) -race ./...
Expand Down
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ gcloud auth configure-docker \
Build and push the app with the following commands:

```
pushd app
pushd sample-apps/simple-app
docker build -t us-east1-docker.pkg.dev/$GCP_PROJECT/run-gmp/sample-app .
docker push us-east1-docker.pkg.dev/$GCP_PROJECT/run-gmp/sample-app
popd
Expand Down Expand Up @@ -158,9 +158,7 @@ curl $SERVICE_URL/metrics
This should return the following output on success:

```
Logged request to /logging/sample-app.log
Generated 10 spans!
Updated sidecar-sample-counter metric!
User request received!
```

### Clean up
Expand Down
104 changes: 104 additions & 0 deletions cloudbuild-single-req.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

steps:
- name: "gcr.io/cloud-builders/docker"
args: ["build", "-t", "${_IMAGE_APP}", "./sample-apps/single-req-app"]
id: BUILD_SAMPLE_APP
waitFor: ["-"]

- name: "gcr.io/cloud-builders/docker"
args: ["push", "${_IMAGE_APP}"]
id: PUSH_SAMPLE_APP
waitFor:
- BUILD_SAMPLE_APP

- name: "gcr.io/cloud-builders/docker"
args: ["build", "-t", "${_IMAGE_COLLECTOR}", "."]
id: BUILD_COLLECTOR
waitFor: ["-"]

- name: "gcr.io/cloud-builders/docker"
args: ["push", "${_IMAGE_COLLECTOR}"]
id: PUSH_COLLECTOR
waitFor:
- BUILD_COLLECTOR

- name: "ubuntu"
env:
- "IMAGE_APP=${_IMAGE_APP}"
- "IMAGE_COLLECTOR=${_IMAGE_COLLECTOR}"
script: |
sed -i s@%OTELCOL_IMAGE%@${IMAGE_COLLECTOR}@g run-service.yaml
sed -i s@%SAMPLE_APP_IMAGE%@${IMAGE_APP}@g run-service.yaml
id: REPLACE_YAML_VALUE
waitFor: ["-"]

- name: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim"
entrypoint: gcloud
args:
[
"run",
"services",
"replace",
"run-service.yaml",
"--region",
"${_REGION}",
]
id: DEPLOY_MULTICONTAINER
waitFor:
- PUSH_SAMPLE_APP
- PUSH_COLLECTOR
- REPLACE_YAML_VALUE

- name: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim"
entrypoint: gcloud
args:
[
"run",
"services",
"set-iam-policy",
"run-gmp-sidecar-service",
"policy.yaml",
"--region",
"${_REGION}",
"--quiet",
]
id: ALLOW_UNAUTHENTICATED
waitFor:
- DEPLOY_MULTICONTAINER

substitutions:
_REGION: us-east1
_REGISTRY: ${_REGION}-docker.pkg.dev/${PROJECT_ID}/run-gmp
_IMAGE_APP: ${_REGISTRY}/sample-app
_IMAGE_COLLECTOR: ${_REGISTRY}/collector
_SA_NAME: run-gmp-sa

images:
- ${_IMAGE_APP}
- ${_IMAGE_COLLECTOR}

# comment out the following line if you want to run Cloud Build with the existing
# service account with the following roles.
# * roles/iam.serviceAccountUser
# * roles/storage.objectViewer
# * roles/logging.logWriter
# * roles/artifactregistry.createOnPushWriter
# * roles/run.admin
serviceAccount: "projects/${PROJECT_ID}/serviceAccounts/${_SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"

options:
dynamic_substitutions: true
logging: CLOUD_LOGGING_ONLY
4 changes: 2 additions & 2 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

steps:
- name: "gcr.io/cloud-builders/docker"
args: ["build", "-t", "${_IMAGE_APP}", "./app"]
args: ["build", "-t", "${_IMAGE_APP}", "./sample-apps/simple-app"]
id: BUILD_SAMPLE_APP
waitFor: ["-"]

Expand Down Expand Up @@ -69,7 +69,7 @@ steps:
"run",
"services",
"set-iam-policy",
"opentelemetry-cloud-run-sample",
"run-gmp-sidecar-service",
"policy.yaml",
"--region",
"${_REGION}",
Expand Down
4 changes: 4 additions & 0 deletions collector-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@

receivers:
prometheus:
use_start_time_metric: true
use_collector_start_time_fallback: true
allow_cumulative_resets: true
config:
scrape_configs:
- job_name: 'run-gmp-sidecar'
scrape_interval: 10s
static_configs:
- targets: ['0.0.0.0:8000']

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ func createMetricsExporter(
params exporter.CreateSettings,
cfg component.Config) (exporter.Metrics, error) {
eCfg := cfg.(*Config)
mExp, err := collector.NewGoogleCloudMetricsExporter(ctx, eCfg.GMPConfig.toCollectorConfig(), params.TelemetrySettings.Logger, params.BuildInfo.Version, eCfg.Timeout)

// We turn off normalization for serverless environments.
collectorConfig := eCfg.GMPConfig.toCollectorConfig()
collectorConfig.MetricConfig.CumulativeNormalization = false
mExp, err := collector.NewGoogleCloudMetricsExporter(ctx, collectorConfig, params.TelemetrySettings.Logger, params.BuildInfo.Version, eCfg.Timeout)
if err != nil {
return nil, err
}
Expand Down
43 changes: 35 additions & 8 deletions collector/receiver/prometheusreceiver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,42 @@ const (
targetAllocatorHTTPSDConfigKey = "http_sd_config"
)

type MetricAdjusterOpts struct {
// UseStartTimeMetric enables retrieving the start time of all counter
// metrics from the process_start_time_seconds metric. This is only correct
// if all counters on that endpoint started after the process start time,
// and the process is the only actor exporting the metric after the process
// started. It should not be used in "exporters" which export counters that
// may have started before the process itself. Use only if you know what you
// are doing, as this may result in incorrect rate calculations.
UseStartTimeMetric bool `mapstructure:"use_start_time_metric"`
StartTimeMetricRegex string `mapstructure:"start_time_metric_regex"`

// UseCollectorStartTimeFallback enables using a fallback start time if a
// start time is otherwise unavailable when adjusting metrics. This would
// happen if the UseStartTimeMetric is used but the application doesn't emit
// a process_start_time_seconds metric or a metric that matches the
// StartTimeMetricRegex provided.
//
// If enabled, the fallback start time used for adjusted metrics is an
// approximation of the collector start time.
//
// This option should only be used when we can guarantee that the scraped
// processes that emit metrics that started after the collector has started.
UseCollectorStartTimeFallback bool `mapstructure:"use_collector_start_time_fallback"`
// AllowCumulativeResets enables preserving resets of cumulative points when
// the metric adjuster is used. Should be enabled if we expect cumulative
// point resets AND we want to use the StartTimeMetricAdjuster. Note that
// this will require that we cache the previous point for every timeseries,
// and so can increase memory used by the collector.
AllowCumulativeResets bool `mapstructure:"allow_cumulative_resets"`
}

// Config defines configuration for Prometheus receiver.
type Config struct {
PrometheusConfig *promconfig.Config `mapstructure:"-"`
BufferPeriod time.Duration `mapstructure:"buffer_period"`
BufferCount int `mapstructure:"buffer_count"`
// UseStartTimeMetric enables retrieving the start time of all counter metrics
// from the process_start_time_seconds metric. This is only correct if all counters on that endpoint
// started after the process start time, and the process is the only actor exporting the metric after
// the process started. It should not be used in "exporters" which export counters that may have
// started before the process itself. Use only if you know what you are doing, as this may result
// in incorrect rate calculations.
UseStartTimeMetric bool `mapstructure:"use_start_time_metric"`
StartTimeMetricRegex string `mapstructure:"start_time_metric_regex"`

// PreserveUntyped is a setting that lets the collector preserve the untypedness of
// untyped metrics as a metric attribute. If set, all untyped prometheus metrics from
Expand All @@ -71,6 +94,10 @@ type Config struct {
// that requires that all keys present in the config actually exist on the
// structure, ie.: it will error if an unknown key is present.
ConfigPlaceholder interface{} `mapstructure:"config"`

// Settings for adjusting metrics. Will default to using an InitialPointAdjuster
// which will use the first scraped point to define the start time for the timeseries.
AdjusterOpts MetricAdjusterOpts `mapstructure:",squash"`
}

type targetAllocator struct {
Expand Down
4 changes: 2 additions & 2 deletions collector/receiver/prometheusreceiver/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ func TestLoadConfig(t *testing.T) {
r1 := cfg.(*Config)
assert.Equal(t, r1.PrometheusConfig.ScrapeConfigs[0].JobName, "demo")
assert.Equal(t, time.Duration(r1.PrometheusConfig.ScrapeConfigs[0].ScrapeInterval), 5*time.Second)
assert.Equal(t, r1.UseStartTimeMetric, true)
assert.Equal(t, r1.StartTimeMetricRegex, "^(.+_)*process_start_time_seconds$")
assert.Equal(t, r1.AdjusterOpts.UseStartTimeMetric, true)
assert.Equal(t, r1.AdjusterOpts.StartTimeMetricRegex, "^(.+_)*process_start_time_seconds$")

assert.Equal(t, "http://my-targetallocator-service", r1.TargetAllocator.Endpoint)
assert.Equal(t, 30*time.Second, r1.TargetAllocator.Interval)
Expand Down
4 changes: 3 additions & 1 deletion collector/receiver/prometheusreceiver/internal/appendable.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ func NewAppendable(
preserveUntyped bool,
startTimeMetricRegex *regexp.Regexp,
useCreatedMetric bool,
useCollectorStartTimeFallback bool,
allowCumulativeResets bool,
externalLabels labels.Labels,
registry *featuregate.Registry) (storage.Appendable, error) {
var metricAdjuster MetricsAdjuster
if !useStartTimeMetric {
metricAdjuster = NewInitialPointAdjuster(set.Logger, gcInterval, useCreatedMetric)
} else {
metricAdjuster = NewStartTimeMetricAdjuster(set.Logger, startTimeMetricRegex)
metricAdjuster = NewStartTimeMetricAdjuster(set.Logger, gcInterval, startTimeMetricRegex, useCollectorStartTimeFallback, allowCumulativeResets)
}

obsrecv, err := obsreport.NewReceiver(obsreport.ReceiverSettings{ReceiverID: set.ID, Transport: transport, ReceiverCreateSettings: set})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func TestMetricGroupData_toDistributionUnitTest(t *testing.T) {
for i, tv := range tt.scrapes {
var lbls labels.Labels
if tv.extraLabel.Name != "" {
lbls = labels.NewBuilder(tt.labels).Set(tv.extraLabel.Name, tv.extraLabel.Value).Labels(nil)
lbls = labels.NewBuilder(tt.labels).Set(tv.extraLabel.Name, tv.extraLabel.Value).Labels()
} else {
lbls = tt.labels.Copy()
}
Expand Down
24 changes: 21 additions & 3 deletions collector/receiver/prometheusreceiver/internal/metrics_adjuster.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,14 +246,20 @@ type initialPointAdjuster struct {
jobsMap *JobsMap
logger *zap.Logger
useCreatedMetric bool
// usePointTimeForReset forces the adjuster to use the timestamp of the point instead
// of the start timestamp when it detects resets.
// This is useful when this adjuster is used after another adjuster that pre-populated start
// times.
usePointTimeForReset bool
}

// NewInitialPointAdjuster returns a new MetricsAdjuster that adjust metrics' start times based on the initial received points.
func NewInitialPointAdjuster(logger *zap.Logger, gcInterval time.Duration, useCreatedMetric bool) MetricsAdjuster {
return &initialPointAdjuster{
jobsMap: NewJobsMap(gcInterval),
logger: logger,
useCreatedMetric: useCreatedMetric,
jobsMap: NewJobsMap(gcInterval),
logger: logger,
useCreatedMetric: useCreatedMetric,
usePointTimeForReset: false,
}
}

Expand Down Expand Up @@ -342,6 +348,10 @@ func (a *initialPointAdjuster) adjustMetricHistogram(tsm *timeseriesMap, current
if currentDist.Count() < tsi.histogram.previousCount || currentDist.Sum() < tsi.histogram.previousSum {
// reset re-initialize everything.
tsi.histogram.startTime = currentDist.StartTimestamp()
if a.usePointTimeForReset {
tsi.histogram.startTime = currentDist.Timestamp()
currentDist.SetStartTimestamp(tsi.histogram.startTime)
}
tsi.histogram.previousCount = currentDist.Count()
tsi.histogram.previousSum = currentDist.Sum()
continue
Expand Down Expand Up @@ -383,6 +393,10 @@ func (a *initialPointAdjuster) adjustMetricSum(tsm *timeseriesMap, current pmetr
if currentSum.DoubleValue() < tsi.number.previousValue {
// reset re-initialize everything.
tsi.number.startTime = currentSum.StartTimestamp()
if a.usePointTimeForReset {
tsi.number.startTime = currentSum.Timestamp()
currentSum.SetStartTimestamp(tsi.number.startTime)
}
tsi.number.previousValue = currentSum.DoubleValue()
continue
}
Expand Down Expand Up @@ -429,6 +443,10 @@ func (a *initialPointAdjuster) adjustMetricSummary(tsm *timeseriesMap, current p
currentSummary.Sum() < tsi.summary.previousSum) {
// reset re-initialize everything.
tsi.summary.startTime = currentSummary.StartTimestamp()
if a.usePointTimeForReset {
tsi.summary.startTime = currentSummary.Timestamp()
currentSummary.SetStartTimestamp(tsi.summary.startTime)
}
tsi.summary.previousCount = currentSummary.Count()
tsi.summary.previousSum = currentSummary.Sum()
continue
Expand Down
Loading

0 comments on commit 182598e

Please sign in to comment.