-
Notifications
You must be signed in to change notification settings - Fork 203
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The current logic relies on hardcoded values which are not suitable for large hosts. The new logic takes into account the size of hosts and also tries to be more aggressive with misbehaving frames. Prevent host from entering an OOM state where oom-killer might start killing important OS processes. The kill logic will kick in one of the following conditions is met: - Host has less than OOM_MEMORY_LEFT_THRESHOLD_PERCENT memory available - A frame is taking more than OOM_FRAME_OVERBOARD_PERCENT of what it had reserved For frames that are using more than they had reserved but not above the threshold, negotiate expanding the reservations with other frames on the same host (cherry picked from commit e88a5295f23bd927614de6d5af6a09d496d3e6ac) Signed-off-by: Diego Tavares <[email protected]>
- Loading branch information
1 parent
c1f335d
commit 1f1d67c
Showing
13 changed files
with
628 additions
and
173 deletions.
There are no files selected for viewing
57 changes: 57 additions & 0 deletions
57
cuebot/src/main/java/com/imageworks/spcue/PrometheusMetrics.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package com.imageworks.spcue; | ||
|
||
import io.prometheus.client.Counter; | ||
import io.prometheus.client.Gauge; | ||
import io.prometheus.client.Histogram; | ||
|
||
public class PrometheusMetrics { | ||
private static final Counter findJobsByShowQueryCountMetric = Counter.build() | ||
.name("cue_find_jobs_by_show_count") | ||
.help("Count the occurrences of the query FIND_JOBS_BY_SHOW.") | ||
.labelNames("env", "cuebot_hosts") | ||
.register(); | ||
private static final Gauge bookingDurationMillisMetric = Gauge.build() | ||
.name("cue_booking_durations_in_millis") | ||
.help("Register duration of booking steps in milliseconds.") | ||
.labelNames("env", "cuebot_host", "stage_desc") | ||
.register(); | ||
private static final Histogram bookingDurationMillisHistogramMetric = Histogram.build() | ||
.name("cue_booking_durations_histogram_in_millis") | ||
.help("Register a summary of duration of booking steps in milliseconds.") | ||
.labelNames("env", "cuebot_host", "stage_desc") | ||
.register(); | ||
|
||
private static final Counter frameOomKilledCounter = Counter.build() | ||
.name("cue_frame_oom_killed_counter") | ||
.help("Number of frames killed for being above memory on a host under OOM") | ||
.labelNames("env", "cuebot_host", "render_node") | ||
.register(); | ||
|
||
private String deployment_environment; | ||
private String cuebot_host; | ||
|
||
public PrometheusMetrics() { | ||
this.cuebot_host = System.getenv("NODE_HOSTNAME"); | ||
if (this.cuebot_host == null) { | ||
this.cuebot_host = "undefined"; | ||
} | ||
// Use the same environment set for SENTRY as the prometheus environment | ||
this.deployment_environment = System.getenv("SENTRY_ENVIRONMENT"); | ||
if (this.deployment_environment == null) { | ||
this.deployment_environment = "undefined"; | ||
} | ||
} | ||
|
||
public void setBookingDurationMetric(String stage_desc, double value) { | ||
bookingDurationMillisMetric.labels(this.deployment_environment, this.cuebot_host, stage_desc).set(value); | ||
bookingDurationMillisHistogramMetric.labels(this.deployment_environment, this.cuebot_host, stage_desc).observe(value); | ||
} | ||
|
||
public void incrementFindJobsByShowQueryCountMetric() { | ||
findJobsByShowQueryCountMetric.labels(this.deployment_environment, this.cuebot_host).inc(); | ||
} | ||
|
||
public void incrementFrameOomKilledCounter(String renderNode) { | ||
frameOomKilledCounter.labels(this.deployment_environment, this.cuebot_host, renderNode).inc(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.