Skip to content

Commit

Permalink
Change OOM protection logic
Browse files Browse the repository at this point in the history
The current logic relies on hardcoded values which are not suitable for large hosts. The new logic takes into account the size of hosts and also tries to be more aggressive with misbehaving frames.

Prevent host from entering an OOM state where oom-killer might start killing important OS processes.
The kill logic will kick in one of the following conditions is met:
  - Host has less than OOM_MEMORY_LEFT_THRESHOLD_PERCENT memory available
  - A frame is taking more than OOM_FRAME_OVERBOARD_PERCENT of what it had reserved
For frames that are using more than they had reserved but not above the threshold, negotiate expanding the reservations with other frames on the same host

(cherry picked from commit e88a5295f23bd927614de6d5af6a09d496d3e6ac)
Signed-off-by: Diego Tavares <[email protected]>
  • Loading branch information
DiegoTavares committed Sep 14, 2023
1 parent c1f335d commit 1f1d67c
Show file tree
Hide file tree
Showing 13 changed files with 628 additions and 173 deletions.
57 changes: 57 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/PrometheusMetrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package com.imageworks.spcue;

import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;

public class PrometheusMetrics {
private static final Counter findJobsByShowQueryCountMetric = Counter.build()
.name("cue_find_jobs_by_show_count")
.help("Count the occurrences of the query FIND_JOBS_BY_SHOW.")
.labelNames("env", "cuebot_hosts")
.register();
private static final Gauge bookingDurationMillisMetric = Gauge.build()
.name("cue_booking_durations_in_millis")
.help("Register duration of booking steps in milliseconds.")
.labelNames("env", "cuebot_host", "stage_desc")
.register();
private static final Histogram bookingDurationMillisHistogramMetric = Histogram.build()
.name("cue_booking_durations_histogram_in_millis")
.help("Register a summary of duration of booking steps in milliseconds.")
.labelNames("env", "cuebot_host", "stage_desc")
.register();

private static final Counter frameOomKilledCounter = Counter.build()
.name("cue_frame_oom_killed_counter")
.help("Number of frames killed for being above memory on a host under OOM")
.labelNames("env", "cuebot_host", "render_node")
.register();

private String deployment_environment;
private String cuebot_host;

public PrometheusMetrics() {
this.cuebot_host = System.getenv("NODE_HOSTNAME");
if (this.cuebot_host == null) {
this.cuebot_host = "undefined";
}
// Use the same environment set for SENTRY as the prometheus environment
this.deployment_environment = System.getenv("SENTRY_ENVIRONMENT");
if (this.deployment_environment == null) {
this.deployment_environment = "undefined";
}
}

public void setBookingDurationMetric(String stage_desc, double value) {
bookingDurationMillisMetric.labels(this.deployment_environment, this.cuebot_host, stage_desc).set(value);
bookingDurationMillisHistogramMetric.labels(this.deployment_environment, this.cuebot_host, stage_desc).observe(value);
}

public void incrementFindJobsByShowQueryCountMetric() {
findJobsByShowQueryCountMetric.labels(this.deployment_environment, this.cuebot_host).inc();
}

public void incrementFrameOomKilledCounter(String renderNode) {
frameOomKilledCounter.labels(this.deployment_environment, this.cuebot_host, renderNode).inc();
}
}
9 changes: 0 additions & 9 deletions cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -244,15 +244,6 @@ public interface HostDao {
*/
void updateThreadMode(HostInterface host, ThreadMode mode);

/**
* When a host is in kill mode that means its 256MB+ into the swap and the
* the worst memory offender is killed.
*
* @param h HostInterface
* @return boolean
*/
boolean isKillMode(HostInterface h);

/**
* Update the specified host's hardware information.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -605,15 +605,6 @@ public void updateHostOs(HostInterface host, String os) {
os, host.getHostId());
}

@Override
public boolean isKillMode(HostInterface h) {
return getJdbcTemplate().queryForObject(
"SELECT COUNT(1) FROM host_stat WHERE pk_host = ? " +
"AND int_swap_total - int_swap_free > ? AND int_mem_free < ?",
Integer.class, h.getHostId(), Dispatcher.KILL_MODE_SWAP_THRESHOLD,
Dispatcher.KILL_MODE_MEM_THRESHOLD) > 0;
}

@Override
public int getStrandedCoreUnits(HostInterface h) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ public boolean increaseReservedMemory(ProcInterface p, long value) {
value, p.getProcId(), value) == 1;
} catch (Exception e) {
// check by trigger erify_host_resources
throw new ResourceReservationFailureException("failed to increase memory reserveration for proc "
throw new ResourceReservationFailureException("failed to increase memory reservation for proc "
+ p.getProcId() + " to " + value + ", proc does not have that much memory to spare.");
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,11 @@ public interface Dispatcher {
// without being penalized for it.
public static final long VIRTUAL_MEM_THRESHHOLD = CueUtil.GB2;

// The amount of swap that must be used before a host can go
// into kill mode.
public static final long KILL_MODE_SWAP_THRESHOLD = CueUtil.MB128;
// Percentage of used memory to consider a risk for triggering oom-killer
public static final double OOM_MAX_SAFE_USED_MEMORY_THRESHOLD = 0.95;

// When the amount of free memory drops below this point, the
// host can go into kill mode.
public static final long KILL_MODE_MEM_THRESHOLD = CueUtil.MB512;
// How much can a frame exceed its reserved memory
public static final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD = 0.25;

// A higher number gets more deep booking but less spread on the cue.
public static final int DEFAULT_MAX_FRAMES_PER_PASS = 4;
Expand Down
Loading

0 comments on commit 1f1d67c

Please sign in to comment.