Skip to content

Commit

Permalink
[cuebot] Prevent booking frames on hosts with no temp space. (#1306)
Browse files Browse the repository at this point in the history
  • Loading branch information
ramonfigueiredo authored Sep 27, 2023
1 parent baa122a commit b606a59
Show file tree
Hide file tree
Showing 39 changed files with 489 additions and 81 deletions.
22 changes: 22 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/CommentDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.imageworks.spcue.HostInterface;
import com.imageworks.spcue.JobInterface;

import java.util.List;

public interface CommentDao {

/**
Expand All @@ -32,6 +34,26 @@ public interface CommentDao {
*/
public void deleteComment(String id);

/**
* Deletes comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return boolean: returns true if one or more comments where deleted
*/
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Get comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return List<Comment>
*/
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Retrieves the specified comment.
*
Expand Down
8 changes: 8 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ public interface HostDao {
*/
void updateHostState(HostInterface host, HardwareState state);

/**
* updates a host with the passed free temporary directory
*
* @param host
* @param freeTempDir
*/
void updateHostFreeTempDir(HostInterface host, Long freeTempDir);

/**
* returns a full host detail
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;

import org.springframework.jdbc.core.RowMapper;
Expand Down Expand Up @@ -71,6 +72,18 @@ public CommentDetail mapRow(ResultSet rs, int row) throws SQLException {
}
};

public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
return getJdbcTemplate().update(
"DELETE FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
host.getHostId(), user, subject) > 0;
}

public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
return getJdbcTemplate().query(
"SELECT * FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
COMMENT_DETAIL_MAPPER, host.getHostId(), user, subject);
}

public CommentDetail getCommentDetail(String id) {
return getJdbcTemplate().queryForObject(
"SELECT * FROM comments WHERE pk_comment=?",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,13 @@ public void updateHostState(HostInterface host, HardwareState state) {
state.toString(), host.getHostId());
}

@Override
public void updateHostFreeTempDir(HostInterface host, Long freeTempDir) {
getJdbcTemplate().update(
"UPDATE host_stat SET int_mcp_free=? WHERE pk_host=?",
freeTempDir, host.getHostId());
}

@Override
public void updateHostSetAllocation(HostInterface host, AllocationInterface alloc) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.env.Environment;
import org.springframework.core.task.TaskRejectedException;
import org.springframework.dao.DataAccessException;
import org.springframework.dao.EmptyResultDataAccessException;

import com.imageworks.spcue.CommentDetail;
import com.imageworks.spcue.DispatchHost;
import com.imageworks.spcue.FrameInterface;
import com.imageworks.spcue.JobEntity;
Expand All @@ -57,6 +60,7 @@
import com.imageworks.spcue.rqd.RqdClient;
import com.imageworks.spcue.rqd.RqdClientException;
import com.imageworks.spcue.service.BookingManager;
import com.imageworks.spcue.service.CommentManager;
import com.imageworks.spcue.service.HostManager;
import com.imageworks.spcue.service.JobManager;
import com.imageworks.spcue.service.JobManagerSupport;
Expand All @@ -80,6 +84,14 @@ public class HostReportHandler {
private JobManagerSupport jobManagerSupport;
private JobDao jobDao;
private LayerDao layerDao;
@Autowired
private Environment env;
@Autowired
private CommentManager commentManager;
// Comment constants
private static final String SUBJECT_COMMENT_FULL_TEMP_DIR = "Host set to REPAIR for not having enough storage " +
"space on the temporary directory (mcp)";
private static final String CUEBOT_COMMENT_USER = "cuebot";

/**
* Boolean to toggle if this class is accepting data or not.
Expand Down Expand Up @@ -156,7 +168,7 @@ public void handleHostReport(HostReport report, boolean isBoot) {
rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l),
rhost.getAttributesMap().get("SP_OS"));

changeHardwareState(host, report.getHost().getState(), isBoot);
changeHardwareState(host, report.getHost().getState(), isBoot, report.getHost().getFreeMcp());
changeNimbyState(host, report.getHost());

/**
Expand Down Expand Up @@ -221,7 +233,14 @@ public void handleHostReport(HostReport report, boolean isBoot) {
}
}

if (host.idleCores < Dispatcher.CORE_POINTS_RESERVED_MIN) {
// The minimum amount of free space in the temporary directory to book a host
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);

if (minBookableFreeTempDir != -1 && report.getHost().getFreeMcp() < minBookableFreeTempDir) {
msg = String.format("%s doens't have enough free space in the temporary directory (mcp), %dMB needs %dMB",
host.name, (report.getHost().getFreeMcp()/1024), (minBookableFreeTempDir/1024));
}
else if (host.idleCores < Dispatcher.CORE_POINTS_RESERVED_MIN) {
msg = String.format("%s doesn't have enough idle cores, %d needs %d",
host.name, host.idleCores, Dispatcher.CORE_POINTS_RESERVED_MIN);
}
Expand All @@ -231,7 +250,7 @@ else if (host.idleMemory < Dispatcher.MEM_RESERVED_MIN) {
}
else if (report.getHost().getFreeMem() < CueUtil.MB512) {
msg = String.format("%s doens't have enough free system mem, %d needs %d",
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN);
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN);
}
else if(!host.hardwareState.equals(HardwareState.UP)) {
msg = host + " is not in the Up state.";
Expand Down Expand Up @@ -309,13 +328,61 @@ else if (!dispatchSupport.isCueBookable(host)) {
* updated with a boot report. If the state is Repair, then state is
* never updated via RQD.
*
*
* Prevent cue frames from booking on hosts with full temporary directories.
*
* Change host state to REPAIR or UP according the amount of free space
* in the temporary directory:
* - Set the host state to REPAIR, when the amount of free space in the
* temporary directory is less than the minimum required. Add a comment with
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
* - Set the host state to UP, when the amount of free space in the temporary directory
* is greater or equals to the minimum required and the host has a comment with
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
*
* @param host
* @param reportState
* @param isBoot
* @param freeTempDir
*/
private void changeHardwareState(DispatchHost host,
HardwareState reportState, boolean isBoot) {
private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot, long freeTempDir) {

// The minimum amount of free space in the temporary directory to book a host
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);

// Prevent cue frames from booking on hosts with full temporary directories
if (minBookableFreeTempDir != -1) {
if (host.hardwareState == HardwareState.UP && freeTempDir < minBookableFreeTempDir) {

// Insert a comment indicating that the Host status = Repair with reason = Full temporary directory
CommentDetail c = new CommentDetail();
c.subject = SUBJECT_COMMENT_FULL_TEMP_DIR;
c.user = CUEBOT_COMMENT_USER;
c.timestamp = null;
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in the " +
"temporary directory (mcp) is " + (freeTempDir/1024) + "MB. It must have at least "
+ (minBookableFreeTempDir/1024) + "MB of free space in temporary directory";
commentManager.addComment(host, c);

// Set the host state to REPAIR
hostManager.setHostState(host, HardwareState.REPAIR);
host.hardwareState = HardwareState.REPAIR;

return;
} else if (host.hardwareState == HardwareState.REPAIR && freeTempDir >= minBookableFreeTempDir) {
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
// user=CUEBOT_COMMENT_USER and delete the comments, if they exists
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host,
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_TEMP_DIR);

if (commentsDeleted) {
// Set the host state to UP
hostManager.setHostState(host, HardwareState.UP);
host.hardwareState = HardwareState.UP;
return;
}
}
}

// If the states are the same there is no reason to do this update.
if (host.hardwareState.equals(reportState)) {
Expand Down Expand Up @@ -374,7 +441,7 @@ private void changeNimbyState(DispatchHost host, RenderHost rh) {
* locked if all cores are locked.
*
* @param host DispatchHost
* @param renderHost RenderHost
* @param coreInfo CoreDetail
*/
private void changeLockState(DispatchHost host, CoreDetail coreInfo) {
if (host.lockState == LockState.LOCKED) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.imageworks.spcue.HostInterface;
import com.imageworks.spcue.JobInterface;

import java.util.List;

public interface CommentManager {

/**
Expand All @@ -47,6 +49,26 @@ public interface CommentManager {
*/
public void deleteComment(String id);

/**
* Deletes comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return boolean: returns true if one or more comments where deleted
*/
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Get comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return List<Comment>
*/
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);

/**
*
* @param id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import com.imageworks.spcue.ShowEntity;
import com.imageworks.spcue.dao.CommentDao;

import java.util.List;

@Transactional
public class CommentManagerService implements CommentManager {

Expand Down Expand Up @@ -55,6 +57,16 @@ public void deleteComment(String id) {
commentDao.deleteComment(id);
}

@Transactional(propagation = Propagation.REQUIRED)
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
return commentDao.deleteCommentByHostUserAndSubject(host, user, subject);
}

@Transactional(propagation = Propagation.REQUIRED)
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
return commentDao.getCommentsByHostUserAndSubject(host, user, subject);
}

@Transactional(propagation = Propagation.REQUIRED)
public void setCommentSubject(String id, String subject) {
commentDao.updateCommentSubject(id, subject);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ public interface HostManager {
*/
void setHostState(HostInterface host, HardwareState state);

/**
* Updates the free temporary directory (mcp) of a host.
*
* @param host HostInterface
* @param freeTempDir Long
*/
void setHostFreeTempDir(HostInterface host, Long freeTempDir);

/**
* Return true if the host is swapping hard enough
* that killing frames will save the entire machine.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ public void setHostState(HostInterface host, HardwareState state) {
hostDao.updateHostState(host, state);
}

@Override
public void setHostFreeTempDir(HostInterface host, Long freeTempDir) {
hostDao.updateHostFreeTempDir(host, freeTempDir);
}

@Override
@Transactional(propagation = Propagation.REQUIRED, readOnly=true)
public boolean isSwapping(HostInterface host) {
Expand Down
6 changes: 6 additions & 0 deletions cuebot/src/main/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ dispatcher.report_queue.max_pool_size=8
# Queue capacity for handling Host Report.
dispatcher.report_queue.queue_capacity=1000

# The minimum amount of free space in the temporary directory (mcp) to book a host.
# E.g: 1G = 1048576 kB => dispatcher.min_bookable_free_temp_dir_kb=1048576
# Default = -1 (deactivated)
# If equals to -1, it means the feature is turned off
dispatcher.min_bookable_free_temp_dir_kb=-1

# Number of threads to keep in the pool for kill frame operation.
dispatcher.kill_queue.core_pool_size=6
# Maximum number of threads to allow in the pool for kill frame operation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import com.imageworks.spcue.service.HostManager;
import com.imageworks.spcue.service.JobLauncher;
import com.imageworks.spcue.service.JobManager;
import com.imageworks.spcue.util.CueUtil;

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
Expand Down Expand Up @@ -209,11 +210,12 @@ private void launchJobs() {
private RenderHost.Builder buildRenderHost() {
return RenderHost.newBuilder()
.setBootTime(1192369572)
.setFreeMcp(76020)
// The minimum amount of free space in the temporary directory to book a host.
.setFreeMcp(CueUtil.GB)
.setFreeMem(53500)
.setFreeSwap(20760)
.setLoad(1)
.setTotalMcp(195430)
.setTotalMcp(CueUtil.GB4)
.setTotalMem(8173264)
.setTotalSwap(20960)
.setNimbyEnabled(false)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,12 @@ public DispatchHost createHost() {
RenderHost host = RenderHost.newBuilder()
.setName("test_host")
.setBootTime(1192369572)
.setFreeMcp(76020)
// The minimum amount of free space in the temporary directory to book a host.
.setFreeMcp(CueUtil.GB)
.setFreeMem(53500)
.setFreeSwap(20760)
.setLoad(1)
.setTotalMcp(195430)
.setTotalMcp(CueUtil.GB4)
.setTotalMem((int) CueUtil.GB16)
.setTotalSwap((int) CueUtil.GB16)
.setNimbyEnabled(false)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,12 @@ public void testInsertCommentOnHost() {
RenderHost host = RenderHost.newBuilder()
.setName("boo")
.setBootTime(1192369572)
.setFreeMcp(76020)
// The minimum amount of free space in the temporary directory to book a host.
.setFreeMcp(CueUtil.GB)
.setFreeMem(15290520)
.setFreeSwap(2076)
.setLoad(1)
.setTotalMcp(19543)
.setTotalMcp(CueUtil.GB4)
.setTotalMem(15290520)
.setTotalSwap(2096)
.setNimbyEnabled(false)
Expand Down
Loading

0 comments on commit b606a59

Please sign in to comment.