Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimum bookable free mcp #1306

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/CommentDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.imageworks.spcue.HostInterface;
import com.imageworks.spcue.JobInterface;

import java.util.List;

public interface CommentDao {

/**
Expand All @@ -32,6 +34,26 @@ public interface CommentDao {
*/
public void deleteComment(String id);

/**
* Deletes comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return boolean: returns true if one or more comments where deleted
*/
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Get comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return List<Comment>
*/
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Retrieves the specified comment.
*
Expand Down
8 changes: 8 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ public interface HostDao {
*/
void updateHostState(HostInterface host, HardwareState state);

/**
* updates a host with the passed free Mcp
*
* @param host
* @param freeMcp
*/
void updateHostFreeMcp(HostInterface host, Long freeMcp);

/**
* returns a full host detail
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;

import org.springframework.jdbc.core.RowMapper;
Expand Down Expand Up @@ -71,6 +72,18 @@ public CommentDetail mapRow(ResultSet rs, int row) throws SQLException {
}
};

public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
return getJdbcTemplate().update(
"DELETE FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
host.getHostId(), user, subject) > 0;
}

public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
return getJdbcTemplate().query(
"SELECT * FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
COMMENT_DETAIL_MAPPER, host.getHostId(), user, subject);
}

public CommentDetail getCommentDetail(String id) {
return getJdbcTemplate().queryForObject(
"SELECT * FROM comments WHERE pk_comment=?",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,13 @@ public void updateHostState(HostInterface host, HardwareState state) {
state.toString(), host.getHostId());
}

@Override
public void updateHostFreeMcp(HostInterface host, Long freeMcp) {
getJdbcTemplate().update(
"UPDATE host_stat SET int_mcp_free=? WHERE pk_host=?",
freeMcp, host.getHostId());
}

@Override
public void updateHostSetAllocation(HostInterface host, AllocationInterface alloc) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@
import java.util.Map;
import java.util.concurrent.ThreadPoolExecutor;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.env.Environment;
import org.springframework.core.task.TaskRejectedException;
import org.springframework.dao.DataAccessException;
import org.springframework.dao.EmptyResultDataAccessException;

import com.imageworks.spcue.CommentDetail;
import com.imageworks.spcue.DispatchHost;
import com.imageworks.spcue.FrameInterface;
import com.imageworks.spcue.JobEntity;
Expand All @@ -57,6 +59,7 @@
import com.imageworks.spcue.rqd.RqdClient;
import com.imageworks.spcue.rqd.RqdClientException;
import com.imageworks.spcue.service.BookingManager;
import com.imageworks.spcue.service.CommentManager;
import com.imageworks.spcue.service.HostManager;
import com.imageworks.spcue.service.JobManager;
import com.imageworks.spcue.service.JobManagerSupport;
Expand All @@ -65,7 +68,7 @@

public class HostReportHandler {

private static final Logger logger = LogManager.getLogger(HostReportHandler.class);
private static final Logger logger = Logger.getLogger(HostReportHandler.class);

private BookingManager bookingManager;
private HostManager hostManager;
Expand All @@ -80,6 +83,14 @@ public class HostReportHandler {
private JobManagerSupport jobManagerSupport;
private JobDao jobDao;
private LayerDao layerDao;
@Autowired
private Environment env;
@Autowired
private CommentManager commentManager;
// Comment constants
private static final String SUBJECT_COMMENT_FULL_MCP_DIR = "Host set to REPAIR for not having enough storage " +
"space on /mcp";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's my understanding that for most users, the thing we call "MCP" will not actually be the /mcp directory:

def getTempPath(self):
"""Returns the correct mcp path for the given machine"""
if os.path.isdir("/mcp/"):
return "/mcp/"
return '%s/' % tempfile.gettempdir()

Ideally we would update all of OpenCue to use more fitting terminology, but that's a big project and not something we need to do right now.

However, in things like user-readable strings, and code comments, is there a different term we can use? This line is one example but there are a few others in this PR.

The current string would seem to be a bit misleading, if users have RQD hosts that aren't actually using the literal /mcp directory.

Copy link
Collaborator Author

@ramonfigueiredo ramonfigueiredo Jul 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @bcipriano

I propose we use the term TempDir to replace Mcp. What do you think?

I will replace all the Mcp by TempDir in my pull resquest.

However, as you said, we can create a new task later on to update all of OpenCue code to fit the terminology.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

I changed the comments, variables and methods reference from "mcp" to "temporary directory" or "TempDir".

See commit: 04ec106

private static final String CUEBOT_COMMENT_USER = "cuebot";

/**
* Boolean to toggle if this class is accepting data or not.
Expand Down Expand Up @@ -146,6 +157,7 @@ public void handleHostReport(HostReport report, boolean isBoot) {

DispatchHost host;
RenderHost rhost = report.getHost();
Long minBookableFreeMCP = env.getRequiredProperty("dispatcher.min_bookable_free_mcp_kb", Long.class);
try {
host = hostManager.findDispatchHost(rhost.getName());
hostManager.setHostStatistics(host,
Expand All @@ -156,7 +168,8 @@ public void handleHostReport(HostReport report, boolean isBoot) {
rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l),
rhost.getAttributesMap().get("SP_OS"));

changeHardwareState(host, report.getHost().getState(), isBoot);
changeHardwareState(host, report.getHost().getState(), isBoot, report.getHost().getFreeMcp(),
minBookableFreeMCP);
changeNimbyState(host, report.getHost());

/**
Expand Down Expand Up @@ -233,6 +246,10 @@ else if (report.getHost().getFreeMem() < CueUtil.MB512) {
msg = String.format("%s doens't have enough free system mem, %d needs %d",
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN);
}
else if (minBookableFreeMCP != -1 && report.getHost().getFreeMcp() < minBookableFreeMCP) {
msg = String.format("%s doens't have enough free space in the /mcp directory, %dMB needs %dMB",
host.name, (report.getHost().getFreeMcp()/1024), (minBookableFreeMCP/1024));
}
else if(!host.hardwareState.equals(HardwareState.UP)) {
msg = host + " is not in the Up state.";
}
Expand Down Expand Up @@ -309,13 +326,60 @@ else if (!dispatchSupport.isCueBookable(host)) {
* updated with a boot report. If the state is Repair, then state is
* never updated via RQD.
*
*
* Prevent cue frames from booking on hosts with full MCP directories.
*
* Change host state to REPAIR or UP according the amount of free space
* in the /mcp directory:
* - Set the host state to REPAIR, when the amount of free space in the
* /mcp directory is less than the minimum required. Add a comment with
* subject: "Action required. Host status = Repair. Reason: Full /mcp directory".
* - Set the host state to UP, when the amount of free space in the /mcp directory
* is greater or equals to the minimum required and the host has a comment with
* subject: "Action required. Host status = Repair. Reason: Full /mcp directory".
*
* @param host
* @param reportState
* @param isBoot
* @param freeMcp
* @param minBookableFreeMCP: The minimum amount of free space in the MCP directory to book a host
*/
private void changeHardwareState(DispatchHost host,
HardwareState reportState, boolean isBoot) {
private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot, long freeMcp,
Long minBookableFreeMCP) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we call env.getRequiredProperty("dispatcher.min_bookable_free_mcp_kb", Long.class) to fetch this value instead of passing it as a function parameter?

Fetching properties should be cheap.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Brian.

I updated the code following you recommendation. Now the solution is calling env.getRequiredProperty("dispatcher.min_bookable_free_mcp_kb", Long.class) twice. Inside the changeHardwareState() method and inside handleHostReport() method.

Please note that currently the only place the changeHardwareState() method is called is in the handleHostReport() method. My previous implementation did only one call of env.getRequiredProperty("dispatcher.min_bookable_free_mcp_kb", Long.class) because the minBookableFreeMCP variable was used later on in the handleHostReport() method to log the message of minimum amount of free space.

  • Use of minBookableFreeMCP latter in the handleHostReport() method:
if (minBookableFreeMCP != -1 && report.getHost().getFreeMcp() < minBookableFreeMCP) {
                msg = String.format("%s doens't have enough free space in the /mcp directory, %dMB needs %dMB",
                        host.name, (report.getHost().getFreeMcp()/1024),  (minBookableFreeMCP/1024));
            }

However, I understand your point of view that the changeHardwareState method should know the min_bookable_free_mcp_kb value and that etching properties should be cheap.


// Prevent cue frames from booking on hosts with full MCP directories
if (minBookableFreeMCP != -1) {
if (host.hardwareState == HardwareState.UP && freeMcp < minBookableFreeMCP) {

// Insert a comment indicating that the Host status = Repair with reason = Full /mcp directory
CommentDetail c = new CommentDetail();
c.subject = SUBJECT_COMMENT_FULL_MCP_DIR;
c.user = CUEBOT_COMMENT_USER;
c.timestamp = null;
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in /mcp is " +
(freeMcp/1024) + "MB. It must have at least " + (minBookableFreeMCP/1024) +
"MB of free space in /mcp";
commentManager.addComment(host, c);

// Set the host state to REPAIR
hostManager.setHostState(host, HardwareState.REPAIR);
host.hardwareState = HardwareState.REPAIR;

return;
} else if (host.hardwareState == HardwareState.REPAIR && freeMcp >= minBookableFreeMCP) {
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_MCP_DIR and
// user=CUEBOT_COMMENT_USER and delete the comments, if they exists
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host,
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_MCP_DIR);

if (commentsDeleted) {
// Set the host state to UP
hostManager.setHostState(host, HardwareState.UP);
host.hardwareState = HardwareState.UP;
return;
}
}
}

// If the states are the same there is no reason to do this update.
if (host.hardwareState.equals(reportState)) {
Expand Down Expand Up @@ -374,7 +438,7 @@ private void changeNimbyState(DispatchHost host, RenderHost rh) {
* locked if all cores are locked.
*
* @param host DispatchHost
* @param renderHost RenderHost
* @param coreInfo CoreDetail
*/
private void changeLockState(DispatchHost host, CoreDetail coreInfo) {
if (host.lockState == LockState.LOCKED) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,16 @@
import java.util.concurrent.atomic.AtomicBoolean;

import com.imageworks.spcue.grpc.report.HostReport;
import org.apache.log4j.Logger;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

import com.imageworks.spcue.dispatcher.commands.DispatchHandleHostReport;
import com.imageworks.spcue.util.CueUtil;

public class HostReportQueue extends ThreadPoolExecutor {

private static final Logger logger = LogManager.getLogger(HostReportQueue.class);
private static final Logger logger = Logger.getLogger(HostReportQueue.class);
private QueueRejectCounter rejectCounter = new QueueRejectCounter();
private AtomicBoolean isShutdown = new AtomicBoolean(false);
private int queueCapacity;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.imageworks.spcue.HostInterface;
import com.imageworks.spcue.JobInterface;

import java.util.List;

public interface CommentManager {

/**
Expand All @@ -47,6 +49,26 @@ public interface CommentManager {
*/
public void deleteComment(String id);

/**
* Deletes comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return boolean: returns true if one or more comments where deleted
*/
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);

/**
* Get comments using host, user, and subject
*
* @param host
* @param user
* @param subject
* @return List<Comment>
*/
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);

/**
*
* @param id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import com.imageworks.spcue.ShowEntity;
import com.imageworks.spcue.dao.CommentDao;

import java.util.List;

@Transactional
public class CommentManagerService implements CommentManager {

Expand Down Expand Up @@ -55,6 +57,16 @@ public void deleteComment(String id) {
commentDao.deleteComment(id);
}

@Transactional(propagation = Propagation.REQUIRED)
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
return commentDao.deleteCommentByHostUserAndSubject(host, user, subject);
}

@Transactional(propagation = Propagation.REQUIRED)
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
return commentDao.getCommentsByHostUserAndSubject(host, user, subject);
}

@Transactional(propagation = Propagation.REQUIRED)
public void setCommentSubject(String id, String subject) {
commentDao.updateCommentSubject(id, subject);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ public interface HostManager {
*/
void setHostState(HostInterface host, HardwareState state);

/**
* Updates the freeMcp of a host.
*
* @param host HostInterface
* @param freeMcp Long
*/
void setHostFreeMcp(HostInterface host, Long freeMcp);

/**
* Return true if the host is swapping hard enough
* that killing frames will save the entire machine.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
import java.sql.Timestamp;
import java.util.List;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.log4j.Logger;
import org.springframework.dao.EmptyResultDataAccessException;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
Expand Down Expand Up @@ -58,7 +57,7 @@

@Transactional
public class HostManagerService implements HostManager {
private static final Logger logger = LogManager.getLogger(HostManagerService.class);
private static final Logger logger = Logger.getLogger(HostManagerService.class);

private HostDao hostDao;
private RqdClient rqdClient;
Expand Down Expand Up @@ -93,6 +92,11 @@ public void setHostState(HostInterface host, HardwareState state) {
hostDao.updateHostState(host, state);
}

@Override
public void setHostFreeMcp(HostInterface host, Long freeMcp) {
hostDao.updateHostFreeMcp(host, freeMcp);
}

@Override
@Transactional(propagation = Propagation.REQUIRED, readOnly=true)
public boolean isSwapping(HostInterface host) {
Expand Down
5 changes: 5 additions & 0 deletions cuebot/src/main/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ dispatcher.report_queue.max_pool_size=8
# Queue capacity for handling Host Report.
dispatcher.report_queue.queue_capacity=1000

# The minimum amount of free space in the MCP directory to book a host.
# Default: 1G = 1048576 kB
# If equals to -1, it means the feature is turned off
dispatcher.min_bookable_free_mcp_kb=1048576

# Number of threads to keep in the pool for kill frame operation.
dispatcher.kill_queue.core_pool_size=6
# Maximum number of threads to allow in the pool for kill frame operation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ private void launchJobs() {
private RenderHost.Builder buildRenderHost() {
return RenderHost.newBuilder()
.setBootTime(1192369572)
.setFreeMcp(76020)
// The minimum amount of free space in the /mcp directory to book a host.
.setFreeMcp(1048576)
.setFreeMem(53500)
.setFreeSwap(20760)
.setLoad(1)
Expand Down
Loading