-
Notifications
You must be signed in to change notification settings - Fork 201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Minimum bookable free mcp #1306
Changes from 9 commits
8ecbc4a
c3a735a
e9f9273
44b1cf3
91e06b2
59c978e
dcf27a5
1aa4ab6
0706d46
4e95928
04ec106
cac5c24
9064c33
d0b763b
629cce9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,10 +29,13 @@ | |
|
||
import org.apache.logging.log4j.Logger; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.springframework.beans.factory.annotation.Autowired; | ||
import org.springframework.core.env.Environment; | ||
import org.springframework.core.task.TaskRejectedException; | ||
import org.springframework.dao.DataAccessException; | ||
import org.springframework.dao.EmptyResultDataAccessException; | ||
|
||
import com.imageworks.spcue.CommentDetail; | ||
import com.imageworks.spcue.DispatchHost; | ||
import com.imageworks.spcue.FrameInterface; | ||
import com.imageworks.spcue.JobEntity; | ||
|
@@ -57,6 +60,7 @@ | |
import com.imageworks.spcue.rqd.RqdClient; | ||
import com.imageworks.spcue.rqd.RqdClientException; | ||
import com.imageworks.spcue.service.BookingManager; | ||
import com.imageworks.spcue.service.CommentManager; | ||
import com.imageworks.spcue.service.HostManager; | ||
import com.imageworks.spcue.service.JobManager; | ||
import com.imageworks.spcue.service.JobManagerSupport; | ||
|
@@ -80,6 +84,14 @@ public class HostReportHandler { | |
private JobManagerSupport jobManagerSupport; | ||
private JobDao jobDao; | ||
private LayerDao layerDao; | ||
@Autowired | ||
private Environment env; | ||
@Autowired | ||
private CommentManager commentManager; | ||
// Comment constants | ||
private static final String SUBJECT_COMMENT_FULL_MCP_DIR = "Host set to REPAIR for not having enough storage " + | ||
"space on /mcp"; | ||
private static final String CUEBOT_COMMENT_USER = "cuebot"; | ||
|
||
/** | ||
* Boolean to toggle if this class is accepting data or not. | ||
|
@@ -146,6 +158,7 @@ public void handleHostReport(HostReport report, boolean isBoot) { | |
|
||
DispatchHost host; | ||
RenderHost rhost = report.getHost(); | ||
Long minBookableFreeMCP = env.getRequiredProperty("dispatcher.min_bookable_free_mcp_kb", Long.class); | ||
try { | ||
host = hostManager.findDispatchHost(rhost.getName()); | ||
hostManager.setHostStatistics(host, | ||
|
@@ -156,7 +169,8 @@ public void handleHostReport(HostReport report, boolean isBoot) { | |
rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l), | ||
rhost.getAttributesMap().get("SP_OS")); | ||
|
||
changeHardwareState(host, report.getHost().getState(), isBoot); | ||
changeHardwareState(host, report.getHost().getState(), isBoot, report.getHost().getFreeMcp(), | ||
minBookableFreeMCP); | ||
changeNimbyState(host, report.getHost()); | ||
|
||
/** | ||
|
@@ -233,6 +247,10 @@ else if (report.getHost().getFreeMem() < CueUtil.MB512) { | |
msg = String.format("%s doens't have enough free system mem, %d needs %d", | ||
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN); | ||
} | ||
else if (minBookableFreeMCP != -1 && report.getHost().getFreeMcp() < minBookableFreeMCP) { | ||
msg = String.format("%s doens't have enough free space in the /mcp directory, %dMB needs %dMB", | ||
host.name, (report.getHost().getFreeMcp()/1024), (minBookableFreeMCP/1024)); | ||
} | ||
else if(!host.hardwareState.equals(HardwareState.UP)) { | ||
msg = host + " is not in the Up state."; | ||
} | ||
|
@@ -309,13 +327,60 @@ else if (!dispatchSupport.isCueBookable(host)) { | |
* updated with a boot report. If the state is Repair, then state is | ||
* never updated via RQD. | ||
* | ||
* | ||
* Prevent cue frames from booking on hosts with full MCP directories. | ||
* | ||
* Change host state to REPAIR or UP according the amount of free space | ||
* in the /mcp directory: | ||
* - Set the host state to REPAIR, when the amount of free space in the | ||
* /mcp directory is less than the minimum required. Add a comment with | ||
* subject: "Action required. Host status = Repair. Reason: Full /mcp directory". | ||
* - Set the host state to UP, when the amount of free space in the /mcp directory | ||
* is greater or equals to the minimum required and the host has a comment with | ||
* subject: "Action required. Host status = Repair. Reason: Full /mcp directory". | ||
* | ||
* @param host | ||
* @param reportState | ||
* @param isBoot | ||
* @param freeMcp | ||
* @param minBookableFreeMCP: The minimum amount of free space in the MCP directory to book a host | ||
*/ | ||
private void changeHardwareState(DispatchHost host, | ||
HardwareState reportState, boolean isBoot) { | ||
private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot, long freeMcp, | ||
Long minBookableFreeMCP) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we call Fetching properties should be cheap. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Brian. I updated the code following you recommendation. Now the solution is calling Please note that currently the only place the
However, I understand your point of view that the changeHardwareState method should know the min_bookable_free_mcp_kb value and that etching properties should be cheap. |
||
|
||
// Prevent cue frames from booking on hosts with full MCP directories | ||
if (minBookableFreeMCP != -1) { | ||
if (host.hardwareState == HardwareState.UP && freeMcp < minBookableFreeMCP) { | ||
|
||
// Insert a comment indicating that the Host status = Repair with reason = Full /mcp directory | ||
CommentDetail c = new CommentDetail(); | ||
c.subject = SUBJECT_COMMENT_FULL_MCP_DIR; | ||
c.user = CUEBOT_COMMENT_USER; | ||
c.timestamp = null; | ||
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in /mcp is " + | ||
(freeMcp/1024) + "MB. It must have at least " + (minBookableFreeMCP/1024) + | ||
"MB of free space in /mcp"; | ||
commentManager.addComment(host, c); | ||
|
||
// Set the host state to REPAIR | ||
hostManager.setHostState(host, HardwareState.REPAIR); | ||
host.hardwareState = HardwareState.REPAIR; | ||
|
||
return; | ||
} else if (host.hardwareState == HardwareState.REPAIR && freeMcp >= minBookableFreeMCP) { | ||
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_MCP_DIR and | ||
// user=CUEBOT_COMMENT_USER and delete the comments, if they exists | ||
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host, | ||
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_MCP_DIR); | ||
|
||
if (commentsDeleted) { | ||
// Set the host state to UP | ||
hostManager.setHostState(host, HardwareState.UP); | ||
host.hardwareState = HardwareState.UP; | ||
return; | ||
} | ||
} | ||
} | ||
|
||
// If the states are the same there is no reason to do this update. | ||
if (host.hardwareState.equals(reportState)) { | ||
|
@@ -374,7 +439,7 @@ private void changeNimbyState(DispatchHost host, RenderHost rh) { | |
* locked if all cores are locked. | ||
* | ||
* @param host DispatchHost | ||
* @param renderHost RenderHost | ||
* @param coreInfo CoreDetail | ||
*/ | ||
private void changeLockState(DispatchHost host, CoreDetail coreInfo) { | ||
if (host.lockState == LockState.LOCKED) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's my understanding that for most users, the thing we call "MCP" will not actually be the
/mcp
directory:OpenCue/rqd/rqd/rqmachine.py
Lines 519 to 523 in 6703e07
Ideally we would update all of OpenCue to use more fitting terminology, but that's a big project and not something we need to do right now.
However, in things like user-readable strings, and code comments, is there a different term we can use? This line is one example but there are a few others in this PR.
The current string would seem to be a bit misleading, if users have RQD hosts that aren't actually using the literal
/mcp
directory.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @bcipriano
I propose we use the term
TempDir
to replaceMcp
. What do you think?I will replace all the
Mcp
byTempDir
in my pull resquest.However, as you said, we can create a new task later on to update all of OpenCue code to fit the terminology.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
I changed the comments, variables and methods reference from "mcp" to "temporary directory" or "TempDir".
See commit: 04ec106