Skip to content

Commit

Permalink
Merge branch 'master' into recover_on_restart_rqd
Browse files Browse the repository at this point in the history
Signed-off-by: Diego Tavares <[email protected]>
  • Loading branch information
DiegoTavares authored Dec 10, 2024
2 parents d7ac144 + 189f17d commit 1d8184b
Show file tree
Hide file tree
Showing 194 changed files with 25,465 additions and 399 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/release-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ jobs:
context: .
push: true

- name: Docker Hub Description
uses: peter-evans/dockerhub-description@v4
with:
username: ${{ secrets.DOCKER_USER }}
password: ${{ secrets.DOCKER_PASS }}
repository: opencue/${{ matrix.component }}
readme-filepath: ./${{ matrix.component }}/README.md

create_release:
needs: preflight
name: Create Release
Expand Down
33 changes: 18 additions & 15 deletions .github/workflows/testing-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,6 @@ on:
branches: [ master ]

jobs:
test_cuebot_2022:
name: Build Cuebot and Run Unit Tests (CY2022)
runs-on: ubuntu-22.04
container:
image: aswf/ci-opencue:2022
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- uses: actions/checkout@v3
- name: Build with Gradle
run: |
chown -R aswfuser:aswfgroup .
su -c "cd cuebot && ./gradlew build --stacktrace --info" aswfuser
test_python_2023:
name: Run Python Unit Tests (CY2023)
runs-on: ubuntu-22.04
Expand Down Expand Up @@ -63,10 +49,27 @@ jobs:
chown -R aswfuser:aswfgroup .
su -c "cd cuebot && ./gradlew build --stacktrace --info" aswfuser
integration_test:
name: Run Integration Test
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Run test
run: ci/run_integration_test.sh

- name: Archive log files
uses: actions/upload-artifact@v3
if: ${{ always() }}
with:
name: test-logs
path: /tmp/opencue-test/*.log

lint_python:
name: Lint Python Code
runs-on: ubuntu-22.04
container: aswf/ci-opencue:2022
container: aswf/ci-opencue:2024.1
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,7 @@ rest_gateway/*.tar\.gz
.eggs/*
/cuebot/bin/*
/logs/*
/.gradle/*
/.gradle/*
cuebot/.settings/*
cuebot/.classpath
cuebot/.project
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ OpenCue provides the following features to help manage rendering jobs at scale:
For more information on installing, using, and administering OpenCue, visit
[www.opencue.io](https://www.opencue.io).

Watch YouTube videos on the [OpenCue Playlist](https://www.youtube.com/playlist?list=PL9dZxafYCWmzSBEwVT2AQinmZolYqBzdp) of the Academy Software Foundation (ASWF) to learn more.

# Meeting notes

Starting from May 2024, all Opencue meeting notes are stored on the [Opencue Confluence page](http://wiki.aswf.io/display/OPENCUE/OpenCue+Home).
Expand All @@ -51,3 +53,7 @@ For meeting notes before May 2024, please refer to the Opencue repository in the
To join the OpenCue discussion forum for users and admins, join the
[opencue-user mailing list](https://lists.aswf.io/g/opencue-user) or email the
group directly at <[email protected]>.

Join the [Opencue Slack channel](https://academysoftwarefdn.slack.com/archives/CMFPXV39Q).

Working Group meets biweekly at 2pm PST on [Zoom](https://www.google.com/url?q=https://zoom-lfx.platform.linuxfoundation.org/meeting/95509555934?password%3Da8d65f0e-c5f0-44fb-b362-d3ed0c22b7c1&sa=D&source=calendar&ust=1717863981078692&usg=AOvVaw1zRcYz7VPAwfwOXeBPpoM6).
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1
1.4
1 change: 0 additions & 1 deletion ci/pylintrc_test
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ disable=arguments-differ,
locally-disabled,
missing-class-docstring,
missing-function-docstring,
no-self-use,
protected-access,
raise-missing-from,
too-many-arguments,
Expand Down
8 changes: 4 additions & 4 deletions ci/run_python_lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ cd ..

echo "Running lint for cuegui/..."
cd cuegui
PYTHONPATH=../pycue python -m pylint --rcfile=../ci/pylintrc_main cuegui --ignore=cuegui/images,cuegui/images/crystal
PYTHONPATH=../pycue python -m pylint --rcfile=../ci/pylintrc_test tests
PYTHONPATH=../pycue python -m pylint --rcfile=../ci/pylintrc_main cuegui --ignore=cuegui/images,cuegui/images/crystal --disable=no-member
PYTHONPATH=../pycue python -m pylint --rcfile=../ci/pylintrc_test tests --disable=no-member
cd ..

echo "Running lint for cuesubmit/..."
cd cuesubmit
PYTHONPATH=../pycue:../pyoutline python -m pylint --rcfile=../ci/pylintrc_main cuesubmit
PYTHONPATH=../pycue:../pyoutline python -m pylint --rcfile=../ci/pylintrc_test tests
PYTHONPATH=../pycue:../pyoutline python -m pylint --rcfile=../ci/pylintrc_main cuesubmit --disable=no-member
PYTHONPATH=../pycue:../pyoutline python -m pylint --rcfile=../ci/pylintrc_test tests --disable=no-member
cd ..

echo "Running lint for rqd/..."
Expand Down
22 changes: 21 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package com.imageworks.spcue;

import java.util.Optional;

import com.imageworks.spcue.dispatcher.Dispatcher;
import com.imageworks.spcue.grpc.job.FrameState;

public class DispatchFrame extends FrameEntity implements FrameInterface {
Expand All @@ -42,7 +44,6 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {
public int minCores;
public int maxCores;
public boolean threadable;
public long minMemory;
public int minGpus;
public int maxGpus;
public long minGpuMemory;
Expand All @@ -52,5 +53,24 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {

// The Operational System this frame is expected to run in
public String os;

// Memory requirement for this frame in bytes
private long minMemory;

// Soft limit to be enforced for this frame in bytes
public long softMemoryLimit;

// Hard limit to be enforced for this frame in bytes
public long hardMemoryLimit;

public void setMinMemory(long minMemory) {
this.minMemory = minMemory;
this.softMemoryLimit = (long)(((double)minMemory) * Dispatcher.SOFT_MEMORY_MULTIPLIER);
this.hardMemoryLimit = (long)(((double)minMemory) * Dispatcher.HARD_MEMORY_MULTIPLIER);
}

public long getMinMemory() {
return this.minMemory;
}
}

8 changes: 4 additions & 4 deletions cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public static final VirtualProc build(DispatchHost host,
proc.isLocalDispatch = host.isLocalDispatch;

proc.coresReserved = frame.minCores;
proc.memoryReserved = frame.minMemory;
proc.memoryReserved = frame.getMinMemory();
proc.gpusReserved = frame.minGpus;
proc.gpuMemoryReserved = frame.minGpuMemory;

Expand Down Expand Up @@ -156,11 +156,11 @@ else if (proc.coresReserved >= 100) {
proc.coresReserved = wholeCores * 100;
}
else {
if (host.idleMemory - frame.minMemory
if (host.idleMemory - frame.getMinMemory()
<= Dispatcher.MEM_STRANDED_THRESHHOLD) {
proc.coresReserved = wholeCores * 100;
} else {
proc.coresReserved = getCoreSpan(host, frame.minMemory);
proc.coresReserved = getCoreSpan(host, frame.getMinMemory());
}
}
if (host.threadMode == ThreadMode.VARIABLE_VALUE
Expand Down Expand Up @@ -247,7 +247,7 @@ public static final VirtualProc build(DispatchHost host,
proc.isLocalDispatch = host.isLocalDispatch;

proc.coresReserved = lja.getThreads() * 100;
proc.memoryReserved = frame.minMemory;
proc.memoryReserved = frame.getMinMemory();
proc.gpusReserved = frame.minGpus;
proc.gpuMemoryReserved = frame.minGpuMemory;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public class DispatchQuery {
"AND job.pk_facility = ? " +
"AND " +
"(" +
"job.str_os IS NULL OR job.str_os IN '' " +
"job.str_os IS NULL OR job.str_os = '' " +
"OR " +
"job.str_os IN ? " +
") " +
Expand Down Expand Up @@ -530,7 +530,7 @@ private static final String replaceQueryForFifo(String query) {
"str_user, " +
"int_uid, " +
"str_log_dir, " +
"str_os, " +
"COALESCE(str_os, '') AS str_os, " +
"frame_name, " +
"frame_state, " +
"pk_frame, " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException {
frame.minCores = rs.getInt("int_cores_min");
frame.maxCores = rs.getInt("int_cores_max");
frame.threadable = rs.getBoolean("b_threadable");
frame.minMemory = rs.getLong("int_mem_min");
frame.setMinMemory(rs.getLong("int_mem_min"));
frame.minGpus = rs.getInt("int_gpus_min");
frame.maxGpus = rs.getInt("int_gpus_max");
frame.minGpuMemory = rs.getLong("int_gpu_mem_min");
Expand All @@ -348,7 +348,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException {
"job.str_user,"+
"job.int_uid,"+
"job.str_log_dir,"+
"job.str_os,"+
"COALESCE(str_os, '') AS str_os, " +
"frame.str_name AS frame_name, "+
"frame.str_state AS frame_state, "+
"frame.pk_frame, "+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public CachedJobWhiteboardMapper(NestedJobWhiteboardMapper result) {
"job.b_paused, " +
"job.b_autoeat, " +
"job.b_comment, " +
"job.str_os, " +
"COALESCE(str_os, '') AS str_os, " +
"job.int_frame_count, " +
"job.int_layer_count, " +
"job_stat.int_waiting_count, " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException {
"proc.int_virt_max_used,"+
"proc.int_virt_used,"+
"host.str_name AS host_name, " +
"job.str_os " +
"COALESCE(job.str_os, '') AS str_os " +
"FROM " +
"proc, " +
"job, " +
Expand Down Expand Up @@ -389,7 +389,7 @@ public VirtualProc findVirtualProc(FrameInterface frame) {
"proc.*, " +
"host.str_name AS host_name, " +
"host.pk_alloc, " +
"job.str_os, " +
"COALESCE(job.str_os, '') AS str_os, " +
"alloc.pk_facility " +
"FROM " +
"proc, " +
Expand Down Expand Up @@ -530,7 +530,7 @@ public String getCurrentFrameId(ProcInterface p) {
"SELECT " +
"proc.*, " +
"host.str_name AS host_name, " +
"job.str_os, " +
"COALESCE(job.str_os, '') AS str_os, " +
"host.pk_alloc, " +
"alloc.pk_facility " +
"FROM " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ public List<VirtualProc> dispatchHost(DispatchHost host, JobInterface job) {
}

if (host.idleCores < host.handleNegativeCoresRequirement(frame.minCores) ||
host.idleMemory < frame.minMemory ||
host.idleMemory < frame.getMinMemory() ||
host.idleGpus < frame.minGpus ||
host.idleGpuMemory < frame.minGpuMemory) {
logger.debug("Cannot dispatch, insufficient resources.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) {
.setStartTime(System.currentTimeMillis())
.setIgnoreNimby(proc.isLocalDispatch)
.setOs(proc.os)
.setSoftMemoryLimit(frame.softMemoryLimit)
.setHardMemoryLimit(frame.hardMemoryLimit)
.putAllEnvironment(jobDao.getEnvironment(frame))
.putAllEnvironment(layerDao.getLayerEnvironment(frame))
.putEnvironment("CUE3", "1")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ public interface Dispatcher {
// Upgrade the memory on the layer by 1g and retry.
public static final int EXIT_STATUS_MEMORY_FAILURE = 33;

// Upgrade the memory on the layer by 1g and retry.
public static final int DOCKER_EXIT_STATUS_MEMORY_FAILURE = 137;

// max retry time
public static final int FRAME_TIME_NO_RETRY = 3600 * 8;

Expand Down Expand Up @@ -112,6 +115,9 @@ public interface Dispatcher {
// memory
public static final long MINIMUM_MEMORY_INCREASE = CueUtil.GB2;

public static final double SOFT_MEMORY_MULTIPLIER = 1.1;
public static final double HARD_MEMORY_MULTIPLIER = 1.4;

/**
* Dispatch a host to the facility.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -320,14 +320,15 @@ public void handlePostFrameCompleteOperations(VirtualProc proc,
}

/*
* An exit status of 33 indicates that the frame was killed by the
* Some exit statuses indicate that a frame was killed by the
* application due to a memory issue and should be retried. In this
* case, disable the optimizer and raise the memory by what is
* specified in the show's service override, service or 2GB.
*/
if (report.getExitStatus() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| frameDetail.exitStatus == Dispatcher.EXIT_STATUS_MEMORY_FAILURE) {
|| frameDetail.exitStatus == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| report.getExitStatus() == Dispatcher.DOCKER_EXIT_STATUS_MEMORY_FAILURE) {
long increase = CueUtil.GB2;

// since there can be multiple services, just going for the
Expand Down Expand Up @@ -641,7 +642,8 @@ else if (frame.state.equals(FrameState.DEAD)) {
newState = FrameState.DEAD;
} else if (frame.retries >= job.maxRetries) {
if (!(report.getExitStatus() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE))
|| report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| report.getExitStatus() == Dispatcher.DOCKER_EXIT_STATUS_MEMORY_FAILURE))
newState = FrameState.DEAD;
}

Expand Down
Loading

0 comments on commit 1d8184b

Please sign in to comment.