Skip to content

Commit

Permalink
Retry with more memory MEM_SIZE/MEM_UNIT Centaur test
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr committed Feb 5, 2025
1 parent 4a92bbc commit f87aff2
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: gcpbatch_retry_with_more_memory
testFormat: workflowfailure
testFormat: workflowsuccess
backends: [GCPBATCH]

files {
Expand All @@ -9,13 +9,10 @@ files {

metadata {
workflowName: retry_with_more_memory
status: Failed
"failures.0.message": "Workflow failed"
"failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory."
status: Succeeded
"retry_with_more_memory.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB"
"retry_with_more_memory.imitate_oom_error.-1.3.executionStatus": "Failed"
"retry_with_more_memory.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB"
"outputs.retry_with_more_memory.memory_output": "1.2100000000000002 GB"
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,21 @@ version 1.0

task imitate_oom_error {
command {
printf "Exception in thread "main" java.lang.OutOfMemoryError: testing\n\tat Test.main(Test.java:1)\n" >&2 && (exit 1)
# As a simulation of an OOM condition, do not create the 'foo' file. Cromwell should still be able to delocalize important detritus.
# touch foo
echo "$MEM_SIZE $MEM_UNIT"

# Current bashes do not do floating point arithmetic, Python to the rescue.
LESS=$(python -c "print($MEM_SIZE < 1.21)")

if [[ "$LESS" = "True" ]]
then
printf "Exception in thread "main" java.lang.OutOfMemoryError: testing\n\tat Test.main(Test.java:1)\n" >&2
exit 1
fi

echo "$MEM_SIZE $MEM_UNIT" > memory_output.txt
}
output {
File foo = "foo"
String memory_output = read_string("memory_output.txt")
}
runtime {
docker: "python:latest"
Expand All @@ -19,4 +28,8 @@ task imitate_oom_error {

workflow retry_with_more_memory {
call imitate_oom_error

output {
String memory_output = imitate_oom_error.memory_output
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsT
import cromwell.backend.google.batch.models.{BatchParameter, GcpBatchInput, GcpBatchOutput}
import cromwell.core.path.Path
import mouse.all.anySyntaxMouse
import wom.format.MemorySize

import scala.concurrent.duration.{Duration, DurationInt, FiniteDuration}
import scala.jdk.CollectionConverters._
Expand Down Expand Up @@ -147,7 +148,8 @@ object RunnableBuilder {
scriptContainerPath: String,
jobShell: String,
volumes: List[Volume],
dockerhubCredentials: (String, String)
dockerhubCredentials: (String, String),
memory: MemorySize
): Runnable.Builder = {

val container = (dockerhubCredentials._1, dockerhubCredentials._2) match {
Expand All @@ -164,9 +166,20 @@ object RunnableBuilder {
.setEntrypoint(jobShell)
.addCommands(scriptContainerPath)
}

// adding memory as environment variables makes it easy for a user to retrieve the new value of memory
// on the machine to utilize in their command blocks if needed
val environment =
Environment
.newBuilder()
.putAllVariables(
Map("MEM_UNIT" -> memory.unit.toString, "MEM_SIZE" -> memory.amount.toString).asJava

Check warning on line 176 in supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala

View check run for this annotation

Codecov / codecov/patch

supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala#L175-L176

Added lines #L175 - L176 were not covered by tests
)

Runnable
.newBuilder()
.setContainer(container)
.setEnvironment(environment)

Check warning on line 182 in supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala

View check run for this annotation

Codecov / codecov/patch

supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala#L182

Added line #L182 was not covered by tests
.withVolumes(volumes)
.putLabels(Key.Tag, Value.UserRunnable)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ trait UserRunnable {
scriptContainerPath = createParameters.commandScriptContainerPath.pathAsString,
jobShell = "/bin/bash",
volumes = volumes,
dockerhubCredentials = createParameters.dockerhubCredentials
dockerhubCredentials = createParameters.dockerhubCredentials,
memory = createParameters.runtimeAttributes.memory

Check warning on line 16 in supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala

View check run for this annotation

Codecov / codecov/patch

supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala#L15-L16

Added lines #L15 - L16 were not covered by tests
)

val describeRunnable = RunnableBuilder.describeDocker("user runnable", userRunnable)
Expand Down

0 comments on commit f87aff2

Please sign in to comment.