Skip to content

Commit f5b9632

Browse files
committed
Optional retry with more memory for all standard backends.
1 parent 5c2e28b commit f5b9632

File tree

12 files changed

+134
-35
lines changed

12 files changed

+134
-35
lines changed

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -241,13 +241,16 @@ trait StandardAsyncExecutionActor
241241
lazy val commandDirectory: Path = jobPaths.callExecutionRoot
242242

243243
lazy val memoryRetryErrorKeys: Option[List[String]] =
244-
configurationDescriptor.globalConfig.as[Option[List[String]]]("system.memory-retry-error-keys")
244+
configurationDescriptor.globalConfig.getAs[List[String]]("system.memory-retry-error-keys")
245+
246+
lazy val memoryRetryStderrLimit: Option[Int] =
247+
configurationDescriptor.globalConfig.getAs[Int]("system.memory-retry-stderr-limit")
245248

246249
lazy val memoryRetryFactor: Option[MemoryRetryMultiplierRefined] =
247250
jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MemoryRetryMultiplier) flatMap { value: String =>
248251
Try(value.toDouble) match {
249252
case Success(v) =>
250-
refineV[MemoryRetryMultiplier](v.toDouble) match {
253+
refineV[MemoryRetryMultiplier](v) match {
251254
case Left(e) =>
252255
// should not happen, this case should have been screened for and fast-failed during workflow materialization.
253256
log.error(
@@ -1197,7 +1200,7 @@ trait StandardAsyncExecutionActor
11971200
val nextKvJobKey =
11981201
KvJobKey(jobDescriptor.key.call.fullyQualifiedName, jobDescriptor.key.index, jobDescriptor.key.attempt + 1)
11991202

1200-
def getNextKvPair[A](key: String, value: String): Map[String, KvPair] = {
1203+
def getNextKvPair(key: String, value: String): Map[String, KvPair] = {
12011204
val nextScopedKey = ScopedKey(jobDescriptor.workflowDescriptor.id, nextKvJobKey, key)
12021205
val nextKvPair = KvPair(nextScopedKey, value)
12031206
Map(key -> nextKvPair)
@@ -1414,36 +1417,44 @@ trait StandardAsyncExecutionActor
14141417

14151418
// Returns true if the task has written an RC file that indicates OOM, false otherwise
14161419
def memoryRetryRC: Future[Boolean] = {
1417-
def returnCodeAsBoolean(codeAsOption: Option[String]): Boolean =
1418-
codeAsOption match {
1419-
case Some(codeAsString) =>
1420-
Try(codeAsString.trim.toInt) match {
1421-
case Success(code) =>
1422-
code match {
1423-
case StderrContainsRetryKeysCode => true
1424-
case _ => false
1425-
}
1426-
case Failure(e) =>
1427-
log.error(
1428-
s"'CheckingForMemoryRetry' action exited with code '$codeAsString' which couldn't be " +
1429-
s"converted to an Integer. Task will not be retried with more memory. Error: ${ExceptionUtils.getMessage(e)}"
1430-
)
1431-
false
1432-
}
1433-
case None => false
1420+
1421+
def readFile(path: Path, maxBytes: Option[Int]): Future[String] =
1422+
asyncIo.contentAsStringAsync(path, maxBytes, failOnOverflow = false)
1423+
1424+
def checkMemoryRetryRC(): Future[Boolean] =
1425+
readFile(jobPaths.memoryRetryRC, None) map { codeAsString =>
1426+
Try(codeAsString.trim.toInt) match {
1427+
case Success(code) =>
1428+
code match {
1429+
case StderrContainsRetryKeysCode => true
1430+
case _ => false
1431+
}
1432+
case Failure(e) =>
1433+
log.error(
1434+
s"'CheckingForMemoryRetry' action exited with code '$codeAsString' which couldn't be " +
1435+
s"converted to an Integer. Task will not be retried with more memory. Error: ${ExceptionUtils.getMessage(e)}"
1436+
)
1437+
false
1438+
}
14341439
}
14351440

1436-
def readMemoryRetryRCFile(fileExists: Boolean): Future[Option[String]] =
1437-
if (fileExists)
1438-
asyncIo.contentAsStringAsync(jobPaths.memoryRetryRC, None, failOnOverflow = false).map(Option(_))
1439-
else
1440-
Future.successful(None)
1441+
def checkMemoryRetryStderr(errorKeys: List[String], maxBytes: Int): Future[Boolean] =
1442+
readFile(jobPaths.standardPaths.error, Option(maxBytes)) map { errorContent =>
1443+
errorKeys.exists(errorContent.contains)
1444+
}
14411445

1442-
for {
1443-
fileExists <- asyncIo.existsAsync(jobPaths.memoryRetryRC)
1444-
retryCheckRCAsOption <- readMemoryRetryRCFile(fileExists)
1445-
retryWithMoreMemory = returnCodeAsBoolean(retryCheckRCAsOption)
1446-
} yield retryWithMoreMemory
1446+
asyncIo.existsAsync(jobPaths.memoryRetryRC) flatMap {
1447+
case true => checkMemoryRetryRC()
1448+
case false =>
1449+
(memoryRetryErrorKeys, memoryRetryStderrLimit) match {
1450+
case (Some(keys), Some(limit)) =>
1451+
asyncIo.existsAsync(jobPaths.standardPaths.error) flatMap {
1452+
case true => checkMemoryRetryStderr(keys, limit)
1453+
case false => Future.successful(false)
1454+
}
1455+
case _ => Future.successful(false)
1456+
}
1457+
}
14471458
}
14481459

14491460
val stderr = jobPaths.standardPaths.error
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
name: do_not_retry_rc0_tes
2+
testFormat: workflowsuccess
3+
backends: [TES]
4+
5+
files {
6+
workflow: retry_with_more_memory/do_not_retry_rc0.wdl
7+
options: retry_with_more_memory/retry_with_more_memory.options
8+
}
9+
10+
metadata {
11+
workflowName: do_not_retry_rc0
12+
status: Succeeded
13+
"calls.do_not_retry_rc0.imitate_oom_error.executionStatus": "Done"
14+
"calls.do_not_retry_rc0.imitate_oom_error.runtimeAttributes.memory": "1 GB"
15+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
name: do_not_retry_rc1_tes
2+
testFormat: workflowsuccess
3+
backends: [TES]
4+
5+
files {
6+
workflow: retry_with_more_memory/do_not_retry_rc1.wdl
7+
options: retry_with_more_memory/retry_with_more_memory.options
8+
}
9+
10+
metadata {
11+
workflowName: do_not_retry_rc1
12+
status: Succeeded
13+
"calls.do_not_retry_rc1.imitate_oom_error.executionStatus": "Done"
14+
"calls.do_not_retry_rc1.imitate_oom_error.runtimeAttributes.memory": "1 GB"
15+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: retry_same_memory_output_failure_tes
2+
testFormat: workflowfailure
3+
backends: [TES]
4+
5+
files {
6+
workflow: retry_with_more_memory/retry_same_memory_output_failure.wdl
7+
options: retry_with_more_memory/retry_with_more_memory.options
8+
}
9+
10+
metadata {
11+
workflowName: retry_same_memory_output_failure
12+
status: Failed
13+
"failures.0.message": "Workflow failed"
14+
"failures.0.causedBy.0.message": ~~"Task retry_same_memory_output_failure.imitate_oom_error:NA:3 failed with SYSTEM_ERROR"
15+
"retry_same_memory_output_failure.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
16+
"retry_same_memory_output_failure.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
17+
"retry_same_memory_output_failure.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
18+
"retry_same_memory_output_failure.imitate_oom_error.-1.2.runtimeAttributes.memory": "1 GB"
19+
"retry_same_memory_output_failure.imitate_oom_error.-1.3.executionStatus": "Failed"
20+
"retry_same_memory_output_failure.imitate_oom_error.-1.3.runtimeAttributes.memory": "1 GB"
21+
}

centaur/src/main/resources/standardTestCases/retry_with_more_memory/do_not_retry_rc0.wdl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ task imitate_oom_error {
1212
docker: "python:latest"
1313
memory: "1 GB"
1414
maxRetries: 2
15-
backend: "Papiv2"
1615
}
1716
}
1817

centaur/src/main/resources/standardTestCases/retry_with_more_memory/do_not_retry_rc1.wdl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ task imitate_oom_error {
1313
memory: "1 GB"
1414
continueOnReturnCode: true
1515
maxRetries: 2
16-
backend: "Papiv2"
1716
}
1817
}
1918

centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_same_memory_output_failure.wdl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ task imitate_oom_error {
1818
memory: "1 GB"
1919
continueOnReturnCode: true
2020
maxRetries: 2
21-
backend: "Papiv2"
2221
}
2322
}
2423

centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory.wdl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ task imitate_oom_error {
1313
docker: "python:latest"
1414
memory: "1 GB"
1515
maxRetries: 2
16-
backend: "Papiv2"
1716
}
1817
}
1918

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: retry_with_more_memory_no_wf_option_tes
2+
testFormat: workflowfailure
3+
backends: [TES]
4+
5+
files {
6+
workflow: retry_with_more_memory/retry_with_more_memory.wdl
7+
}
8+
9+
metadata {
10+
workflowName: retry_with_more_memory
11+
status: Failed
12+
"failures.0.message": "Workflow failed"
13+
"retry_with_more_memory.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
14+
"retry_with_more_memory.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
15+
"retry_with_more_memory.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
16+
"retry_with_more_memory.imitate_oom_error.-1.2.runtimeAttributes.memory": "1 GB"
17+
"retry_with_more_memory.imitate_oom_error.-1.3.executionStatus": "Failed"
18+
"retry_with_more_memory.imitate_oom_error.-1.3.runtimeAttributes.memory": "1 GB"
19+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: retry_with_more_memory_tes
2+
testFormat: workflowfailure
3+
backends: [TES]
4+
5+
files {
6+
workflow: retry_with_more_memory/retry_with_more_memory.wdl
7+
options: retry_with_more_memory/retry_with_more_memory.options
8+
}
9+
10+
metadata {
11+
workflowName: retry_with_more_memory
12+
status: Failed
13+
"failures.0.message": "Workflow failed"
14+
"failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory."
15+
"retry_with_more_memory.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
16+
"retry_with_more_memory.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
17+
"retry_with_more_memory.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
18+
"retry_with_more_memory.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB"
19+
"retry_with_more_memory.imitate_oom_error.-1.3.executionStatus": "Failed"
20+
"retry_with_more_memory.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB"
21+
}

0 commit comments

Comments
 (0)