Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CROM-6920 Add option to retry only known errors. #7456

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import cromwell.core._
import cromwell.core.io.{AsyncIoActorClient, DefaultIoCommandBuilder, IoCommandBuilder}
import cromwell.core.path.Path
import cromwell.core.retry._
import cromwell.services.keyvalue.KeyValueServiceActor._
import cromwell.services.keyvalue.KvClient
import cromwell.services.metadata.CallMetadataKeys
Expand Down Expand Up @@ -273,6 +274,25 @@
}
}

lazy val maxRetriesMode: MaxRetriesMode = {
val maxRetriesModeOption: Option[MaxRetriesMode] =
jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MaxRetriesMode) flatMap { value: String =>
MaxRetriesMode.tryParse(value) match {
case Success(v) => Option(v)
case Failure(e) =>
// should not happen, this case should have been screened for and fast-failed during workflow materialization.
log.error(
e,
s"Programmer error: unexpected failure attempting to convert value for workflow option " +
s"'${WorkflowOptions.MaxRetriesMode.name}' to MaxRetriesMode."
)
Option(MaxRetriesMode.DefaultMode)
}
}

maxRetriesModeOption.getOrElse(MaxRetriesMode.DefaultMode)
}

lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty

/**
Expand Down Expand Up @@ -1107,8 +1127,17 @@

failedRetryableOrNonRetryable match {
case failedNonRetryable: FailedNonRetryableExecutionHandle if previousFailedRetries < maxRetries =>
// The user asked us to retry finitely for them, possibly with a memory modification
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
maxRetriesMode match {
case AllErrors =>
// The user asked us to retry finitely for them, possibly with a memory modification
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
case KnownErrors if memoryRetry.oomDetected =>
// The user asked us to retry finitely for them, with a memory modification
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)

Check warning on line 1136 in backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

View check run for this annotation

Codecov / codecov/patch

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala#L1136

Added line #L1136 was not covered by tests
case _ =>
// No reason to retry
Future.successful(failedNonRetryable)
}
case failedNonRetryable: FailedNonRetryableExecutionHandle =>
// No reason to retry
Future.successful(failedNonRetryable)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"max_retries_mode" : "AllErrors"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"max_retries_mode" : "KnownErrors"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: max_retries_mode_allerrors
testFormat: workflowfailure

files {
workflow: max_retries/max_retries.wdl
options: max_retries/max_retries_mode_allerrors.options
}

metadata {
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:2 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: max_retries_mode_knownerrors
testFormat: workflowfailure

files {
workflow: max_retries/max_retries.wdl
options: max_retries/max_retries_mode_knownerrors.options
}

metadata {
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:1 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
}
1 change: 1 addition & 0 deletions core/src/main/scala/cromwell/core/WorkflowOptions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ object WorkflowOptions {
case object WorkflowFailureMode extends WorkflowOption("workflow_failure_mode")
case object UseReferenceDisks extends WorkflowOption("use_reference_disks")
case object MemoryRetryMultiplier extends WorkflowOption("memory_retry_multiplier")
case object MaxRetriesMode extends WorkflowOption("max_retries_mode")
case object WorkflowCallbackUri extends WorkflowOption("workflow_callback_uri")

private lazy val WorkflowOptionsConf = ConfigFactory.load.getConfig("workflow-options")
Expand Down
17 changes: 17 additions & 0 deletions core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package cromwell.core.retry

import scala.util.{Failure, Success, Try}

sealed trait MaxRetriesMode
case object AllErrors extends MaxRetriesMode
case object KnownErrors extends MaxRetriesMode

object MaxRetriesMode {
val DefaultMode = AllErrors
private val AllModes = Seq(AllErrors, KnownErrors)

def tryParse(mode: String): Try[MaxRetriesMode] =
AllModes find { _.toString.equalsIgnoreCase(mode) } map { Success(_) } getOrElse Failure(
new Exception(s"Invalid max retries mode: '$mode', supported modes are: ${AllModes.mkString("'", "', '", "'")}")

Check warning on line 15 in core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala

View check run for this annotation

Codecov / codecov/patch

core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala#L15

Added line #L15 was not covered by tests
)
}
2 changes: 1 addition & 1 deletion docs/RuntimeAttributes.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ runtime {

*Default: _0_*

This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job, that is it cannot be used to increase memory in out-of-memory situations.
This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job.

If using the Google backend, it's important to note that The `maxRetries` count is independent from the [preemptible](#preemptible) count. For example, the task below can be retried up to 6 times if it's preempted 3 times AND the command execution fails 3 times.

Expand Down
9 changes: 9 additions & 0 deletions docs/wf_options/Overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,12 @@ Example `options.json`:
"memory_retry_multiplier" : 1.1
}
```

## Max Retries Mode

The `max_retries_mode` workflow options sets the behavior of retrying failed jobs when the [`maxRetries` runtime
attribute](../RuntimeAttributes.md#maxretries) is specified.

The possible values are `AllErrors` or `KnownErrors`. If set to `AllErrors`, the job will be retried for any error. If
set to `KnownErrors`, the job will only be retried for errors that are known to be retryable, such as increasing memory
in out-of-memory situations. The default value is `AllErrors`.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import cromwell.core.labels.{Label, Labels}
import cromwell.core.logging.WorkflowLogging
import cromwell.core.path.{PathBuilder, PathBuilderFactory}
import cromwell.core.retry._
import cromwell.engine._
import cromwell.engine.backend.CromwellBackends
import cromwell.engine.workflow.WorkflowProcessingEventPublishing._
Expand Down Expand Up @@ -184,6 +185,19 @@
s"'$optionName' is specified in workflow options but value is not of expected Double type: ${e.getMessage}".invalidNel
}
}

def validateMaxRetriesMode(workflowOptions: WorkflowOptions): ErrorOr[MaxRetriesMode] = {
val modeString: Try[String] = workflowOptions.get(WorkflowOptions.MaxRetriesMode) match {
case Success(value) => Success(value)
case Failure(_: OptionNotFoundException) => Success(MaxRetriesMode.DefaultMode.toString)
case Failure(e) => Failure(e)

Check warning on line 193 in engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala

View check run for this annotation

Codecov / codecov/patch

engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala#L193

Added line #L193 was not covered by tests
}

modeString flatMap MaxRetriesMode.tryParse match {
case Success(mode) => mode.validNel
case Failure(t) => t.getMessage.invalidNel

Check warning on line 198 in engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala

View check run for this annotation

Codecov / codecov/patch

engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala#L198

Added line #L198 was not covered by tests
}
}
}

// TODO WOM: need to decide where to draw the line between language specific initialization and WOM
Expand Down Expand Up @@ -499,12 +513,15 @@

val memoryRetryMultiplierValidation: ErrorOr[Unit] = validateMemoryRetryMultiplier(workflowOptions)

val maxRetriesModeValidation: ErrorOr[MaxRetriesMode] = validateMaxRetriesMode(workflowOptions)

(failureModeValidation,
backendAssignmentsValidation,
callCachingModeValidation,
useReferenceDisksValidation,
memoryRetryMultiplierValidation
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _) =>
memoryRetryMultiplierValidation,
maxRetriesModeValidation
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _, _) =>
val callable = womNamespace.executable.entryPoint
val backendDescriptor = BackendWorkflowDescriptor(id,
callable,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ class MaterializeWorkflowDescriptorActorSpec
private val invalidMemoryRetryOptions5 =
WorkflowOptions.fromJsonString(""" { "memory_retry_multiplier": true } """).get

private val validMaxRetriesModeOptions1 =
WorkflowOptions.fromJsonString(""" { "max_retries_mode": "AllErrors" } """).get
private val validMaxRetriesModeOptions2 =
WorkflowOptions.fromJsonString(""" { "max_retries_mode": "KnownErrors" } """).get
private val invalidMaxRetriesModeOptions1 =
WorkflowOptions.fromJsonString(""" { "max_retries_mode": "invalid value" } """).get
private val invalidMaxRetriesModeOptions2 =
WorkflowOptions.fromJsonString(""" { "max_retries_mode": true } """).get

before {}

after {
Expand Down Expand Up @@ -699,6 +708,27 @@ class MaterializeWorkflowDescriptorActorSpec
}
}

"accept valid max_retries_mode" in {
List(validMaxRetriesModeOptions1, validMaxRetriesModeOptions2, validOptions) map { options =>
MaterializeWorkflowDescriptorActor.validateMaxRetriesMode(options) match {
case Valid(_) => // good!
case Invalid(_) => fail(s"max_retries_mode validation for $options failed but should have passed!")
}
}
}

"reject invalid max_retries_mode" in {
List(invalidMaxRetriesModeOptions1, invalidMaxRetriesModeOptions2) map { options =>
MaterializeWorkflowDescriptorActor.validateMaxRetriesMode(options) match {
case Invalid(errorsList) =>
errorsList.head should startWith(
"Invalid max retries mode"
)
case Valid(_) => fail(s"max_retries_mode validation for $options succeeded but should have failed!")
}
}
}

"fail materialization if memory_retry_multiplier is invalid" in {
val materializeWfActor = system.actorOf(
MaterializeWorkflowDescriptorActor.props(NoBehaviorActor,
Expand Down
Loading