feat: [GCP Batch] Support passing standard machine types to the Google backend (#1)

javiergaitan · mcovarr · web-flow · commit ae4e6aa55d88 · 2024-10-10T08:15:49.000-07:00
* WX-1810 WX-1830 n1/n2/n2d machine types, cpuPlatform on GCPBATCH (broadinstitute#7518) * feat: [GCP Batch] Support passing standard machine types to the Google backend --------- Co-authored-by: Miguel Covarrubias <mcovarr@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,8 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional
 ### GCP Batch
 
 - The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](https://cromwell.readthedocs.io/en/stable/backends/GCPBatch/) for more information.
+- Fixes a bug with not being able to recover jobs on Cromwell restart.
+- Fixes machine type selection to match the Google Cloud Life Sciences backend, including default n1 non shared-core machine types and correct handling of `cpuPlatform` to select n2 or n2d machine types as appropriate.
 - Fixes the preemption error handling, now, the correct error message is printed, this also handles the other potential exit codes.
 - Fixes pulling Docker image metadata from private GCR repositories.
 - Fixed `google_project` and `google_compute_service_account` workflow options not taking effect when using GCP Batch backend
diff --git a/build.sbt b/build.sbt
@@ -237,6 +237,7 @@ lazy val googlePipelinesV2Beta = (project in backendRoot / "google" / "pipelines
 
 lazy val googleBatch = (project in backendRoot / "google" / "batch")
   .withLibrarySettings("cromwell-google-batch-backend")
+  .dependsOn(core)
   .dependsOn(backend)
   .dependsOn(gcsFileSystem)
   .dependsOn(drsFileSystem)
diff --git a/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test b/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test
@@ -1,6 +1,7 @@
 name: papi_cpu_platform
 testFormat: workflowsuccess
-backends: [Papiv2]
+backendsMode: any
+backends: [Papiv2, GCPBATCH]
 
 files {
   workflow: papi_cpu_platform/papi_cpu_platform.wdl
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala
@@ -53,7 +53,7 @@ trait BatchApiRunCreationClient { this: Actor with ActorLogging with BatchInstru
         backendSingletonActor ! BatchApiRequestManager.BatchRunCreationRequest(
           request.workflowId,
           self,
-          requestFactory.submitRequest(request)
+          requestFactory.submitRequest(request, jobLogger)
         )
         val newPromise = Promise[StandardAsyncJob]()
         runCreationClientPromise = Option(newPromise)
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala
@@ -6,13 +6,14 @@ import cromwell.backend.google.batch.io.GcpBatchAttachedDisk
 import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.VirtualPrivateCloudConfiguration
 import cromwell.backend.google.batch.models._
 import cromwell.backend.google.batch.monitoring.{CheckpointingConfiguration, MonitoringImage}
+import cromwell.core.logging.JobLogger
 import cromwell.core.path.Path
 import wom.runtime.WomOutputRuntimeExtractor
 
 import scala.concurrent.duration.FiniteDuration
 
 trait GcpBatchRequestFactory {
-  def submitRequest(data: GcpBatchRequest): CreateJobRequest
+  def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest
 
   def queryRequest(jobName: JobName): GetJobRequest
 
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala
@@ -22,7 +22,8 @@ import cromwell.backend.google.batch.io.GcpBatchAttachedDisk
 import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration
 import cromwell.backend.google.batch.models.{GcpBatchRequest, VpcAndSubnetworkProjectLabelValues}
 import cromwell.backend.google.batch.runnable._
-import cromwell.backend.google.batch.util.BatchUtilityConversions
+import cromwell.backend.google.batch.util.{BatchUtilityConversions, GcpBatchMachineConstraints}
+import cromwell.core.logging.JobLogger
 
 import scala.jdk.CollectionConverters._
 
@@ -74,14 +75,16 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
   private def createInstancePolicy(cpuPlatform: String,
                                    spotModel: ProvisioningModel,
                                    accelerators: Option[Accelerator.Builder],
-                                   attachedDisks: List[AttachedDisk]
+                                   attachedDisks: List[AttachedDisk],
+                                   machineType: String
   ): InstancePolicy.Builder = {
 
     // set GPU count to 0 if not included in workflow
     val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) // TODO: Driver version
 
     val instancePolicy = InstancePolicy.newBuilder
       .setProvisioningModel(spotModel)
+      .setMachineType(machineType)
       .addAllDisks(attachedDisks.asJava)
       .setMinCpuPlatform(cpuPlatform)
       .buildPartial()
@@ -154,7 +157,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
     }
   }
 
-  override def submitRequest(data: GcpBatchRequest): CreateJobRequest = {
+  override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = {
 
     val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes
     val createParameters = data.createParameters
@@ -224,7 +227,15 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
     val computeResource = createComputeResource(cpuCores, memory, gcpBootDiskSizeMb)
     val taskSpec = createTaskSpec(sortedRunnables, computeResource, retryCount, durationInSeconds, allVolumes)
     val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec)
-    val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks)
+    val machineType = GcpBatchMachineConstraints.machineType(runtimeAttributes.memory,
+                                                             runtimeAttributes.cpu,
+                                                             cpuPlatformOption = runtimeAttributes.cpuPlatform,
+                                                             standardMachineTypeOption = runtimeAttributes.standardMachineType,
+                                                             googleLegacyMachineSelection = false,
+                                                             jobLogger = jobLogger
+    )
+    val instancePolicy =
+      createInstancePolicy(cpuPlatform = cpuPlatform, spotModel, accelerators, allDisks, machineType = machineType)
     val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build
     val allocationPolicy =
       createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators)
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala
@@ -11,6 +11,8 @@ import wom.format.MemorySize
 
 import scala.math.{log, pow}
 
+case class StandardMachineType(machineType: String) {}
+
 /**
   * Adjusts memory and cpu for custom machine types.
   *
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala
@@ -49,7 +49,8 @@ final case class GcpBatchRuntimeAttributes(cpu: Int Refined Positive,
                                            continueOnReturnCode: ContinueOnReturnCode,
                                            noAddress: Boolean,
                                            useDockerImageCache: Option[Boolean],
-                                           checkpointFilename: Option[String]
+                                           checkpointFilename: Option[String],
+                                           standardMachineType: Option[String]
 )
 
 object GcpBatchRuntimeAttributes {
@@ -77,13 +78,16 @@ object GcpBatchRuntimeAttributes {
   private val cpuPlatformValidationInstance = new StringRuntimeAttributesValidation(CpuPlatformKey).optional
   // via `gcloud compute zones describe us-central1-a`
   val CpuPlatformIntelCascadeLakeValue = "Intel Cascade Lake"
+  val CpuPlatformIntelIceLakeValue = "Intel Ice Lake"
   val CpuPlatformAMDRomeValue = "AMD Rome"
 
   val UseDockerImageCacheKey = "useDockerImageCache"
   private val useDockerImageCacheValidationInstance = new BooleanRuntimeAttributesValidation(
     UseDockerImageCacheKey
   ).optional
 
+  val StandardMachineTypeKey = "standardMachineType"
+
   val CheckpointFileKey = "checkpointFile"
   private val checkpointFileValidationInstance = new StringRuntimeAttributesValidation(CheckpointFileKey).optional
 
@@ -97,6 +101,8 @@ object GcpBatchRuntimeAttributes {
       )
   private def cpuPlatformValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] =
     cpuPlatformValidationInstance
+  private def standardMachineTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] =
+    new StringRuntimeAttributesValidation(StandardMachineTypeKey).optional
   private def gpuTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[GpuType] =
     GpuTypeValidation.optional
 
@@ -170,7 +176,8 @@ object GcpBatchRuntimeAttributes {
         bootDiskSizeValidation(runtimeConfig),
         useDockerImageCacheValidation(runtimeConfig),
         checkpointFileValidationInstance,
-        dockerValidation
+        dockerValidation,
+        standardMachineTypeValidation(runtimeConfig)
       )
   }
 
@@ -227,6 +234,10 @@ object GcpBatchRuntimeAttributes {
       useDockerImageCacheValidation(runtimeAttrsConfig).key,
       validatedRuntimeAttributes
     )
+    val standardMachineType: Option[String] = RuntimeAttributesValidation.extractOption(
+      standardMachineTypeValidation(runtimeAttrsConfig).key,
+      validatedRuntimeAttributes
+    )
 
     new GcpBatchRuntimeAttributes(
       cpu = cpu,
@@ -242,7 +253,8 @@ object GcpBatchRuntimeAttributes {
       continueOnReturnCode = continueOnReturnCode,
       noAddress = noAddress,
       useDockerImageCache = useDockerImageCache,
-      checkpointFilename = checkpointFileName
+      checkpointFilename = checkpointFileName,
+      standardMachineType = standardMachineType
     )
   }
 
diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala
@@ -4,29 +4,34 @@ import cromwell.backend.google.batch.models.{
   GcpBatchRuntimeAttributes,
   N1CustomMachineType,
   N2CustomMachineType,
-  N2DCustomMachineType
+  N2DCustomMachineType,
+  StandardMachineType
 }
+import cromwell.core.logging.JobLogger
 import eu.timepit.refined.api.Refined
 import eu.timepit.refined.numeric.Positive
-import org.slf4j.Logger
 import wdl4s.parser.MemoryUnit
 import wom.format.MemorySize
 
 object GcpBatchMachineConstraints {
   def machineType(memory: MemorySize,
                   cpu: Int Refined Positive,
                   cpuPlatformOption: Option[String],
+                  standardMachineTypeOption: Option[String],
                   googleLegacyMachineSelection: Boolean,
-                  jobLogger: Logger
+                  jobLogger: JobLogger
   ): String =
-    if (googleLegacyMachineSelection) {
+    if (standardMachineTypeOption.exists(_.trim.nonEmpty)) {
+      StandardMachineType(standardMachineTypeOption.get).machineType
+    } else if (googleLegacyMachineSelection) {
       s"predefined-$cpu-${memory.to(MemoryUnit.MB).amount.intValue()}"
     } else {
-      // If someone requests Intel Cascade Lake as their CPU platform then switch the machine type to n2.
+      // If someone requests Intel Cascade Lake or Intel Ice Lake as their CPU platform then switch the machine type to n2.
       // Similarly, CPU platform of AMD Rome corresponds to the machine type n2d.
       val customMachineType =
         cpuPlatformOption match {
           case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) => N2CustomMachineType
+          case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelIceLakeValue) => N2CustomMachineType
           case Some(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) => N2DCustomMachineType
           case _ => N1CustomMachineType
         }
diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala
@@ -131,7 +131,7 @@ class GcpBatchAsyncBackendJobExecutionActorSpec
     val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(configuration)
 
     val requestFactory: GcpBatchRequestFactory = new GcpBatchRequestFactory {
-      override def submitRequest(data: GcpBatchRequest): CreateJobRequest = null
+      override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = null
 
       override def queryRequest(jobName: JobName): GetJobRequest = null
 
diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala
@@ -286,7 +286,8 @@ trait GcpBatchRuntimeAttributesSpecsMixin {
     continueOnReturnCode = ContinueOnReturnCodeSet(Set(0)),
     noAddress = false,
     useDockerImageCache = None,
-    checkpointFilename = None
+    checkpointFilename = None,
+    standardMachineType = None
   )
 
   def assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes: Map[String, WomValue],
diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ trait BatchApiRunCreationClient { this: Actor with ActorLogging with BatchInstru`
`53`	`53`	`backendSingletonActor ! BatchApiRequestManager.BatchRunCreationRequest(`
`54`	`54`	`request.workflowId,`
`55`	`55`	`self,`
`56`		`- requestFactory.submitRequest(request)`
	`56`	`+ requestFactory.submitRequest(request, jobLogger)`
`57`	`57`	`)`
`58`	`58`	`val newPromise = Promise[StandardAsyncJob]()`
`59`	`59`	`runCreationClientPromise = Option(newPromise)`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@ import wom.format.MemorySize`
`11`	`11`
`12`	`12`	`import scala.math.{log, pow}`
`13`	`13`
	`14`	`+case class StandardMachineType(machineType: String) {}`
	`15`	`+`
`14`	`16`	`/**`
`15`	`17`	`* Adjusts memory and cpu for custom machine types.`
`16`	`18`	`*`
Original file line number	Diff line number	Diff line change
`@@ -286,7 +286,8 @@ trait GcpBatchRuntimeAttributesSpecsMixin {`
`286`	`286`	`continueOnReturnCode = ContinueOnReturnCodeSet(Set(0)),`
`287`	`287`	`noAddress = false,`
`288`	`288`	`useDockerImageCache = None,`
`289`		`- checkpointFilename = None`
	`289`	`+ checkpointFilename = None,`
	`290`	`+ standardMachineType = None`
`290`	`291`	`)`
`291`	`292`
`292`	`293`	`def assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes: Map[String, WomValue],`