diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 13bb26742..c83e91ee2 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_for_multi-host_nodepool.txt b/goldens/Cluster_create_for_multi-host_nodepool.txt index e35bd13db..9ca94f20a 100644 --- a/goldens/Cluster_create_for_multi-host_nodepool.txt +++ b/goldens/Cluster_create_for_multi-host_nodepool.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-16 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-16 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 81f631140..8875a5088 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -41,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster-private --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v5p-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of v5p-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_sub-slicing.txt b/goldens/Cluster_create_sub-slicing.txt index 31754d125..101c4b0f7 100644 --- a/goldens/Cluster_create_sub-slicing.txt +++ b/goldens/Cluster_create_sub-slicing.txt @@ -39,13 +39,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v6e-4x4 -We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of v6e-16 -Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_super-slicing.txt b/goldens/Cluster_create_super-slicing.txt index 39be2466d..4b11c413e 100644 --- a/goldens/Cluster_create_super-slicing.txt +++ b/goldens/Cluster_create_super-slicing.txt @@ -39,13 +39,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 5 node pool or pools of tpu7x-4x4x4 -We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 5 node pool or pools of tpu7x-128 -Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt index 1b58622a3..f2b56037f 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt index 462d668c2..2eda93c68 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_Managed_Lustre_driver.txt b/goldens/Cluster_create_with_Managed_Lustre_driver.txt index 2705e513f..0d554a48a 100644 --- a/goldens/Cluster_create_with_Managed_Lustre_driver.txt +++ b/goldens/Cluster_create_with_Managed_Lustre_driver.txt @@ -42,11 +42,11 @@ gcloud container clusters update golden-cluster --project=golden-project --locat [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt b/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt index 264373630..d53e1d274 100644 --- a/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +++ b/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt @@ -42,11 +42,11 @@ gcloud container clusters update golden-cluster --project=golden-project --locat [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index fe84c0ae6..7c2774b6b 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -37,13 +37,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of gb200-4 -We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', kjob_decorator_fn=, nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=)) +We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', kjob_decorator_fn=, nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=), parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool with 2 nodes of gb200-4 -Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', kjob_decorator_fn=, nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=)) +Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', kjob_decorator_fn=, nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=), parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_shared_reservation.txt b/goldens/Cluster_create_with_shared_reservation.txt index 228a9129d..98e667cd0 100644 --- a/goldens/Cluster_create_with_shared_reservation.txt +++ b/goldens/Cluster_create_with_shared_reservation.txt @@ -37,13 +37,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index d4de70768..9a59482af 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 2db423089..4cb54993d 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -39,11 +39,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 28da01c73..38b771f59 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -35,7 +35,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (bd9665007cde96247406beaa13c9f40ba5636f1f75ea7561749c4472989fd14b) content: +[XPK] Temp file (e46ffcf0bb34f5286529bebd5665b1a0032c62bc14f157d691e96c24b354196a) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -64,6 +64,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onExitCodes: containerName: jax-tpu @@ -93,14 +94,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -145,7 +143,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f bd9665007cde96247406beaa13c9f40ba5636f1f75ea7561749c4472989fd14b +kubectl apply -f e46ffcf0bb34f5286529bebd5665b1a0032c62bc14f157d691e96c24b354196a [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index 3e4870660..ec565da5c 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -37,7 +37,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce) content: +[XPK] Temp file (321584e701d68faa848df77a0e87ecbec8ce31e2b2aeb0d1e3ddb7027acc5021) content: apiVersion: pathways-job.pathways.domain/v1 kind: PathwaysJob @@ -74,13 +74,11 @@ docker push gcr.io/golden-project/dry-run-runner:prefix-current metadata: spec: containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current imagePullPolicy: Always env: - ports: - - securityContext: privileged: true command: @@ -126,7 +124,7 @@ docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce +kubectl apply -f 321584e701d68faa848df77a0e87ecbec8ce31e2b2aeb0d1e3ddb7027acc5021 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_sub-slicing.txt b/goldens/Workload_create_sub-slicing.txt index 6cc927dec..d365aff78 100644 --- a/goldens/Workload_create_sub-slicing.txt +++ b/goldens/Workload_create_sub-slicing.txt @@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (816caa6f0023876e99f55515fda46cdbb648ff2c609f9d2af7c2354079d0e582) content: +[XPK] Temp file (8bca9f6c310db07ea9ab2461d54b0362f2bf60f95217c6be275f43d08ea37a4e) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -68,6 +68,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onExitCodes: containerName: jax-tpu @@ -98,14 +99,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -150,7 +148,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 816caa6f0023876e99f55515fda46cdbb648ff2c609f9d2af7c2354079d0e582 +kubectl apply -f 8bca9f6c310db07ea9ab2461d54b0362f2bf60f95217c6be275f43d08ea37a4e [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_super-slicing.txt b/goldens/Workload_create_super-slicing.txt index 990d930b0..fc6faff26 100644 --- a/goldens/Workload_create_super-slicing.txt +++ b/goldens/Workload_create_super-slicing.txt @@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (91fb78adbd49f9e2d6c2fec62dc461e724e79d4189af07b99e8f731bf8e2e11d) content: +[XPK] Temp file (c713ef9f055661f174f4a3433922c3d1a9df9d07ec9b483e22bb5abfa28906d6) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -68,9 +68,15 @@ spec: podFailurePolicy: rules: + + - action: FailJob + onExitCodes: + containerName: jax-tpu-1 + operator: NotIn + values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255] - action: FailJob onExitCodes: - containerName: jax-tpu + containerName: jax-tpu-2 operator: NotIn values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255] template: @@ -97,14 +103,46 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: - - name: jax-tpu + + - name: jax-tpu-1 image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 + securityContext: + privileged: true + command: + - bash + - -c + - | + echo XPK Start: $(date); + _sigterm() (kill -SIGTERM $! 2>/dev/null;); + trap _sigterm SIGTERM; + + (bash hello) & PID=$!; + while kill -0 $PID 2>/dev/null; + do sleep 5; + done; + wait $PID; + EXIT_CODE=$?; + + echo XPK End: $(date); + echo EXIT_CODE=$EXIT_CODE; + + + exit $EXIT_CODE + resources: + limits: + google.com/tpu: 2 + + volumeMounts: + - mountPath: /dev/shm + name: dshm-2 + + - name: jax-tpu-2 + image: gcr.io/golden-project/dry-run-runner:prefix-current + + env: securityContext: privileged: true command: @@ -129,7 +167,7 @@ spec: exit $EXIT_CODE resources: limits: - google.com/tpu: 4 + google.com/tpu: 2 volumeMounts: - mountPath: /dev/shm @@ -149,7 +187,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 91fb78adbd49f9e2d6c2fec62dc461e724e79d4189af07b99e8f731bf8e2e11d +kubectl apply -f c713ef9f055661f174f4a3433922c3d1a9df9d07ec9b483e22bb5abfa28906d6 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_with_output-manifest-file.txt b/goldens/Workload_create_with_output-manifest-file.txt index 9c0ffb4c3..9dbf42eb4 100644 --- a/goldens/Workload_create_with_output-manifest-file.txt +++ b/goldens/Workload_create_with_output-manifest-file.txt @@ -36,7 +36,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Workload golden-workload manifest written to /var/tmp/manifest.yaml -[XPK] Temp file (bd9665007cde96247406beaa13c9f40ba5636f1f75ea7561749c4472989fd14b) content: +[XPK] Temp file (e46ffcf0bb34f5286529bebd5665b1a0032c62bc14f157d691e96c24b354196a) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -65,6 +65,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onExitCodes: containerName: jax-tpu @@ -94,14 +95,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -146,7 +144,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f bd9665007cde96247406beaa13c9f40ba5636f1f75ea7561749c4472989fd14b +kubectl apply -f e46ffcf0bb34f5286529bebd5665b1a0032c62bc14f157d691e96c24b354196a [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 58110781f..b2f5ad511 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -489,12 +489,20 @@ def workload_create(args) -> None: - PodFailurePolicy""" restart_on_exit_codes_list = get_restart_exit_codes(args) restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list)) - pod_failure_policy = f""" + + pod_failure_policy = """ podFailurePolicy: rules: + """ + docker_image = get_main_container_docker_image(args, workload_system) + for i in range(workload_system.parallel_containers): + docker_image_sufix = ( + f'-{i + 1}' if workload_system.parallel_containers > 1 else '' + ) + pod_failure_policy += f""" - action: FailJob onExitCodes: - containerName: {get_main_container_docker_image(args, workload_system)} + containerName: {docker_image}{docker_image_sufix} operator: NotIn values: [{restart_on_exit_codes}]""" diff --git a/src/xpk/commands/workload_test.py b/src/xpk/commands/workload_test.py index 899f86a63..bf397d76b 100644 --- a/src/xpk/commands/workload_test.py +++ b/src/xpk/commands/workload_test.py @@ -23,6 +23,7 @@ from ..core.system_characteristics import DockerPlatform, SystemCharacteristics, AcceleratorType, UserFacingNameToSystemCharacteristics, GpuConfig from .workload import workload_create from .cluster_test import construct_args +from ..core.docker_container import get_user_workload_container as real_get_user_workload_container SYSTEM_CHARACTERISTICS = SystemCharacteristics( @@ -206,3 +207,69 @@ def test_workload_create_dry_run_with_output_file(mocker): written_content = mock_open.return_value.write.call_args[0][0] assert 'test-workload' in written_content assert 'cloud.google.com/gke-tpu-topology: 8x8' in written_content + + +def test_workload_create_multi_container_for_tpu7x( + workload_create_mocks: _WorkloadCreateMocks, + mocker, +): + """Tests that the generated YAML for a multi-container workload has correct pod failure policy and container structure.""" + + # Enable dry_run to prevent external calls like get_storages_to_mount -> gcloud + mocker.patch('xpk.utils.execution_context.dry_run', True) + + # Mock dependencies required by get_user_workload_container -> get_main_container + mocker.patch( + 'xpk.core.docker_container.setup_docker_image', + return_value=(0, 'dummy-image'), + ) + mocker.patch( + 'xpk.core.docker_container.get_gke_debugging_dashboard', return_value=None + ) + + # Use the real get_user_workload_container to test integration + workload_create_mocks.get_user_workload_container.side_effect = ( + real_get_user_workload_container + ) + + args = construct_args( + workload='test-workload', + command='echo hello', + num_nodes=1, + tpu_type='tpu7x-2x2x2', + restart_on_exit_codes=None, + docker_name='test-docker', + deploy_stacktrace_sidecar=False, + enable_debug_logs=False, + scheduler='default-scheduler', + ) + workload_create(args) + + assert workload_create_mocks.write_tmp_file.called + yaml_content = workload_create_mocks.write_tmp_file.call_args[0][0] + jobset = yaml.safe_load(yaml_content) + + # Verify Pod Failure Policy + pod_failure_rules = jobset['spec']['replicatedJobs'][0]['template']['spec'][ + 'podFailurePolicy' + ]['rules'] + # Should have 2 rules for multi_container + assert len(pod_failure_rules) == 2 + assert pod_failure_rules[0]['onExitCodes']['containerName'].endswith('-1') + assert pod_failure_rules[1]['onExitCodes']['containerName'].endswith('-2') + + # Verify Containers + # Navigate to the containers list in the YAML + containers = jobset['spec']['replicatedJobs'][0]['template']['spec'][ + 'template' + ]['spec']['containers'] + + assert len(containers) == 2 + assert containers[0]['name'].endswith('-1') + assert containers[1]['name'].endswith('-2') + assert containers[0]['image'] == 'dummy-image' + assert containers[1]['image'] == 'dummy-image' + + # Check if resources are split correctly (4 chips / 2 containers = 2 chips) + assert containers[0]['resources']['limits']['google.com/tpu'] == 2 + assert containers[1]['resources']['limits']['google.com/tpu'] == 2 diff --git a/src/xpk/core/docker_container.py b/src/xpk/core/docker_container.py index 1708a99da..422d72c13 100644 --- a/src/xpk/core/docker_container.py +++ b/src/xpk/core/docker_container.py @@ -17,9 +17,7 @@ from ..utils.console import xpk_exit, xpk_print from .docker_image import setup_docker_image from .docker_resources import ( - add_container_ports, add_image_pull_policy_for_pw_or_gpu, - add_jax_coordinator_port, get_env_container, get_main_container_resources, get_volume_mounts, @@ -112,13 +110,12 @@ def get_main_container(args, system, docker_image, resource_type) -> str: 'touch /shared-volume/stacktrace_signal; ' ) - yaml = """- name: {docker_name} + containers = [] + container_yaml = """ + - name: {docker_name} image: {docker_image} {image_pull_policy} env: {env} - ports: - {container_ports} - {jax_coordinator_port} securityContext: privileged: true command: @@ -145,29 +142,39 @@ def get_main_container(args, system, docker_image, resource_type) -> str: limits: {resources} """ + docker_name = get_main_container_docker_image(args, system) volume_mounts = get_volume_mounts(args, system) if volume_mounts != '': - yaml += """ + container_yaml += """ volumeMounts: {volume_mounts} """ - return yaml.format( - args=args, - system=system, - image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system), - env=get_env_container(args, system), - container_ports=add_container_ports(args, system), - jax_coordinator_port=add_jax_coordinator_port(system), - docker_name=get_main_container_docker_image(args, system), - docker_image=docker_image, - gsutil_test_command=gsutil_test_command, - command=command, - tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command, - gpu_workload_terminate_command=gpu_workload_terminate_command, - xpk_internal_commands=xpk_internal_commands, - resources=get_main_container_resources(args, system, resource_type), - volume_mounts=volume_mounts, - ) + # pathways job running on 2 parallel containers is not verified yet + if args.use_pathways: + system.parallel_containers = 1 + + env = get_env_container(args, system) + image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system) + for i in range(system.parallel_containers): + docker_name_sufix = f'-{i + 1}' if system.parallel_containers > 1 else '' + containers.append( + container_yaml.format( + args=args, + system=system, + image_pull_policy=image_pull_policy, + env=env, + docker_name=f'{docker_name}{docker_name_sufix}', + docker_image=docker_image, + gsutil_test_command=gsutil_test_command, + command=command, + tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command, + gpu_workload_terminate_command=gpu_workload_terminate_command, + xpk_internal_commands=xpk_internal_commands, + resources=get_main_container_resources(args, system, resource_type), + volume_mounts=volume_mounts, + ) + ) + return ''.join(containers) def get_user_workload_container(args, system: SystemCharacteristics): diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py index 031d74fc5..4167fc07a 100644 --- a/src/xpk/core/docker_resources.py +++ b/src/xpk/core/docker_resources.py @@ -53,7 +53,10 @@ def get_main_container_resources( offset_vCPUs = int(system.chips_per_vm) * 0.95 return f'{resource_type}: {offset_vCPUs}' - return f'{resource_type}: {system.chips_per_vm}' + return ( + f'{resource_type}:' + f' {int(system.chips_per_vm / system.parallel_containers)}' + ) def get_env_container(args, system: SystemCharacteristics) -> str: diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index d33bb63b8..736073f02 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -140,6 +140,8 @@ class SystemCharacteristics: supports_super_slicing: Whether the Super-slicing feature is supported. requires_workload_policy: A boolean indicating if a GCE resource workload policy is required. This is automatically set to True for GPUs. + parallel_containers: The number of containers running on a single VM. + """ topology: str @@ -155,6 +157,7 @@ class SystemCharacteristics: docker_platform: DockerPlatform requires_workload_policy: bool = False gpu_config: Optional[GpuConfig] = None + parallel_containers: int = 1 def __post_init__(self): if self.accelerator_type == AcceleratorType.GPU: @@ -248,6 +251,7 @@ def get_tpu_system_characteristics_map( default_topologies: set[str] | None = None, sub_slicing_topologies: set[str] | None = None, super_slicing_topologies: set[str] | None = None, + parallel_containers: int = 1, ) -> dict[str, SystemCharacteristics]: system_characteristics_map = {} default_topologies = default_topologies or set() @@ -272,6 +276,7 @@ def get_tpu_system_characteristics_map( supports_super_slicing=topology in super_slicing_topologies, supports_accelerator_network_profile=supports_accelerator_network_profile, docker_platform=docker_platform, + parallel_containers=parallel_containers, ) system_characteristics_map[f'{prefix}-{topology}'] = system if ( @@ -559,6 +564,7 @@ def compute_vms_per_slice(topology: str) -> int: tpu_type_requires_workload_policy=True, supports_accelerator_network_profile=False, docker_platform=AMD_PLATFORM, + parallel_containers=2, supported_topologies=generate_tpu_topologies(max_cubes=144), super_slicing_topologies=set(['4x4x4']), default_topologies=set([