Skip to content
Open
75 changes: 37 additions & 38 deletions src/xpk/core/docker_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import datetime
import os
import random
import re
import string

from ..utils.console import xpk_exit, xpk_print
Expand All @@ -26,6 +27,11 @@
DEFAULT_DOCKER_IMAGE = 'python:3.10'
DEFAULT_SCRIPT_DIR = os.getcwd()
PLATFORM = 'linux/amd64'
CLOUD_PREFIXES = [
r'^gcr\.io',
r'^docker\.pkg\.dev',
r'^([a-z0-9-]+)-docker\.pkg\.dev',
]


def validate_docker_image(docker_image, args) -> int:
Expand Down Expand Up @@ -161,50 +167,43 @@ def setup_docker_image(args) -> tuple[int, str]:
0 if successful and 1 otherwise.
Name of the docker image to use.
"""
use_base_docker_image = use_base_docker_image_or_docker_image(args)

docker_image = args.base_docker_image
if use_base_docker_image:
validate_docker_image_code = validate_docker_image(docker_image, args)
if validate_docker_image_code != 0:
xpk_exit(validate_docker_image_code)
build_docker_image_code, docker_image = build_docker_image_from_base_image(
args
)
if build_docker_image_code != 0:
xpk_exit(build_docker_image_code)
else:
docker_image = args.docker_image
validate_docker_image_code = validate_docker_image(args.docker_image, args)
if validate_docker_image_code != 0:
xpk_exit(validate_docker_image_code)

return 0, docker_image
docker_image = args.docker_image

if not docker_image and args.base_docker_image:
docker_image = args.base_docker_image # fallback for legacy users

def use_base_docker_image_or_docker_image(args) -> bool:
"""Checks for correct docker image arguments.
if not docker_image:
xpk_print(
f'No docker image specified, using default: {DEFAULT_DOCKER_IMAGE}'
)
docker_image = DEFAULT_DOCKER_IMAGE

Args:
args: user provided arguments for running the command.
is_cloud_image = any(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need distinction between cloud and local images? What if I push my local ubuntu image to gcr and want to use it as a local image? It will be treated as a cloud image.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. It should be based on the flag --script-dir
I will update it and verify again
@SujeethJinesh

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is updated to check --script-dir and is_cloud_image to determine if building a new image is needed.
I attached the commands and logs below

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this is_cloud_image check at all? IMO this is completely wrong, as if I as a researcher store my image outside of GCP I might still want to build scripts right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original purpose of is_cloud_image is to check if it is a local image, if so, we need to push the local image to cloud for pods to pull.

  if (
      args.script_dir and args.script_dir != DEFAULT_SCRIPT_DIR
  ) or not is_cloud_image:
    validate_code = validate_docker_image(docker_image, args)
    if validate_code != 0:
      xpk_exit(validate_code)
    build_code, docker_image = build_docker_image_from_base_image(args)
    if build_code != 0:
      xpk_exit(build_code)

And you are right, it will force the user to build and push the image if the image is outside of GCP.
I will update it. Thanks for the review!

Copy link
Collaborator Author

@ycchenzheng ycchenzheng Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@scaliby I updated the logic and please review it one more time.
Following is the logs:
Local image(maxtext_base_image):
XPK will build and push the image

[XPK] Docker image: maxtext_base_image
[XPK] Local image: True, build and push
[XPK] Building /usr/local/google/home/chzheng/maxtext into docker image.
[XPK] Task: `Building script_dir into docker image` is implemented by `docker buildx build --platform=linux/amd64 -f /tmp/tmplf2ae_b1 -t chzheng-runner /usr/local/google/home/chzheng/maxtext`, streaming output live.
Waiting for `chz-pw-llama3--3-e1g`, for 9 secondsr image`, for 0 seconds...
[+] Building 0.8s (1/2)                                                                                                                                  docker:default
[+] Building 1.7s (6/8)                                                                                                                                  docker:default
[+] Building 2.5s (9/9) FINISHED                                                                                                                         docker:default
 => [internal] load build definition from tmplf2ae_b1                                                                                                              0.0s
 => => transferring dockerfile: 212B                                                                                                                               0.0s
 => [internal] load metadata for docker.io/library/python:3.10                                                                                                     1.3s
 => [internal] load .dockerignore                                                                                                                                  0.0s
 => => transferring context: 45B                                                                                                                                   0.0s
 => [1/4] FROM docker.io/library/python:3.10@sha256:b568817037b2e14fb0d5d0deb972556ac15a915eb85e254bbc66e4eae446aa54                                               0.0s
 => [internal] load build context                                                                                                                                  0.1s
 => => transferring context: 66.74kB                                                                                                                               0.1s
 => CACHED [2/4] WORKDIR /app                                                                                                                                      0.0s
 => [3/4] COPY . .                                                                                                                                                 0.6s
 => [4/4] WORKDIR /app                                                                                                                                             0.0s
 => exporting to image                                                                                                                                             0.5s
 => => exporting layers                                                                                                                                            0.5s
 => => writing image sha256:caf3e50099b114251c19a2481570758515b5051fec14654144b9ff865c500ed6                                                                       0.0s
 => => naming to docker.io/library/chzheng-runner                                                                                                                  0.0s
[XPK] Task: `Building script_dir into docker image` terminated with code `0`
[XPK] Adding Docker Image: gcr.io/cloud-tpu-multipod-dev/chzheng-runner:adsc-2025-09-26-20-04-54 to cloud-tpu-multipod-dev
[XPK] Task: `Tag Docker Image` is implemented by `docker tag chzheng-runner gcr.io/cloud-tpu-multipod-dev/chzheng-runner:adsc-2025-09-26-20-04-54`, streaming output live.
Waiting for `chz-pw-llama3--3-e1g`, for 12 secondss...
[XPK] Task: `Tag Docker Image` terminated with code `0`
[XPK] Task: `Upload Docker Image` is implemented by `docker push gcr.io/cloud-tpu-multipod-dev/chzheng-runner:adsc-2025-09-26-20-04-54`, streaming output live.
Waiting for `chz-pw-llama3--3-e1g`, for 13 secondsonds...
The push refers to repository [gcr.io/cloud-tpu-multipod-dev/chzheng-runner]
5f70bf18a086: Preparing 
5f70bf18a086: Layer already exists 
ad25f8cf280d: Pushing [=======>                                           ]  11.03MB/72.57MB
ad25f8cf280d: Pushing [================>                                  ]  24.29MB/72.57MB
ad25f8cf280d: Pushing [=========================>                         ]  37.45MB/72.57MB
ad25f8cf280d: Pushing [=================================>                 ]  48.58MB/72.57MB
ad25f8cf280d: Pushing [========================================>          ]   58.6MB/72.57MB
ad25f8cf280d: Pushing [================================================>  ]   70.3MB/72.57MB
ad25f8cf280d: Pushed 
607ddfe5f3c3: Layer already exists 
185e04da9d94: Layer already exists 
Waiting for `chz-pw-llama3--3-e1g`, for 15 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 16 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 17 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 18 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 19 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 20 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 21 secondsonds...
Waiting for `chz-pw-llama3--3-e1g`, for 22 secondsonds...
adsc-2025-09-26-20-04-54: digest: sha256:dd5b481a3ad529d67fb0d24c0d025293b4df6858383b3b8a0662fb61ff267bb5 size: 2420
[XPK] Task: `Upload Docker Image` terminated with code `0`

Events:
  Type    Reason     Age   From               Message
  ----    ------     ----  ----               -------
  Normal  Scheduled  12s   default-scheduler  Successfully assigned default/chz-pw-llama3--3-e1g-pathways-head-0-0-jq8rh to gke-pw-scale-test-v5e-32-cpu-np-157a9698-zl4g
  Normal  Pulling    12s   kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest"
  Normal  Pulled     12s   kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest" in 271ms (271ms including waiting). Image size: 189202517 bytes.
  Normal  Created    12s   kubelet            Created container: pathways-rm
  Normal  Started    12s   kubelet            Started container pathways-rm
  Normal  Pulling    11s   kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest"
  Normal  Pulled     11s   kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest" in 315ms (315ms including waiting). Image size: 176032745 bytes.
  Normal  Created    11s   kubelet            Created container: pathways-proxy
  Normal  Started    11s   kubelet            Started container pathways-proxy
  Normal  Pulling    10s   kubelet            Pulling image "gcr.io/cloud-tpu-multipod-dev/chzheng-runner:adsc-2025-09-26-20-04-54"
  Normal  Pulled     10s   kubelet            Successfully pulled image "gcr.io/cloud-tpu-multipod-dev/chzheng-runner:adsc-2025-09-26-20-04-54" in 330ms (330ms including waiting). Image size: 432209311 bytes.
  Normal  Created    10s   kubelet            Created container: jax-tpu
  Normal  Started    10s   kubelet            Started container jax-tpu

Cloud image outside of GCP (python:3.12):
XPK will not build image if no --script-dir, since it is outside of GCP, validate_docker_image will return 0 and output no logs

[XPK] Docker image: python:3.12
Waiting for `chz-pw-llama3--3-7xs`, for 9 seconds
[XPK] Remote image, move forward

Events:
  Type    Reason     Age   From               Message
  ----    ------     ----  ----               -------
  Normal  Scheduled  37s   default-scheduler  Successfully assigned default/chz-pw-llama3--3-7xs-pathways-head-0-0-8vl6r to gke-pw-scale-test-v5e-32-cpu-np-157a9698-zl4g
  Normal  Pulling    38s   kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest"
  Normal  Pulled     38s   kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest" in 367ms (367ms including waiting). Image size: 189202517 bytes.
  Normal  Created    38s   kubelet            Created container: pathways-rm
  Normal  Started    38s   kubelet            Started container pathways-rm
  Normal  Pulling    37s   kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest"
  Normal  Pulled     36s   kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest" in 247ms (247ms including waiting). Image size: 176032745 bytes.
  Normal  Created    36s   kubelet            Created container: pathways-proxy
  Normal  Started    36s   kubelet            Started container pathways-proxy
  Normal  Pulling    36s   kubelet            Pulling image "python:3.12"
  Normal  Pulled     35s   kubelet            Successfully pulled image "python:3.12" in 225ms (225ms including waiting). Image size: 410344954 bytes.
  Normal  Created    35s   kubelet            Created container: jax-tpu
  Normal  Started    35s   kubelet            Started container jax-tpu

Cloud image(gcr.io/tpu-prod-env-one-vm/chzheng_latest:latest):
XPK will not build image if no --script-dir

[XPK] Docker image: gcr.io/tpu-prod-env-one-vm/chzheng_latest:latest
[XPK] Remote image, move forward
[XPK] Task: `Validate Docker Image` is implemented by `gcloud container images describe gcr.io/tpu-prod-env-one-vm/chzheng_latest:latest --project cloud-tpu-multipod-dev`, hiding output unless there is an error.
Waiting for `chz-pw-llama3--3-866`, for 10 seconds
Waiting for `chz-pw-llama3--3-866`, for 11 seconds
[XPK] Task: `Validate Docker Image` succeeded.

Events:
  Type    Reason     Age   From               Message
  ----    ------     ----  ----               -------
  Normal  Scheduled  119s  default-scheduler  Successfully assigned default/chz-pw-llama3--3-866-pathways-head-0-0-9rqjk to gke-pw-scale-test-v5e-32-cpu-np-157a9698-zl4g
  Normal  Pulling    119s  kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest"
  Normal  Pulled     119s  kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_server:latest" in 357ms (357ms including waiting). Image size: 189202517 bytes.
  Normal  Created    119s  kubelet            Created container: pathways-rm
  Normal  Started    119s  kubelet            Started container pathways-rm
  Normal  Pulling    119s  kubelet            Pulling image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest"
  Normal  Pulled     118s  kubelet            Successfully pulled image "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/unsanitized_proxy_server:latest" in 375ms (375ms including waiting). Image size: 176032745 bytes.
  Normal  Created    118s  kubelet            Created container: pathways-proxy
  Normal  Started    118s  kubelet            Started container pathways-proxy
  Normal  Pulling    118s  kubelet            Pulling image "gcr.io/tpu-prod-env-one-vm/chzheng_latest:latest"
  Normal  Pulled     117s  kubelet            Successfully pulled image "gcr.io/tpu-prod-env-one-vm/chzheng_latest:latest" in 420ms (420ms including waiting). Image size: 1573876003 bytes.
  Normal  Created    117s  kubelet            Created container: jax-tpu
  Normal  Started    117s  kubelet            Started container jax-tpu

re.match(prefix, docker_image) for prefix in CLOUD_PREFIXES
)

Returns:
True if intended to use base docker image, False to use docker image.
"""
use_base_docker_image = True
# Check if (base_docker_image and script_dir) or (docker_image) is set.
if args.docker_image is not None:
if is_cloud_image:
if args.script_dir is not DEFAULT_SCRIPT_DIR:
xpk_print(
'`--script-dir` and --docker-image can not be used together. Please'
' see `--help` command for more details.'
'Error: `--script-dir` cannot be used with a cloud docker'
' image.\nHint: If you need to customize the image with local'
' scripts, use a local base image (e.g., `ubuntu:20.04`) instead of a'
' prebuilt cloud image.'
)
xpk_exit(1)
if args.base_docker_image is not DEFAULT_DOCKER_IMAGE:
xpk_print(
'`--base-docker-image` and --docker-image can not be used together.'
' Please see `--help` command for more details.'
)
xpk_exit(1)
use_base_docker_image = False
return use_base_docker_image

validate_code = validate_docker_image(docker_image, args)
if validate_code != 0:
xpk_exit(validate_code)

else:
validate_code = validate_docker_image(docker_image, args)
if validate_code != 0:
xpk_exit(validate_code)

build_code, docker_image = build_docker_image_from_base_image(args)
if build_code != 0:
xpk_exit(build_code)

return 0, docker_image
Loading