Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git

KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
KUEUE_VERSION=v0.12.2
KUEUE_VERSION=v0.14.2
KJOB_VERSION=v0.1.0

OS := $(shell uname -s | tr A-Z a-z)
Expand Down
8 changes: 4 additions & 4 deletions goldens/Basic_cluster_create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ kubectl apply --server-side -f https://github.com/google/pathways-job/releases/d
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.12.2...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml
[XPK] Installing Kueue version v0.14.2...
[XPK] Try 1: Installing Kueue v0.14.2
[XPK] Task: `Installing Kueue v0.14.2` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.2/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
Expand Down
8 changes: 4 additions & 4 deletions goldens/Cluster_create_private.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ kubectl apply --server-side -f https://github.com/google/pathways-job/releases/d
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.12.2...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml
[XPK] Installing Kueue version v0.14.2...
[XPK] Try 1: Installing Kueue v0.14.2
[XPK] Task: `Installing Kueue v0.14.2` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.2/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
Expand Down
8 changes: 4 additions & 4 deletions goldens/Cluster_create_with_gb200-4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ kubectl apply --server-side -f https://github.com/google/pathways-job/releases/d
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.12.2...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml
[XPK] Installing Kueue version v0.14.2...
[XPK] Try 1: Installing Kueue v0.14.2
[XPK] Task: `Installing Kueue v0.14.2` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.2/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
Expand Down
8 changes: 4 additions & 4 deletions goldens/NAP_cluster-create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ kubectl apply --server-side -f https://github.com/google/pathways-job/releases/d
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.12.2...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml
[XPK] Installing Kueue version v0.14.2...
[XPK] Try 1: Installing Kueue v0.14.2
[XPK] Task: `Installing Kueue v0.14.2` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.2/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
Expand Down
8 changes: 4 additions & 4 deletions goldens/NAP_cluster-create_with_pathways.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ kubectl apply --server-side -f https://github.com/google/pathways-job/releases/d
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.12.2...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml
[XPK] Installing Kueue version v0.14.2...
[XPK] Try 1: Installing Kueue v0.14.2
[XPK] Task: `Installing Kueue v0.14.2` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.2/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
Expand Down
28 changes: 11 additions & 17 deletions src/xpk/core/kueue_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
from typing import Optional, List, Dict, Any
import json
from jinja2 import Environment, FileSystemLoader

from .kueue_migrations import install_kueue_manifest_upgrading
from ..utils.execution_context import is_dry_run
from ..utils.kueue import is_queued_cluster
from packaging.version import Version

from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
from .scheduling import (
Expand All @@ -40,7 +43,7 @@
from ..utils.file import write_tmp_file
from ..utils.console import xpk_print, xpk_exit
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
from packaging.version import Version


WAIT_FOR_KUEUE_TIMEOUT = "10m"
CLUSTER_QUEUE_NAME = "cluster-queue"
Expand All @@ -52,7 +55,7 @@
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
MEMORY_SIZE_PER_VM = 1.2
MIN_MEMORY_LIMIT_SIZE = 4096
KUEUE_VERSION = Version("v0.12.2")
KUEUE_VERSION = Version("v0.14.2")


@dataclass
Expand Down Expand Up @@ -116,7 +119,7 @@ def install_or_upgrade(
else:
xpk_print(f"Installing Kueue version v{self.kueue_version}...")

install_return_code = self.__install(tolerations)
install_return_code = self.__install(installed_version, tolerations)
if install_return_code != 0:
return install_return_code

Expand All @@ -142,6 +145,7 @@ def get_installed_kueue_version(self) -> tuple[int, Version | None]:

def __install(
self,
current_kueue_version: Version | None,
tolerations: Optional[List[Dict[str, Any]]] = None,
) -> int:
"""
Expand All @@ -150,7 +154,10 @@ def __install(
Args:
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
"""
return_code = self.__install_kueue_crs()
return_code = install_kueue_manifest_upgrading(
from_version=current_kueue_version,
to_version=self.kueue_version,
)
if return_code != 0:
return return_code

Expand All @@ -161,19 +168,6 @@ def __install(

return self.__wait_for_kueue_available()

def __install_kueue_crs(self) -> int:
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
install_command = (
f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
)
task = "Installing Kueue Custom Resources"
return_code = run_command_with_updates_retry(
install_command, "Install Kueue"
)
if return_code != 0:
xpk_print(f"{task} returned ERROR {return_code}")
return return_code

def __patch_tolerations(self, tolerations: List[Dict[str, Any]]) -> int:
patch = {"spec": {"template": {"spec": {"tolerations": tolerations}}}}
patch_str = json.dumps(patch)
Expand Down
6 changes: 3 additions & 3 deletions src/xpk/core/kueue_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_install_or_upgrade_when_newer_version_already_installed(
self, mock_configure, mock_install, mock_get_version
):
"""Test install_or_upgrade when Kueue is already up to date."""
mock_get_version.return_value = (0, Version("v0.12.3"))
mock_get_version.return_value = (0, Version("v0.99.9"))
kueue_config = MagicMock(spec=KueueConfig)

result = self.kueue_manager.install_or_upgrade(kueue_config)
Expand Down Expand Up @@ -154,7 +154,7 @@ def test_installation_with_tolerations(self):
return_value=(1, None),
),
patch(
"xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
"xpk.core.kueue_manager.install_kueue_manifest_upgrading",
return_value=0,
),
patch(
Expand Down Expand Up @@ -198,7 +198,7 @@ def test_installation_without_tolerations(self):
return_value=(1, None),
),
patch(
"xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
"xpk.core.kueue_manager.install_kueue_manifest_upgrading",
return_value=0,
),
patch(
Expand Down
Loading
Loading