From fc56d14924cb15a8ba7a048dcb7b54407dc9cf1e Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 24 Apr 2025 14:12:40 +0200 Subject: [PATCH 1/6] [no-relnote] Update E2E test suite Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/e2e.yaml | 22 +- .gitignore | 3 +- testdata/job-1.yaml | 25 + tests/e2e/Makefile | 43 +- tests/e2e/common/gpu_job.go | 60 -- tests/e2e/common/kubernetes.go | 216 ----- tests/e2e/common/taints.go | 45 - tests/e2e/device-plugin_test.go | 261 +++--- tests/e2e/e2e_test.go | 868 +++++++++++++++++- tests/e2e/framework.go | 26 - tests/e2e/framework/framework.go | 268 ------ tests/e2e/framework/test_context.go | 57 -- tests/e2e/framework/util.go | 156 ---- tests/e2e/gomega.go | 191 ---- tests/e2e/gpu-feature-discovery_test.go | 238 +++-- tests/e2e/infra/aws.yaml | 17 +- tests/e2e/internal/kube.go | 288 ++++++ .../k8s.io/apimachinery/pkg/util/uuid/uuid.go | 27 - tests/vendor/modules.txt | 1 - 19 files changed, 1436 insertions(+), 1376 deletions(-) create mode 100644 testdata/job-1.yaml delete mode 100644 tests/e2e/common/gpu_job.go delete mode 100644 tests/e2e/common/kubernetes.go delete mode 100644 tests/e2e/common/taints.go delete mode 100644 tests/e2e/framework.go delete mode 100644 tests/e2e/framework/framework.go delete mode 100644 tests/e2e/framework/test_context.go delete mode 100644 tests/e2e/framework/util.go delete mode 100644 tests/e2e/gomega.go create mode 100644 tests/e2e/internal/kube.go delete mode 100644 tests/vendor/k8s.io/apimachinery/pkg/util/uuid/uuid.go diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 0c6172318..64159ac70 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -58,11 +58,14 @@ jobs: - name: Run e2e tests env: KUBECONFIG: ${{ github.workspace }}/kubeconfig + HELM_CHART: ${{ github.workspace }}/deployments/helm/nvidia-device-plugin E2E_IMAGE_REPO: ghcr.io/nvidia/k8s-device-plugin E2E_IMAGE_TAG: ${{ inputs.version }} + E2E_IMAGE_PULL_POLICY: Always + NVIDIA_DRIVER_ENABLED: true LOG_ARTIFACTS: ${{ github.workspace }}/e2e_logs run: | - make test-e2e + make -f tests/e2e/Makefile test - name: Archive test logs if: ${{ failure() }} @@ -72,6 +75,13 @@ jobs: path: ./e2e_logs/ retention-days: 15 + - name: Archive Ginkgo logs + uses: actions/upload-artifact@v4 + with: + name: ginkgo-logs + path: ginkgo.json + retention-days: 15 + - name: Send Slack alert notification id: slack if: false @@ -80,8 +90,10 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} with: - channel-id: ${{ secrets.SLACK_CHANNEL_ID }} - slack-message: | - :x: On repository ${{ github.repository }} the Workflow *${{ github.workflow }}* has failed. + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ secrets.SLACK_CHANNEL_ID }} + text: ":x: On repository ${{ github.repository }} the Workflow *${{ github.workflow }}* has failed. - Details: ${{ env.SUMMARY_URL }} + Details: ${{ env.SUMMARY_URL }}" diff --git a/.gitignore b/.gitignore index f0ef8b39a..4c5a9150c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ deployments/helm/gpu-feature-discovery cmd/gpu-feature-discovery/gfd-test-loop e2e_logs - +bin *.out *.log +ginkgo.json diff --git a/testdata/job-1.yaml b/testdata/job-1.yaml new file mode 100644 index 000000000..1ac7c853a --- /dev/null +++ b/testdata/job-1.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: j-e2e-1 + labels: + app.nvidia.com: k8s-device-plugin-test-app +spec: + template: + metadata: + name: gpu-pod + spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04 + args: + - "--benchmark" + - "--numbodies=10000" + resources: + limits: + nvidia.com/gpu: "1" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 03a802a64..03c15c1fd 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ GO_TEST_TIMEOUT ?= 30m include $(CURDIR)/versions.mk -DRIVER_ENABLED ?= true +NVIDIA_DRIVER_ENABLED ?= true E2E_IMAGE_REPO ?= $(REGISTRY)/$(DRIVER_NAME) E2E_IMAGE_TAG ?= $(VERSION) @@ -28,21 +28,24 @@ E2E_IMAGE_PULL_POLICY ?= IfNotPresent HELM_CHART ?= $(CURDIR)/deployments/helm/nvidia-device-plugin LOG_ARTIFACTS ?= $(CURDIR)/e2e_logs -.PHONY: test -test: - @if [ -z ${KUBECONFIG} ]; then \ - echo "[ERR] KUBECONFIG missing, must be defined"; \ - exit 1; \ - fi - cd $(CURDIR)/tests/e2e && $(GO_CMD) test -timeout $(GO_TEST_TIMEOUT) -v . -args \ - -kubeconfig=$(KUBECONFIG) \ - -driver-enabled=$(DRIVER_ENABLED) \ - -image.repo=$(E2E_IMAGE_REPO) \ - -image.tag=$(E2E_IMAGE_TAG) \ - -image.pull-policy=$(E2E_IMAGE_PULL_POLICY) \ - -log-artifacts=$(LOG_ARTIFACTS) \ - -helm-chart=$(HELM_CHART) \ - -helm-log-file=$(LOG_ARTIFACTS)/helm.log \ - -ginkgo.focus="\[nvidia\]" \ - -test.timeout=1h \ - -ginkgo.v +# Test configuration +GINKGO_COMMON_ARGS := -v --fail-on-pending --randomize-all --trace +GINKGO_REPORT_ARGS := --json-report=$(LOG_ARTIFACTS)/report.json --junit-report=$(LOG_ARTIFACTS)/junit.xml + +.PHONY: ginkgo test clean-artifacts + +ginkgo: + mkdir -p $(CURDIR)/bin + GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest + +# Create artifacts directory +$(LOG_ARTIFACTS): + mkdir -p $(LOG_ARTIFACTS) + +# Clean artifacts +clean-artifacts: + rm -rf $(LOG_ARTIFACTS) + +# Run tests +test: ginkgo $(LOG_ARTIFACTS) + $(CURDIR)/bin/ginkgo $(GINKGO_COMMON_ARGS) $(GINKGO_REPORT_ARGS) $(GINKGO_ARGS) ./tests/e2e/... diff --git a/tests/e2e/common/gpu_job.go b/tests/e2e/common/gpu_job.go deleted file mode 100644 index fb512f36f..000000000 --- a/tests/e2e/common/gpu_job.go +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package common - -import ( - batchv1 "k8s.io/api/batch/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// Define the Job -var GPUJob = &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-job", - }, - Spec: batchv1.JobSpec{ - Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-pod", - }, - Spec: v1.PodSpec{ - RestartPolicy: "Never", - Containers: []v1.Container{ - { - Name: "cuda-container", - Image: "nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04", - Args: []string{"--benchmark", "--numbodies=10000"}, - Resources: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - }, - }, - Tolerations: []v1.Toleration{ - { - Key: "nvidia.com/gpu", - Operator: "Exists", - Effect: "NoSchedule", - }, - }, - }, - }, - }, -} diff --git a/tests/e2e/common/kubernetes.go b/tests/e2e/common/kubernetes.go deleted file mode 100644 index 0c72d7751..000000000 --- a/tests/e2e/common/kubernetes.go +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package common - -import ( - "context" - "fmt" - "strings" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - nfdclient "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" - nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1" -) - -// GetNonControlPlaneNodes gets the nodes that are not tainted for exclusive control-plane usage -func GetNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) ([]corev1.Node, error) { - nodeList, err := cli.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return nil, err - } - if len(nodeList.Items) == 0 { - return nil, fmt.Errorf("no nodes found in the cluster") - } - - controlPlaneTaint := corev1.Taint{ - Effect: corev1.TaintEffectNoSchedule, - Key: "node-role.kubernetes.io/control-plane", - } - out := []corev1.Node{} - for _, node := range nodeList.Items { - if !TaintExists(node.Spec.Taints, &controlPlaneTaint) { - out = append(out, node) - } - } - - if len(out) == 0 { - return nil, fmt.Errorf("no non-control-plane nodes found in the cluster") - } - return out, nil -} - -func GetNode(nodes []corev1.Node, nodeName string) corev1.Node { - for _, node := range nodes { - if node.Name == nodeName { - return node - } - } - return corev1.Node{} -} - -// CleanupNode deletes all NFD/GFD related metadata from the Node object, i.e. -// labels and annotations -func CleanupNode(ctx context.Context, cs clientset.Interface) { - // Per-node cleanup function - cleanup := func(nodeName string) error { - node, err := cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - - update := false - updateStatus := false - // Gather info about all NFD-managed node assets outside the default prefix - nfdLabels := map[string]struct{}{} - for _, name := range strings.Split(node.Annotations[nfdv1alpha1.FeatureLabelsAnnotation], ",") { - if strings.Contains(name, "/") { - nfdLabels[name] = struct{}{} - } - } - nfdERs := map[string]struct{}{} - for _, name := range strings.Split(node.Annotations[nfdv1alpha1.ExtendedResourceAnnotation], ",") { - if strings.Contains(name, "/") { - nfdERs[name] = struct{}{} - } - } - - // Remove labels - for key := range node.Labels { - _, ok := nfdLabels[key] - if ok || strings.HasPrefix(key, nfdv1alpha1.FeatureLabelNs) { - delete(node.Labels, key) - update = true - } - } - - // Remove annotations - for key := range node.Annotations { - if strings.HasPrefix(key, nfdv1alpha1.AnnotationNs) { - delete(node.Annotations, key) - update = true - } - } - - // Remove taints - for _, taint := range node.Spec.Taints { - taint := taint - if strings.HasPrefix(taint.Key, nfdv1alpha1.TaintNs) { - newTaints, removed := DeleteTaint(node.Spec.Taints, &taint) - if removed { - node.Spec.Taints = newTaints - update = true - } - } - } - - // Remove extended resources - for key := range node.Status.Capacity { - // We check for FeatureLabelNs as -resource-labels can create ERs there - _, ok := nfdERs[string(key)] - if ok || strings.HasPrefix(string(key), nfdv1alpha1.FeatureLabelNs) { - delete(node.Status.Capacity, key) - delete(node.Status.Allocatable, key) - updateStatus = true - } - } - - if updateStatus { - By("[Cleanup]\tDeleting NFD extended resources from node " + nodeName) - if _, err := cs.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { - return err - } - } - - if update { - By("[Cleanup]\tDeleting NFD labels, annotations and taints from node " + node.Name) - if _, err := cs.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil { - return err - } - } - - return nil - } - - // Cleanup all nodes - nodeList, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - for _, n := range nodeList.Items { - var err error - for retry := 0; retry < 5; retry++ { - if err = cleanup(n.Name); err == nil { - break - } - time.Sleep(100 * time.Millisecond) - } - Expect(err).NotTo(HaveOccurred()) - } -} - -func CleanupNFDObjects(ctx context.Context, cli *nfdclient.Clientset, namespace string) { - cleanupNodeFeatureRules(ctx, cli) - cleanupNodeFeatures(ctx, cli, namespace) -} - -// cleanupNodeFeatures deletes all NodeFeature objects in the given namespace -func cleanupNodeFeatures(ctx context.Context, cli *nfdclient.Clientset, namespace string) { - nfs, err := cli.NfdV1alpha1().NodeFeatures(namespace).List(ctx, metav1.ListOptions{}) - if errors.IsNotFound(err) { - // Omitted error, nothing to do. - return - } - Expect(err).NotTo(HaveOccurred()) - - if len(nfs.Items) != 0 { - By("[Cleanup]\tDeleting NodeFeature objects from namespace " + namespace) - for _, nf := range nfs.Items { - err = cli.NfdV1alpha1().NodeFeatures(namespace).Delete(ctx, nf.Name, metav1.DeleteOptions{}) - if errors.IsNotFound(err) { - // Omitted error - continue - } - Expect(err).NotTo(HaveOccurred()) - } - } -} - -// cleanupNodeFeatureRules deletes all NodeFeatureRule objects -func cleanupNodeFeatureRules(ctx context.Context, cli *nfdclient.Clientset) { - nfrs, err := cli.NfdV1alpha1().NodeFeatureRules().List(ctx, metav1.ListOptions{}) - if errors.IsNotFound(err) { - // Omitted error, nothing to do. - return - } - Expect(err).NotTo(HaveOccurred()) - - if len(nfrs.Items) != 0 { - By("[Cleanup]\tDeleting NodeFeatureRule objects from the cluster") - for _, nfr := range nfrs.Items { - err = cli.NfdV1alpha1().NodeFeatureRules().Delete(ctx, nfr.Name, metav1.DeleteOptions{}) - if errors.IsNotFound(err) { - // Omitted error - continue - } - Expect(err).NotTo(HaveOccurred()) - } - } -} diff --git a/tests/e2e/common/taints.go b/tests/e2e/common/taints.go deleted file mode 100644 index 217814eda..000000000 --- a/tests/e2e/common/taints.go +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package common - -import ( - corev1 "k8s.io/api/core/v1" -) - -// TaintExists checks if the given taint exists in list of taints. Returns true if exists false otherwise. -func TaintExists(taints []corev1.Taint, taintToFind *corev1.Taint) bool { - for _, taint := range taints { - if taint.MatchTaint(taintToFind) { - return true - } - } - return false -} - -// DeleteTaint removes all the taints that have the same key and effect to given taintToDelete. -func DeleteTaint(taints []corev1.Taint, taintToDelete *corev1.Taint) ([]corev1.Taint, bool) { - newTaints := []corev1.Taint{} - deleted := false - for i := range taints { - if taintToDelete.MatchTaint(&taints[i]) { - deleted = true - continue - } - newTaints = append(newTaints, taints[i]) - } - return newTaints, deleted -} diff --git a/tests/e2e/device-plugin_test.go b/tests/e2e/device-plugin_test.go index d171bfb00..1d8540333 100644 --- a/tests/e2e/device-plugin_test.go +++ b/tests/e2e/device-plugin_test.go @@ -1,5 +1,6 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,6 @@ package e2e import ( - "context" "fmt" "strings" "time" @@ -27,14 +27,10 @@ import ( helm "github.com/mittwald/go-helm-client" helmValues "github.com/mittwald/go-helm-client/values" - apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" - extclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/rand" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common/diagnostics" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/framework" + "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" ) const ( @@ -42,153 +38,140 @@ const ( ) // Actual test suite -var _ = NVDescribe("GPU Device Plugin", func() { - f := framework.NewFramework("k8s-device-plugin") - - Context("When deploying k8s-device-plugin", Ordered, func() { - // helm-chart is required - if *HelmChart == "" { - Fail("No helm-chart for k8s-device-plugin specified") +var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugin"), func() { + // Init global suite vars vars + var ( + helmReleaseName string + chartSpec helm.ChartSpec + + collectLogsFrom []string + diagnosticsCollector *diagnostics.Diagnostic + ) + + collectLogsFrom = []string{ + "pods", + "nodes", + "namespaces", + "deployments", + "daemonsets", + "jobs", + } + if CollectLogsFrom != "" && CollectLogsFrom != "default" { + collectLogsFrom = strings.Split(CollectLogsFrom, ",") + } + + values := helmValues.Options{ + Values: []string{ + fmt.Sprintf("image.repository=%s", ImageRepo), + fmt.Sprintf("image.tag=%s", ImageTag), + fmt.Sprintf("image.pullPolicy=%s", ImagePullPolicy), + "devicePlugin.enabled=true", + // We need to make affinity null, if not deploying NFD/GFD + // test will fail if not run on a GPU node + "affinity=", + }, + } + + BeforeAll(func(ctx SpecContext) { + // Create clients for apiextensions and our CRD api + helmReleaseName = "nvdp-e2e-test-" + randomSuffix() + + chartSpec = helm.ChartSpec{ + ReleaseName: helmReleaseName, + ChartName: HelmChart, + Namespace: testNamespace.Name, + Wait: true, + Timeout: 1 * time.Minute, + ValuesOptions: values, + CleanupOnFail: true, } - // Init global suite vars vars - var ( - crds []*apiextensionsv1.CustomResourceDefinition - extClient *extclient.Clientset - - helmReleaseName string - chartSpec helm.ChartSpec - - collectLogsFrom []string - diagnosticsCollector *diagnostics.Diagnostic - ) - - defaultCollectorObjects := []string{ - "pods", - "nodes", - "namespaces", - "deployments", - "daemonsets", - "jobs", - } + By("Installing k8s-device-plugin Helm chart") + _, err := helmClient.InstallChart(ctx, &chartSpec, nil) + Expect(err).NotTo(HaveOccurred()) - values := helmValues.Options{ - Values: []string{ - fmt.Sprintf("image.repository=%s", *ImageRepo), - fmt.Sprintf("image.tag=%s", *ImageTag), - fmt.Sprintf("image.pullPolicy=%s", *ImagePullPolicy), - "devicePlugin.enabled=true", - // We need to make affinity is none if not deploying NFD/GFD - // test will fail if not run on a GPU node - "affinity=", - }, - } + // Wait for all DaemonSets to be ready + // Note: DaemonSet names are dynamically generated with the Helm release prefix, + // so we wait for all DaemonSets in the namespace rather than specific names + By("Waiting for all DaemonSets to be ready") + err = internal.WaitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") + Expect(err).NotTo(HaveOccurred()) + }) - // check Collector objects - collectLogsFrom = defaultCollectorObjects - if *CollectLogsFrom != "" && *CollectLogsFrom != "default" { - collectLogsFrom = strings.Split(*CollectLogsFrom, ",") + AfterAll(func(ctx SpecContext) { + By("Uninstalling k8s-device-plugin Helm chart") + err := helmClient.UninstallReleaseByName(helmReleaseName) + if err != nil { + GinkgoWriter.Printf("Failed to uninstall helm release %s: %v\n", helmReleaseName, err) } + }) - BeforeAll(func(ctx context.Context) { - // Create clients for apiextensions and our CRD api - extClient = extclient.NewForConfigOrDie(f.ClientConfig()) - helmReleaseName = "nvdp-e2e-test" + rand.String(5) - - chartSpec = helm.ChartSpec{ - ReleaseName: helmReleaseName, - ChartName: *HelmChart, - Namespace: f.Namespace.Name, - Wait: true, - Timeout: 1 * time.Minute, - ValuesOptions: values, - CleanupOnFail: true, - } - - By("Installing k8s-device-plugin Helm chart") - _, err := f.HelmClient.InstallChart(ctx, &chartSpec, nil) + AfterEach(func(ctx SpecContext) { + // Run diagnostic collector if test failed + if CurrentSpecReport().Failed() { + var err error + diagnosticsCollector, err = diagnostics.New( + diagnostics.WithNamespace(testNamespace.Name), + diagnostics.WithArtifactDir(LogArtifactDir), + diagnostics.WithKubernetesClient(clientSet), + diagnostics.WithObjects(collectLogsFrom...), + ) Expect(err).NotTo(HaveOccurred()) - }) - JustBeforeEach(func(ctx context.Context) { - }) + err = diagnosticsCollector.Collect(ctx) + Expect(err).NotTo(HaveOccurred()) + } + }) - AfterEach(func(ctx context.Context) { - // Run diagnostic collector if test failed - if CurrentSpecReport().Failed() { - var err error - diagnosticsCollector, err = diagnostics.New( - diagnostics.WithNamespace(f.Namespace.Name), - diagnostics.WithArtifactDir(*LogArtifactDir), - diagnostics.WithKubernetesClient(f.ClientSet), - diagnostics.WithObjects(collectLogsFrom...), - ) - Expect(err).NotTo(HaveOccurred()) - - err = diagnosticsCollector.Collect(ctx) - Expect(err).NotTo(HaveOccurred()) - } - }) + When("When deploying k8s-device-plugin", Ordered, Label("serial"), func() { + It("it should create nvidia.com/gpu resource", Label("gpu-resource"), func(ctx SpecContext) { + nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(len(nodeList.Items)).ToNot(BeZero()) - AfterAll(func(ctx context.Context) { - // Delete Helm release - err := f.HelmClient.UninstallReleaseByName(helmReleaseName) + // We pick one node + nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) - for _, crd := range crds { - err := extClient.ApiextensionsV1().CustomResourceDefinitions().Delete(ctx, crd.Name, metav1.DeleteOptions{}) - Expect(err).NotTo(HaveOccurred()) - } + targetNodeName := nodes[0].Name + Expect(targetNodeName).ToNot(BeEmpty(), "No suitable worker node found") - // TODO: Add a check for a zero node capacity. + By("Checking the node capacity") + capacityChecker := map[string]k8sLabels{ + targetNodeName: { + "nvidia.com/gpu": "^[1-9]$", + }} + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchCapacity(capacityChecker, nodes), "Node capacity does not match") }) - - Context("and NV Driver is installed", func() { - It("it should create nvidia.com/gpu resource", func(ctx context.Context) { - nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(len(nodeList.Items)).ToNot(BeZero()) - - // We pick one node - nodes, err := common.GetNonControlPlaneNodes(ctx, f.ClientSet) - Expect(err).NotTo(HaveOccurred()) - - targetNodeName := nodes[0].Name - Expect(targetNodeName).ToNot(BeEmpty(), "No suitable worker node found") - - By("Checking the node capacity") - capacityChecker := map[string]k8sLabels{ - targetNodeName: { - "nvidia.com/gpu": "^[1-9]$", - }} - eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchCapacity(capacityChecker, nodes), "Node capacity does not match") - - // TODO: As a workaround to installing and reinstalling client causing - // the required resources to not be available, we merge the two tests. - // }) - // It("it should run GPU jobs", func(ctx context.Context) { - // By("Creating a GPU job") - job := common.GPUJob.DeepCopy() - job.Namespace = f.Namespace.Name - - _, err = f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Create(ctx, job, metav1.CreateOptions{}) - Expect(err).NotTo(HaveOccurred()) - - By("Waiting for job to complete") - Eventually(func() error { - job, err := f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, job.Name, metav1.GetOptions{}) - if err != nil { - return err - } - if job.Status.Succeeded != 1 { - return fmt.Errorf("job %s/%s failed", job.Namespace, job.Name) - } - if job.Status.Succeeded == 1 { - return nil - } - return fmt.Errorf("job %s/%s not completed yet", job.Namespace, job.Name) - }, devicePluginEventuallyTimeout, 5*time.Second).Should(BeNil()) + It("it should run GPU jobs", Label("gpu-job"), func(ctx SpecContext) { + By("Creating a GPU job") + jobNames, err := CreateOrUpdateJobsFromFile(ctx, clientSet, "job-1.yaml", testNamespace.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(jobNames).NotTo(BeEmpty()) + + // Defer cleanup for the job + DeferCleanup(func(ctx SpecContext) { + By("Deleting the GPU job") + err := clientSet.BatchV1().Jobs(testNamespace.Name).Delete(ctx, jobNames[0], metav1.DeleteOptions{}) + if err != nil { + GinkgoWriter.Printf("Failed to delete job %s: %v\n", jobNames[0], err) + } }) + + By("Waiting for job to complete") + Eventually(func(g Gomega) error { + job, err := clientSet.BatchV1().Jobs(testNamespace.Name).Get(ctx, jobNames[0], metav1.GetOptions{}) + if err != nil { + return err + } + if job.Status.Failed > 0 { + return fmt.Errorf("job %s/%s has failed pods: %d", job.Namespace, job.Name, job.Status.Failed) + } + if job.Status.Succeeded != 1 { + return fmt.Errorf("job %s/%s not completed yet: %d succeeded", job.Namespace, job.Name, job.Status.Succeeded) + } + return nil + }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(devicePluginEventuallyTimeout).Should(Succeed()) }) }) }) diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index fb0020f5d..54b192cf0 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -1,5 +1,6 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,46 +18,859 @@ package e2e import ( - "flag" + "bytes" + "context" + "errors" + "fmt" + "io" "log" "os" + "path/filepath" + "regexp" + "runtime" + "strconv" + "strings" "testing" + "time" - "github.com/onsi/ginkgo/v2" - "github.com/onsi/gomega" + helm "github.com/mittwald/go-helm-client" + nfdclient "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" + nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/framework" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + gomegatypes "github.com/onsi/gomega/types" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + extclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apiruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/util/yaml" + clientset "k8s.io/client-go/kubernetes" + k8sscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +const ( + // DefaultNamespaceDeletionTimeout is timeout duration for waiting for a namespace deletion. + DefaultNamespaceDeletionTimeout = 10 * time.Minute + + // PollInterval is how often to Poll pods, nodes and claims. + PollInterval = 2 * time.Second ) var ( - NVIDIA_DRIVER_ENABLED = flag.Bool("driver-enabled", false, "NVIDIA driver is installed on test infra") - HelmChart = flag.String("helm-chart", "", "Helm chart to use") - LogArtifactDir = flag.String("log-artifacts", "", "Directory to store logs") - ImageRepo = flag.String("image.repo", "", "Image repository to fetch image from") - ImageTag = flag.String("image.tag", "", "Image tag to use") - ImagePullPolicy = flag.String("image.pull-policy", "IfNotPresent", "Image pull policy") - CollectLogsFrom = flag.String("collect-logs-from", "default", "Comma separated list of objects to collect logs from after test") + Kubeconfig string + Timeout time.Duration + HelmChart string + LogArtifactDir string + ImageRepo string + ImageTag string + ImagePullPolicy string + CollectLogsFrom string + cwd string + NVIDIA_DRIVER_ENABLED bool + + // k8s clients + clientConfig *rest.Config + clientSet clientset.Interface + extClient *extclient.Clientset + nfdClient *nfdclient.Clientset + + testNamespace *corev1.Namespace // Every test has at least one namespace unless creation is skipped + + // Helm + helmClient helm.Client + helmLogFile *os.File + helmArtifactDir string + helmLogger *log.Logger + helmReleaseName string + + ctx context.Context + packagePath string ) -func TestMain(m *testing.M) { - // Register test flags, then parse flags. - framework.RegisterClusterFlags(flag.CommandLine) - flag.Parse() +func TestMain(t *testing.T) { + suiteName := "E2E K8s Device Plugin" + + RegisterFailHandler(Fail) + + // get the package path + _, thisFile, _, _ := runtime.Caller(0) + packagePath = filepath.Dir(thisFile) + + ctx = context.Background() + getTestEnv() + + // Log random seed for reproducibility + GinkgoWriter.Printf("Random seed: %d\n", GinkgoRandomSeed()) + + RunSpecs(t, + suiteName, + Label("e2e"), + ) +} + +// BeforeSuite runs before the test suite +var _ = BeforeSuite(func(ctx SpecContext) { + var err error + + cwd, err = os.Getwd() + Expect(err).NotTo(HaveOccurred()) + + // Get k8s clients + getK8sClients() + + // Create clients for apiextensions and our CRD api + extClient = extclient.NewForConfigOrDie(clientConfig) + + // Create a namespace for the test + testNamespace, err = CreateTestingNS("k8s-device-plugin-e2e-test", clientSet, nil) + Expect(err).NotTo(HaveOccurred()) + + // Get Helm client + helmReleaseName = "k8s-device-plugin-e2e-test" + rand.String(5) + getHelmClient() +}) + +var _ = AfterSuite(func(ctx SpecContext) { + By("Cleaning up namespace resources") + cleanupNamespaceResources(testNamespace.Name) + + By("Deleting the test namespace") + deleteTestNamespace() +}) + +// Add ReportAfterSuite for logging test summary and random seed +var _ = ReportAfterSuite("", func(report Report) { + // Log test summary + failedCount := 0 + for _, specReport := range report.SpecReports { + if specReport.Failed() { + failedCount++ + } + } + + GinkgoWriter.Printf("\nTest Summary:\n") + GinkgoWriter.Printf(" Total Specs: %d\n", len(report.SpecReports)) + GinkgoWriter.Printf(" Random Seed: %d\n", report.SuiteConfig.RandomSeed) + GinkgoWriter.Printf(" Failed: %d\n", failedCount) + GinkgoWriter.Printf(" Duration: %.2fs\n", report.RunTime.Seconds()) +}) + +// getK8sClients creates the k8s clients +func getK8sClients() { + var err error + + // get config from kubeconfig + c, err := clientcmd.LoadFromFile(Kubeconfig) + Expect(err).NotTo(HaveOccurred()) + + // get client config + clientConfig, err = clientcmd.NewDefaultClientConfig(*c, &clientcmd.ConfigOverrides{}).ClientConfig() + Expect(err).NotTo(HaveOccurred()) + + clientSet, err = clientset.NewForConfig(clientConfig) + Expect(err).NotTo(HaveOccurred()) + + // Create clients for apiextensions and our CRD api + nfdClient = nfdclient.NewForConfigOrDie(clientConfig) +} + +// getHelmClient creates a new Helm client +func getHelmClient() { + var err error + + // Set Helm log file + helmArtifactDir = filepath.Join(LogArtifactDir, "helm") + + // Create a Helm client + err = os.MkdirAll(helmArtifactDir, 0755) + Expect(err).NotTo(HaveOccurred()) + + helmLogFile, err = os.OpenFile(filepath.Join(LogArtifactDir, "helm_logs"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + Expect(err).NotTo(HaveOccurred()) + + helmLogger = log.New(helmLogFile, fmt.Sprintf("%s\t", testNamespace.Name), log.Ldate|log.Ltime) + + helmRestConf := &helm.RestConfClientOptions{ + Options: &helm.Options{ + Namespace: testNamespace.Name, + RepositoryCache: "/tmp/.helmcache", + RepositoryConfig: "/tmp/.helmrepo", + Debug: true, + DebugLog: helmLogger.Printf, + }, + RestConfig: clientConfig, + } + + helmClient, err = helm.NewClientFromRestConf(helmRestConf) + Expect(err).NotTo(HaveOccurred()) +} + +// getTestEnv gets the test environment variables +func getTestEnv() { + defer GinkgoRecover() + var err error + + Kubeconfig = getRequiredEnvvar[string]("KUBECONFIG") + + Timeout = time.Duration(getEnvVarOrDefault("E2E_TIMEOUT_SECONDS", 1800)) * time.Second + + HelmChart = getRequiredEnvvar[string]("HELM_CHART") + + LogArtifactDir = getEnvVarOrDefault("LOG_ARTIFACTS_DIR", "e2e_logs") + + ImageRepo = getRequiredEnvvar[string]("E2E_IMAGE_REPO") + + ImageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG") + + ImagePullPolicy = getRequiredEnvvar[string]("E2E_IMAGE_PULL_POLICY") + + CollectLogsFrom = getEnvVarOrDefault("COLLECT_LOGS_FROM", "") + + NVIDIA_DRIVER_ENABLED = getEnvVarOrDefault("NVIDIA_DRIVER_ENABLED", false) + + // Get current working directory + cwd, err = os.Getwd() + Expect(err).NotTo(HaveOccurred()) +} + +// CreateTestingNS should be used by every test, note that we append a common prefix to the provided test name. +// Please see NewFramework instead of using this directly. +func CreateTestingNS(baseName string, c clientset.Interface, labels map[string]string) (*corev1.Namespace, error) { + uid := rand.String(5) + if labels == nil { + labels = map[string]string{} + } + labels["e2e-run"] = uid + + // We don't use ObjectMeta.GenerateName feature, as in case of API call + // failure we don't know whether the namespace was created and what is its + // name. + name := fmt.Sprintf("%v-%v", baseName, uid) + + namespaceObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "", + Labels: labels, + }, + Status: corev1.NamespaceStatus{}, + } + // Be robust about making the namespace creation call. + var got *corev1.Namespace + if err := wait.PollUntilContextTimeout(ctx, PollInterval, 30*time.Second, true, func(ctx context.Context) (bool, error) { + var err error + got, err = c.CoreV1().Namespaces().Create(ctx, namespaceObj, metav1.CreateOptions{}) + if err != nil { + if k8serrors.IsAlreadyExists(err) { + // regenerate on conflict + namespaceObj.Name = fmt.Sprintf("%v-%v", baseName, uid) + } + return false, nil + } + return true, nil + }); err != nil { + return nil, err + } + + return got, nil +} + +type k8sLabels map[string]string + +// eventuallyNonControlPlaneNodes is a helper for asserting node properties +// +//nolint:unused +func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion { + return Eventually(func(g Gomega) ([]corev1.Node, error) { + return getNonControlPlaneNodes(ctx, cli) + }).WithPolling(1 * time.Second).WithTimeout(1 * time.Minute).WithContext(ctx) +} + +// MatchLabels returns a specialized Gomega matcher for checking if a list of +// nodes are labeled as expected. +func MatchLabels(expectedNew map[string]k8sLabels, oldNodes []corev1.Node) gomegatypes.GomegaMatcher { + return &nodeListPropertyRegexpMatcher[k8sLabels]{ + propertyName: "labels", + expected: expectedNew, + oldNodes: oldNodes, + } +} + +// MatchCapacity returns a specialized Gomega matcher for checking if a list of +// nodes are configured as expected. +func MatchCapacity(expectedNew map[string]k8sLabels, oldNodes []corev1.Node) gomegatypes.GomegaMatcher { + return &nodeListPropertyRegexpMatcher[k8sLabels]{ + propertyName: "capacity", + expected: expectedNew, + oldNodes: oldNodes, + } +} + +// nodeListPropertyRegexpMatcher is a generic Gomega matcher for asserting one property a group of nodes. +type nodeListPropertyRegexpMatcher[T any] struct { + expected map[string]k8sLabels + oldNodes []corev1.Node + + propertyName string + node *corev1.Node //nolint:unused + missing []string //nolint:unused + invalidValue []string //nolint:unused +} + +// Match method of the GomegaMatcher interface. +func (m *nodeListPropertyRegexpMatcher[T]) Match(actual interface{}) (bool, error) { + nodes, ok := actual.([]corev1.Node) + if !ok { + return false, fmt.Errorf("expected []corev1.Node, got: %T", actual) + } + + switch m.propertyName { + case "labels": + return m.matchLabels(nodes), nil + case "capacity": + return m.matchCapacity(nodes), nil + default: + return true, nil + } + +} + +func (m *nodeListPropertyRegexpMatcher[T]) matchLabels(nodes []corev1.Node) bool { + targetNode := corev1.Node{} + for _, node := range nodes { + _, ok := m.expected[node.Name] + if !ok { + continue + } + targetNode = node + break + } + + m.node = &targetNode + + for labelKey, labelValue := range m.expected[targetNode.Name] { + // missing key + if _, ok := targetNode.Labels[labelKey]; !ok { + m.missing = append(m.missing, labelKey) + continue + } + // invalid value + regexMatcher := regexp.MustCompile(labelValue) + if !regexMatcher.MatchString(targetNode.Labels[labelKey]) { + m.invalidValue = append(m.invalidValue, fmt.Sprintf("%s: %s", labelKey, targetNode.Labels[labelKey])) + return false + } + } + + return true +} + +func (m *nodeListPropertyRegexpMatcher[T]) matchCapacity(nodes []corev1.Node) bool { + targetNode := corev1.Node{} + for _, node := range nodes { + _, ok := m.expected[node.Name] + if !ok { + continue + } + targetNode = node + break + } + + m.node = &targetNode + + for labelKey, labelValue := range m.expected[targetNode.Name] { + // missing key + rn := corev1.ResourceName(labelKey) + if _, ok := targetNode.Status.Capacity[rn]; !ok { + m.missing = append(m.missing, labelKey) + continue + } + // invalid value + capacity := targetNode.Status.Capacity[rn] + regexMatcher := regexp.MustCompile(labelValue) + if !regexMatcher.MatchString(capacity.String()) { + m.invalidValue = append(m.invalidValue, fmt.Sprintf("%s: %s", labelKey, capacity.String())) + return false + } + } + + return true +} + +// FailureMessage method of the GomegaMatcher interface. +func (m *nodeListPropertyRegexpMatcher[T]) FailureMessage(actual interface{}) string { + return m.message() +} + +// NegatedFailureMessage method of the GomegaMatcher interface. +func (m *nodeListPropertyRegexpMatcher[T]) NegatedFailureMessage(actual interface{}) string { + return fmt.Sprintf("Node %q matched unexpectedly", m.node.Name) +} + +// TODO remove nolint when golangci-lint is able to cope with generics +// +//nolint:unused +func (m *nodeListPropertyRegexpMatcher[T]) message() string { + msg := fmt.Sprintf("Node %q %s did not match:", m.node.Name, m.propertyName) + if len(m.missing) > 0 { + msg += fmt.Sprintf("\n missing:\n %s", strings.Join(m.missing, "\n ")) + } + if len(m.invalidValue) > 0 { + msg += fmt.Sprintf("\n invalid value:\n %s", strings.Join(m.invalidValue, "\n ")) + } + return msg +} + +// jobIsCompleted checks if a job is completed +// +//nolint:unused +func jobIsCompleted(ctx context.Context, cli clientset.Interface, namespace, podName string) bool { + pod, err := cli.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return false + } + // Check if the pod's phase is Succeeded. + if pod.Status.Phase == "Succeeded" { + return true + } + return false +} + +// getNonControlPlaneNodes gets the nodes that are not tainted for exclusive control-plane usage +// +//nolint:unused +func getNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) ([]corev1.Node, error) { + nodeList, err := cli.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + if len(nodeList.Items) == 0 { + return nil, fmt.Errorf("no nodes found in the cluster") + } + + controlPlaneTaint := corev1.Taint{ + Effect: corev1.TaintEffectNoSchedule, + Key: "node-role.kubernetes.io/control-plane", + } + out := []corev1.Node{} + for _, node := range nodeList.Items { + if !taintExists(node.Spec.Taints, &controlPlaneTaint) { + out = append(out, node) + } + } + + if len(out) == 0 { + return nil, fmt.Errorf("no non-control-plane nodes found in the cluster") + } + return out, nil +} + +// taintExists checks if the given taint exists in list of taints. Returns true if exists false otherwise. +// +//nolint:unused +func taintExists(taints []corev1.Taint, taintToFind *corev1.Taint) bool { + for _, taint := range taints { + if taint.MatchTaint(taintToFind) { + return true + } + } + return false +} + +// getNode returns the node object from the list of nodes +// +//nolint:unused +func getNode(nodes []corev1.Node, nodeName string) corev1.Node { + for _, node := range nodes { + if node.Name == nodeName { + return node + } + } + return corev1.Node{} +} + +// CreateOrUpdateJobsFromFile creates or updates jobs from a file +func CreateOrUpdateJobsFromFile(ctx context.Context, cli clientset.Interface, filename, namespace string) ([]string, error) { + jobs, err := newJobFromfile(filepath.Join(packagePath, "..", "..", "testdata", filename)) + if err != nil { + return nil, fmt.Errorf("failed to create Job from file: %w", err) + } + + names := make([]string, len(jobs)) + for i, job := range jobs { + job.Namespace = namespace + + names[i] = job.Name + + // create or update the job + _, err = cli.BatchV1().Jobs(namespace).Get(ctx, job.Name, metav1.GetOptions{}) + if !k8serrors.IsNotFound(err) { + // update the job + _, err = cli.BatchV1().Jobs(namespace).Update(ctx, job, metav1.UpdateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to update job: %w", err) + } + continue + } + // create the job + _, err = cli.BatchV1().Jobs(namespace).Create(ctx, job, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create job: %w", err) + } + + } + + return names, nil +} + +func newJobFromfile(path string) ([]*batchv1.Job, error) { + objs, err := apiObjsFromFile(path, k8sscheme.Codecs.UniversalDeserializer()) + if err != nil { + return nil, err + } + + jobs := make([]*batchv1.Job, len(objs)) + + for i, obj := range objs { + var ok bool + jobs[i], ok = obj.(*batchv1.Job) + if !ok { + return nil, fmt.Errorf("unexpected type %t when reading %q", obj, path) + } + } + + return jobs, nil +} +func apiObjsFromFile(path string, decoder apiruntime.Decoder) ([]apiruntime.Object, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + + // Use Kubernetes' YAML decoder that properly handles multiple documents + // separated by "---", similar to how kubectl processes multi-document YAML files + yamlDecoder := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(data), 4096) + objs := []apiruntime.Object{} + + for { + // Decode into raw extension first + raw := apiruntime.RawExtension{} + if err := yamlDecoder.Decode(&raw); err != nil { + if err == io.EOF { + break + } + return nil, err + } + + // Skip empty documents + raw.Raw = bytes.TrimSpace(raw.Raw) + if len(raw.Raw) == 0 { + continue + } + + // Now decode the actual object using the provided decoder + obj, _, err := decoder.Decode(raw.Raw, nil, nil) + if err != nil { + return nil, err + } + objs = append(objs, obj) + } + + return objs, nil +} + +// cleanupNamespaceResources removes all resources in the specified namespace. +func cleanupNamespaceResources(namespace string) { + err := cleanupTestPods(namespace) + Expect(err).NotTo(HaveOccurred()) + + err = cleanupHelmDeployments(namespace) + Expect(err).NotTo(HaveOccurred()) + + cleanupNode(clientSet) + cleanupNFDObjects(nfdClient, testNamespace.Name) + cleanupCRDs() +} + +// waitForDeletion polls the provided checkFunc until a NotFound error is returned, +// confirming that the resource is deleted. +func waitForDeletion(resourceName string, checkFunc func() error) error { + EventuallyWithOffset(1, func(g Gomega) error { + err := checkFunc() + if err != nil && k8serrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("%s still exists", resourceName) + }).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).WithContext(ctx).Should(Succeed()) + return nil +} - // check if flags are set and if not cancel the test run - if *ImageRepo == "" || *ImageTag == "" || *HelmChart == "" { - log.Fatal("Required flags not set. Please set -image.repo, -image.tag and -helm-chart") +// cleanupTestPods deletes all test Pods in the namespace that have the label "app.nvidia.com=k8s-dra-driver-gpu-test-app". +func cleanupTestPods(namespace string) error { + labelSelector := "app.nvidia.com=k8s-device-plugin-test-app" + podList, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return err } - os.Exit(m.Run()) + zero := int64(0) + deleteOptions := metav1.DeleteOptions{GracePeriodSeconds: &zero} + for _, pod := range podList.Items { + if err = clientSet.CoreV1().Pods(namespace).Delete(ctx, pod.Name, deleteOptions); err != nil { + return err + } + if err = waitForDeletion(pod.Name, func() error { + _, err := clientSet.CoreV1().Pods(namespace).Get(ctx, pod.Name, metav1.GetOptions{}) + return err + }); err != nil { + return err + } + } + return nil } -func TestE2E(t *testing.T) { - gomega.RegisterFailHandler(ginkgo.Fail) - // Run tests through the Ginkgo runner with output to console + JUnit for Jenkins - suiteConfig, reporterConfig := ginkgo.GinkgoConfiguration() - // Randomize specs as well as suites - suiteConfig.RandomizeAllSpecs = true +// cleanupHelmDeployments uninstalls all deployed Helm releases in the specified namespace. +func cleanupHelmDeployments(namespace string) error { + releases, err := helmClient.ListDeployedReleases() + if err != nil { + return fmt.Errorf("failed to list deployed releases: %w", err) + } + + for _, release := range releases { + // Check if the release is deployed in the target namespace. + // Depending on your helmClient configuration the release might carry the namespace information. + if release.Namespace == namespace { + if err := helmClient.UninstallReleaseByName(release.Name); err != nil { + return fmt.Errorf("failed to uninstall release %q: %w", release.Name, err) + } + } + } + return nil +} + +// deleteTestNamespace deletes the test namespace and waits for its deletion. +func deleteTestNamespace() { + defer func() { + err := clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + Expect(err).NotTo(HaveOccurred()) + } + err = waitForDeletion(testNamespace.Name, func() error { + _, err := clientSet.CoreV1().Namespaces().Get(ctx, testNamespace.Name, metav1.GetOptions{}) + return err + }) + Expect(err).NotTo(HaveOccurred()) + }() +} + +// cleanupCRDs deletes specific CRDs used during testing. +func cleanupCRDs() { + crds := []string{ + "nodefeatures.nfd.k8s-sigs.io", + "nodefeaturegroups.nfd.k8s-sigs.io", + "nodefeaturerules.nfd.k8s-sigs.io", + } + + for _, crd := range crds { + err := extClient.ApiextensionsV1().CustomResourceDefinitions().Delete(ctx, crd, metav1.DeleteOptions{}) + Expect(err).NotTo(HaveOccurred()) + + _ = waitForDeletion(crd, func() error { + _, err := extClient.ApiextensionsV1().CustomResourceDefinitions().Get(ctx, crd, metav1.GetOptions{}) + return err + }) + } +} + +// cleanupNode deletes all NFD/GFD related metadata from the Node object, i.e. +// labels and annotations +func cleanupNode(cs clientset.Interface) { + // Per-node cleanup function + cleanup := func(nodeName string) error { + node, err := cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + + update := false + updateStatus := false + // Gather info about all NFD-managed node assets outside the default prefix + nfdLabels := map[string]struct{}{} + for _, name := range strings.Split(node.Annotations[nfdv1alpha1.FeatureLabelsAnnotation], ",") { + if strings.Contains(name, "/") { + nfdLabels[name] = struct{}{} + } + } + nfdERs := map[string]struct{}{} + for _, name := range strings.Split(node.Annotations[nfdv1alpha1.ExtendedResourceAnnotation], ",") { + if strings.Contains(name, "/") { + nfdERs[name] = struct{}{} + } + } + + // Remove labels + for key := range node.Labels { + _, ok := nfdLabels[key] + if ok || strings.HasPrefix(key, nfdv1alpha1.FeatureLabelNs) { + delete(node.Labels, key) + update = true + } + } + + // Remove annotations + for key := range node.Annotations { + if strings.HasPrefix(key, nfdv1alpha1.AnnotationNs) { + delete(node.Annotations, key) + update = true + } + } + + // Remove nvidia.com/ labels + for key := range node.Labels { + if strings.HasPrefix(key, "nvidia.com/") { + delete(node.Labels, key) + update = true + } + } + + // Remove extended resources + for key := range node.Status.Capacity { + // We check for FeatureLabelNs as -resource-labels can create ERs there + _, ok := nfdERs[string(key)] + if ok || strings.HasPrefix(string(key), nfdv1alpha1.FeatureLabelNs) { + delete(node.Status.Capacity, key) + delete(node.Status.Allocatable, key) + updateStatus = true + } + } + + if updateStatus { + By("[Cleanup]\tDeleting NFD extended resources from node " + nodeName) + if _, err := cs.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { + return err + } + } + + if update { + By("[Cleanup]\tDeleting NFD labels, annotations and taints from node " + node.Name) + if _, err := cs.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil { + return err + } + } + + return nil + } - ginkgo.RunSpecs(t, "nvidia k8s-device-plugin e2e suite", suiteConfig, reporterConfig) + // Cleanup all nodes + nodeList, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + + for _, n := range nodeList.Items { + nodeName := n.Name + Eventually(func(g Gomega) error { + return cleanup(nodeName) + }).WithPolling(100 * time.Millisecond).WithTimeout(500 * time.Millisecond).Should(Succeed()) + } +} + +func cleanupNFDObjects(cli *nfdclient.Clientset, namespace string) { + cleanupNodeFeatureRules(cli) + cleanupNodeFeatures(cli, namespace) +} + +// cleanupNodeFeatures deletes all NodeFeature objects in the given namespace +func cleanupNodeFeatures(cli *nfdclient.Clientset, namespace string) { + nfs, err := cli.NfdV1alpha1().NodeFeatures(namespace).List(ctx, metav1.ListOptions{}) + if k8serrors.IsNotFound(err) { + // Omitted error, nothing to do. + return + } + Expect(err).NotTo(HaveOccurred()) + + if len(nfs.Items) != 0 { + By("[Cleanup]\tDeleting NodeFeature objects from namespace " + namespace) + for _, nf := range nfs.Items { + err = cli.NfdV1alpha1().NodeFeatures(namespace).Delete(ctx, nf.Name, metav1.DeleteOptions{}) + if k8serrors.IsNotFound(err) { + // Omitted error + continue + } + Expect(err).NotTo(HaveOccurred()) + } + } +} + +// cleanupNodeFeatureRules deletes all NodeFeatureRule objects +func cleanupNodeFeatureRules(cli *nfdclient.Clientset) { + nfrs, err := cli.NfdV1alpha1().NodeFeatureRules().List(ctx, metav1.ListOptions{}) + if k8serrors.IsNotFound(err) { + // Omitted error, nothing to do. + return + } + Expect(err).NotTo(HaveOccurred()) + + if len(nfrs.Items) != 0 { + By("[Cleanup]\tDeleting NodeFeatureRule objects from the cluster") + for _, nfr := range nfrs.Items { + err = cli.NfdV1alpha1().NodeFeatureRules().Delete(ctx, nfr.Name, metav1.DeleteOptions{}) + if k8serrors.IsNotFound(err) { + // Omitted error + continue + } + Expect(err).NotTo(HaveOccurred()) + } + } +} + +// getRequiredEnvvar returns the specified envvar if set or raises an error. +func getRequiredEnvvar[T any](key string) T { + v, err := getEnvVarAs[T](key) + Expect(err).To(BeNil(), "required environement variable not set", key) + return v +} + +func getEnvVarAs[T any](key string) (T, error) { + var zero T + value := os.Getenv(key) + if value == "" { + return zero, errors.New("env var not set") + } + + switch any(zero).(type) { + case bool: + v, err := strconv.ParseBool(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case int: + v, err := strconv.Atoi(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case string: + return any(value).(T), nil + default: + return zero, errors.New("unsupported type") + } +} + +func getEnvVarOrDefault[T any](key string, defaultValue T) T { + val, err := getEnvVarAs[T](key) + if err != nil { + return defaultValue + } + return val } diff --git a/tests/e2e/framework.go b/tests/e2e/framework.go deleted file mode 100644 index 7a870548f..000000000 --- a/tests/e2e/framework.go +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package e2e - -import ( - "github.com/onsi/ginkgo/v2" -) - -// NVDescribe annotates the test with the NVIDIA label. -func NVDescribe(text string, body func()) bool { - return ginkgo.Describe("[nvidia] "+text, body) -} diff --git a/tests/e2e/framework/framework.go b/tests/e2e/framework/framework.go deleted file mode 100644 index c371a1384..000000000 --- a/tests/e2e/framework/framework.go +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package framework contains provider-independent helper code for -// building and running E2E tests with Ginkgo. The actual Ginkgo test -// suites gets assembled by combining this framework, the optional -// provider support code and specific tests via a separate .go file -// like Kubernetes' test/e2e.go. -package framework - -import ( - "context" - "errors" - "fmt" - "log" - "math/rand" - "os" - "path/filepath" - "time" - - helm "github.com/mittwald/go-helm-client" - "github.com/onsi/ginkgo/v2" - "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" -) - -const ( - // DefaultNamespaceDeletionTimeout is timeout duration for waiting for a namespace deletion. - DefaultNamespaceDeletionTimeout = 5 * time.Minute -) - -// Options is a struct for managing test framework options. -type Options struct { - ClientQPS float32 - ClientBurst int - GroupVersion *schema.GroupVersion -} - -// Framework supports common operations used by e2e tests; it will keep a client & a namespace for you. -// Eventual goal is to merge this with integration test framework. -// -// You can configure the pod security level for your test by setting the `NamespacePodSecurityLevel` -// which will set all three of pod security admission enforce, warn and audit labels on the namespace. -// The default pod security profile is "restricted". -// Each of the labels can be overridden by using more specific NamespacePodSecurity* attributes of this -// struct. -type Framework struct { - BaseName string - - // Set together with creating the ClientSet and the namespace. - // Guaranteed to be unique in the cluster even when running the same - // test multiple times in parallel. - UniqueName string - - clientConfig *rest.Config - ClientSet clientset.Interface - - // Helm - HelmClient helm.Client - HelmLogFile *os.File - HelmLogger *log.Logger - - // configuration for framework's client - Options Options - - SkipNamespaceCreation bool // Whether to skip creating a namespace - Namespace *corev1.Namespace // Every test has at least one namespace unless creation is skipped - NamespaceDeletionTimeout time.Duration - - namespacesToDelete []*corev1.Namespace // Some tests have more than one. -} - -// NewFramework creates a test framework. -func NewFramework(baseName string) *Framework { - f := &Framework{ - BaseName: baseName, - } - - // The order is important here: if the extension calls ginkgo.BeforeEach - // itself, then it can be sure that f.BeforeEach already ran when its - // own callback gets invoked. - ginkgo.BeforeEach(f.BeforeEach) - - return f -} - -// ClientConfig an externally accessible method for reading the kube client config. -func (f *Framework) ClientConfig() *rest.Config { - ret := rest.CopyConfig(f.clientConfig) - // json is the least common denominator - ret.ContentType = runtime.ContentTypeJSON - ret.AcceptContentTypes = runtime.ContentTypeJSON - return ret -} - -// BeforeEach gets a client and makes a namespace. -func (f *Framework) BeforeEach(ctx context.Context) { - // DeferCleanup, in contrast to AfterEach, triggers execution in - // first-in-last-out order. This ensures that the framework instance - // remains valid as long as possible. - // - // In addition, AfterEach will not be called if a test never gets here. - ginkgo.DeferCleanup(f.AfterEach) - - ginkgo.By("Creating a kubernetes client") - config, err := LoadConfig() - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - config.QPS = f.Options.ClientQPS - config.Burst = f.Options.ClientBurst - if f.Options.GroupVersion != nil { - config.GroupVersion = f.Options.GroupVersion - } - f.clientConfig = rest.CopyConfig(config) - f.ClientSet, err = clientset.NewForConfig(config) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - if !f.SkipNamespaceCreation { - ginkgo.By(fmt.Sprintf("Building a namespace with basename %s", f.BaseName)) - namespace, err := f.CreateNamespace(ctx, f.BaseName, map[string]string{ - "e2e-framework": f.BaseName, - }) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - f.Namespace = namespace - - f.UniqueName = f.Namespace.GetName() - } else { - // not guaranteed to be unique, but very likely - f.UniqueName = fmt.Sprintf("%s-%08x", f.BaseName, rand.Int31()) - } - - // Create a Helm client - ginkgo.By("Creating a Helm client") - - err = os.MkdirAll(filepath.Dir(TestContext.HelmLogFile), 0755) - gomega.Expect(err).To(gomega.BeNil()) - - f.HelmLogFile, err = os.OpenFile(TestContext.HelmLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) - gomega.Expect(err).To(gomega.BeNil()) - - f.HelmLogger = log.New(f.HelmLogFile, fmt.Sprintf("%s\t", f.UniqueName), log.Ldate|log.Ltime) - helmRestConf := &helm.RestConfClientOptions{ - Options: &helm.Options{ - Namespace: f.Namespace.Name, - RepositoryCache: "/tmp/.helmcache", - RepositoryConfig: "/tmp/.helmrepo", - Debug: true, - DebugLog: f.HelmLogger.Printf, - }, - RestConfig: config, - } - - f.HelmClient, err = helm.NewClientFromRestConf(helmRestConf) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) -} - -// AfterEach deletes the namespace, after reading its events. -func (f *Framework) AfterEach(ctx context.Context) { - // This should not happen. Given ClientSet is a public field a test must have updated it! - // Error out early before any API calls during cleanup. - gomega.Expect(f.ClientSet).NotTo(gomega.BeNil()) - - // DeleteNamespace at the very end in defer, to avoid any - // expectation failures preventing deleting the namespace. - defer func() { - var nsDeletionErrors error - // Whether to delete namespace is determined by 3 factors: delete-namespace flag, delete-namespace-on-failure flag and the test result - // if delete-namespace set to false, namespace will always be preserved. - // if delete-namespace is true and delete-namespace-on-failure is false, namespace will be preserved if test failed. - if TestContext.DeleteNamespace && (TestContext.DeleteNamespaceOnFailure || !ginkgo.CurrentSpecReport().Failed()) { - for _, ns := range f.namespacesToDelete { - ginkgo.By(fmt.Sprintf("[Cleanup]\tDeleting testing namespace %q.", ns.Name)) - if err := f.ClientSet.CoreV1().Namespaces().Delete(ctx, ns.Name, metav1.DeleteOptions{}); err != nil { - if !apierrors.IsNotFound(err) { - nsDeletionErrors = errors.Join(nsDeletionErrors, fmt.Errorf("error deleting %v: %w", ns.Name, err)) - } - } - // remove the namespace from the list of namespaces to delete - // so that it is not deleted again in the defer block - f.namespacesToDelete = f.namespacesToDelete[1:] - } - } - - // Unsetting this is relevant for a following test that uses - // the same instance because it might not reach f.BeforeEach - // when some other BeforeEach skips the test first. - f.Namespace = nil - f.clientConfig = nil - f.ClientSet = nil - - // if we had errors deleting, report them now. - gomega.Expect(nsDeletionErrors).NotTo(gomega.HaveOccurred()) - }() - - // Close helm log file - err := f.HelmLogFile.Close() - gomega.Expect(err).To(gomega.BeNil()) -} - -// CreateNamespace creates a namespace for e2e testing. -func (f *Framework) CreateNamespace(ctx context.Context, baseName string, labels map[string]string) (*corev1.Namespace, error) { - createTestingNS := TestContext.CreateTestingNS - if createTestingNS == nil { - createTestingNS = CreateTestingNS - } - - if labels == nil { - labels = make(map[string]string) - } else { - labelsCopy := make(map[string]string) - for k, v := range labels { - labelsCopy[k] = v - } - labels = labelsCopy - } - - ns, err := createTestingNS(ctx, baseName, f.ClientSet, labels) - - // check ns instead of err to see if it's nil as we may - // fail to create serviceAccount in it. - f.AddNamespacesToDelete(ns) - - return ns, err -} - -// DeleteNamespace can be used to delete a namespace -func (f *Framework) DeleteNamespace(ctx context.Context, name string) { - defer func() { - err := f.ClientSet.CoreV1().Namespaces().Delete(ctx, name, metav1.DeleteOptions{}) - if err != nil && !apierrors.IsNotFound(err) { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - err = WaitForNamespacesDeleted(ctx, f.ClientSet, []string{name}, DefaultNamespaceDeletionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() -} - -// AddNamespacesToDelete adds one or more namespaces to be deleted when the test -// completes. -func (f *Framework) AddNamespacesToDelete(namespaces ...*corev1.Namespace) { - for _, ns := range namespaces { - if ns == nil { - continue - } - f.namespacesToDelete = append(f.namespacesToDelete, ns) - - } -} diff --git a/tests/e2e/framework/test_context.go b/tests/e2e/framework/test_context.go deleted file mode 100644 index 739b9aa6e..000000000 --- a/tests/e2e/framework/test_context.go +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package framework - -import ( - "context" - "flag" - "os" - - corev1 "k8s.io/api/core/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/clientcmd" -) - -// CreateTestingNSFn is a func that is responsible for creating namespace used for executing e2e tests. -type CreateTestingNSFn func(ctx context.Context, baseName string, c clientset.Interface, labels map[string]string) (*corev1.Namespace, error) - -// TestContextType contains test settings and global state -type TestContextType struct { - KubeConfig string - KubeContext string - DeleteNamespace bool - DeleteNamespaceOnFailure bool - - HelmLogFile string - - // CreateTestingNS is responsible for creating namespace used for executing e2e tests. - // It accepts namespace base name, which will be prepended with e2e prefix, kube client - // and labels to be applied to a namespace. - CreateTestingNS CreateTestingNSFn -} - -// TestContext should be used by all tests to access common context data. -var TestContext = TestContextType{} - -// RegisterClusterFlags registers flags specific to the cluster e2e test suite. -func RegisterClusterFlags(flags *flag.FlagSet) { - flags.BoolVar(&TestContext.DeleteNamespace, "delete-namespace", true, "If true tests will delete namespace after completion. It is only designed to make debugging easier, DO NOT turn it off by default.") - flags.BoolVar(&TestContext.DeleteNamespaceOnFailure, "delete-namespace-on-failure", true, "If true, framework will delete test namespace on failure. Used only during test debugging.") - flags.StringVar(&TestContext.KubeConfig, clientcmd.RecommendedConfigPathFlag, os.Getenv(clientcmd.RecommendedConfigPathEnvVar), "Path to kubeconfig containing embedded authinfo.") - flags.StringVar(&TestContext.KubeContext, clientcmd.FlagContext, "", "kubeconfig context to use/override. If unset, will use value from 'current-context'") - flags.StringVar(&TestContext.HelmLogFile, "helm-log-file", "e2e-helm", "Path to the file where helm logs will be written.") -} diff --git a/tests/e2e/framework/util.go b/tests/e2e/framework/util.go deleted file mode 100644 index 78e06fd69..000000000 --- a/tests/e2e/framework/util.go +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package framework - -import ( - "context" - "fmt" - "math/rand" - "strconv" - "strings" - "time" - - "github.com/onsi/ginkgo/v2" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/uuid" - "k8s.io/apimachinery/pkg/util/wait" - clientset "k8s.io/client-go/kubernetes" - restclient "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - clientcmdapi "k8s.io/client-go/tools/clientcmd/api" -) - -const ( - // PollInterval is how often to Poll pods, nodes and claims. - PollInterval = 2 * time.Second -) - -// RunID is a unique identifier of the e2e run. -// Beware that this ID is not the same for all tests in the e2e run, because each Ginkgo node creates it separately. -var RunID = uuid.NewUUID() - -// RandomSuffix provides a random sequence to append to pods,services,rcs. -func RandomSuffix() string { - return strconv.Itoa(rand.Intn(10000)) -} - -// LoadConfig returns a config for a rest client with the UserAgent set to include the current test name. -func LoadConfig() (config *restclient.Config, err error) { - defer func() { - if err == nil && config != nil { - testDesc := ginkgo.CurrentSpecReport() - if len(testDesc.ContainerHierarchyTexts) > 0 { - testName := strings.Join(testDesc.ContainerHierarchyTexts, " ") - if len(testDesc.LeafNodeText) > 0 { - testName = testName + " " + testDesc.LeafNodeText - } - config.UserAgent = fmt.Sprintf("%s -- %s", restclient.DefaultKubernetesUserAgent(), testName) - } - } - }() - - c, err := restclientConfig(TestContext.KubeContext) - if err != nil { - if TestContext.KubeConfig == "" { - return restclient.InClusterConfig() - } - return nil, err - } - - return clientcmd.NewDefaultClientConfig(*c, &clientcmd.ConfigOverrides{}).ClientConfig() -} - -// restclientConfig returns a config holds the information needed to build connection to kubernetes clusters. -func restclientConfig(kubeContext string) (*clientcmdapi.Config, error) { - if TestContext.KubeConfig == "" { - return nil, fmt.Errorf("KubeConfig must be specified to load client config") - } - c, err := clientcmd.LoadFromFile(TestContext.KubeConfig) - if err != nil { - return nil, fmt.Errorf("error loading KubeConfig: %v", err.Error()) - } - if kubeContext != "" { - c.CurrentContext = kubeContext - } - return c, nil -} - -// CreateTestingNS should be used by every test, note that we append a common prefix to the provided test name. -// Please see NewFramework instead of using this directly. -func CreateTestingNS(ctx context.Context, baseName string, c clientset.Interface, labels map[string]string) (*corev1.Namespace, error) { - if labels == nil { - labels = map[string]string{} - } - labels["e2e-run"] = string(RunID) - - // We don't use ObjectMeta.GenerateName feature, as in case of API call - // failure we don't know whether the namespace was created and what is its - // name. - name := fmt.Sprintf("%v-%v", baseName, RandomSuffix()) - - namespaceObj := &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "", - Labels: labels, - }, - Status: corev1.NamespaceStatus{}, - } - // Be robust about making the namespace creation call. - var got *corev1.Namespace - if err := wait.PollUntilContextTimeout(ctx, PollInterval, 30*time.Second, true, func(ctx context.Context) (bool, error) { - var err error - got, err = c.CoreV1().Namespaces().Create(ctx, namespaceObj, metav1.CreateOptions{}) - if err != nil { - if apierrors.IsAlreadyExists(err) { - // regenerate on conflict - namespaceObj.Name = fmt.Sprintf("%v-%v", baseName, RandomSuffix()) - } - return false, nil - } - return true, nil - }); err != nil { - return nil, err - } - - return got, nil -} - -// WaitForNamespacesDeleted waits for the namespaces to be deleted. -func WaitForNamespacesDeleted(ctx context.Context, c clientset.Interface, namespaces []string, timeout time.Duration) error { - ginkgo.By(fmt.Sprintf("Waiting for namespaces %+v to vanish", namespaces)) - nsMap := map[string]bool{} - for _, ns := range namespaces { - nsMap[ns] = true - } - // Now POLL until all namespaces have been eradicated. - return wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, true, - func(ctx context.Context) (bool, error) { - nsList, err := c.CoreV1().Namespaces().List(ctx, metav1.ListOptions{}) - if err != nil { - return false, err - } - for _, item := range nsList.Items { - if _, ok := nsMap[item.Name]; ok { - return false, nil - } - } - return true, nil - }) -} diff --git a/tests/e2e/gomega.go b/tests/e2e/gomega.go deleted file mode 100644 index 1fa4364e3..000000000 --- a/tests/e2e/gomega.go +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package e2e - -import ( - "context" - "fmt" - "regexp" - "strings" - "time" - - . "github.com/onsi/gomega" - gomegatypes "github.com/onsi/gomega/types" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common" -) - -type k8sLabels map[string]string - -// eventuallyNonControlPlaneNodes is a helper for asserting node properties -func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion { - return Eventually(func(g Gomega, ctx context.Context) ([]corev1.Node, error) { - return common.GetNonControlPlaneNodes(ctx, cli) - }).WithPolling(1 * time.Second).WithTimeout(10 * time.Second).WithContext(ctx) -} - -// MatchLabels returns a specialized Gomega matcher for checking if a list of -// nodes are labeled as expected. -func MatchLabels(expectedNew map[string]k8sLabels, oldNodes []corev1.Node) gomegatypes.GomegaMatcher { - return &nodeListPropertyRegexpMatcher[k8sLabels]{ - propertyName: "labels", - expected: expectedNew, - oldNodes: oldNodes, - } -} - -// MatchCapacity returns a specialized Gomega matcher for checking if a list of -// nodes are configured as expected. -func MatchCapacity(expectedNew map[string]k8sLabels, oldNodes []corev1.Node) gomegatypes.GomegaMatcher { - return &nodeListPropertyRegexpMatcher[k8sLabels]{ - propertyName: "capacity", - expected: expectedNew, - oldNodes: oldNodes, - } -} - -// nodeListPropertyRegexpMatcher is a generic Gomega matcher for asserting one property a group of nodes. -type nodeListPropertyRegexpMatcher[T any] struct { - expected map[string]k8sLabels - oldNodes []corev1.Node - - propertyName string - node *corev1.Node //nolint:unused - missing []string //nolint:unused - invalidValue []string //nolint:unused -} - -// Match method of the GomegaMatcher interface. -func (m *nodeListPropertyRegexpMatcher[T]) Match(actual interface{}) (bool, error) { - nodes, ok := actual.([]corev1.Node) - if !ok { - return false, fmt.Errorf("expected []corev1.Node, got: %T", actual) - } - - switch m.propertyName { - case "labels": - return m.matchLabels(nodes), nil - case "capacity": - return m.matchCapacity(nodes), nil - default: - return true, nil - } - -} - -func (m *nodeListPropertyRegexpMatcher[T]) matchLabels(nodes []corev1.Node) bool { - targetNode := corev1.Node{} - for _, node := range nodes { - _, ok := m.expected[node.Name] - if !ok { - continue - } - targetNode = node - break - } - - m.node = &targetNode - - for labelKey, labelValue := range m.expected[targetNode.Name] { - // missing key - if _, ok := targetNode.Labels[labelKey]; !ok { - m.missing = append(m.missing, labelKey) - continue - } - // invalid value - regexMatcher := regexp.MustCompile(labelValue) - if !regexMatcher.MatchString(targetNode.Labels[labelKey]) { - m.invalidValue = append(m.invalidValue, fmt.Sprintf("%s: %s", labelKey, targetNode.Labels[labelKey])) - return false - } - } - - return true -} - -func (m *nodeListPropertyRegexpMatcher[T]) matchCapacity(nodes []corev1.Node) bool { - targetNode := corev1.Node{} - for _, node := range nodes { - _, ok := m.expected[node.Name] - if !ok { - continue - } - targetNode = node - break - } - - m.node = &targetNode - - for labelKey, labelValue := range m.expected[targetNode.Name] { - // missing key - rn := corev1.ResourceName(labelKey) - if _, ok := targetNode.Status.Capacity[rn]; !ok { - m.missing = append(m.missing, labelKey) - continue - } - // invalid value - capacity := targetNode.Status.Capacity[rn] - regexMatcher := regexp.MustCompile(labelValue) - if !regexMatcher.MatchString(capacity.String()) { - m.invalidValue = append(m.invalidValue, fmt.Sprintf("%s: %s", labelKey, capacity.String())) - return false - } - } - - return true -} - -// FailureMessage method of the GomegaMatcher interface. -func (m *nodeListPropertyRegexpMatcher[T]) FailureMessage(actual interface{}) string { - return m.message() -} - -// NegatedFailureMessage method of the GomegaMatcher interface. -func (m *nodeListPropertyRegexpMatcher[T]) NegatedFailureMessage(actual interface{}) string { - return fmt.Sprintf("Node %q matched unexpectedly", m.node.Name) -} - -// TODO remove nolint when golangci-lint is able to cope with generics -// -//nolint:unused -func (m *nodeListPropertyRegexpMatcher[T]) message() string { - msg := fmt.Sprintf("Node %q %s did not match:", m.node.Name, m.propertyName) - if len(m.missing) > 0 { - msg += fmt.Sprintf("\n missing:\n %s", strings.Join(m.missing, "\n ")) - } - if len(m.invalidValue) > 0 { - msg += fmt.Sprintf("\n invalid value:\n %s", strings.Join(m.invalidValue, "\n ")) - } - return msg -} - -// JobIsCompleted checks if a job is completed -func JobIsCompleted(ctx context.Context, cli clientset.Interface, namespace, podName string) bool { - pod, err := cli.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) - if err != nil { - return false - } - // Check if the pod's phase is Succeeded. - if pod.Status.Phase == "Succeeded" { - return true - } - return false -} diff --git a/tests/e2e/gpu-feature-discovery_test.go b/tests/e2e/gpu-feature-discovery_test.go index 5bc8aecfd..f66a935a5 100644 --- a/tests/e2e/gpu-feature-discovery_test.go +++ b/tests/e2e/gpu-feature-discovery_test.go @@ -1,5 +1,6 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,21 +28,15 @@ import ( helm "github.com/mittwald/go-helm-client" helmValues "github.com/mittwald/go-helm-client/values" - apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" - extclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/rand" - nfdclient "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common/diagnostics" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/framework" + "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" ) // Actual test suite -var _ = NVDescribe("GPU Feature Discovery", func() { - f := framework.NewFramework("gpu-feature-discovery") - +var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), func() { expectedLabelPatterns := k8sLabels{ "nvidia.com/gfd.timestamp": "[0-9]{10}", "nvidia.com/cuda.driver.major": "[0-9]+", @@ -71,113 +66,108 @@ var _ = NVDescribe("GPU Feature Discovery", func() { "nodeFeature", } - Context("When deploying GFD", Ordered, func() { - // helm-chart is required - if *HelmChart == "" { - Fail("No helm-chart for GPU-Feature-Discovery specified") - } + // Init global suite vars + var ( + helmReleaseName string + chartSpec helm.ChartSpec + + collectLogsFrom []string + diagnosticsCollector diagnostics.Collector + ) + + values := helmValues.Options{ + Values: []string{ + fmt.Sprintf("image.repository=%s", ImageRepo), + fmt.Sprintf("image.tag=%s", ImageTag), + fmt.Sprintf("image.pullPolicy=%s", ImagePullPolicy), + "gfd.enabled=true", + "devicePlugin.enabled=false", + }, + } - // Init global suite vars vars - var ( - crds []*apiextensionsv1.CustomResourceDefinition - extClient *extclient.Clientset - nfdClient *nfdclient.Clientset - - chartSpec helm.ChartSpec - helmReleaseName string - - collectLogsFrom []string - diagnosticsCollector diagnostics.Collector - ) - - values := helmValues.Options{ - Values: []string{ - fmt.Sprintf("image.repository=%s", *ImageRepo), - fmt.Sprintf("image.tag=%s", *ImageTag), - fmt.Sprintf("image.pullPolicy=%s", *ImagePullPolicy), - "gfd.enabled=true", - "devicePlugin.enabled=false", - }, - } + // checkNodeFeatureObject is a helper function to check if NodeFeature object was created + checkNodeFeatureObject := func(ctx context.Context, name string) bool { + gfdNodeFeature := fmt.Sprintf("nvidia-features-for-%s", name) + _, err := nfdClient.NfdV1alpha1().NodeFeatures(testNamespace.Name).Get(ctx, gfdNodeFeature, metav1.GetOptions{}) + return err == nil + } - // checkNodeFeatureObject is a helper function to check if NodeFeature object was created - checkNodeFeatureObject := func(ctx context.Context, name string) bool { - gfdNodeFeature := fmt.Sprintf("nvidia-features-for-%s", name) - _, err := nfdClient.NfdV1alpha1().NodeFeatures(f.Namespace.Name).Get(ctx, gfdNodeFeature, metav1.GetOptions{}) - return err == nil - } + // check Collector objects + collectLogsFrom = defaultCollectorObjects + if CollectLogsFrom != "" && CollectLogsFrom != "default" { + collectLogsFrom = strings.Split(CollectLogsFrom, ",") + } - // check Collector objects - collectLogsFrom = defaultCollectorObjects - if *CollectLogsFrom != "" && *CollectLogsFrom != "default" { - collectLogsFrom = strings.Split(*CollectLogsFrom, ",") + BeforeAll(func(ctx SpecContext) { + helmReleaseName = "gfd-e2e-test" + rand.String(5) + + // reset Helm Client + chartSpec = helm.ChartSpec{ + ReleaseName: helmReleaseName, + ChartName: HelmChart, + Namespace: testNamespace.Name, + Wait: true, + Timeout: 1 * time.Minute, + ValuesOptions: values, + CleanupOnFail: true, } - BeforeAll(func(ctx context.Context) { - // Create clients for apiextensions and our CRD api - extClient = extclient.NewForConfigOrDie(f.ClientConfig()) - nfdClient = nfdclient.NewForConfigOrDie(f.ClientConfig()) - helmReleaseName = "gfd-e2e-test" + rand.String(5) - }) + By("Installing GFD Helm chart") + _, err := helmClient.InstallChart(ctx, &chartSpec, nil) + Expect(err).NotTo(HaveOccurred()) - JustBeforeEach(func(ctx context.Context) { - // reset Helm Client - chartSpec = helm.ChartSpec{ - ReleaseName: helmReleaseName, - ChartName: *HelmChart, - Namespace: f.Namespace.Name, - Wait: true, - Timeout: 1 * time.Minute, - ValuesOptions: values, - CleanupOnFail: true, - } - - By("Installing GFD Helm chart") - _, err := f.HelmClient.InstallChart(ctx, &chartSpec, nil) - Expect(err).NotTo(HaveOccurred()) - }) + // Wait for all DaemonSets to be ready + // Note: DaemonSet names are dynamically generated with the Helm release prefix, + // so we wait for all DaemonSets in the namespace rather than specific names + By("Waiting for all DaemonSets to be ready") + err = internal.WaitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") + Expect(err).NotTo(HaveOccurred()) + }) - // Cleanup before next test run - AfterEach(func(ctx context.Context) { - // Run diagnostic collector if test failed - if CurrentSpecReport().Failed() { - var err error - diagnosticsCollector, err = diagnostics.New( - diagnostics.WithNamespace(f.Namespace.Name), - diagnostics.WithArtifactDir(*LogArtifactDir), - diagnostics.WithKubernetesClient(f.ClientSet), - diagnostics.WithNFDClient(nfdClient), - diagnostics.WithObjects(collectLogsFrom...), - ) - Expect(err).NotTo(HaveOccurred()) + AfterAll(func(ctx SpecContext) { + By("Uninstalling GFD Helm chart") + err := helmClient.UninstallReleaseByName(helmReleaseName) + if err != nil { + GinkgoWriter.Printf("Failed to uninstall helm release %s: %v\n", helmReleaseName, err) + } + }) - err = diagnosticsCollector.Collect(ctx) - Expect(err).NotTo(HaveOccurred()) - } - // Delete Helm release - err := f.HelmClient.UninstallReleaseByName(helmReleaseName) + // Cleanup before next test run + AfterEach(func(ctx SpecContext) { + // Run diagnostic collector if test failed + if CurrentSpecReport().Failed() { + var err error + diagnosticsCollector, err = diagnostics.New( + diagnostics.WithNamespace(testNamespace.Name), + diagnostics.WithArtifactDir(LogArtifactDir), + diagnostics.WithKubernetesClient(clientSet), + diagnostics.WithNFDClient(nfdClient), + diagnostics.WithObjects(collectLogsFrom...), + ) Expect(err).NotTo(HaveOccurred()) - // Cleanup environment - By("[Cleanup]\tCleaning up environment") - common.CleanupNode(ctx, f.ClientSet) - common.CleanupNFDObjects(ctx, nfdClient, f.Namespace.Name) - }) - AfterAll(func(ctx context.Context) { - for _, crd := range crds { - err := extClient.ApiextensionsV1().CustomResourceDefinitions().Delete(ctx, crd.Name, metav1.DeleteOptions{}) - Expect(err).NotTo(HaveOccurred()) - } - }) + err = diagnosticsCollector.Collect(ctx) + Expect(err).NotTo(HaveOccurred()) + } + }) - Context("and NV Driver is not installed", func() { - It("it should create nvidia.com timestamp label", func(ctx context.Context) { - nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + When("When deploying GFD", Ordered, Label("serial"), func() { + Context("NV Driver is not installed", func() { + BeforeEach(func() { + // Skip this context when driver is enabled since "NV Driver is installed" + // context provides more comprehensive testing + if NVIDIA_DRIVER_ENABLED { + Skip("Skipping driver-not-installed tests when NVIDIA_DRIVER_ENABLED is true") + } + }) + + It("it should create nvidia.com timestamp label", Label("timestamp"), func(ctx SpecContext) { + nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) Expect(err).NotTo(HaveOccurred()) Expect(len(nodeList.Items)).ToNot(BeZero()) // We pick one node targeted for our NodeFeature objects - nodes, err := common.GetNonControlPlaneNodes(ctx, f.ClientSet) + nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) targetNodeName := nodes[0].Name @@ -188,52 +178,51 @@ var _ = NVDescribe("GPU Feature Discovery", func() { targetNodeName: { "nvidia.com/gfd.timestamp": "[0-9]{10}", }} - eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchLabels(labelChecker, nodes)) + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) }) Context("and the NodeFeature API is enabled", func() { - It("gfd should create node feature object", func(ctx context.Context) { + It("gfd should create node feature object", Label("nodefeature"), func(ctx SpecContext) { By("Updating GFD Helm chart values") newValues := values newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true") chartSpec.ValuesOptions = newValues chartSpec.Replace = true - _, err := f.HelmClient.UpgradeChart(ctx, &chartSpec, nil) + _, err := helmClient.UpgradeChart(ctx, &chartSpec, nil) Expect(err).NotTo(HaveOccurred()) By("Checking if NodeFeature CR object is created") - nodes, err := common.GetNonControlPlaneNodes(ctx, f.ClientSet) + nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) targetNodeName := nodes[0].Name Expect(targetNodeName).ToNot(BeEmpty()) - Eventually(func() bool { + Eventually(func(g Gomega) bool { return checkNodeFeatureObject(ctx, targetNodeName) - }, 2*time.Minute, 5*time.Second).Should(BeTrue()) + }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue()) By("Checking that node labels are created from NodeFeature object") labelChecker := map[string]k8sLabels{ targetNodeName: { "nvidia.com/gfd.timestamp": "[0-9]{10}", }} - eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchLabels(labelChecker, nodes)) + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) }) }) }) - Context("and NV Driver is installed", func() { - BeforeEach(func(ctx context.Context) { - // Skip test if NVIDIA_DRIVER_ENABLED is not set - if !*NVIDIA_DRIVER_ENABLED { + When("NV Driver is installed", func() { + It("it should create nvidia.com labels", Label("driver", "labels"), func(ctx SpecContext) { + if !NVIDIA_DRIVER_ENABLED { Skip("NVIDIA_DRIVER_ENABLED is not set") } - }) - It("it should create nvidia.com labels", func(ctx context.Context) { - nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + + By("Checking the node labels") + nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) Expect(err).NotTo(HaveOccurred()) Expect(len(nodeList.Items)).ToNot(BeZero()) // We pick one node targeted for our NodeFeature objects - nodes, err := common.GetNonControlPlaneNodes(ctx, f.ClientSet) + nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) targetNodeName := nodes[0].Name @@ -242,32 +231,35 @@ var _ = NVDescribe("GPU Feature Discovery", func() { By("Checking the node labels") labelChecker := map[string]k8sLabels{ targetNodeName: expectedLabelPatterns} - eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchLabels(labelChecker, nodes)) + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) }) Context("and the NodeFeature API is enabled", func() { - It("gfd should create node feature object", func(ctx context.Context) { + It("gfd should create node feature object", Label("driver", "nodefeature"), func(ctx SpecContext) { + if !NVIDIA_DRIVER_ENABLED { + Skip("NVIDIA_DRIVER_ENABLED is not set") + } By("Updating GFD Helm chart values") newValues := values newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true") chartSpec.ValuesOptions = newValues chartSpec.Replace = true - _, err := f.HelmClient.UpgradeChart(ctx, &chartSpec, nil) + _, err := helmClient.UpgradeChart(ctx, &chartSpec, nil) Expect(err).NotTo(HaveOccurred()) By("Checking if NodeFeature CR object is created") - nodes, err := common.GetNonControlPlaneNodes(ctx, f.ClientSet) + nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) targetNodeName := nodes[0].Name Expect(targetNodeName).ToNot(BeEmpty()) - Eventually(func() bool { + Eventually(func(g Gomega) bool { return checkNodeFeatureObject(ctx, targetNodeName) - }, 2*time.Minute, 5*time.Second).Should(BeTrue()) + }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue()) By("Checking that node labels are created from NodeFeature CR object") checkForLabels := map[string]k8sLabels{ targetNodeName: expectedLabelPatterns} - eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchLabels(checkForLabels, nodes)) + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(checkForLabels, nodes)) }) }) }) diff --git a/tests/e2e/infra/aws.yaml b/tests/e2e/infra/aws.yaml index 251d9ba2a..6de73f56e 100644 --- a/tests/e2e/infra/aws.yaml +++ b/tests/e2e/infra/aws.yaml @@ -11,27 +11,16 @@ spec: instance: type: g4dn.xlarge region: us-west-1 - ingressIpRanges: - - 18.190.12.32/32 - - 3.143.46.93/32 - - 52.15.119.136/32 - - 35.155.108.162/32 - - 35.162.190.51/32 - - 54.201.61.24/32 - - 52.24.205.48/32 - - 44.235.4.62/32 - - 44.230.241.223/32 image: architecture: amd64 - imageId: ami-0ce2cb35386fc22e9 containerRuntime: install: true name: containerd - nvidiaContainerToolkit: - install: true nvidiaDriver: install: true + nvidiaContainerToolkit: + install: true + enableCDI: true kubernetes: install: true installer: kubeadm - version: v1.28.5 diff --git a/tests/e2e/internal/kube.go b/tests/e2e/internal/kube.go new file mode 100644 index 000000000..41031f135 --- /dev/null +++ b/tests/e2e/internal/kube.go @@ -0,0 +1,288 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package internal + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" +) + +const ( + // DefaultPollInterval for Eventually checks + DefaultPollInterval = 2 * time.Second + // DefaultTimeout for Eventually checks + DefaultTimeout = 5 * time.Minute +) + +// WaitForDaemonSetRollout waits for a DaemonSet to complete its rollout +func WaitForDaemonSetRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error { + EventuallyWithOffset(1, func(g Gomega) error { + ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + + // Check if rollout is complete + if ds.Status.DesiredNumberScheduled == 0 { + return fmt.Errorf("daemonset %s/%s has 0 desired pods", namespace, name) + } + + if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready", + namespace, name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) + } + + if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated", + namespace, name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled) + } + + // Check generation to ensure we're looking at the latest spec + if ds.Generation != ds.Status.ObservedGeneration { + return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d", + namespace, name, ds.Generation, ds.Status.ObservedGeneration) + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForAllDaemonSetsReady waits for all DaemonSets in a namespace to be ready +func WaitForAllDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace string) error { + return WaitForDaemonSetsReady(ctx, client, namespace, "") +} + +// WaitForDaemonSetsReady waits for DaemonSets in a namespace to be ready, optionally filtered by label selector +func WaitForDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) error { + EventuallyWithOffset(1, func(g Gomega) error { + dsList, err := client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return err + } + + if len(dsList.Items) == 0 { + return fmt.Errorf("no daemonsets found in namespace %s with selector '%s'", namespace, labelSelector) + } + + for _, ds := range dsList.Items { + // Skip if no pods are desired + if ds.Status.DesiredNumberScheduled == 0 { + continue + } + + if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready", + namespace, ds.Name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) + } + + if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated", + namespace, ds.Name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled) + } + + // Check generation to ensure we're looking at the latest spec + if ds.Generation != ds.Status.ObservedGeneration { + return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d", + namespace, ds.Name, ds.Generation, ds.Status.ObservedGeneration) + } + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForDaemonSetPodsReady waits for all pods of a DaemonSet to be ready +func WaitForDaemonSetPodsReady(ctx context.Context, client kubernetes.Interface, namespace, name string) error { + EventuallyWithOffset(1, func(g Gomega) error { + ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + + selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector) + if err != nil { + return fmt.Errorf("invalid selector: %v", err) + } + + pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + if err != nil { + return err + } + + if len(pods.Items) == 0 { + return fmt.Errorf("no pods found for daemonset %s/%s", namespace, name) + } + + for _, pod := range pods.Items { + if !isPodReady(&pod) { + return fmt.Errorf("pod %s/%s is not ready", pod.Namespace, pod.Name) + } + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForNodeLabels waits for specific labels to appear on nodes +func WaitForNodeLabels(ctx context.Context, client kubernetes.Interface, labelSelector string, expectedLabels map[string]string) error { + EventuallyWithOffset(1, func(g Gomega) error { + nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return err + } + + if len(nodes.Items) == 0 { + return fmt.Errorf("no nodes found with selector: %s", labelSelector) + } + + // Check each node has the expected labels + for _, node := range nodes.Items { + for key, expectedValue := range expectedLabels { + actualValue, exists := node.Labels[key] + if !exists { + return fmt.Errorf("node %s missing label: %s", node.Name, key) + } + if expectedValue != "" && actualValue != expectedValue { + return fmt.Errorf("node %s label %s=%s, expected %s", + node.Name, key, actualValue, expectedValue) + } + } + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForGFDLabels waits for GPU Feature Discovery labels on nodes +func WaitForGFDLabels(ctx context.Context, client kubernetes.Interface, nodeName string) error { + gfdLabels := []string{ + "nvidia.com/gfd.timestamp", + "nvidia.com/cuda.driver.major", + "nvidia.com/cuda.driver.minor", + "nvidia.com/gpu.family", + "nvidia.com/gpu.machine", + "nvidia.com/gpu.memory", + "nvidia.com/gpu.product", + } + + EventuallyWithOffset(1, func(g Gomega) error { + node, err := client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + + for _, label := range gfdLabels { + if _, exists := node.Labels[label]; !exists { + return fmt.Errorf("node %s missing GFD label: %s", nodeName, label) + } + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForPodsRunning waits for pods matching a selector to be running +func WaitForPodsRunning(ctx context.Context, client kubernetes.Interface, namespace string, selector labels.Selector) error { + EventuallyWithOffset(1, func(g Gomega) error { + pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + if err != nil { + return err + } + + if len(pods.Items) == 0 { + return fmt.Errorf("no pods found matching selector: %s", selector.String()) + } + + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + return fmt.Errorf("pod %s/%s is %s, not Running", pod.Namespace, pod.Name, pod.Status.Phase) + } + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// WaitForDeploymentRollout waits for a deployment to complete its rollout +func WaitForDeploymentRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error { + EventuallyWithOffset(1, func(g Gomega) error { + deployment, err := client.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + + // Check if the deployment is complete + for _, condition := range deployment.Status.Conditions { + if condition.Type == appsv1.DeploymentProgressing { + if condition.Status != corev1.ConditionTrue { + return fmt.Errorf("deployment %s/%s is not progressing: %s", namespace, name, condition.Message) + } + } + if condition.Type == appsv1.DeploymentAvailable { + if condition.Status != corev1.ConditionTrue { + return fmt.Errorf("deployment %s/%s is not available: %s", namespace, name, condition.Message) + } + } + } + + if deployment.Status.UpdatedReplicas != *deployment.Spec.Replicas { + return fmt.Errorf("deployment %s/%s update incomplete: %d/%d replicas updated", + namespace, name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas) + } + + if deployment.Status.ReadyReplicas != *deployment.Spec.Replicas { + return fmt.Errorf("deployment %s/%s not ready: %d/%d replicas ready", + namespace, name, deployment.Status.ReadyReplicas, *deployment.Spec.Replicas) + } + + return nil + }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) + return nil +} + +// isPodReady checks if a pod is ready +func isPodReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + return condition.Status == corev1.ConditionTrue + } + } + return false +} diff --git a/tests/vendor/k8s.io/apimachinery/pkg/util/uuid/uuid.go b/tests/vendor/k8s.io/apimachinery/pkg/util/uuid/uuid.go deleted file mode 100644 index 1fa351aab..000000000 --- a/tests/vendor/k8s.io/apimachinery/pkg/util/uuid/uuid.go +++ /dev/null @@ -1,27 +0,0 @@ -/* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package uuid - -import ( - "github.com/google/uuid" - - "k8s.io/apimachinery/pkg/types" -) - -func NewUUID() types.UID { - return types.UID(uuid.New().String()) -} diff --git a/tests/vendor/modules.txt b/tests/vendor/modules.txt index 4a7a8d3b5..5f36422dd 100644 --- a/tests/vendor/modules.txt +++ b/tests/vendor/modules.txt @@ -840,7 +840,6 @@ k8s.io/apimachinery/pkg/util/remotecommand k8s.io/apimachinery/pkg/util/runtime k8s.io/apimachinery/pkg/util/sets k8s.io/apimachinery/pkg/util/strategicpatch -k8s.io/apimachinery/pkg/util/uuid k8s.io/apimachinery/pkg/util/validation k8s.io/apimachinery/pkg/util/validation/field k8s.io/apimachinery/pkg/util/version From 67e0b073072965dccf7aae656a923990c2b345f4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 15 Sep 2025 16:54:00 +0200 Subject: [PATCH 2/6] [no-relnote] Minor cleanups Signed-off-by: Evan Lezar --- tests/e2e/device-plugin_test.go | 10 +- tests/e2e/e2e_test.go | 8 +- tests/e2e/gpu-feature-discovery_test.go | 206 +++++++++--------------- 3 files changed, 87 insertions(+), 137 deletions(-) diff --git a/tests/e2e/device-plugin_test.go b/tests/e2e/device-plugin_test.go index 1d8540333..1e1fc7688 100644 --- a/tests/e2e/device-plugin_test.go +++ b/tests/e2e/device-plugin_test.go @@ -19,6 +19,7 @@ package e2e import ( "fmt" + "path/filepath" "strings" "time" @@ -28,6 +29,7 @@ import ( helm "github.com/mittwald/go-helm-client" helmValues "github.com/mittwald/go-helm-client/values" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/rand" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common/diagnostics" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" @@ -39,7 +41,7 @@ const ( // Actual test suite var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugin"), func() { - // Init global suite vars vars + // Init global suite vars var ( helmReleaseName string chartSpec helm.ChartSpec @@ -74,7 +76,7 @@ var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugi BeforeAll(func(ctx SpecContext) { // Create clients for apiextensions and our CRD api - helmReleaseName = "nvdp-e2e-test-" + randomSuffix() + helmReleaseName = "nvdp-e2e-test-" + rand.String(5) chartSpec = helm.ChartSpec{ ReleaseName: helmReleaseName, @@ -145,9 +147,9 @@ var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugi }) It("it should run GPU jobs", Label("gpu-job"), func(ctx SpecContext) { By("Creating a GPU job") - jobNames, err := CreateOrUpdateJobsFromFile(ctx, clientSet, "job-1.yaml", testNamespace.Name) + jobNames, err := CreateOrUpdateJobsFromFile(ctx, clientSet, testNamespace.Name, filepath.Join(projectRoot, "testdata", "job-1.yaml")) Expect(err).NotTo(HaveOccurred()) - Expect(jobNames).NotTo(BeEmpty()) + Expect(jobNames).NotTo(HaveLen(1)) // Defer cleanup for the job DeferCleanup(func(ctx SpecContext) { diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 54b192cf0..97b0c685e 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -93,6 +93,7 @@ var ( ctx context.Context packagePath string + projectRoot string ) func TestMain(t *testing.T) { @@ -103,6 +104,7 @@ func TestMain(t *testing.T) { // get the package path _, thisFile, _, _ := runtime.Caller(0) packagePath = filepath.Dir(thisFile) + projectRoot = filepath.Join(packagePath, "..", "..") ctx = context.Background() getTestEnv() @@ -286,8 +288,6 @@ func CreateTestingNS(baseName string, c clientset.Interface, labels map[string]s type k8sLabels map[string]string // eventuallyNonControlPlaneNodes is a helper for asserting node properties -// -//nolint:unused func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion { return Eventually(func(g Gomega) ([]corev1.Node, error) { return getNonControlPlaneNodes(ctx, cli) @@ -498,8 +498,8 @@ func getNode(nodes []corev1.Node, nodeName string) corev1.Node { } // CreateOrUpdateJobsFromFile creates or updates jobs from a file -func CreateOrUpdateJobsFromFile(ctx context.Context, cli clientset.Interface, filename, namespace string) ([]string, error) { - jobs, err := newJobFromfile(filepath.Join(packagePath, "..", "..", "testdata", filename)) +func CreateOrUpdateJobsFromFile(ctx context.Context, cli clientset.Interface, namespace string, filename string) ([]string, error) { + jobs, err := newJobFromfile(filename) if err != nil { return nil, fmt.Errorf("failed to create Job from file: %w", err) } diff --git a/tests/e2e/gpu-feature-discovery_test.go b/tests/e2e/gpu-feature-discovery_test.go index f66a935a5..9eb1399c5 100644 --- a/tests/e2e/gpu-feature-discovery_test.go +++ b/tests/e2e/gpu-feature-discovery_test.go @@ -35,37 +35,28 @@ import ( "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" ) +var expectedLabelPatterns = k8sLabels{ + "nvidia.com/gfd.timestamp": "[0-9]{10}", + "nvidia.com/cuda.driver.major": "[0-9]+", + "nvidia.com/cuda.driver.minor": "[0-9]+", + "nvidia.com/cuda.driver.rev": "[0-9]*", + "nvidia.com/cuda.runtime.major": "[0-9]+", + "nvidia.com/cuda.runtime.minor": "[0-9]+", + "nvidia.com/gpu.machine": ".*", + "nvidia.com/gpu.count": "[0-9]+", + "nvidia.com/gpu.replicas": "[0-9]+", + "nvidia.com/gpu.sharing-strategy": "[none|mps|time-slicing]", + "nvidia.com/gpu.product": "[A-Za-z_-]+", + "nvidia.com/gpu.memory": "[0-9]+", + "nvidia.com/gpu.family": "[a-z]+", + "nvidia.com/mig.capable": "[true|false]", + "nvidia.com/gpu.compute.major": "[0-9]+", + "nvidia.com/gpu.compute.minor": "[0-9]+", + "nvidia.com/mps.capable": "[true|false]", +} + // Actual test suite var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), func() { - expectedLabelPatterns := k8sLabels{ - "nvidia.com/gfd.timestamp": "[0-9]{10}", - "nvidia.com/cuda.driver.major": "[0-9]+", - "nvidia.com/cuda.driver.minor": "[0-9]+", - "nvidia.com/cuda.driver.rev": "[0-9]*", - "nvidia.com/cuda.runtime.major": "[0-9]+", - "nvidia.com/cuda.runtime.minor": "[0-9]+", - "nvidia.com/gpu.machine": ".*", - "nvidia.com/gpu.count": "[0-9]+", - "nvidia.com/gpu.replicas": "[0-9]+", - "nvidia.com/gpu.sharing-strategy": "[none|mps|time-slicing]", - "nvidia.com/gpu.product": "[A-Za-z_-]+", - "nvidia.com/gpu.memory": "[0-9]+", - "nvidia.com/gpu.family": "[a-z]+", - "nvidia.com/mig.capable": "[true|false]", - "nvidia.com/gpu.compute.major": "[0-9]+", - "nvidia.com/gpu.compute.minor": "[0-9]+", - "nvidia.com/mps.capable": "[true|false]", - } - - defaultCollectorObjects := []string{ - "pods", - "nodes", - "namespaces", - "deployments", - "daemonsets", - "nodeFeature", - } - // Init global suite vars var ( helmReleaseName string @@ -75,6 +66,18 @@ var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), f diagnosticsCollector diagnostics.Collector ) + collectLogsFrom = []string{ + "pods", + "nodes", + "namespaces", + "deployments", + "daemonsets", + "nodeFeature", + } + if CollectLogsFrom != "" && CollectLogsFrom != "default" { + collectLogsFrom = strings.Split(CollectLogsFrom, ",") + } + values := helmValues.Options{ Values: []string{ fmt.Sprintf("image.repository=%s", ImageRepo), @@ -92,12 +95,6 @@ var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), f return err == nil } - // check Collector objects - collectLogsFrom = defaultCollectorObjects - if CollectLogsFrom != "" && CollectLogsFrom != "default" { - collectLogsFrom = strings.Split(CollectLogsFrom, ",") - } - BeforeAll(func(ctx SpecContext) { helmReleaseName = "gfd-e2e-test" + rand.String(5) @@ -152,115 +149,66 @@ var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), f }) When("When deploying GFD", Ordered, Label("serial"), func() { - Context("NV Driver is not installed", func() { - BeforeEach(func() { - // Skip this context when driver is enabled since "NV Driver is installed" - // context provides more comprehensive testing - if NVIDIA_DRIVER_ENABLED { - Skip("Skipping driver-not-installed tests when NVIDIA_DRIVER_ENABLED is true") - } - }) - - It("it should create nvidia.com timestamp label", Label("timestamp"), func(ctx SpecContext) { - nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(len(nodeList.Items)).ToNot(BeZero()) - - // We pick one node targeted for our NodeFeature objects - nodes, err := getNonControlPlaneNodes(ctx, clientSet) - Expect(err).NotTo(HaveOccurred()) - - targetNodeName := nodes[0].Name - Expect(targetNodeName).ToNot(BeEmpty()) + It("it should create nvidia.com labels", func(ctx SpecContext) { + nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(len(nodeList.Items)).ToNot(BeZero()) - By("Checking the node labels") - labelChecker := map[string]k8sLabels{ - targetNodeName: { - "nvidia.com/gfd.timestamp": "[0-9]{10}", - }} - eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) - }) - Context("and the NodeFeature API is enabled", func() { - It("gfd should create node feature object", Label("nodefeature"), func(ctx SpecContext) { - By("Updating GFD Helm chart values") - newValues := values - newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true") - chartSpec.ValuesOptions = newValues - chartSpec.Replace = true - _, err := helmClient.UpgradeChart(ctx, &chartSpec, nil) - Expect(err).NotTo(HaveOccurred()) + // We pick one node targeted for our NodeFeature objects + nodes, err := getNonControlPlaneNodes(ctx, clientSet) + Expect(err).NotTo(HaveOccurred()) - By("Checking if NodeFeature CR object is created") - nodes, err := getNonControlPlaneNodes(ctx, clientSet) - Expect(err).NotTo(HaveOccurred()) + targetNodeName := nodes[0].Name + Expect(targetNodeName).ToNot(BeEmpty()) - targetNodeName := nodes[0].Name - Expect(targetNodeName).ToNot(BeEmpty()) - Eventually(func(g Gomega) bool { - return checkNodeFeatureObject(ctx, targetNodeName) - }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue()) + By("Checking the node labels") - By("Checking that node labels are created from NodeFeature object") - labelChecker := map[string]k8sLabels{ - targetNodeName: { - "nvidia.com/gfd.timestamp": "[0-9]{10}", - }} - eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) - }) - }) - }) - - When("NV Driver is installed", func() { - It("it should create nvidia.com labels", Label("driver", "labels"), func(ctx SpecContext) { - if !NVIDIA_DRIVER_ENABLED { - Skip("NVIDIA_DRIVER_ENABLED is not set") + labelChecker := map[string]k8sLabels{ + targetNodeName: expectedLabelPatterns, + } + if !NVIDIA_DRIVER_ENABLED { + // If the NVIDIA driver is not installed, we only check the + // timestamp label to allow for local testing on non-GPU + // systems. + labelChecker[targetNodeName] = k8sLabels{ + "nvidia.com/gfd.timestamp": "[0-9]{10}", } - - By("Checking the node labels") - nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + } + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) + }) + Context("and the NodeFeature API is enabled", func() { + It("gfd should create node feature object", Label("nodefeature"), func(ctx SpecContext) { + By("Updating GFD Helm chart values") + newValues := values + newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true") + chartSpec.ValuesOptions = newValues + chartSpec.Replace = true + _, err := helmClient.UpgradeChart(ctx, &chartSpec, nil) Expect(err).NotTo(HaveOccurred()) - Expect(len(nodeList.Items)).ToNot(BeZero()) - // We pick one node targeted for our NodeFeature objects + By("Checking if NodeFeature CR object is created") nodes, err := getNonControlPlaneNodes(ctx, clientSet) Expect(err).NotTo(HaveOccurred()) targetNodeName := nodes[0].Name Expect(targetNodeName).ToNot(BeEmpty()) + Eventually(func(g Gomega) bool { + return checkNodeFeatureObject(ctx, targetNodeName) + }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue()) - By("Checking the node labels") + By("Checking that node labels are created from NodeFeature object") labelChecker := map[string]k8sLabels{ - targetNodeName: expectedLabelPatterns} - eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) - }) - Context("and the NodeFeature API is enabled", func() { - It("gfd should create node feature object", Label("driver", "nodefeature"), func(ctx SpecContext) { - if !NVIDIA_DRIVER_ENABLED { - Skip("NVIDIA_DRIVER_ENABLED is not set") + targetNodeName: expectedLabelPatterns, + } + if !NVIDIA_DRIVER_ENABLED { + // If the NVIDIA driver is not installed, we only check the + // timestamp label to allow for local testing on non-GPU + // systems. + labelChecker[targetNodeName] = k8sLabels{ + "nvidia.com/gfd.timestamp": "[0-9]{10}", } - By("Updating GFD Helm chart values") - newValues := values - newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true") - chartSpec.ValuesOptions = newValues - chartSpec.Replace = true - _, err := helmClient.UpgradeChart(ctx, &chartSpec, nil) - Expect(err).NotTo(HaveOccurred()) - - By("Checking if NodeFeature CR object is created") - nodes, err := getNonControlPlaneNodes(ctx, clientSet) - Expect(err).NotTo(HaveOccurred()) - - targetNodeName := nodes[0].Name - Expect(targetNodeName).ToNot(BeEmpty()) - Eventually(func(g Gomega) bool { - return checkNodeFeatureObject(ctx, targetNodeName) - }).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue()) - - By("Checking that node labels are created from NodeFeature CR object") - checkForLabels := map[string]k8sLabels{ - targetNodeName: expectedLabelPatterns} - eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(checkForLabels, nodes)) - }) + } + eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes)) }) }) }) From df67aa8b191390dc8065ca76c3ac8e56c0cf5dcb Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Sep 2025 14:40:11 +0200 Subject: [PATCH 3/6] TOFIX: reduce diff Signed-off-by: Evan Lezar --- tests/e2e/device-plugin_test.go | 3 +- tests/e2e/e2e_test.go | 42 ++++ tests/e2e/gpu-feature-discovery_test.go | 3 +- tests/e2e/internal/kube.go | 288 ------------------------ 4 files changed, 44 insertions(+), 292 deletions(-) delete mode 100644 tests/e2e/internal/kube.go diff --git a/tests/e2e/device-plugin_test.go b/tests/e2e/device-plugin_test.go index 1e1fc7688..917d68ca4 100644 --- a/tests/e2e/device-plugin_test.go +++ b/tests/e2e/device-plugin_test.go @@ -32,7 +32,6 @@ import ( "k8s.io/apimachinery/pkg/util/rand" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common/diagnostics" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" ) const ( @@ -96,7 +95,7 @@ var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugi // Note: DaemonSet names are dynamically generated with the Helm release prefix, // so we wait for all DaemonSets in the namespace rather than specific names By("Waiting for all DaemonSets to be ready") - err = internal.WaitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") + err = waitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") Expect(err).NotTo(HaveOccurred()) }) diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 97b0c685e..2c6c1661c 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -874,3 +874,45 @@ func getEnvVarOrDefault[T any](key string, defaultValue T) T { } return val } + +// waitForDaemonSetsReady waits for DaemonSets in a namespace to be ready, optionally filtered by label selector +func waitForDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) error { + EventuallyWithOffset(1, func(g Gomega) error { + dsList, err := client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return err + } + + if len(dsList.Items) == 0 { + return fmt.Errorf("no daemonsets found in namespace %s with selector '%s'", namespace, labelSelector) + } + + for _, ds := range dsList.Items { + // Skip if no pods are desired + if ds.Status.DesiredNumberScheduled == 0 { + continue + } + + if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready", + namespace, ds.Name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) + } + + if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled { + return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated", + namespace, ds.Name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled) + } + + // Check generation to ensure we're looking at the latest spec + if ds.Generation != ds.Status.ObservedGeneration { + return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d", + namespace, ds.Name, ds.Generation, ds.Status.ObservedGeneration) + } + } + + return nil + }).WithContext(ctx).WithPolling(2 * time.Second).WithTimeout(5 * time.Minute).Should(Succeed()) + return nil +} diff --git a/tests/e2e/gpu-feature-discovery_test.go b/tests/e2e/gpu-feature-discovery_test.go index 9eb1399c5..8b2170107 100644 --- a/tests/e2e/gpu-feature-discovery_test.go +++ b/tests/e2e/gpu-feature-discovery_test.go @@ -32,7 +32,6 @@ import ( "k8s.io/apimachinery/pkg/util/rand" "github.com/NVIDIA/k8s-device-plugin/tests/e2e/common/diagnostics" - "github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal" ) var expectedLabelPatterns = k8sLabels{ @@ -117,7 +116,7 @@ var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), f // Note: DaemonSet names are dynamically generated with the Helm release prefix, // so we wait for all DaemonSets in the namespace rather than specific names By("Waiting for all DaemonSets to be ready") - err = internal.WaitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") + err = waitForDaemonSetsReady(ctx, clientSet, testNamespace.Name, "app.kubernetes.io/name=nvidia-device-plugin") Expect(err).NotTo(HaveOccurred()) }) diff --git a/tests/e2e/internal/kube.go b/tests/e2e/internal/kube.go deleted file mode 100644 index 41031f135..000000000 --- a/tests/e2e/internal/kube.go +++ /dev/null @@ -1,288 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package internal - -import ( - "context" - "fmt" - "time" - - . "github.com/onsi/gomega" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/kubernetes" -) - -const ( - // DefaultPollInterval for Eventually checks - DefaultPollInterval = 2 * time.Second - // DefaultTimeout for Eventually checks - DefaultTimeout = 5 * time.Minute -) - -// WaitForDaemonSetRollout waits for a DaemonSet to complete its rollout -func WaitForDaemonSetRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error { - EventuallyWithOffset(1, func(g Gomega) error { - ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return err - } - - // Check if rollout is complete - if ds.Status.DesiredNumberScheduled == 0 { - return fmt.Errorf("daemonset %s/%s has 0 desired pods", namespace, name) - } - - if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled { - return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready", - namespace, name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) - } - - if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled { - return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated", - namespace, name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled) - } - - // Check generation to ensure we're looking at the latest spec - if ds.Generation != ds.Status.ObservedGeneration { - return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d", - namespace, name, ds.Generation, ds.Status.ObservedGeneration) - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForAllDaemonSetsReady waits for all DaemonSets in a namespace to be ready -func WaitForAllDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace string) error { - return WaitForDaemonSetsReady(ctx, client, namespace, "") -} - -// WaitForDaemonSetsReady waits for DaemonSets in a namespace to be ready, optionally filtered by label selector -func WaitForDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) error { - EventuallyWithOffset(1, func(g Gomega) error { - dsList, err := client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: labelSelector, - }) - if err != nil { - return err - } - - if len(dsList.Items) == 0 { - return fmt.Errorf("no daemonsets found in namespace %s with selector '%s'", namespace, labelSelector) - } - - for _, ds := range dsList.Items { - // Skip if no pods are desired - if ds.Status.DesiredNumberScheduled == 0 { - continue - } - - if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled { - return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready", - namespace, ds.Name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) - } - - if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled { - return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated", - namespace, ds.Name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled) - } - - // Check generation to ensure we're looking at the latest spec - if ds.Generation != ds.Status.ObservedGeneration { - return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d", - namespace, ds.Name, ds.Generation, ds.Status.ObservedGeneration) - } - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForDaemonSetPodsReady waits for all pods of a DaemonSet to be ready -func WaitForDaemonSetPodsReady(ctx context.Context, client kubernetes.Interface, namespace, name string) error { - EventuallyWithOffset(1, func(g Gomega) error { - ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return err - } - - selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector) - if err != nil { - return fmt.Errorf("invalid selector: %v", err) - } - - pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: selector.String(), - }) - if err != nil { - return err - } - - if len(pods.Items) == 0 { - return fmt.Errorf("no pods found for daemonset %s/%s", namespace, name) - } - - for _, pod := range pods.Items { - if !isPodReady(&pod) { - return fmt.Errorf("pod %s/%s is not ready", pod.Namespace, pod.Name) - } - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForNodeLabels waits for specific labels to appear on nodes -func WaitForNodeLabels(ctx context.Context, client kubernetes.Interface, labelSelector string, expectedLabels map[string]string) error { - EventuallyWithOffset(1, func(g Gomega) error { - nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ - LabelSelector: labelSelector, - }) - if err != nil { - return err - } - - if len(nodes.Items) == 0 { - return fmt.Errorf("no nodes found with selector: %s", labelSelector) - } - - // Check each node has the expected labels - for _, node := range nodes.Items { - for key, expectedValue := range expectedLabels { - actualValue, exists := node.Labels[key] - if !exists { - return fmt.Errorf("node %s missing label: %s", node.Name, key) - } - if expectedValue != "" && actualValue != expectedValue { - return fmt.Errorf("node %s label %s=%s, expected %s", - node.Name, key, actualValue, expectedValue) - } - } - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForGFDLabels waits for GPU Feature Discovery labels on nodes -func WaitForGFDLabels(ctx context.Context, client kubernetes.Interface, nodeName string) error { - gfdLabels := []string{ - "nvidia.com/gfd.timestamp", - "nvidia.com/cuda.driver.major", - "nvidia.com/cuda.driver.minor", - "nvidia.com/gpu.family", - "nvidia.com/gpu.machine", - "nvidia.com/gpu.memory", - "nvidia.com/gpu.product", - } - - EventuallyWithOffset(1, func(g Gomega) error { - node, err := client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - return err - } - - for _, label := range gfdLabels { - if _, exists := node.Labels[label]; !exists { - return fmt.Errorf("node %s missing GFD label: %s", nodeName, label) - } - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForPodsRunning waits for pods matching a selector to be running -func WaitForPodsRunning(ctx context.Context, client kubernetes.Interface, namespace string, selector labels.Selector) error { - EventuallyWithOffset(1, func(g Gomega) error { - pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: selector.String(), - }) - if err != nil { - return err - } - - if len(pods.Items) == 0 { - return fmt.Errorf("no pods found matching selector: %s", selector.String()) - } - - for _, pod := range pods.Items { - if pod.Status.Phase != corev1.PodRunning { - return fmt.Errorf("pod %s/%s is %s, not Running", pod.Namespace, pod.Name, pod.Status.Phase) - } - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// WaitForDeploymentRollout waits for a deployment to complete its rollout -func WaitForDeploymentRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error { - EventuallyWithOffset(1, func(g Gomega) error { - deployment, err := client.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return err - } - - // Check if the deployment is complete - for _, condition := range deployment.Status.Conditions { - if condition.Type == appsv1.DeploymentProgressing { - if condition.Status != corev1.ConditionTrue { - return fmt.Errorf("deployment %s/%s is not progressing: %s", namespace, name, condition.Message) - } - } - if condition.Type == appsv1.DeploymentAvailable { - if condition.Status != corev1.ConditionTrue { - return fmt.Errorf("deployment %s/%s is not available: %s", namespace, name, condition.Message) - } - } - } - - if deployment.Status.UpdatedReplicas != *deployment.Spec.Replicas { - return fmt.Errorf("deployment %s/%s update incomplete: %d/%d replicas updated", - namespace, name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas) - } - - if deployment.Status.ReadyReplicas != *deployment.Spec.Replicas { - return fmt.Errorf("deployment %s/%s not ready: %d/%d replicas ready", - namespace, name, deployment.Status.ReadyReplicas, *deployment.Spec.Replicas) - } - - return nil - }).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed()) - return nil -} - -// isPodReady checks if a pod is ready -func isPodReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - return condition.Status == corev1.ConditionTrue - } - } - return false -} From 189747b3d5d37e3046fcd4fc870eadc0d0f6ee56 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Sep 2025 15:34:12 +0200 Subject: [PATCH 4/6] [no-relnote] Set ginkgo version on install Signed-off-by: Evan Lezar --- tests/e2e/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 03c15c1fd..711654215 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -34,9 +34,10 @@ GINKGO_REPORT_ARGS := --json-report=$(LOG_ARTIFACTS)/report.json --junit-report= .PHONY: ginkgo test clean-artifacts +GINKGO_VERSION = $(shell grep -Eo "github.com/onsi/ginkgo/v2.*$$" ./tests/go.mod | sed -e 's&github.com/onsi/ginkgo/v2[[:space:]]&&g') ginkgo: mkdir -p $(CURDIR)/bin - GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest + GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@$(GINKGO_VERSION) # Create artifacts directory $(LOG_ARTIFACTS): From d2dde98928d33d7a087266a6360f7ca92b59da48 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Sep 2025 16:34:43 +0200 Subject: [PATCH 5/6] fixup! TOFIX: reduce diff Signed-off-by: Evan Lezar --- tests/e2e/e2e_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 2c6c1661c..ecf363a59 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -876,7 +876,7 @@ func getEnvVarOrDefault[T any](key string, defaultValue T) T { } // waitForDaemonSetsReady waits for DaemonSets in a namespace to be ready, optionally filtered by label selector -func waitForDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) error { +func waitForDaemonSetsReady(ctx context.Context, client clientset.Interface, namespace, labelSelector string) error { EventuallyWithOffset(1, func(g Gomega) error { dsList, err := client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{ LabelSelector: labelSelector, From 49680b4b8ec4fa53a99e453c884162b0fd3553ba Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Sep 2025 17:01:22 +0200 Subject: [PATCH 6/6] fixup! [no-relnote] Minor cleanups Signed-off-by: Evan Lezar --- tests/e2e/device-plugin_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/device-plugin_test.go b/tests/e2e/device-plugin_test.go index 917d68ca4..d6f1ea298 100644 --- a/tests/e2e/device-plugin_test.go +++ b/tests/e2e/device-plugin_test.go @@ -148,7 +148,7 @@ var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugi By("Creating a GPU job") jobNames, err := CreateOrUpdateJobsFromFile(ctx, clientSet, testNamespace.Name, filepath.Join(projectRoot, "testdata", "job-1.yaml")) Expect(err).NotTo(HaveOccurred()) - Expect(jobNames).NotTo(HaveLen(1)) + Expect(jobNames).To(HaveLen(1)) // Defer cleanup for the job DeferCleanup(func(ctx SpecContext) {