Skip to content

Commit 30b4508

Browse files
ArangoGutierrezelezar
authored andcommitted
[no-relnote] Update E2E test suite
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent b0508d6 commit 30b4508

File tree

44 files changed

+2122
-3261
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+2122
-3261
lines changed

.github/workflows/e2e.yaml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
go-version: ${{ inputs.golang_version }}
4949

5050
- name: Set up Holodeck
51-
uses: NVIDIA/[email protected].14
51+
uses: NVIDIA/[email protected].15
5252
with:
5353
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
5454
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -58,11 +58,14 @@ jobs:
5858
- name: Run e2e tests
5959
env:
6060
KUBECONFIG: ${{ github.workspace }}/kubeconfig
61+
HELM_CHART: ${{ github.workspace }}/deployments/helm/nvidia-device-plugin
6162
E2E_IMAGE_REPO: ghcr.io/nvidia/k8s-device-plugin
62-
E2E_IMAGE_TAG: ${{ inputs.version }}-ubi9
63+
E2E_IMAGE_TAG: ${{ inputs.version }}
64+
E2E_IMAGE_PULL_POLICY: Always
65+
NVIDIA_DRIVER_ENABLED: true
6366
LOG_ARTIFACTS: ${{ github.workspace }}/e2e_logs
6467
run: |
65-
make test-e2e
68+
make -f tests/e2e/Makefile test-e2e
6669
6770
- name: Archive test logs
6871
if: ${{ failure() }}
@@ -71,6 +74,13 @@ jobs:
7174
name: e2e-test-logs
7275
path: ./e2e_logs/
7376
retention-days: 15
77+
78+
- name: Archive Ginkgo logs
79+
uses: actions/upload-artifact@v4
80+
with:
81+
name: ginkgo-logs
82+
path: ginkgo.json
83+
retention-days: 15
7484

7585
- name: Send Slack alert notification
7686
id: slack
@@ -80,8 +90,10 @@ jobs:
8090
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
8191
SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
8292
with:
83-
channel-id: ${{ secrets.SLACK_CHANNEL_ID }}
84-
slack-message: |
85-
:x: On repository ${{ github.repository }} the Workflow *${{ github.workflow }}* has failed.
93+
method: chat.postMessage
94+
token: ${{ secrets.SLACK_BOT_TOKEN }}
95+
payload: |
96+
channel: ${{ secrets.SLACK_CHANNEL_ID }}
97+
text: ":x: On repository ${{ github.repository }} the Workflow *${{ github.workflow }}* has failed.
8698
87-
Details: ${{ env.SUMMARY_URL }}
99+
Details: ${{ env.SUMMARY_URL }}"

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
deployments/helm/gpu-feature-discovery
77
cmd/gpu-feature-discovery/gfd-test-loop
88
e2e_logs
9-
9+
bin
1010
*.out
1111
*.log
12+
ginkgo.json

testdata/job-1.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: j-e2e-1
5+
labels:
6+
app.nvidia.com: k8s-device-plugin-test-app
7+
spec:
8+
template:
9+
metadata:
10+
name: gpu-pod
11+
spec:
12+
restartPolicy: Never
13+
containers:
14+
- name: cuda-container
15+
image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
16+
args:
17+
- "--benchmark"
18+
- "--numbodies=10000"
19+
resources:
20+
limits:
21+
nvidia.com/gpu: "1"
22+
tolerations:
23+
- key: "nvidia.com/gpu"
24+
operator: "Exists"
25+
effect: "NoSchedule"

tests/e2e/Makefile

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
45
# you may not use this file except in compliance with the License.
@@ -20,29 +21,34 @@ GO_TEST_TIMEOUT ?= 30m
2021

2122
include $(CURDIR)/versions.mk
2223

23-
DRIVER_ENABLED ?= true
24+
NVIDIA_DRIVER_ENABLED ?= true
2425

2526
E2E_IMAGE_REPO ?= $(REGISTRY)/$(DRIVER_NAME)
2627
E2E_IMAGE_TAG ?= $(VERSION)
2728
E2E_IMAGE_PULL_POLICY ?= IfNotPresent
2829
HELM_CHART ?= $(CURDIR)/deployments/helm/nvidia-device-plugin
2930
LOG_ARTIFACTS ?= $(CURDIR)/e2e_logs
3031

31-
.PHONY: test
32-
test:
33-
@if [ -z ${KUBECONFIG} ]; then \
34-
echo "[ERR] KUBECONFIG missing, must be defined"; \
35-
exit 1; \
36-
fi
37-
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -timeout $(GO_TEST_TIMEOUT) -v . -args \
38-
-kubeconfig=$(KUBECONFIG) \
39-
-driver-enabled=$(DRIVER_ENABLED) \
40-
-image.repo=$(E2E_IMAGE_REPO) \
41-
-image.tag=$(E2E_IMAGE_TAG) \
42-
-image.pull-policy=$(E2E_IMAGE_PULL_POLICY) \
43-
-log-artifacts=$(LOG_ARTIFACTS) \
44-
-helm-chart=$(HELM_CHART) \
45-
-helm-log-file=$(LOG_ARTIFACTS)/helm.log \
46-
-ginkgo.focus="\[nvidia\]" \
47-
-test.timeout=1h \
48-
-ginkgo.v
32+
# Test configuration
33+
GINKGO_PARALLEL_PROCS ?= 4
34+
GINKGO_FLAKE_ATTEMPTS ?= 2
35+
GINKGO_COMMON_ARGS := -v --fail-on-pending --randomize-all --trace
36+
GINKGO_REPORT_ARGS := --json-report=$(LOG_ARTIFACTS)/report.json --junit-report=$(LOG_ARTIFACTS)/junit.xml
37+
38+
.PHONY: ginkgo e2e-test e2e-test-parallel e2e-test-serial e2e-test-all clean-artifacts
39+
40+
ginkgo:
41+
mkdir -p $(CURDIR)/bin
42+
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
43+
44+
# Create artifacts directory
45+
$(LOG_ARTIFACTS):
46+
mkdir -p $(LOG_ARTIFACTS)
47+
48+
# Clean artifacts
49+
clean-artifacts:
50+
rm -rf $(LOG_ARTIFACTS)
51+
52+
# Run all tests (default)
53+
test-e2e: ginkgo $(LOG_ARTIFACTS)
54+
$(CURDIR)/bin/ginkgo $(GINKGO_COMMON_ARGS) $(GINKGO_REPORT_ARGS) $(GINKGO_ARGS) ./tests/e2e/...

tests/e2e/README.md

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
<!--
2+
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
SPDX-License-Identifier: Apache-2.0
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
-->
17+
18+
# README – End‑to‑End (Ginkgo/Gomega) Test Suite for the NVIDIA K8s Device Plugin
19+
20+
---
21+
22+
## 1  Purpose
23+
This repository contains a self‑contained Ginkgo v2 / Gomega end‑to‑end (E2E) test suite that
24+
25+
1. Creates an **isolated namespace** per run.
26+
2. Deploys the **NVIDIA k8s‑device‑plugin Helm chart** under a random release name.
27+
3. Executes a **CUDA “*n‑body*” benchmark job** to validate GPU scheduling.
28+
29+
On test failure the suite gathers logs and **ensures full cleanup** (namespace deletion, finalizer removal).
30+
The suite targets CI pipelines and developers validating chart or driver changes before promotion.
31+
32+
### Key Features
33+
- **Ginkgo v2 idioms**: Uses `SpecContext`, `DeferCleanup`, `Label`, and `ReportAfterSuite`
34+
- **Parallel execution**: Label-based test sharding for faster CI runs
35+
- **Deterministic cleanup**: LIFO order cleanup with proper context handling
36+
- **No blind sleeps**: All waits use `Eventually` with explicit timeouts and context
37+
- **CI-ready reporting**: Built-in JSON and JUnit report generation
38+
- **GPU Feature Discovery**: Tests GFD label creation and NodeFeature API integration
39+
40+
---
41+
42+
## 2  Prerequisites
43+
44+
| Requirement | Notes |
45+
|----------------------|-------------------------------------------------------------------------------|
46+
| **Go ≥ 1.22** | Needed for building helper binaries. |
47+
| **Kubernetes cluster** | Must be reachable via `kubectl`; worker nodes require NVIDIA GPUs. |
48+
| **Helm v3 CLI** | Only required for manual debugging; the suite uses a programmatic client. |
49+
| **Linux/macOS host** | The Makefile assumes a POSIX‑compatible shell. |
50+
51+
---
52+
53+
## 3  Environment variables
54+
55+
| Variable | Required | Default | Description |
56+
|----------|----------|---------|-------------|
57+
| `KUBECONFIG` ||| Path to the target‑cluster kubeconfig. |
58+
| `HELM_CHART` ||| Helm chart reference (e.g. `oci://ghcr.io/nvidia/k8s-device-plugin`). |
59+
| `E2E_IMAGE_REPO` ||| Repository hosting the image under test. |
60+
| `E2E_IMAGE_TAG` ||| Image tag to test. |
61+
| `E2E_IMAGE_PULL_POLICY` ||| Image pull policy (`Always`, `IfNotPresent`, …). |
62+
| `E2E_TIMEOUT_SECONDS` || `1800` | Global timeout (s). |
63+
| `LOG_ARTIFACTS_DIR` || `./artifacts` | Directory for Helm & test logs. |
64+
| `COLLECT_LOGS_FROM` || (unset) | Comma‑separated node list or `all` for log collection. |
65+
| `NVIDIA_DRIVER_ENABLED` || `false` | Skip GPU job when driver is unavailable. |
66+
| `GINKGO_PARALLEL_PROCS` || `4` | Number of parallel Ginkgo processes for parallel test execution. |
67+
| `GINKGO_FLAKE_ATTEMPTS` || `2` | Number of retry attempts for flaky tests (should be reduced to 1 when stable). |
68+
69+
> *Unset variables fall back to defaults via `getIntEnvVar` / `getBoolEnvVar`.*
70+
71+
---
72+
73+
## 4  Build helper binaries
74+
75+
```bash
76+
make ginkgo
77+
# → ./bin/ginkgo (latest v2 CLI)
78+
```
79+
80+
---
81+
82+
## 5  Run the suite
83+
84+
### 5.1  Default invocation
85+
```bash
86+
make test-e2e
87+
```
88+
Generates the CLI (if missing), executes all specs under `./tests/e2e`, and writes a JSON report to `ginkgo.json`.
89+
90+
### 5.2  Focused run / extra flags
91+
```bash
92+
GINKGO_ARGS='--focus="[GPU Job]" --keep-going' make test-e2e
93+
```
94+
Any flag accepted by `ginkgo run` can be forwarded through `GINKGO_ARGS`.
95+
96+
---
97+
98+
## 6  Execution flow
99+
100+
| Phase | Key functions / objects | Description |
101+
|-------|-------------------------|-------------|
102+
| **Init** | `TestMain`, `getTestEnv` | Validates env vars, sets global timeout. |
103+
| **Client setup** | `getK8sClients`, `getHelmClient` | Creates REST clients (core, CRD, NFD) and a Helm client that shares the same `rest.Config`. |
104+
| **Namespace** | `CreateTestingNS` | Generates a unique namespace labelled `e2e-run=<uid>`. |
105+
| **Chart deploy** | `helmClient.InstallRelease` | Installs the chart in the test namespace with a random release name. |
106+
| **Workload** | `newGPUJob` | Launches `nvcr.io/nvidia/k8s/cuda-sample:nbody` requesting `nvidia.com/gpu=1`. |
107+
| **Assertions** | Gomega matchers | Waits for `JobSucceeded == 1` and validates pod logs. |
108+
| **Cleanup** | `cleanupNamespaceResources`, `AfterSuite` | Removes finalizers, deletes namespace, closes Helm log file. |
109+
110+
---
111+
112+
## 7  Artifacts & logs
113+
114+
```
115+
${LOG_ARTIFACTS_DIR}/
116+
└── helm/
117+
├── helm_logs # Release operations, one per test namespace
118+
└── ...
119+
120+
ginkgo.json # Structured test outcome for CI parsing
121+
```
122+
If `COLLECT_LOGS_FROM` is set, additional node‑level or container logs are archived in the same directory.
123+
124+
---
125+
126+
## 8 Extending the suite
127+
128+
### 8.1 Creating additional spec files
129+
130+
1. Add a new `_test.go` file under `tests/e2e`.
131+
2. Import the Ginkgo/Gomega DSL:
132+
```go
133+
import (
134+
. "github.com/onsi/ginkgo/v2"
135+
. "github.com/onsi/gomega"
136+
)
137+
```
138+
3. Wrap your tests with `Describe`, `Context`, `When`, `It`, etc.
139+
4. Scope all resources to `testNamespace` and always guard API calls with `Expect(err).NotTo(HaveOccurred())`.
140+
5. Use helpers such as `wait.PollUntilContextTimeout` for custom waits and back‑off loops.
141+
142+
### 8.2 Adding additional *When* blocks to `device-plugin_test.go`
143+
The suite already contains a high‑level file, `tests/e2e/device-plugin_test.go`, which drives most GPU‑focused checks. To extend it:
144+
145+
1. **Open** `tests/e2e/device-plugin_test.go`.
146+
2. **Locate** the outer `Describe("GPU Device Plugin", Ordered, func() { … })` wrapper.
147+
3. **Add a sibling `When` container** under this `Describe` for each new behaviour you want to validate:
148+
```go
149+
When("....", func() {
150+
It("should ......", func(ctx context.Context) {
151+
//
152+
//
153+
// ...
154+
})
155+
})
156+
```
157+
4. **Use `Ordered`** on the `When` block *only* if its order relative to other tests is significant (e.g. upgrade/downgrade flows). Otherwise omit it for independent execution.
158+
5. **Share helpers**: you can reference `helmClient`, `clientSet`, `randomSuffix()`, `eventuallyNonControlPlaneNodes`, etc., directly because they are package‑level variables/functions exposed by `e2e`.
159+
6. **Diagnostics on failure** are automatic – `AfterEach` will collect logs whenever `CurrentSpecReport().Failed()` is `true`.
160+
161+
> Keep each `When` block focused on one behaviour. If it spawns multiple `It` tests, make sure they are idempotent and leave no residual resources so that later blocks start from a clean state.
162+
163+
---
164+
165+
## 9 Troubleshooting  Troubleshooting
166+
167+
| Symptom | Possible fix |
168+
|---------|--------------|
169+
| **`ErrImagePull` for CUDA job** | Validate `E2E_IMAGE_REPO` / `E2E_IMAGE_TAG` and registry access. |
170+
| Job stuck in **`Pending`** | Ensure nodes advertise `nvidia.com/gpu` and tolerations match taints. |
171+
| Helm install failure | Render manifests locally via `helm template $HELM_CHART` to inspect errors. |
172+
173+
---
174+
175+
## 10  License
176+
This test code is released under the same license as the NVIDIA k8s‑device‑plugin project (Apache‑2.0).
177+
178+
---
179+
180+
## 11  References
181+
* [Ginkgo v2](https://github.com/onsi/ginkgo)
182+
* [mittwald/go‑helm‑client](https://github.com/mittwald/go-helm-client)
183+
* [kubernetes‑sigs/node-feature-discovery](https://github.com/kubernetes-sigs/node-feature-discovery)
184+
* [Kubernetes blog – *End‑to‑End Testing for Everyone*](https://kubernetes.io/blog/2020/07/27/kubernetes-e2e-testing-for-everyone/)

tests/e2e/common/gpu_job.go

Lines changed: 0 additions & 60 deletions
This file was deleted.

0 commit comments

Comments
 (0)