Skip to content

Commit 21bb4ef

Browse files
[no-relnote] enhance E2E
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 02d5773 commit 21bb4ef

File tree

5 files changed

+121
-59
lines changed

5 files changed

+121
-59
lines changed

tests/e2e/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ This repository contains a self‑contained Ginkgo v2 / Gomega end‑to‑end (
2929
On test failure the suite gathers logs and **ensures full cleanup** (namespace deletion, finalizer removal).
3030
The suite targets CI pipelines and developers validating chart or driver changes before promotion.
3131

32+
### Key Features
33+
- **Ginkgo v2 idioms**: Uses `SpecContext`, `DeferCleanup`, `Label`, and `ReportAfterSuite`
34+
- **Parallel execution**: Label-based test sharding for faster CI runs
35+
- **Deterministic cleanup**: LIFO order cleanup with proper context handling
36+
- **No blind sleeps**: All waits use `Eventually` with explicit timeouts and context
37+
- **CI-ready reporting**: Built-in JSON and JUnit report generation
38+
- **GPU Feature Discovery**: Tests GFD label creation and NodeFeature API integration
39+
3240
---
3341

3442
## 2  Prerequisites
@@ -55,6 +63,8 @@ The suite targets CI pipelines and developers validating chart or driver changes
5563
| `LOG_ARTIFACTS_DIR` || `./artifacts` | Directory for Helm & test logs. |
5664
| `COLLECT_LOGS_FROM` || (unset) | Comma‑separated node list or `all` for log collection. |
5765
| `NVIDIA_DRIVER_ENABLED` || `false` | Skip GPU job when driver is unavailable. |
66+
| `GINKGO_PARALLEL_PROCS` || `4` | Number of parallel Ginkgo processes for parallel test execution. |
67+
| `GINKGO_FLAKE_ATTEMPTS` || `2` | Number of retry attempts for flaky tests (should be reduced to 1 when stable). |
5868

5969
> *Unset variables fall back to defaults via `getIntEnvVar` / `getBoolEnvVar`.*
6070

tests/e2e/cleanup_test.go

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,17 @@ func cleanupNamespaceResources(namespace string) {
4848
// waitForDeletion polls the provided checkFunc until a NotFound error is returned,
4949
// confirming that the resource is deleted.
5050
func waitForDeletion(resourceName string, checkFunc func() error) error {
51-
timeout := 2 * time.Minute
52-
interval := 5 * time.Second
53-
start := time.Now()
54-
for {
51+
EventuallyWithOffset(1, func(g Gomega) error {
5552
err := checkFunc()
5653
if err != nil && errors.IsNotFound(err) {
5754
return nil
5855
}
59-
if time.Since(start) > timeout {
60-
return fmt.Errorf("timed out waiting for deletion of %s", resourceName)
56+
if err != nil {
57+
return err
6158
}
62-
time.Sleep(interval)
63-
}
59+
return fmt.Errorf("%s still exists", resourceName)
60+
}).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).WithContext(ctx).Should(Succeed())
61+
return nil
6462
}
6563

6664
// cleanupTestPods deletes all test Pods in the namespace that have the label "app.nvidia.com=k8s-dra-driver-gpu-test-app".
@@ -224,14 +222,10 @@ func cleanupNode(cs clientset.Interface) {
224222
Expect(err).NotTo(HaveOccurred())
225223

226224
for _, n := range nodeList.Items {
227-
var err error
228-
for retry := 0; retry < 5; retry++ {
229-
if err = cleanup(n.Name); err == nil {
230-
break
231-
}
232-
time.Sleep(100 * time.Millisecond)
233-
}
234-
Expect(err).NotTo(HaveOccurred())
225+
nodeName := n.Name
226+
Eventually(func(g Gomega) error {
227+
return cleanup(nodeName)
228+
}).WithPolling(100 * time.Millisecond).WithTimeout(500 * time.Millisecond).Should(Succeed())
235229
}
236230
}
237231

tests/e2e/device-plugin_test.go

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package e2e
1919

2020
import (
21-
"context"
2221
"fmt"
2322
"strings"
2423
"time"
@@ -30,6 +29,7 @@ import (
3029
helmValues "github.com/mittwald/go-helm-client/values"
3130
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3231

32+
"github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal"
3333
"github.com/NVIDIA/k8s-test-infra/pkg/diagnostics"
3434
)
3535

@@ -38,7 +38,7 @@ const (
3838
)
3939

4040
// Actual test suite
41-
var _ = Describe("GPU Device Plugin", Ordered, func() {
41+
var _ = Describe("GPU Device Plugin", Ordered, Label("gpu", "e2e", "device-plugin"), func() {
4242
// Init global suite vars vars
4343
var (
4444
helmReleaseName string
@@ -75,7 +75,7 @@ var _ = Describe("GPU Device Plugin", Ordered, func() {
7575
collectLogsFrom = strings.Split(CollectLogsFrom, ",")
7676
}
7777

78-
BeforeAll(func(ctx context.Context) {
78+
BeforeAll(func(ctx SpecContext) {
7979
// Create clients for apiextensions and our CRD api
8080
helmReleaseName = "nvdp-e2e-test-" + randomSuffix()
8181

@@ -92,9 +92,24 @@ var _ = Describe("GPU Device Plugin", Ordered, func() {
9292
By("Installing k8s-device-plugin Helm chart")
9393
_, err := helmClient.InstallChart(ctx, &chartSpec, nil)
9494
Expect(err).NotTo(HaveOccurred())
95+
96+
// Wait for all DaemonSets to be ready
97+
// Note: DaemonSet names are dynamically generated with the Helm release prefix,
98+
// so we wait for all DaemonSets in the namespace rather than specific names
99+
By("Waiting for all DaemonSets to be ready")
100+
err = internal.WaitForAllDaemonSetsReady(ctx, clientSet, testNamespace.Name)
101+
Expect(err).NotTo(HaveOccurred())
102+
})
103+
104+
AfterAll(func(ctx SpecContext) {
105+
By("Uninstalling k8s-device-plugin Helm chart")
106+
err := helmClient.UninstallReleaseByName(helmReleaseName)
107+
if err != nil {
108+
GinkgoWriter.Printf("Failed to uninstall helm release %s: %v\n", helmReleaseName, err)
109+
}
95110
})
96111

97-
AfterEach(func(ctx context.Context) {
112+
AfterEach(func(ctx SpecContext) {
98113
// Run diagnostic collector if test failed
99114
if CurrentSpecReport().Failed() {
100115
var err error
@@ -111,18 +126,8 @@ var _ = Describe("GPU Device Plugin", Ordered, func() {
111126
}
112127
})
113128

114-
AfterAll(func(ctx context.Context) {
115-
By("Deleting the job")
116-
job, err := clientSet.BatchV1().Jobs(testNamespace.Name).List(ctx, metav1.ListOptions{})
117-
Expect(err).NotTo(HaveOccurred())
118-
Expect(len(job.Items)).ToNot(BeZero())
119-
120-
err = clientSet.BatchV1().Jobs(testNamespace.Name).Delete(ctx, job.Items[0].Name, metav1.DeleteOptions{})
121-
Expect(err).NotTo(HaveOccurred())
122-
})
123-
124-
When("When deploying k8s-device-plugin", Ordered, func() {
125-
It("it should create nvidia.com/gpu resource", func(ctx context.Context) {
129+
When("When deploying k8s-device-plugin", Ordered, Label("serial"), func() {
130+
It("it should create nvidia.com/gpu resource", Label("gpu-resource"), func(ctx SpecContext) {
126131
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
127132
Expect(err).NotTo(HaveOccurred())
128133
Expect(len(nodeList.Items)).ToNot(BeZero())
@@ -141,25 +146,35 @@ var _ = Describe("GPU Device Plugin", Ordered, func() {
141146
}}
142147
eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchCapacity(capacityChecker, nodes), "Node capacity does not match")
143148
})
144-
It("it should run GPU jobs", func(ctx context.Context) {
149+
It("it should run GPU jobs", Label("gpu-job"), func(ctx SpecContext) {
145150
By("Creating a GPU job")
146-
job, err := CreateOrUpdateJobsFromFile(ctx, clientSet, "job-1.yaml", testNamespace.Name)
151+
jobNames, err := CreateOrUpdateJobsFromFile(ctx, clientSet, "job-1.yaml", testNamespace.Name)
147152
Expect(err).NotTo(HaveOccurred())
153+
Expect(jobNames).NotTo(BeEmpty())
154+
155+
// Defer cleanup for the job
156+
DeferCleanup(func(ctx SpecContext) {
157+
By("Deleting the GPU job")
158+
err := clientSet.BatchV1().Jobs(testNamespace.Name).Delete(ctx, jobNames[0], metav1.DeleteOptions{})
159+
if err != nil {
160+
GinkgoWriter.Printf("Failed to delete job %s: %v\n", jobNames[0], err)
161+
}
162+
})
148163

149164
By("Waiting for job to complete")
150-
Eventually(func() error {
151-
job, err := clientSet.BatchV1().Jobs(testNamespace.Name).Get(ctx, job[0], metav1.GetOptions{})
165+
Eventually(func(g Gomega) error {
166+
job, err := clientSet.BatchV1().Jobs(testNamespace.Name).Get(ctx, jobNames[0], metav1.GetOptions{})
152167
if err != nil {
153168
return err
154169
}
155-
if job.Status.Succeeded != 1 {
156-
return fmt.Errorf("job %s/%s failed", job.Namespace, job.Name)
170+
if job.Status.Failed > 0 {
171+
return fmt.Errorf("job %s/%s has failed pods: %d", job.Namespace, job.Name, job.Status.Failed)
157172
}
158-
if job.Status.Succeeded == 1 {
159-
return nil
173+
if job.Status.Succeeded != 1 {
174+
return fmt.Errorf("job %s/%s not completed yet: %d succeeded", job.Namespace, job.Name, job.Status.Succeeded)
160175
}
161-
return fmt.Errorf("job %s/%s not completed yet", job.Namespace, job.Name)
162-
}, devicePluginEventuallyTimeout, 5*time.Second).Should(BeNil())
176+
return nil
177+
}).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(devicePluginEventuallyTimeout).Should(Succeed())
163178
})
164179
})
165180
})

tests/e2e/e2e_test.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,17 @@ func TestMain(t *testing.T) {
103103
ctx = context.Background()
104104
getTestEnv()
105105

106+
// Log random seed for reproducibility
107+
GinkgoWriter.Printf("Random seed: %d\n", GinkgoRandomSeed())
108+
106109
RunSpecs(t,
107110
suiteName,
111+
Label("e2e"),
108112
)
109113
}
110114

111115
// BeforeSuite runs before the test suite
112-
var _ = BeforeSuite(func() {
116+
var _ = BeforeSuite(func(ctx SpecContext) {
113117
var err error
114118

115119
cwd, err = os.Getwd()
@@ -130,16 +134,31 @@ var _ = BeforeSuite(func() {
130134
getHelmClient()
131135
})
132136

133-
var _ = AfterSuite(func() {
137+
var _ = AfterSuite(func(ctx SpecContext) {
134138
By("Cleaning up namespace resources")
135-
// Remove finalizers and force delete resourceclaims, resourceclaimtemplates, daemonsets, and pods.
136139
cleanupNamespaceResources(testNamespace.Name)
137140

138141
By("Deleting the test namespace")
139-
// Delete the test namespace to remove any remaining objects.
140142
deleteTestNamespace()
141143
})
142144

145+
// Add ReportAfterSuite for logging test summary and random seed
146+
var _ = ReportAfterSuite("", func(report Report) {
147+
// Log test summary
148+
failedCount := 0
149+
for _, specReport := range report.SpecReports {
150+
if specReport.Failed() {
151+
failedCount++
152+
}
153+
}
154+
155+
GinkgoWriter.Printf("\nTest Summary:\n")
156+
GinkgoWriter.Printf(" Total Specs: %d\n", len(report.SpecReports))
157+
GinkgoWriter.Printf(" Random Seed: %d\n", report.SuiteConfig.RandomSeed)
158+
GinkgoWriter.Printf(" Failed: %d\n", failedCount)
159+
GinkgoWriter.Printf(" Duration: %.2fs\n", report.RunTime.Seconds())
160+
})
161+
143162
// getK8sClients creates the k8s clients
144163
func getK8sClients() {
145164
var err error
@@ -297,7 +316,7 @@ type k8sLabels map[string]string
297316
//
298317
//nolint:unused
299318
func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion {
300-
return Eventually(func(g Gomega, ctx context.Context) ([]corev1.Node, error) {
319+
return Eventually(func(g Gomega) ([]corev1.Node, error) {
301320
return getNonControlPlaneNodes(ctx, cli)
302321
}).WithPolling(1 * time.Second).WithTimeout(1 * time.Minute).WithContext(ctx)
303322
}

tests/e2e/gpu-feature-discovery_test.go

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ import (
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3232
"k8s.io/apimachinery/pkg/util/rand"
3333

34+
"github.com/NVIDIA/k8s-device-plugin/tests/e2e/internal"
3435
"github.com/NVIDIA/k8s-test-infra/pkg/diagnostics"
3536
)
3637

3738
// Actual test suite
38-
var _ = Describe("GPU Feature Discovery", Ordered, func() {
39+
var _ = Describe("GPU Feature Discovery", Ordered, Label("gfd", "gpu", "e2e"), func() {
3940
expectedLabelPatterns := k8sLabels{
4041
"nvidia.com/gfd.timestamp": "[0-9]{10}",
4142
"nvidia.com/cuda.driver.major": "[0-9]+",
@@ -97,7 +98,7 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
9798
collectLogsFrom = strings.Split(CollectLogsFrom, ",")
9899
}
99100

100-
BeforeAll(func(ctx context.Context) {
101+
BeforeAll(func(ctx SpecContext) {
101102
helmReleaseName = "gfd-e2e-test" + rand.String(5)
102103

103104
// reset Helm Client
@@ -114,10 +115,25 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
114115
By("Installing GFD Helm chart")
115116
_, err := helmClient.InstallChart(ctx, &chartSpec, nil)
116117
Expect(err).NotTo(HaveOccurred())
118+
119+
// Wait for all DaemonSets to be ready
120+
// Note: DaemonSet names are dynamically generated with the Helm release prefix,
121+
// so we wait for all DaemonSets in the namespace rather than specific names
122+
By("Waiting for all DaemonSets to be ready")
123+
err = internal.WaitForAllDaemonSetsReady(ctx, clientSet, testNamespace.Name)
124+
Expect(err).NotTo(HaveOccurred())
125+
})
126+
127+
AfterAll(func(ctx SpecContext) {
128+
By("Uninstalling GFD Helm chart")
129+
err := helmClient.UninstallReleaseByName(helmReleaseName)
130+
if err != nil {
131+
GinkgoWriter.Printf("Failed to uninstall helm release %s: %v\n", helmReleaseName, err)
132+
}
117133
})
118134

119135
// Cleanup before next test run
120-
AfterEach(func(ctx context.Context) {
136+
AfterEach(func(ctx SpecContext) {
121137
// Run diagnostic collector if test failed
122138
if CurrentSpecReport().Failed() {
123139
var err error
@@ -135,9 +151,17 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
135151
}
136152
})
137153

138-
When("When deploying GFD", Ordered, func() {
154+
When("When deploying GFD", Ordered, Label("serial"), func() {
139155
Context("NV Driver is not installed", func() {
140-
It("it should create nvidia.com timestamp label", func(ctx context.Context) {
156+
BeforeEach(func() {
157+
// Skip this context when driver is enabled since "NV Driver is installed"
158+
// context provides more comprehensive testing
159+
if NVIDIA_DRIVER_ENABLED {
160+
Skip("Skipping driver-not-installed tests when NVIDIA_DRIVER_ENABLED is true")
161+
}
162+
})
163+
164+
It("it should create nvidia.com timestamp label", Label("timestamp"), func(ctx SpecContext) {
141165
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
142166
Expect(err).NotTo(HaveOccurred())
143167
Expect(len(nodeList.Items)).ToNot(BeZero())
@@ -157,7 +181,7 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
157181
eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes))
158182
})
159183
Context("and the NodeFeature API is enabled", func() {
160-
It("gfd should create node feature object", func(ctx context.Context) {
184+
It("gfd should create node feature object", Label("nodefeature"), func(ctx SpecContext) {
161185
By("Updating GFD Helm chart values")
162186
newValues := values
163187
newValues.Values = append(newValues.Values, "nfd.enableNodeFeatureApi=true")
@@ -172,9 +196,9 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
172196

173197
targetNodeName := nodes[0].Name
174198
Expect(targetNodeName).ToNot(BeEmpty())
175-
Eventually(func() bool {
199+
Eventually(func(g Gomega) bool {
176200
return checkNodeFeatureObject(ctx, targetNodeName)
177-
}, 2*time.Minute, 5*time.Second).Should(BeTrue())
201+
}).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue())
178202

179203
By("Checking that node labels are created from NodeFeature object")
180204
labelChecker := map[string]k8sLabels{
@@ -187,7 +211,7 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
187211
})
188212

189213
When("NV Driver is installed", func() {
190-
It("it should create nvidia.com labels", func(ctx context.Context) {
214+
It("it should create nvidia.com labels", Label("driver", "labels"), func(ctx SpecContext) {
191215
if !NVIDIA_DRIVER_ENABLED {
192216
Skip("NVIDIA_DRIVER_ENABLED is not set")
193217
}
@@ -210,7 +234,7 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
210234
eventuallyNonControlPlaneNodes(ctx, clientSet).Should(MatchLabels(labelChecker, nodes))
211235
})
212236
Context("and the NodeFeature API is enabled", func() {
213-
It("gfd should create node feature object", func(ctx context.Context) {
237+
It("gfd should create node feature object", Label("driver", "nodefeature"), func(ctx SpecContext) {
214238
if !NVIDIA_DRIVER_ENABLED {
215239
Skip("NVIDIA_DRIVER_ENABLED is not set")
216240
}
@@ -228,9 +252,9 @@ var _ = Describe("GPU Feature Discovery", Ordered, func() {
228252

229253
targetNodeName := nodes[0].Name
230254
Expect(targetNodeName).ToNot(BeEmpty())
231-
Eventually(func() bool {
255+
Eventually(func(g Gomega) bool {
232256
return checkNodeFeatureObject(ctx, targetNodeName)
233-
}, 2*time.Minute, 5*time.Second).Should(BeTrue())
257+
}).WithContext(ctx).WithPolling(5 * time.Second).WithTimeout(2 * time.Minute).Should(BeTrue())
234258

235259
By("Checking that node labels are created from NodeFeature CR object")
236260
checkForLabels := map[string]k8sLabels{

0 commit comments

Comments
 (0)