Skip to content

Commit e4d84ae

Browse files
committed
implement NRI plugin server to inject management CDI devices
Signed-off-by: Tariq Ibrahim <[email protected]>
1 parent f845983 commit e4d84ae

File tree

589 files changed

+165221
-23
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

589 files changed

+165221
-23
lines changed

cmd/nvidia-ctk-installer/container/container.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,15 @@ type Options struct {
4949
// mount.
5050
ExecutablePath string
5151
// EnabledCDI indicates whether CDI should be enabled.
52-
EnableCDI bool
53-
RuntimeName string
54-
RuntimeDir string
55-
SetAsDefault bool
56-
RestartMode string
57-
HostRootMount string
52+
EnableCDI bool
53+
EnableNRI bool
54+
RuntimeName string
55+
RuntimeDir string
56+
SetAsDefault bool
57+
RestartMode string
58+
HostRootMount string
59+
NRIPluginIndex string
60+
NRISocket string
5861

5962
ConfigSources []string
6063
}
@@ -128,6 +131,10 @@ func (o Options) UpdateConfig(cfg engine.Interface) error {
128131
cfg.EnableCDI()
129132
}
130133

134+
if o.EnableNRI {
135+
cfg.EnableNRI()
136+
}
137+
131138
return nil
132139
}
133140

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
package nri
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/containerd/nri/pkg/api"
8+
"github.com/containerd/nri/pkg/stub"
9+
"sigs.k8s.io/yaml"
10+
11+
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
12+
)
13+
14+
const (
15+
// nodeResourceCDIDeviceKey is the prefix of the key used for CDI device annotations.
16+
nodeResourceCDIDeviceKey = "cdi-devices.noderesource.dev"
17+
// Prefix of the key used for CDI device annotations.
18+
nriCDIDeviceKey = "cdi-devices.nri.io"
19+
)
20+
21+
type Plugin struct {
22+
logger logger.Interface
23+
24+
Stub stub.Stub
25+
}
26+
27+
// CreateContainer handles container creation requests.
28+
func (p *Plugin) CreateContainer(_ context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
29+
adjust := &api.ContainerAdjustment{}
30+
31+
if err := p.injectCDIDevices(pod, ctr, adjust); err != nil {
32+
return nil, nil, err
33+
}
34+
35+
return adjust, nil, nil
36+
}
37+
38+
func (p *Plugin) injectCDIDevices(pod *api.PodSandbox, ctr *api.Container, a *api.ContainerAdjustment) error {
39+
devices, err := parseCDIDevices(ctr.Name, pod.Annotations)
40+
if err != nil {
41+
return err
42+
}
43+
44+
if len(devices) == 0 {
45+
p.logger.Debugf("%s: no CDI devices annotated...", containerName(pod, ctr))
46+
return nil
47+
}
48+
49+
for _, name := range devices {
50+
a.AddCDIDevice(
51+
&api.CDIDevice{
52+
Name: name,
53+
},
54+
)
55+
p.logger.Infof("%s: injected CDI device %q...", containerName(pod, ctr), name)
56+
}
57+
58+
return nil
59+
}
60+
61+
func parseCDIDevices(ctr string, annotations map[string]string) ([]string, error) {
62+
var (
63+
cdiDevices []string
64+
)
65+
66+
annotation := getAnnotation(annotations, nodeResourceCDIDeviceKey, nriCDIDeviceKey, ctr)
67+
if len(annotation) == 0 {
68+
return nil, nil
69+
}
70+
71+
if err := yaml.Unmarshal(annotation, &cdiDevices); err != nil {
72+
return nil, fmt.Errorf("invalid CDI device annotation %q: %w", string(annotation), err)
73+
}
74+
75+
return cdiDevices, nil
76+
}
77+
78+
func getAnnotation(annotations map[string]string, mainKey, oldKey, ctr string) []byte {
79+
for _, key := range []string{
80+
mainKey + "/container." + ctr,
81+
oldKey + "/container." + ctr,
82+
mainKey + "/pod",
83+
oldKey + "/pod",
84+
mainKey,
85+
oldKey,
86+
} {
87+
if value, ok := annotations[key]; ok {
88+
return []byte(value)
89+
}
90+
}
91+
92+
return nil
93+
}
94+
95+
// Construct a container name for log messages.
96+
func containerName(pod *api.PodSandbox, container *api.Container) string {
97+
if pod != nil {
98+
return pod.Name + "/" + container.Name
99+
}
100+
return container.Name
101+
}

cmd/nvidia-ctk-installer/container/runtime/runtime.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@
1717
package runtime
1818

1919
import (
20+
"context"
21+
"errors"
2022
"fmt"
23+
"strings"
2124

25+
"github.com/containerd/nri/pkg/stub"
2226
"github.com/urfave/cli/v3"
2327

2428
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container"
2529
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/containerd"
2630
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/crio"
2731
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/docker"
32+
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/nri"
2833
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/toolkit"
2934
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
3035
)
@@ -34,6 +39,8 @@ const (
3439
// defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled
3540
defaultRuntimeName = "nvidia"
3641
defaultHostRootMount = "/host"
42+
defaultNRIPluginIdx = "10"
43+
defaultNRISocket = "/var/run/nri/nri.sock"
3744

3845
runtimeSpecificDefault = "RUNTIME_SPECIFIC_DEFAULT"
3946
)
@@ -94,6 +101,27 @@ func Flags(opts *Options) []cli.Flag {
94101
Destination: &opts.EnableCDI,
95102
Sources: cli.EnvVars("RUNTIME_ENABLE_CDI"),
96103
},
104+
&cli.BoolFlag{
105+
Name: "enable-nri-in-runtime",
106+
Usage: "Enable NRI in the configured runtime",
107+
Destination: &opts.EnableNRI,
108+
Value: true,
109+
Sources: cli.EnvVars("RUNTIME_ENABLE_NRI"),
110+
},
111+
&cli.StringFlag{
112+
Name: "nri-plugin-index",
113+
Usage: "Specify the plugin index to register to NRI",
114+
Value: defaultNRIPluginIdx,
115+
Destination: &opts.NRIPluginIndex,
116+
Sources: cli.EnvVars("RUNTIME_NRI_PLUGIN_INDEX"),
117+
},
118+
&cli.StringFlag{
119+
Name: "nri-socket",
120+
Usage: "Specify the path to the NRI socket file to register the NRI plugin server",
121+
Value: defaultNRISocket,
122+
Destination: &opts.NRISocket,
123+
Sources: cli.EnvVars("RUNTIME_NRI_SOCKET"),
124+
},
97125
&cli.StringFlag{
98126
Name: "host-root",
99127
Usage: "Specify the path to the host root to be used when restarting the runtime using systemd",
@@ -250,3 +278,57 @@ func GetLowlevelRuntimePaths(opts *Options, runtime string) ([]string, error) {
250278
return nil, fmt.Errorf("undefined runtime %v", runtime)
251279
}
252280
}
281+
282+
func StartNRIPlugin(ctx context.Context, opts *Options) (*nri.Plugin, error) {
283+
284+
socketPaths := getNRISocketPaths(opts)
285+
p := &nri.Plugin{}
286+
var errs []error
287+
var nriSocketConnSuccess bool
288+
for _, socketPath := range socketPaths {
289+
var pluginOpts []stub.Option
290+
pluginOpts = append(pluginOpts, stub.WithPluginIdx(opts.NRIPluginIndex))
291+
pluginOpts = append(pluginOpts, stub.WithSocketPath(opts.HostRootMount+socketPath))
292+
var err error
293+
if p.Stub, err = stub.New(p, pluginOpts...); err != nil {
294+
errs = append(errs, fmt.Errorf("failed to initialise plugin at %s: %w", socketPath, err))
295+
continue
296+
}
297+
298+
fmt.Printf("Attempting to connect to %s\n", opts.HostRootMount+socketPath)
299+
err = p.Stub.Run(ctx)
300+
if err != nil {
301+
errs = append(errs, fmt.Errorf("plugin exited with error %w", err))
302+
} else {
303+
nriSocketConnSuccess = true
304+
break
305+
}
306+
}
307+
308+
if !nriSocketConnSuccess {
309+
return nil, errors.Join(errs...)
310+
}
311+
312+
return p, nil
313+
}
314+
315+
func getNRISocketPaths(opts *Options) []string {
316+
var socketPaths []string
317+
318+
origSocketPath := opts.NRISocket
319+
if len(origSocketPath) == 0 {
320+
origSocketPath = defaultNRISocket
321+
}
322+
323+
socketPaths = append(socketPaths, origSocketPath)
324+
socketPathSuffix, found := strings.CutPrefix(origSocketPath, "/var/run/")
325+
if found {
326+
fallbackSocketPath := fmt.Sprintf("%s/%s", "/run", socketPathSuffix)
327+
socketPaths = append(socketPaths, fallbackSocketPath)
328+
}
329+
return socketPaths
330+
}
331+
332+
func StopNRIPlugin(plugin *nri.Plugin) {
333+
plugin.Stub.Stop()
334+
}

cmd/nvidia-ctk-installer/main.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"path/filepath"
99
"syscall"
1010

11+
"github.com/containerd/nri/pkg/stub"
1112
"github.com/urfave/cli/v3"
1213
"golang.org/x/sys/unix"
1314

@@ -70,7 +71,8 @@ func main() {
7071
type app struct {
7172
logger logger.Interface
7273

73-
toolkit *toolkit.Installer
74+
pluginStub stub.Stub
75+
toolkit *toolkit.Installer
7476
}
7577

7678
// NewApp creates the CLI app fro the specified options.
@@ -93,8 +95,8 @@ func (a app) build() *cli.Command {
9395
Before: func(ctx context.Context, cmd *cli.Command) (context.Context, error) {
9496
return ctx, a.Before(cmd, &options)
9597
},
96-
Action: func(_ context.Context, cmd *cli.Command) error {
97-
return a.Run(cmd, &options)
98+
Action: func(ctx context.Context, cmd *cli.Command) error {
99+
return a.Run(ctx, cmd, &options)
98100
},
99101
Flags: []cli.Flag{
100102
&cli.BoolFlag{
@@ -194,7 +196,7 @@ func (a *app) validateFlags(c *cli.Command, o *options) error {
194196
// Run installs the NVIDIA Container Toolkit and updates the requested runtime.
195197
// If the application is run as a daemon, the application waits and unconfigures
196198
// the runtime on termination.
197-
func (a *app) Run(c *cli.Command, o *options) error {
199+
func (a *app) Run(ctx context.Context, c *cli.Command, o *options) error {
198200
err := a.initialize(o.pidFile)
199201
if err != nil {
200202
return fmt.Errorf("unable to initialize: %v", err)
@@ -216,6 +218,13 @@ func (a *app) Run(c *cli.Command, o *options) error {
216218
return fmt.Errorf("unable to install toolkit: %v", err)
217219
}
218220

221+
if o.runtimeOptions.EnableNRI {
222+
err = a.startNRIPluginServer(ctx, &o.runtimeOptions)
223+
if err != nil {
224+
return fmt.Errorf("unable to start runtime plugin server: %w", err)
225+
}
226+
}
227+
219228
err = runtime.Setup(c, &o.runtimeOptions, o.runtime)
220229
if err != nil {
221230
return fmt.Errorf("unable to setup runtime: %v", err)
@@ -290,6 +299,11 @@ func (a *app) waitForSignal() error {
290299
func (a *app) shutdown(pidFile string) {
291300
a.logger.Infof("Shutting Down")
292301

302+
if a.pluginStub != nil {
303+
a.logger.Infof("Stopping NRI plugin server...")
304+
a.pluginStub.Stop()
305+
}
306+
293307
err := os.Remove(pidFile)
294308
if err != nil {
295309
a.logger.Warningf("Unable to remove pidfile: %v", err)
@@ -327,3 +341,14 @@ func (a *app) resolvePackageType(hostRoot string, packageType string) (rPackageT
327341

328342
return "deb", nil
329343
}
344+
345+
func (a *app) startNRIPluginServer(ctx context.Context, opts *runtime.Options) error {
346+
a.logger.Info("Starting NRI Plugin server...")
347+
plugin, err := runtime.StartNRIPlugin(ctx, opts)
348+
if plugin == nil || err != nil {
349+
a.logger.Errorf("Failed to start NRI plugin server: %v", err)
350+
return fmt.Errorf("unable to setup NRI plugin server: %w", err)
351+
}
352+
a.pluginStub = plugin.Stub
353+
return nil
354+
}

cmd/nvidia-ctk-installer/main_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ version = 2
444444
"--pid-file=" + filepath.Join(testRoot, "toolkit.pid"),
445445
"--restart-mode=none",
446446
"--toolkit-source-root=" + filepath.Join(artifactRoot, "deb"),
447+
"--enable-nri-in-runtime=false",
447448
}
448449

449450
err := app.Run(context.Background(), append(testArgs, tc.args...))

go.mod

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.25.0
55
require (
66
github.com/NVIDIA/go-nvlib v0.8.1
77
github.com/NVIDIA/go-nvml v0.13.0-1
8+
github.com/containerd/nri v0.10.1-0.20251120153915-7d8611f87ad7
89
github.com/google/uuid v1.6.0
910
github.com/moby/sys/mountinfo v0.7.2
1011
github.com/moby/sys/reexec v0.1.0
@@ -19,24 +20,31 @@ require (
1920
github.com/urfave/cli/v3 v3.6.1
2021
golang.org/x/mod v0.30.0
2122
golang.org/x/sys v0.38.0
23+
sigs.k8s.io/yaml v1.4.0
2224
tags.cncf.io/container-device-interface v1.0.2-0.20251114135136-1b24d969689f
2325
tags.cncf.io/container-device-interface/specs-go v1.0.0
2426
)
2527

2628
require (
2729
cyphar.com/go-pathrs v0.2.1 // indirect
30+
github.com/containerd/log v0.1.0 // indirect
31+
github.com/containerd/ttrpc v1.2.7 // indirect
2832
github.com/cyphar/filepath-securejoin v0.6.0 // indirect
2933
github.com/davecgh/go-spew v1.1.1 // indirect
3034
github.com/fsnotify/fsnotify v1.7.0 // indirect
35+
github.com/golang/protobuf v1.5.3 // indirect
3136
github.com/hashicorp/errwrap v1.1.0 // indirect
32-
github.com/kr/pretty v0.3.1 // indirect
37+
github.com/knqyf263/go-plugin v0.9.0 // indirect
38+
github.com/kr/text v0.2.0 // indirect
3339
github.com/moby/sys/capability v0.4.0 // indirect
3440
github.com/opencontainers/cgroups v0.0.4 // indirect
3541
github.com/opencontainers/runtime-tools v0.9.1-0.20251114084447-edf4cb3d2116 // indirect
3642
github.com/pmezard/go-difflib v1.0.0 // indirect
3743
github.com/rogpeppe/go-internal v1.11.0 // indirect
44+
github.com/tetratelabs/wazero v1.9.0 // indirect
3845
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
39-
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
46+
google.golang.org/genproto/googleapis/rpc v0.0.0-20230731190214-cbb8c96f2d6d // indirect
47+
google.golang.org/grpc v1.57.1 // indirect
48+
google.golang.org/protobuf v1.36.5 // indirect
4049
gopkg.in/yaml.v3 v3.0.1 // indirect
41-
sigs.k8s.io/yaml v1.4.0 // indirect
4250
)

0 commit comments

Comments
 (0)