Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 38 additions & 19 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1379,11 +1379,46 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, runtime string, container *corev1.Container) error {
setContainerEnv(container, "RUNTIME", runtime)

if runtime == gpuv1.Containerd.String() {
// Set the runtime class name that is to be configured for containerd
setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
// In NRI plugin mode the toolkit installer runs the noop runtime configurer
// (see nvidia-ctk-installer's NewConfigurer): it does not read or modify the
// container runtime configuration and does not restart the runtime. Mounting the
// runtime config files and socket is therefore unnecessary in this mode, and the
// hostPath mounts (e.g. /etc/containerd/conf.d with DirectoryOrCreate) break on
// read-only hosts such as Talos. Only the NRI socket mount (below) is required.
if config.CDI.IsNRIPluginEnabled() {
Comment thread
frezbo marked this conversation as resolved.
// setup mounts for the runtime NRI socket file
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not move L1389-L1401 to a private the same way as you have done in transformRuntimeConfigAndSocketMounts?

nriSocketFile := getContainerEnv(container, "NRI_SOCKET")
if nriSocketFile == "" {
nriSocketFile = DefaultRuntimeNRISocketFile
}
setContainerEnv(container, "NRI_SOCKET", path.Join(DefaultRuntimeNRISocketTargetDir, path.Base(nriSocketFile))

nriVolMountSocketName := "nri-socket"
nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir}
container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket)

nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol)
Comment thread
frezbo marked this conversation as resolved.
} else {
if runtime == gpuv1.Containerd.String() {
// Set the runtime class name that is to be configured for containerd
setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
}

if err := transformRuntimeConfigAndSocketMounts(obj, runtime, container); err != nil {
return err
}
}

return nil
}

// transformRuntimeConfigAndSocketMounts configures the toolkit container with the
// mounts and environment required for the toolkit installer to update the container
// runtime configuration (top-level config, drop-in config, and runtime socket). This
// is not required when the NRI plugin is enabled, since the installer does not modify
// the runtime configuration in that mode.
Comment on lines +1416 to +1420
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's leave out the parts that are more or less obvious.

Suggested change
// transformRuntimeConfigAndSocketMounts configures the toolkit container with the
// mounts and environment required for the toolkit installer to update the container
// runtime configuration (top-level config, drop-in config, and runtime socket). This
// is not required when the NRI plugin is enabled, since the installer does not modify
// the runtime configuration in that mode.
// transformRuntimeConfigAndSocketMounts configures the toolkit container with the
// mounts and environment required for the toolkit installer to update the container
// runtime configuration (top-level config, drop-in config, and runtime socket).

func transformRuntimeConfigAndSocketMounts(obj *appsv1.DaemonSet, runtime string, container *corev1.Container) error {
// For runtime config files we have top-level configs and drop-in files.
// These are supported as follows:
// * Docker only supports top-level config files.
Expand Down Expand Up @@ -1489,22 +1524,6 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol)
}

if config.CDI.IsNRIPluginEnabled() {
// setup mounts for the runtime NRI socket file
nriSocketFile := getContainerEnv(container, "NRI_SOCKET")
if nriSocketFile == "" {
nriSocketFile = DefaultRuntimeNRISocketFile
}
setContainerEnv(container, "NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile))

nriVolMountSocketName := "nri-socket"
nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir}
container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket)

nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol)
}

return nil
}

Expand Down
36 changes: 35 additions & 1 deletion controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package controllers

import (
"path"
"path/filepath"
"strings"
"testing"
Expand Down Expand Up @@ -390,6 +391,7 @@ func TestTransformForRuntime(t *testing.T) {
testCases := []struct {
description string
runtime gpuv1.Runtime
clusterPolicy *gpuv1.ClusterPolicySpec
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of modifying the test case struct here, you can add a new test case to the TestTransformToolkit method in object_controls_test.go. The test case struct over there already has the ClusterPolicySpec object as a struct field.

input Daemonset
expectedOutput Daemonset
}{
Expand Down Expand Up @@ -558,11 +560,43 @@ func TestTransformForRuntime(t *testing.T) {
},
}),
},
{
// With the NRI plugin enabled, the toolkit installer does not modify the
// container runtime configuration, so no runtime config / drop-in / socket
// mounts are added -- only the NRI socket mount.
description: "containerd with NRI plugin enabled",
runtime: gpuv1.Containerd,
clusterPolicy: &gpuv1.ClusterPolicySpec{
Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass},
CDI: gpuv1.CDIConfigSpec{
Enabled: ptr.To(true),
NRIPluginEnabled: ptr.To(true),
},
},
input: NewDaemonset().
WithContainer(corev1.Container{Name: "test-ctr"}),
expectedOutput: NewDaemonset().
WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)).
WithContainer(corev1.Container{
Name: "test-ctr",
Env: []corev1.EnvVar{
{Name: "RUNTIME", Value: gpuv1.Containerd.String()},
{Name: "NRI_SOCKET", Value: path.Join(DefaultRuntimeNRISocketTargetDir, path.Base(DefaultRuntimeNRISocketFile))},
},
VolumeMounts: []corev1.VolumeMount{
{Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir},
Comment thread
frezbo marked this conversation as resolved.
},
}),
},
}

cp := &gpuv1.ClusterPolicySpec{Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass}}
defaultCP := &gpuv1.ClusterPolicySpec{Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass}}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
cp := defaultCP
if tc.clusterPolicy != nil {
cp = tc.clusterPolicy
}
Comment thread
frezbo marked this conversation as resolved.
// pass pointer to the target container
err := transformForRuntime(tc.input.DaemonSet, cp, tc.runtime.String(), &tc.input.Spec.Template.Spec.Containers[0])
require.NoError(t, err)
Expand Down