diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e9767a20..558fbb704 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1379,11 +1379,46 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, runtime string, container *corev1.Container) error { setContainerEnv(container, "RUNTIME", runtime) - if runtime == gpuv1.Containerd.String() { - // Set the runtime class name that is to be configured for containerd - setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config)) + // In NRI plugin mode the toolkit installer runs the noop runtime configurer + // (see nvidia-ctk-installer's NewConfigurer): it does not read or modify the + // container runtime configuration and does not restart the runtime. Mounting the + // runtime config files and socket is therefore unnecessary in this mode, and the + // hostPath mounts (e.g. /etc/containerd/conf.d with DirectoryOrCreate) break on + // read-only hosts such as Talos. Only the NRI socket mount (below) is required. + if config.CDI.IsNRIPluginEnabled() { + // setup mounts for the runtime NRI socket file + nriSocketFile := getContainerEnv(container, "NRI_SOCKET") + if nriSocketFile == "" { + nriSocketFile = DefaultRuntimeNRISocketFile + } + setContainerEnv(container, "NRI_SOCKET", path.Join(DefaultRuntimeNRISocketTargetDir, path.Base(nriSocketFile)) + + nriVolMountSocketName := "nri-socket" + nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir} + container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket) + + nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}} + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol) + } else { + if runtime == gpuv1.Containerd.String() { + // Set the runtime class name that is to be configured for containerd + setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config)) + } + + if err := transformRuntimeConfigAndSocketMounts(obj, runtime, container); err != nil { + return err + } } + return nil +} + +// transformRuntimeConfigAndSocketMounts configures the toolkit container with the +// mounts and environment required for the toolkit installer to update the container +// runtime configuration (top-level config, drop-in config, and runtime socket). This +// is not required when the NRI plugin is enabled, since the installer does not modify +// the runtime configuration in that mode. +func transformRuntimeConfigAndSocketMounts(obj *appsv1.DaemonSet, runtime string, container *corev1.Container) error { // For runtime config files we have top-level configs and drop-in files. // These are supported as follows: // * Docker only supports top-level config files. @@ -1489,22 +1524,6 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol) } - if config.CDI.IsNRIPluginEnabled() { - // setup mounts for the runtime NRI socket file - nriSocketFile := getContainerEnv(container, "NRI_SOCKET") - if nriSocketFile == "" { - nriSocketFile = DefaultRuntimeNRISocketFile - } - setContainerEnv(container, "NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile)) - - nriVolMountSocketName := "nri-socket" - nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir} - container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket) - - nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}} - obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol) - } - return nil } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 30310c1b2..3ce43674f 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -17,6 +17,7 @@ package controllers import ( + "path" "path/filepath" "strings" "testing" @@ -390,6 +391,7 @@ func TestTransformForRuntime(t *testing.T) { testCases := []struct { description string runtime gpuv1.Runtime + clusterPolicy *gpuv1.ClusterPolicySpec input Daemonset expectedOutput Daemonset }{ @@ -558,11 +560,43 @@ func TestTransformForRuntime(t *testing.T) { }, }), }, + { + // With the NRI plugin enabled, the toolkit installer does not modify the + // container runtime configuration, so no runtime config / drop-in / socket + // mounts are added -- only the NRI socket mount. + description: "containerd with NRI plugin enabled", + runtime: gpuv1.Containerd, + clusterPolicy: &gpuv1.ClusterPolicySpec{ + Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass}, + CDI: gpuv1.CDIConfigSpec{ + Enabled: ptr.To(true), + NRIPluginEnabled: ptr.To(true), + }, + }, + input: NewDaemonset(). + WithContainer(corev1.Container{Name: "test-ctr"}), + expectedOutput: NewDaemonset(). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). + WithContainer(corev1.Container{ + Name: "test-ctr", + Env: []corev1.EnvVar{ + {Name: "RUNTIME", Value: gpuv1.Containerd.String()}, + {Name: "NRI_SOCKET", Value: path.Join(DefaultRuntimeNRISocketTargetDir, path.Base(DefaultRuntimeNRISocketFile))}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, + }, + }), + }, } - cp := &gpuv1.ClusterPolicySpec{Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass}} + defaultCP := &gpuv1.ClusterPolicySpec{Operator: gpuv1.OperatorSpec{RuntimeClass: DefaultRuntimeClass}} for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { + cp := defaultCP + if tc.clusterPolicy != nil { + cp = tc.clusterPolicy + } // pass pointer to the target container err := transformForRuntime(tc.input.DaemonSet, cp, tc.runtime.String(), &tc.input.Spec.Template.Spec.Containers[0]) require.NoError(t, err)