diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 12f8e9dcd..6f676f5f1 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1539,6 +1539,12 @@ type GDRCopySpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` + // UsePrecompiled indicates if deployment of GDRCopy using pre-compiled modules is enabled + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy deployment using pre-compiled modules" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // NVIDIA GDRCopy driver image repository // +kubebuilder:validation:Optional Repository string `json:"repository,omitempty"` @@ -2428,6 +2434,14 @@ func (gdrcopy *GDRCopySpec) IsEnabled() bool { return *gdrcopy.Enabled } +// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec +func (gdrcopy *GDRCopySpec) UsePrecompiledDrivers() bool { + if gdrcopy.UsePrecompiled == nil { + return false + } + return *gdrcopy.UsePrecompiled +} + // IsEnabled returns true if DCGM hostengine as a separate Pod is enabled through gpu-perator func (dcgm *DCGMSpec) IsEnabled() bool { if dcgm.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 9e936de60..e4a4d79eb 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -846,6 +846,11 @@ func (in *GDRCopySpec) DeepCopyInto(out *GDRCopySpec) { *out = new(bool) **out = **in } + if in.UsePrecompiled != nil { + in, out := &in.UsePrecompiled, &out.UsePrecompiled + *out = new(bool) + **out = **in + } if in.ImagePullSecrets != nil { in, out := &in.ImagePullSecrets, &out.ImagePullSecrets *out = make([]string, len(*in)) diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 613355ca6..ba4dab7c6 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -363,6 +363,12 @@ type GDRCopySpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` + // UsePrecompiled indicates if deployment of GDRCopy using pre-compiled modules is enabled + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy deployment using pre-compiled modules" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // GDRCopy diver image repository // +kubebuilder:validation:Optional Repository string `json:"repository,omitempty"` @@ -599,6 +605,14 @@ func (d *GPUDirectStorageSpec) GetImagePath(osVersion string) (string, error) { return image, nil } +// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec +func (d *GDRCopySpec) UsePrecompiledDrivers() bool { + if d.UsePrecompiled == nil { + return false + } + return *d.UsePrecompiled +} + // GetImagePath returns the gdrcopy driver image path given the information // provided in GDRCopySpec and the osVersion passed as an argument. // The driver image path will be in the following format unless the spec @@ -624,6 +638,32 @@ func (d *GDRCopySpec) GetImagePath(osVersion string) (string, error) { return image, nil } +// GetPrecompiledImagePath returns the precompiled gdrcopy image path for a +// given os version and kernel version. Precompiled gdrcopy images follow +// the following format: +// /:-- +func (d *GDRCopySpec) GetPrecompiledImagePath(osVersion string, kernelVersion string) (string, error) { + image, err := image.ImagePath(d.Repository, d.Image, d.Version, "") + if err != nil { + return "", fmt.Errorf("failed to get image path from crd: %w", err) + } + + // specifying a digest in the spec is not supported when using precompiled + if strings.Contains(image, "sha256:") { + return "", fmt.Errorf("specifying image digest is not supported when precompiled is enabled") + } + + // append '--' to the gdrcopy tag + image = fmt.Sprintf("%s-%s-%s", image, kernelVersion, osVersion) + + _, err = ref.New(image) + if err != nil { + return "", fmt.Errorf("failed to parse gdrcopy image path: %w", err) + } + + return image, nil +} + // GetPrecompiledImagePath returns the precompiled driver image path for a // given os version and kernel version. Precompiled driver images follow // the following format: diff --git a/api/nvidia/v1alpha1/nvidiadriver_types_test.go b/api/nvidia/v1alpha1/nvidiadriver_types_test.go index ba592f494..62b1303c7 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types_test.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types_test.go @@ -402,3 +402,127 @@ func TestGDRCopyGetImagePath(t *testing.T) { }) } } + +func TestGDRCopyGetPrecompiledImagePath(t *testing.T) { + testCases := []struct { + description string + spec *GDRCopySpec + osVersion string + kernelVersion string + errorExpected bool + expectedImage string + }{ + { + description: "malformed repository", + spec: &GDRCopySpec{ + Repository: "malformed?/repo", + }, + errorExpected: true, + expectedImage: "", + }, + { + description: "malformed image", + spec: &GDRCopySpec{ + Image: "malformed?image", + }, + errorExpected: true, + expectedImage: "", + }, + { + description: "only image provided with no tag or digest", + spec: &GDRCopySpec{ + Image: "nvcr.io/nvidia/cloud-native/gdrdrv", + }, + errorExpected: true, + expectedImage: "", + }, + { + description: "only image provided with tag", + spec: &GDRCopySpec{ + Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2", + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2-5.4.0-150-generic-ubuntu22.04", + }, + { + description: "only image provided with digest", + spec: &GDRCopySpec{ + Image: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:" + testDigest, + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + errorExpected: true, + expectedImage: "", + }, + { + description: "repository, image, and version set but image contains a tag", + spec: &GDRCopySpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.4.1", + Version: "v2.5.2", + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + errorExpected: true, + expectedImage: "", + }, + { + description: "repository, image, and version set but image contains a digest", + spec: &GDRCopySpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:" + testDigest, + Version: "v2.5.2", + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + errorExpected: true, + expectedImage: "", + }, + { + description: "missing version", + spec: &GDRCopySpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gdrdrv", + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + errorExpected: true, + expectedImage: "", + }, + { + description: "repository, image, and version set; version is a tag", + spec: &GDRCopySpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gdrdrv", + Version: "v2.5.2", + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2-5.4.0-150-generic-ubuntu22.04", + }, + { + description: "repository, image, and version set; version is a digest", + spec: &GDRCopySpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gdrdrv", + Version: "sha256:" + testDigest, + }, + osVersion: "ubuntu22.04", + kernelVersion: "5.4.0-150-generic", + errorExpected: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + image, err := tc.spec.GetPrecompiledImagePath(tc.osVersion, tc.kernelVersion) + if tc.errorExpected { + require.Error(t, err) + } else { + require.NoError(t, err) + } + require.Equal(t, image, tc.expectedImage) + }) + } +} diff --git a/api/nvidia/v1alpha1/zz_generated.deepcopy.go b/api/nvidia/v1alpha1/zz_generated.deepcopy.go index 466ecebbe..ebd1b082e 100644 --- a/api/nvidia/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia/v1alpha1/zz_generated.deepcopy.go @@ -139,6 +139,11 @@ func (in *GDRCopySpec) DeepCopyInto(out *GDRCopySpec) { *out = new(bool) **out = **in } + if in.UsePrecompiled != nil { + in, out := &in.UsePrecompiled, &out.UsePrecompiled + *out = new(bool) + **out = **in + } if in.ImagePullSecrets != nil { in, out := &in.ImagePullSecrets, &out.ImagePullSecrets *out = make([]string, len(*in)) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index ce48355b4..9d8cc1f57 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -1386,6 +1386,10 @@ spec: repository: description: NVIDIA GDRCopy driver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: NVIDIA GDRCopy driver image tag type: string diff --git a/bundle/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/manifests/nvidia.com_nvidiadrivers.yaml index 81c234157..b74084fab 100644 --- a/bundle/manifests/nvidia.com_nvidiadrivers.yaml +++ b/bundle/manifests/nvidia.com_nvidiadrivers.yaml @@ -141,6 +141,10 @@ spec: repository: description: GDRCopy diver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: GDRCopy driver image tag type: string diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index ce48355b4..9d8cc1f57 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -1386,6 +1386,10 @@ spec: repository: description: NVIDIA GDRCopy driver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: NVIDIA GDRCopy driver image tag type: string diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index 81c234157..b74084fab 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -141,6 +141,10 @@ spec: repository: description: GDRCopy diver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: GDRCopy driver image tag type: string diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e9767a20..61285a6f1 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -3140,8 +3140,8 @@ func transformGDRCopyContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...) return nil } - if config.Driver.UsePrecompiledDrivers() { - return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers") + if config.Driver.UsePrecompiledDrivers() && !config.GDRCopy.UsePrecompiledDrivers() { + return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers unless gdrcopy.usePrecompiled is also enabled") } gdrcopyContainer := &obj.Spec.Template.Spec.Containers[i] @@ -3425,9 +3425,14 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string } case *gpuv1.GDRCopySpec: spec := driverSpec.(*gpuv1.GDRCopySpec) - image, err = gpuv1.ImagePath(spec) - if err != nil { - return "", err + if spec.UsePrecompiledDrivers() { + // use per kernel version tag + image = spec.Repository + "/" + spec.Image + ":" + spec.Version + "-" + n.currentKernelVersion + } else { + image, err = gpuv1.ImagePath(spec) + if err != nil { + return "", err + } } default: return "", fmt.Errorf("invalid type to construct image path: %v", v) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 692486b37..fd9717a79 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -2292,3 +2292,62 @@ func TestDriverPrecompiledLibModulesSuse(t *testing.T) { }) } } + +// TestGDRCopyResolveDriverTag tests that resolveDriverTag returns the correct +// image path for GDRCopy in both non-precompiled and precompiled modes. +func TestGDRCopyResolveDriverTag(t *testing.T) { + const ( + repo = "nvcr.io/nvidia/cloud-native" + image = "gdrdrv" + version = "v2.5.2" + kernel = "5.4.0-150-generic" + osTag = "ubuntu22.04" + ) + + n := ClusterPolicyController{ + currentKernelVersion: kernel, + gpuNodeOSTag: osTag, + singleton: &gpuv1.ClusterPolicy{}, + } + + t.Run("non-precompiled", func(t *testing.T) { + spec := &gpuv1.GDRCopySpec{Repository: repo, Image: image, Version: version} + got, err := resolveDriverTag(n, spec) + require.NoError(t, err) + require.Equal(t, repo+"/"+image+":"+version+"-"+osTag, got) + }) + + t.Run("precompiled", func(t *testing.T) { + spec := &gpuv1.GDRCopySpec{Repository: repo, Image: image, Version: version, UsePrecompiled: ptr.To(true)} + got, err := resolveDriverTag(n, spec) + require.NoError(t, err) + require.Equal(t, repo+"/"+image+":"+version+"-"+kernel+"-"+osTag, got) + }) +} + +// TestGDRCopyPrecompiledDriverIncompatibility tests that enabling precompiled +// drivers without enabling precompiled GDRCopy returns an error. +func TestGDRCopyPrecompiledDriverIncompatibility(t *testing.T) { + cp := getDriverTestInput("precompiled") + enabled := true + cp.Spec.GDRCopy = &gpuv1.GDRCopySpec{ + Enabled: &enabled, + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gdrdrv", + Version: "v2.5.2", + UsePrecompiled: ptr.To(false), + } + + err := updateClusterPolicy(&clusterPolicyController, cp) + require.NoError(t, err) + + addState(&clusterPolicyController, filepath.Join(cfg.root, driverAssetsPath)) + // step() returns an error without incrementing idx, so the state is still at idx + defer func() { + _ = removeState(&clusterPolicyController, clusterPolicyController.idx) + }() + + _, err = clusterPolicyController.step() + require.Error(t, err, "expected error when driver is precompiled but GDRCopy is not") + require.Contains(t, err.Error(), "GDRCopy is not supported") +} diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index ce48355b4..9d8cc1f57 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1386,6 +1386,10 @@ spec: repository: description: NVIDIA GDRCopy driver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: NVIDIA GDRCopy driver image tag type: string diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index 81c234157..b74084fab 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -141,6 +141,10 @@ spec: repository: description: GDRCopy diver image repository type: string + usePrecompiled: + description: UsePrecompiled indicates if deployment of GDRCopy + using pre-compiled modules is enabled + type: boolean version: description: GDRCopy driver image tag type: string diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index a0ad8bdd4..f19077b40 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -450,6 +450,7 @@ gds: gdrcopy: enabled: false + usePrecompiled: false repository: nvcr.io/nvidia/cloud-native image: gdrdrv version: "v2.5.2" diff --git a/internal/state/driver.go b/internal/state/driver.go index 355910a8e..503842889 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -623,13 +623,23 @@ func getGDSSpec(spec *nvidiav1alpha1.NVIDIADriverSpec, pool nodePool) (*gdsDrive }, nil } +func getGDRCopyImagePath(spec *nvidiav1alpha1.GDRCopySpec, pool nodePool) (string, error) { + os := pool.osTag + + if spec.UsePrecompiledDrivers() { + return spec.GetPrecompiledImagePath(os, pool.kernel) + } + + return spec.GetImagePath(os) +} + func getGDRCopySpec(spec *nvidiav1alpha1.NVIDIADriverSpec, pool nodePool) (*gdrcopyDriverSpec, error) { if spec == nil || !spec.IsGDRCopyEnabled() { // note: GDRCopy is optional in the NvidiaDriver CRD return nil, nil } gdrcopySpec := spec.GDRCopy - imagePath, err := gdrcopySpec.GetImagePath(pool.osTag) + imagePath, err := getGDRCopyImagePath(gdrcopySpec, pool) if err != nil { return nil, err }