Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,12 @@ type GDRCopySpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// UsePrecompiled indicates if deployment of GDRCopy using pre-compiled modules is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy deployment using pre-compiled modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// NVIDIA GDRCopy driver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`
Expand Down Expand Up @@ -2428,6 +2434,14 @@ func (gdrcopy *GDRCopySpec) IsEnabled() bool {
return *gdrcopy.Enabled
}

// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
func (gdrcopy *GDRCopySpec) UsePrecompiledDrivers() bool {
if gdrcopy.UsePrecompiled == nil {
return false
}
return *gdrcopy.UsePrecompiled
}

// IsEnabled returns true if DCGM hostengine as a separate Pod is enabled through gpu-perator
func (dcgm *DCGMSpec) IsEnabled() bool {
if dcgm.Enabled == nil {
Expand Down
5 changes: 5 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions api/nvidia/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,12 @@ type GDRCopySpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// UsePrecompiled indicates if deployment of GDRCopy using pre-compiled modules is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy deployment using pre-compiled modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// GDRCopy diver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`
Expand Down Expand Up @@ -599,6 +605,14 @@ func (d *GPUDirectStorageSpec) GetImagePath(osVersion string) (string, error) {
return image, nil
}

// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
func (d *GDRCopySpec) UsePrecompiledDrivers() bool {
if d.UsePrecompiled == nil {
return false
}
return *d.UsePrecompiled
}

// GetImagePath returns the gdrcopy driver image path given the information
// provided in GDRCopySpec and the osVersion passed as an argument.
// The driver image path will be in the following format unless the spec
Expand All @@ -624,6 +638,32 @@ func (d *GDRCopySpec) GetImagePath(osVersion string) (string, error) {
return image, nil
}

// GetPrecompiledImagePath returns the precompiled gdrcopy image path for a
// given os version and kernel version. Precompiled gdrcopy images follow
// the following format:
// <repository>/<image>:<gdrcopy-ver>-<kernel-ver>-<os-ver>
func (d *GDRCopySpec) GetPrecompiledImagePath(osVersion string, kernelVersion string) (string, error) {
image, err := image.ImagePath(d.Repository, d.Image, d.Version, "")
if err != nil {
return "", fmt.Errorf("failed to get image path from crd: %w", err)
}

// specifying a digest in the spec is not supported when using precompiled
if strings.Contains(image, "sha256:") {
return "", fmt.Errorf("specifying image digest is not supported when precompiled is enabled")
}

// append '-<kernelVersion>-<osVersion>' to the gdrcopy tag
image = fmt.Sprintf("%s-%s-%s", image, kernelVersion, osVersion)

_, err = ref.New(image)
if err != nil {
return "", fmt.Errorf("failed to parse gdrcopy image path: %w", err)
}

return image, nil
}

// GetPrecompiledImagePath returns the precompiled driver image path for a
// given os version and kernel version. Precompiled driver images follow
// the following format:
Expand Down
124 changes: 124 additions & 0 deletions api/nvidia/v1alpha1/nvidiadriver_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,127 @@ func TestGDRCopyGetImagePath(t *testing.T) {
})
}
}

func TestGDRCopyGetPrecompiledImagePath(t *testing.T) {
testCases := []struct {
description string
spec *GDRCopySpec
osVersion string
kernelVersion string
errorExpected bool
expectedImage string
}{
{
description: "malformed repository",
spec: &GDRCopySpec{
Repository: "malformed?/repo",
},
errorExpected: true,
expectedImage: "",
},
{
description: "malformed image",
spec: &GDRCopySpec{
Image: "malformed?image",
},
errorExpected: true,
expectedImage: "",
},
{
description: "only image provided with no tag or digest",
spec: &GDRCopySpec{
Image: "nvcr.io/nvidia/cloud-native/gdrdrv",
},
errorExpected: true,
expectedImage: "",
},
{
description: "only image provided with tag",
spec: &GDRCopySpec{
Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2",
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2-5.4.0-150-generic-ubuntu22.04",
},
{
description: "only image provided with digest",
spec: &GDRCopySpec{
Image: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:" + testDigest,
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
errorExpected: true,
expectedImage: "",
},
{
description: "repository, image, and version set but image contains a tag",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.4.1",
Version: "v2.5.2",
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
errorExpected: true,
expectedImage: "",
},
{
description: "repository, image, and version set but image contains a digest",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:" + testDigest,
Version: "v2.5.2",
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
errorExpected: true,
expectedImage: "",
},
{
description: "missing version",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
errorExpected: true,
expectedImage: "",
},
{
description: "repository, image, and version set; version is a tag",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
Version: "v2.5.2",
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5.2-5.4.0-150-generic-ubuntu22.04",
},
{
description: "repository, image, and version set; version is a digest",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
Version: "sha256:" + testDigest,
},
osVersion: "ubuntu22.04",
kernelVersion: "5.4.0-150-generic",
errorExpected: true,
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
image, err := tc.spec.GetPrecompiledImagePath(tc.osVersion, tc.kernelVersion)
if tc.errorExpected {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.Equal(t, image, tc.expectedImage)
})
}
}
5 changes: 5 additions & 0 deletions api/nvidia/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,10 @@ spec:
repository:
description: NVIDIA GDRCopy driver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: NVIDIA GDRCopy driver image tag
type: string
Expand Down
4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ spec:
repository:
description: GDRCopy diver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: GDRCopy driver image tag
type: string
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,10 @@ spec:
repository:
description: NVIDIA GDRCopy driver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: NVIDIA GDRCopy driver image tag
type: string
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ spec:
repository:
description: GDRCopy diver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: GDRCopy driver image tag
type: string
Expand Down
15 changes: 10 additions & 5 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -3140,8 +3140,8 @@ func transformGDRCopyContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
return nil
}
if config.Driver.UsePrecompiledDrivers() {
return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers")
if config.Driver.UsePrecompiledDrivers() && !config.GDRCopy.UsePrecompiledDrivers() {
return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers unless gdrcopy.usePrecompiled is also enabled")
}

gdrcopyContainer := &obj.Spec.Template.Spec.Containers[i]
Expand Down Expand Up @@ -3425,9 +3425,14 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string
}
case *gpuv1.GDRCopySpec:
spec := driverSpec.(*gpuv1.GDRCopySpec)
image, err = gpuv1.ImagePath(spec)
if err != nil {
return "", err
if spec.UsePrecompiledDrivers() {
// use per kernel version tag
image = spec.Repository + "/" + spec.Image + ":" + spec.Version + "-" + n.currentKernelVersion
} else {
image, err = gpuv1.ImagePath(spec)
if err != nil {
return "", err
}
}
default:
return "", fmt.Errorf("invalid type to construct image path: %v", v)
Expand Down
59 changes: 59 additions & 0 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2292,3 +2292,62 @@ func TestDriverPrecompiledLibModulesSuse(t *testing.T) {
})
}
}

// TestGDRCopyResolveDriverTag tests that resolveDriverTag returns the correct
// image path for GDRCopy in both non-precompiled and precompiled modes.
func TestGDRCopyResolveDriverTag(t *testing.T) {
const (
repo = "nvcr.io/nvidia/cloud-native"
image = "gdrdrv"
version = "v2.5.2"
kernel = "5.4.0-150-generic"
osTag = "ubuntu22.04"
)

n := ClusterPolicyController{
currentKernelVersion: kernel,
gpuNodeOSTag: osTag,
singleton: &gpuv1.ClusterPolicy{},
}

t.Run("non-precompiled", func(t *testing.T) {
spec := &gpuv1.GDRCopySpec{Repository: repo, Image: image, Version: version}
got, err := resolveDriverTag(n, spec)
require.NoError(t, err)
require.Equal(t, repo+"/"+image+":"+version+"-"+osTag, got)
})

t.Run("precompiled", func(t *testing.T) {
spec := &gpuv1.GDRCopySpec{Repository: repo, Image: image, Version: version, UsePrecompiled: ptr.To(true)}
got, err := resolveDriverTag(n, spec)
require.NoError(t, err)
require.Equal(t, repo+"/"+image+":"+version+"-"+kernel+"-"+osTag, got)
})
}

// TestGDRCopyPrecompiledDriverIncompatibility tests that enabling precompiled
// drivers without enabling precompiled GDRCopy returns an error.
func TestGDRCopyPrecompiledDriverIncompatibility(t *testing.T) {
cp := getDriverTestInput("precompiled")
enabled := true
cp.Spec.GDRCopy = &gpuv1.GDRCopySpec{
Enabled: &enabled,
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
Version: "v2.5.2",
UsePrecompiled: ptr.To(false),
}

err := updateClusterPolicy(&clusterPolicyController, cp)
require.NoError(t, err)

addState(&clusterPolicyController, filepath.Join(cfg.root, driverAssetsPath))
// step() returns an error without incrementing idx, so the state is still at idx
defer func() {
_ = removeState(&clusterPolicyController, clusterPolicyController.idx)
}()

_, err = clusterPolicyController.step()
require.Error(t, err, "expected error when driver is precompiled but GDRCopy is not")
require.Contains(t, err.Error(), "GDRCopy is not supported")
}
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,10 @@ spec:
repository:
description: NVIDIA GDRCopy driver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: NVIDIA GDRCopy driver image tag
type: string
Expand Down
4 changes: 4 additions & 0 deletions deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ spec:
repository:
description: GDRCopy diver image repository
type: string
usePrecompiled:
description: UsePrecompiled indicates if deployment of GDRCopy
using pre-compiled modules is enabled
type: boolean
version:
description: GDRCopy driver image tag
type: string
Expand Down
Loading