diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445..fedae1ba6 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -42,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/internal/conditions" ) @@ -143,6 +144,20 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques } clusterPolicyCtrl.operatorMetrics.reconciliationTotal.Inc() + + nvidiaDriverEnabled := nvidiaDriverCRDEnabled(instance) + if nvidiaDriverEnabled { + if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil { + clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) + clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc() + updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady) + if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { + r.Log.Error(condErr, "failed to set condition") + } + return ctrl.Result{}, err + } + } + overallStatus := gpuv1.Ready statesNotReady := []string{} for { @@ -170,6 +185,12 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques } } + if nvidiaDriverEnabled { + if err := clusterPolicyCtrl.labelNodesWithOrphanedDriverPods(ctx); err != nil { + r.Log.Error(err, "failed to label nodes with orphaned NVIDIA driver pods") + } + } + // if any state is not ready, requeue for reconcile after 5 seconds if overallStatus != gpuv1.Ready { clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) @@ -371,6 +392,29 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl return err } + err = c.Watch( + source.Kind(mgr.GetCache(), + &nvidiav1alpha1.NVIDIADriver{}, + handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { + list := &gpuv1.ClusterPolicyList{} + if err := mgr.GetClient().List(ctx, list); err != nil { + return []reconcile.Request{} + } + requests := make([]reconcile.Request, 0, len(list.Items)) + for _, item := range list.Items { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: item.GetName()}, + }) + } + return requests + }), + nvidiaDriverGenerationOrDefaultLabelChangedPredicate(), + ), + ) + if err != nil { + return err + } + // TODO(user): Modify this to be the types you create that are owned by the primary resource // Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy err = c.Watch( diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index 637b9cec9..af8042448 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -110,6 +110,9 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request } if len(clusterPolicyList.Items) == 0 { + if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil { + return reconcile.Result{}, err + } err := fmt.Errorf("no ClusterPolicy object found in the cluster") logger.Error(err, "failed to get ClusterPolicy object") instance.Status.State = nvidiav1alpha1.NotReady @@ -120,8 +123,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request } clusterPolicyInstance := clusterPolicyList.Items[0] + if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil { + return reconcile.Result{}, err + } + // Ensure that ClusterPolicy is configured to use NVIDIADriver CRD - if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() { + if !nvidiaDriverCRDEnabled(&clusterPolicyInstance) { msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy" logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg) instance.Status.State = nvidiav1alpha1.Disabled @@ -152,6 +159,15 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } + if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil { + logger.Error(err, "failed to assign NVIDIADriver owners to nodes") + instance.Status.State = nvidiav1alpha1.NotReady + if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { + logger.Error(condErr, "failed to set condition") + } + return reconcile.Result{}, err + } + if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) { err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers") logger.Error(err, "unsupported driver combination detected") @@ -221,6 +237,14 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } +func (r *NVIDIADriverReconciler) handleDefaultNVIDIADriverDeletion(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (bool, error) { + if !isDefaultNVIDIADriver(driver) || driver.GetDeletionTimestamp() == nil { + return false, nil + } + log.FromContext(ctx).Info("Default NVIDIADriver delete requested; skipping reconciliation") + return true, nil +} + func (r *NVIDIADriverReconciler) updateCrStatus( ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error { reqLogger := log.FromContext(ctx) @@ -316,7 +340,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. mgr.GetCache(), &nvidiav1alpha1.NVIDIADriver{}, handler.TypedEnqueueRequestsFromMapFunc(nvidiaDriverMapFn), - predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{}, + nvidiaDriverGenerationOrDefaultLabelChangedPredicate(), ), ) if err != nil { @@ -413,3 +437,30 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. return nil } + +// nvidiaDriverGenerationOrDefaultLabelChangedPredicate reconciles NVIDIADrivers on spec changes and +// on changes to the default-driver label, which is metadata but affects ownership semantics. +func nvidiaDriverGenerationOrDefaultLabelChangedPredicate() predicate.TypedPredicate[*nvidiav1alpha1.NVIDIADriver] { + return predicate.TypedFuncs[*nvidiav1alpha1.NVIDIADriver]{ + CreateFunc: func(event.TypedCreateEvent[*nvidiav1alpha1.NVIDIADriver]) bool { + return true + }, + DeleteFunc: func(event.TypedDeleteEvent[*nvidiav1alpha1.NVIDIADriver]) bool { + return true + }, + UpdateFunc: func(e event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]) bool { + if e.ObjectOld == nil || e.ObjectNew == nil { + return true + } + if e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() { + return true + } + oldLabels := e.ObjectOld.GetLabels() + newLabels := e.ObjectNew.GetLabels() + return oldLabels[consts.DefaultNVIDIADriverLabel] != newLabels[consts.DefaultNVIDIADriverLabel] + }, + GenericFunc: func(event.TypedGenericEvent[*nvidiav1alpha1.NVIDIADriver]) bool { + return false + }, + } +} diff --git a/controllers/nvidiadriver_controller_test.go b/controllers/nvidiadriver_controller_test.go index 7630e7e4b..489f50ce4 100644 --- a/controllers/nvidiadriver_controller_test.go +++ b/controllers/nvidiadriver_controller_test.go @@ -28,15 +28,19 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/zap" "go.uber.org/zap/zapcore" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/validator" ) @@ -70,6 +74,15 @@ func (f *FakeNodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1al return f.CustomError } +type patchFailingClient struct { + client.Client + patchErr error +} + +func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return c.patchErr +} + // newTestLogger creates a zap.Logger that writes to an in-memory buffer for testing func newTestLogger() (logr.Logger, *bytes.Buffer) { buf := &bytes.Buffer{} @@ -97,6 +110,7 @@ func TestReconcile(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) tests := []struct { name string @@ -247,6 +261,59 @@ func TestReconcileConflictSetsNotReadyState(t *testing.T) { require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState) } +func TestReconcileReturnsErrorWhenOwnerAssignmentFails(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-driver", + Namespace: "default", + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "a"}, + }, + } + cp := &gpuv1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + UseNvidiaDriverCRD: ptr.To(true), + }, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + "nodepool": "a", + "nvidia.com/gpu.present": "true", + }, + }} + patchErr := errors.New("patch failed") + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, driver, node).Build() + updater := &FakeConditionUpdater{} + + reconciler := &NVIDIADriverReconciler{ + Client: &patchFailingClient{Client: k8sClient, patchErr: patchErr}, + Scheme: scheme, + conditionUpdater: updater, + nodeSelectorValidator: &FakeNodeSelectorValidator{}, + } + + _, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: driver.Name, + Namespace: driver.Namespace, + }, + }) + + require.Error(t, err) + require.ErrorContains(t, err, patchErr.Error()) + require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState) +} + func TestEnqueueAllNVIDIADrivers(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) @@ -267,3 +334,45 @@ func TestEnqueueAllNVIDIADrivers(t *testing.T) { sort.Strings(got) require.Equal(t, []string{"default/driver-a", "default/driver-b"}, got) } + +func TestNVIDIADriverGenerationOrDefaultLabelChangedPredicate(t *testing.T) { + p := nvidiaDriverGenerationOrDefaultLabelChangedPredicate() + + require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{ + ObjectOld: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 1}}, + ObjectNew: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 2}}, + })) + + require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{ + ObjectOld: &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "driver", + Generation: 1, + }, + }, + ObjectNew: &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "driver", + Generation: 1, + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + }, + })) + + require.False(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{ + ObjectOld: &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "driver", + Generation: 1, + Labels: map[string]string{"example.com/label": "old"}, + }, + }, + ObjectNew: &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "driver", + Generation: 1, + Labels: map[string]string{"example.com/label": "new"}, + }, + }, + })) +} diff --git a/controllers/nvidiadriver_default.go b/controllers/nvidiadriver_default.go new file mode 100644 index 000000000..d6f7098ee --- /dev/null +++ b/controllers/nvidiadriver_default.go @@ -0,0 +1,136 @@ +/* +Copyright NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" +) + +// isDefaultNVIDIADriver returns true when the NVIDIADriver is marked as the fallback driver. +func isDefaultNVIDIADriver(driver *nvidiav1alpha1.NVIDIADriver) bool { + return driver != nil && driver.Labels[consts.DefaultNVIDIADriverLabel] == "true" +} + +// nvidiaDriverCRDEnabled returns true when ClusterPolicy driver management is enabled through NVIDIADriver CRs. +func nvidiaDriverCRDEnabled(clusterPolicy *gpuv1.ClusterPolicy) bool { + return clusterPolicy != nil && + clusterPolicy.Spec.Driver.IsEnabled() && + clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() +} + +// validateNVIDIADriverNodeSelector rejects selectors that use operator-managed routing labels. +func validateNVIDIADriverNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error { + if driver == nil || driver.Spec.NodeSelector == nil { + return nil + } + if _, ok := driver.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok { + return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", driver.Name, consts.NVIDIADriverOwnerLabel) + } + return nil +} + +// assignNVIDIADriverOwners labels GPU nodes with the NVIDIADriver that should manage their driver pods. +// Non-default NVIDIADrivers take precedence over the default fallback, and conflicts fail closed before +// node owner labels are changed. +func assignNVIDIADriverOwners(ctx context.Context, c client.Client) error { + drivers := &nvidiav1alpha1.NVIDIADriverList{} + if err := c.List(ctx, drivers); err != nil { + return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err) + } + + var defaultDriver *nvidiav1alpha1.NVIDIADriver + defaultDriverNames := []string{} + specificDrivers := make([]nvidiav1alpha1.NVIDIADriver, 0, len(drivers.Items)) + for i := range drivers.Items { + if err := validateNVIDIADriverNodeSelector(&drivers.Items[i]); err != nil { + return err + } + if isDefaultNVIDIADriver(&drivers.Items[i]) { + defaultDriverNames = append(defaultDriverNames, drivers.Items[i].Name) + defaultDriver = &drivers.Items[i] + continue + } + specificDrivers = append(specificDrivers, drivers.Items[i]) + } + if len(defaultDriverNames) > 1 { + return fmt.Errorf("multiple default NVIDIADrivers found: %s", strings.Join(defaultDriverNames, ", ")) + } + nodes := &corev1.NodeList{} + if err := c.List(ctx, nodes, client.MatchingLabels{consts.GPUPresentLabel: "true"}); err != nil { + return fmt.Errorf("failed to list GPU nodes: %w", err) + } + + for i := range nodes.Items { + matchingDrivers := []string{} + for _, driver := range specificDrivers { + if nodeMatchesSelector(&nodes.Items[i], driver.GetNodeSelector()) { + matchingDrivers = append(matchingDrivers, driver.Name) + } + } + if len(matchingDrivers) > 1 { + return fmt.Errorf("conflicting NVIDIADriver NodeSelectors found for node %s: %s", nodes.Items[i].Name, strings.Join(matchingDrivers, ", ")) + } + } + + for i := range nodes.Items { + node := &nodes.Items[i] + originalNode := node.DeepCopy() + owner := "" + for _, driver := range specificDrivers { + if nodeMatchesSelector(node, driver.GetNodeSelector()) { + owner = driver.Name + } + } + if owner == "" && defaultDriver != nil && nodeMatchesSelector(node, defaultDriver.GetNodeSelector()) { + owner = defaultDriver.Name + } + if owner == "" { + if node.Labels == nil { + continue + } + if _, ok := node.Labels[consts.NVIDIADriverOwnerLabel]; !ok { + continue + } + delete(node.Labels, consts.NVIDIADriverOwnerLabel) + if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil { + return fmt.Errorf("failed to remove NVIDIADriver owner label for node %s: %w", node.Name, err) + } + continue + } + if node.Labels != nil && node.Labels[consts.NVIDIADriverOwnerLabel] == owner { + continue + } + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels[consts.NVIDIADriverOwnerLabel] = owner + if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil { + return fmt.Errorf("failed to update NVIDIADriver owner label for node %s: %w", node.Name, err) + } + } + + return nil +} diff --git a/controllers/nvidiadriver_default_test.go b/controllers/nvidiadriver_default_test.go new file mode 100644 index 000000000..173c6a9e8 --- /dev/null +++ b/controllers/nvidiadriver_default_test.go @@ -0,0 +1,373 @@ +/* +Copyright NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" +) + +func TestNvidiaDriverCRDEnabled(t *testing.T) { + driverEnabled := true + driverDisabled := false + crdEnabled := true + crdDisabled := false + + tests := []struct { + name string + driverEnabled *bool + crdEnabled *bool + expected bool + }{ + { + name: "driver enabled by default and CRD enabled", + crdEnabled: &crdEnabled, + expected: true, + }, + { + name: "CRD disabled", + crdEnabled: &crdDisabled, + expected: false, + }, + { + name: "driver disabled", + driverEnabled: &driverDisabled, + crdEnabled: &crdEnabled, + expected: false, + }, + { + name: "driver explicitly enabled and CRD enabled", + driverEnabled: &driverEnabled, + crdEnabled: &crdEnabled, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + clusterPolicy := &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + Enabled: tc.driverEnabled, + UseNvidiaDriverCRD: tc.crdEnabled, + }, + }, + } + + require.Equal(t, tc.expected, nvidiaDriverCRDEnabled(clusterPolicy)) + }) + } +} + +func TestAssignNVIDIADriverOwnersGivesSpecificDriversPrecedence(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.DefaultNVIDIADriverName, + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, specificNode).Build() + + require.NoError(t, assignNVIDIADriverOwners(context.Background(), k8sClient)) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersAllowsMissingDefaultDriver(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + unmatchedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "unmatched-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(specificDriver, unmatchedNode, specificNode).Build() + + require.NoError(t, assignNVIDIADriverOwners(context.Background(), k8sClient)) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "unmatched-node"}, unmatchedNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.NotContains(t, unmatchedNode.Labels, consts.NVIDIADriverOwnerLabel) + require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersUsesLabeledDefaultDriverWithArbitraryName(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fallback-driver", + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, node).Build() + + require.NoError(t, assignNVIDIADriverOwners(context.Background(), k8sClient)) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.Equal(t, "fallback-driver", node.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersErrorsOnMultipleDefaultDrivers(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriverA := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fallback-a", + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + defaultDriverB := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fallback-b", + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriverA, defaultDriverB, node).Build() + + err := assignNVIDIADriverOwners(context.Background(), k8sClient) + require.Error(t, err) + require.Contains(t, err.Error(), "multiple default NVIDIADrivers found") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.NotContains(t, node.Labels, consts.NVIDIADriverOwnerLabel) +} + +func TestAssignNVIDIADriverOwnersRejectsReservedOwnerLabelSelector(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "bad-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{consts.NVIDIADriverOwnerLabel: "other-driver"}, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "existing-driver", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(driver, node).Build() + + err := assignNVIDIADriverOwners(context.Background(), k8sClient) + require.Error(t, err) + require.Contains(t, err.Error(), "reserved label") + require.Contains(t, err.Error(), consts.NVIDIADriverOwnerLabel) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.Equal(t, "existing-driver", node.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersHonorsDefaultDriverNodeSelector(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.DefaultNVIDIADriverName, + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"driver-default": "true"}, + }, + } + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "driver-default": "true"}, + }} + unmatchedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "unmatched-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "driver-default": "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, unmatchedNode, specificNode).Build() + + require.NoError(t, assignNVIDIADriverOwners(context.Background(), k8sClient)) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "unmatched-node"}, unmatchedNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.NotContains(t, unmatchedNode.Labels, consts.NVIDIADriverOwnerLabel) + require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersDoesNotFallbackToDefaultOnUserDriverConflict(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.DefaultNVIDIADriverName, + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + driverA := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "driver-a"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "shared"}, + }, + } + driverB := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "driver-b"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "shared"}, + }, + } + conflictedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "conflicted-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "driver-a", + "nodepool": "shared", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, driverA, driverB, conflictedNode).Build() + + err := assignNVIDIADriverOwners(context.Background(), k8sClient) + require.Error(t, err) + require.Contains(t, err.Error(), "conflicting NVIDIADriver NodeSelectors found") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "conflicted-node"}, conflictedNode)) + require.Equal(t, "driver-a", conflictedNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersDoesNotChangeOwnersWhenAnyUserDriverConflicts(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.DefaultNVIDIADriverName, + Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"}, + }, + } + goldDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "demo-gold"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"region": "us-east-1"}, + }, + } + silverDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "demo-silver"}, + } + goldNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gold-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "demo-gold", + "region": "us-east-1", + }, + }} + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName, + "region": "us-east-2", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, goldDriver, silverDriver, goldNode, defaultNode).Build() + + err := assignNVIDIADriverOwners(context.Background(), k8sClient) + require.Error(t, err) + require.Contains(t, err.Error(), "conflicting NVIDIADriver NodeSelectors found") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gold-node"}, goldNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.Equal(t, "demo-gold", goldNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel]) +} diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e9767a20..137a48e8a 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -27,6 +27,7 @@ import ( "strconv" "strings" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" apiconfigv1 "github.com/openshift/api/config/v1" apiimagev1 "github.com/openshift/api/image/v1" secv1 "github.com/openshift/api/security/v1" @@ -48,6 +49,7 @@ import ( "sigs.k8s.io/yaml" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" driverconfig "github.com/NVIDIA/gpu-operator/internal/config" "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/utils" @@ -4241,7 +4243,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) { return true, nil } -func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error { +func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, orphanPods bool) error { // Get all DaemonSets owned by ClusterPolicy // // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector @@ -4257,8 +4259,16 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) ds := ds // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) { - n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) - err = n.client.Delete(ctx, &ds) + if orphanPods { + n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy with cascade=orphan", "Name", ds.Name) + propagationPolicy := metav1.DeletePropagationOrphan + err = n.client.Delete(ctx, &ds, &client.DeleteOptions{ + PropagationPolicy: &propagationPolicy, + }) + } else { + n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) + err = n.client.Delete(ctx, &ds) + } if err != nil { return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err) } @@ -4268,6 +4278,76 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) return nil } +// labelNodesWithOrphanedDriverPods marks nodes that still have orphaned ClusterPolicy driver pods +// so the driver upgrade controller can replace them with NVIDIADriver-owned pods. +func (n ClusterPolicyController) labelNodesWithOrphanedDriverPods(ctx context.Context) error { + nvidiaDrivers := &nvidiav1alpha1.NVIDIADriverList{} + if err := n.client.List(ctx, nvidiaDrivers); err != nil { + return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err) + } + if len(nvidiaDrivers.Items) == 0 { + return nil + } + + pods := &corev1.PodList{} + if err := n.client.List(ctx, pods, + client.InNamespace(n.operatorNamespace), + client.MatchingLabels{AppComponentLabelKey: AppComponentLabelValue}, + ); err != nil { + return fmt.Errorf("failed to list NVIDIA driver pods: %w", err) + } + + for i := range pods.Items { + pod := &pods.Items[i] + if len(pod.OwnerReferences) > 0 || pod.Status.Phase != corev1.PodRunning || pod.Spec.NodeName == "" { + continue + } + + node := &corev1.Node{} + if err := n.client.Get(ctx, types.NamespacedName{Name: pod.Spec.NodeName}, node); err != nil { + n.logger.Error(err, "failed to get node for orphaned NVIDIA driver pod", "pod", pod.Name, "node", pod.Spec.NodeName) + continue + } + if !nodeMatchesAnyNVIDIADriver(node, nvidiaDrivers.Items) { + continue + } + + upgradeStateLabel := upgrade.GetUpgradeStateLabelKey() + if node.Labels != nil && node.Labels[upgradeStateLabel] == upgrade.UpgradeStateUpgradeRequired { + continue + } + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels[upgradeStateLabel] = upgrade.UpgradeStateUpgradeRequired + if err := n.client.Update(ctx, node); err != nil { + n.logger.Error(err, "failed to label node with orphaned NVIDIA driver pod", "pod", pod.Name, "node", node.Name) + } + } + + return nil +} + +// nodeMatchesAnyNVIDIADriver returns true when the node matches at least one NVIDIADriver selector. +func nodeMatchesAnyNVIDIADriver(node *corev1.Node, nvidiaDrivers []nvidiav1alpha1.NVIDIADriver) bool { + for _, nvidiaDriver := range nvidiaDrivers { + if nodeMatchesSelector(node, nvidiaDriver.GetNodeSelector()) { + return true + } + } + return false +} + +// nodeMatchesSelector returns true when all selector labels are present on the node. +func nodeMatchesSelector(node *corev1.Node, selector map[string]string) bool { + for key, value := range selector { + if node.Labels[key] != value { + return false + } + } + return true +} + // cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen // 1. If all nodes upgraded to the latest kernel // 2. no GPU nodes are present diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 29eef5cf2..bba4f9498 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -1058,12 +1058,23 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) { // - In object_controls.go, check the OwnerRef for existing objects // before managing them. Clusterpolicy controller should not be creating / // updating / deleting objects owned by another controller. - if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") && - n.singleton.Spec.Driver.UseNvidiaDriverCRDType() { + if n.stateNames[n.idx] == "state-driver" && nvidiaDriverCRDEnabled(n.singleton) { n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") n.idx++ - // Cleanup all driver daemonsets owned by ClusterPolicy. - err := n.cleanupAllDriverDaemonSets(n.ctx) + // Cleanup all driver daemonsets owned by ClusterPolicy while keeping the + // running driver pods available until NVIDIADriver rolls replacements. + err := n.cleanupAllDriverDaemonSets(n.ctx, true) + if err != nil { + return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) + } + return gpuv1.Disabled, nil + } + if n.stateNames[n.idx] == "state-vgpu-manager" && nvidiaDriverCRDEnabled(n.singleton) { + n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") + n.idx++ + // Cleanup all driver daemonsets owned by ClusterPolicy while keeping the + // running driver pods available until NVIDIADriver rolls replacements. + err := n.cleanupAllDriverDaemonSets(n.ctx, true) if err != nil { return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) } diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index e82e2de35..bd3955b5b 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -132,7 +132,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct driverLabelKey := DriverLabelKey driverLabelValue := DriverLabelValue - if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + if nvidiaDriverCRDEnabled(clusterPolicy) { // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller driverLabelKey = AppComponentLabelKey driverLabelValue = AppComponentLabelValue diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 121a00f15..38189b8a3 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -167,6 +167,7 @@ spec: driver: enabled: {{ .Values.driver.enabled }} useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} + {{- if not .Values.driver.nvidiaDriverCRD.enabled }} kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} {{- if .Values.driver.repository }} @@ -239,6 +240,7 @@ spec: {{- if .Values.driver.args }} args: {{ toYaml .Values.driver.args | nindent 6 }} {{- end }} + {{- end }} {{- if .Values.driver.upgradePolicy }} upgradePolicy: autoUpgrade: {{ .Values.driver.upgradePolicy.autoUpgrade | default false }} diff --git a/deployments/gpu-operator/templates/nvidiadriver.yaml b/deployments/gpu-operator/templates/nvidiadriver.yaml index 1a059a490..b9b350066 100644 --- a/deployments/gpu-operator/templates/nvidiadriver.yaml +++ b/deployments/gpu-operator/templates/nvidiadriver.yaml @@ -4,6 +4,8 @@ apiVersion: nvidia.com/v1alpha1 kind: NVIDIADriver metadata: name: default + labels: + nvidia.com/gpu-operator.default-driver: "true" spec: repository: {{ .Values.driver.repository }} image: {{ .Values.driver.image }} diff --git a/internal/consts/consts.go b/internal/consts/consts.go index c2850f419..3adbffbe2 100644 --- a/internal/consts/consts.go +++ b/internal/consts/consts.go @@ -65,6 +65,13 @@ const ( // NVIDIADriverControllerIndexKey provides quick lookups for DaemonSets owned by an NVIDIADriver instance NVIDIADriverControllerIndexKey = "metadata.nvidiadriver.controller" + // DefaultNVIDIADriverName is the Helm-managed fallback NVIDIADriver. + DefaultNVIDIADriverName = "default" + // DefaultNVIDIADriverLabel marks the fallback NVIDIADriver. + DefaultNVIDIADriverLabel = "nvidia.com/gpu-operator.default-driver" + // NVIDIADriverOwnerLabel is an operator-managed node label used to route each GPU node to one NVIDIADriver. + NVIDIADriverOwnerLabel = "nvidia.com/gpu.driver.owner" + // MinimumGDSVersionForOpenRM indicates the minimum GDS version that is supported only with OpenRM driver MinimumGDSVersionForOpenRM = "v2.17.5" ) diff --git a/internal/state/driver.go b/internal/state/driver.go index 355910a8e..5fe5e322d 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -270,7 +270,7 @@ func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1 } isOpenshift := runtimeSpec.OpenshiftVersion != "" - nodePools, err := getNodePools(ctx, s.client, cr.Spec.NodeSelector, cr.Spec.UsePrecompiledDrivers(), isOpenshift) + nodePools, err := getNodePools(ctx, s.client, cr.Name, cr.Spec.NodeSelector, cr.Spec.UsePrecompiledDrivers(), isOpenshift) if err != nil { return nil, fmt.Errorf("failed to get node pools: %w", err) } diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 02b2736f2..0e47499bf 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -603,6 +603,34 @@ func TestDriverOpenshiftDriverToolkit(t *testing.T) { require.Equal(t, string(o), actual) } +func TestGetNodePoolsDoesNotAllowSelectorToOverrideOwnerLabel(t *testing.T) { + require.NoError(t, corev1.AddToScheme(scheme.Scheme)) + + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "driver-a", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "22.04", + }, + }} + k8sClient := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(node).Build() + + nodePools, err := getNodePools( + context.Background(), + k8sClient, + "driver-a", + map[string]string{consts.NVIDIADriverOwnerLabel: "driver-b"}, + false, + false, + ) + + require.NoError(t, err) + require.Len(t, nodePools, 1) + require.Equal(t, "driver-a", nodePools[0].nodeSelector[consts.NVIDIADriverOwnerLabel]) +} + func TestDriverPrecompiled(t *testing.T) { const ( testName = "driver-precompiled" diff --git a/internal/state/nodepool.go b/internal/state/nodepool.go index c9781cce0..8523e98a7 100644 --- a/internal/state/nodepool.go +++ b/internal/state/nodepool.go @@ -26,6 +26,8 @@ import ( corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/NVIDIA/gpu-operator/internal/consts" ) const ( @@ -55,16 +57,15 @@ type nodePool struct { // // Each nodePool object contains information needed to identify the corresonding node pool. // Most importantly, it contains a nodeSelector used to identify the node pool. -func getNodePools(ctx context.Context, k8sClient client.Client, selector map[string]string, precompiled bool, openshift bool) ([]nodePool, error) { +func getNodePools(ctx context.Context, k8sClient client.Client, driverName string, selector map[string]string, precompiled bool, openshift bool) ([]nodePool, error) { nodePoolMap := make(map[string]nodePool) logger := log.FromContext(ctx) - nodeSelector := map[string]string{ - "nvidia.com/gpu.present": "true", - } - + nodeSelector := map[string]string{} maps.Copy(nodeSelector, selector) + nodeSelector[consts.GPUPresentLabel] = "true" + nodeSelector[consts.NVIDIADriverOwnerLabel] = driverName nodeList := &corev1.NodeList{} err := k8sClient.List(ctx, nodeList, client.MatchingLabels(nodeSelector)) diff --git a/internal/validator/validator.go b/internal/validator/validator.go index e43c8127c..ab8eed344 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -19,12 +19,15 @@ package validator import ( "context" "fmt" + "sort" + "strings" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/controller-runtime/pkg/client" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" ) // Validator provides interface to validate NVIDIADriver fields @@ -45,25 +48,37 @@ func NewNodeSelectorValidator(c client.Client) Validator { // Check returns error when nodes matching with the selector labels of current instance of NVIDIADriver // are conflicting with other instances of NVIDIADriver func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error { + if err := validateNVIDIADriverNodeSelector(cr); err != nil { + return err + } + drivers := &nvidiav1alpha1.NVIDIADriverList{} err := nsv.client.List(ctx, drivers) if err != nil { return err } - names := map[string]struct{}{} + selectedNodeOwners := map[string][]string{} for di := range drivers.Items { + if err := validateNVIDIADriverNodeSelector(&drivers.Items[di]); err != nil { + return err + } + if isDefaultNVIDIADriver(&drivers.Items[di]) { + continue + } + driverName := drivers.Items[di].Name nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di]) if err != nil { return err } for ni := range nodeList.Items { - if _, ok := names[nodeList.Items[ni].Name]; ok { - return fmt.Errorf("conflicting NVIDIADriver NodeSelectors found for resource: %s, nodeSelector: %q", cr.Name, cr.Spec.NodeSelector) + nodeName := nodeList.Items[ni].Name + selectedNodeOwners[nodeName] = append(selectedNodeOwners[nodeName], driverName) + if len(selectedNodeOwners[nodeName]) > 1 { + sort.Strings(selectedNodeOwners[nodeName]) + return fmt.Errorf("conflicting NVIDIADriver NodeSelectors found for node %s: %s", nodeName, strings.Join(selectedNodeOwners[nodeName], ", ")) } - - names[nodeList.Items[ni].Name] = struct{}{} } } @@ -71,6 +86,22 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph return nil } +// validateNVIDIADriverNodeSelector rejects selectors that use operator-managed routing labels. +func validateNVIDIADriverNodeSelector(cr *nvidiav1alpha1.NVIDIADriver) error { + if cr == nil || cr.Spec.NodeSelector == nil { + return nil + } + if _, ok := cr.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok { + return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", cr.Name, consts.NVIDIADriverOwnerLabel) + } + return nil +} + +// isDefaultNVIDIADriver returns true when the NVIDIADriver is marked as the fallback driver. +func isDefaultNVIDIADriver(cr *nvidiav1alpha1.NVIDIADriver) bool { + return cr != nil && cr.Labels[consts.DefaultNVIDIADriverLabel] == "true" +} + // getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance func (nsv *nodeSelectorValidator) getNVIDIADriverSelectedNodes(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) { nodeList := &corev1.NodeList{} diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 13fc899e4..28ae7536f 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" ) const ( @@ -73,6 +74,12 @@ func nodeSelector(labels map[string]string) driverOptions { } } +func defaultDriver() driverOptions { + return func(c *nvidiav1alpha1.NVIDIADriver) { + c.Labels = map[string]string{consts.DefaultNVIDIADriverLabel: "true"} + } +} + func labelled(labels map[string]string) nodeOptions { return func(n *corev1.Node) { n.Labels = labels @@ -123,8 +130,77 @@ func TestCheckNodeSelector(t *testing.T) { err = nsv.Validate(context.Background(), tc.requestedDriver) if tc.shouldError { assert.Error(t, err) + assert.Contains(t, err.Error(), "conflicting NVIDIADriver NodeSelectors found for node my-test-node") + assert.Contains(t, err.Error(), "conflictingDriver") + assert.Contains(t, err.Error(), testDriverName) } else { assert.NoError(t, err) } } } + +func TestCheckNodeSelectorIgnoresDefaultDriver(t *testing.T) { + node := makeTestNode(labelled(map[string]string{ + "nvidia.com/gpu.present": "true", + "nodepool": "a", + })) + defaultDriver := makeTestDriver(defaultDriver()) + requestedDriver := makeTestDriver(named("specificDriver"), nodeSelector(map[string]string{"nodepool": "a"})) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(node, defaultDriver, requestedDriver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), requestedDriver) + assert.NoError(t, err) +} + +func TestCheckNodeSelectorRejectsReservedOwnerLabel(t *testing.T) { + driver := makeTestDriver(nodeSelector(map[string]string{consts.NVIDIADriverOwnerLabel: "other-driver"})) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(driver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), driver) + assert.Error(t, err) + assert.Contains(t, err.Error(), "reserved label") + assert.Contains(t, err.Error(), consts.NVIDIADriverOwnerLabel) +} + +func TestCheckNodeSelectorDoesNotIgnoreDefaultNameWithoutLabel(t *testing.T) { + node := makeTestNode(labelled(map[string]string{ + "nvidia.com/gpu.present": "true", + "nodepool": "a", + })) + unlabeledDefaultNameDriver := makeTestDriver(named(consts.DefaultNVIDIADriverName), nodeSelector(map[string]string{"nodepool": "a"})) + requestedDriver := makeTestDriver(named("specificDriver"), nodeSelector(map[string]string{"nodepool": "a"})) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(node, unlabeledDefaultNameDriver, requestedDriver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), requestedDriver) + assert.Error(t, err) + assert.Contains(t, err.Error(), "conflicting NVIDIADriver NodeSelectors found for node my-test-node") + assert.Contains(t, err.Error(), consts.DefaultNVIDIADriverName) + assert.Contains(t, err.Error(), "specificDriver") +} diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh index c0b2a0b7b..5f785d5a6 100755 --- a/tests/scripts/checks.sh +++ b/tests/scripts/checks.sh @@ -200,6 +200,46 @@ check_no_driver_pod_restarts() { return 0 } +print_driver_upgrade_debug() { + echo "current state of driver upgrade" + kubectl get node -l nvidia.com/gpu.present \ + -o custom-columns=NODE:.metadata.name,OWNER:.metadata.labels.nvidia\\.com/gpu\\.driver\\.owner,UPGRADE_STATE:.metadata.labels.nvidia\\.com/gpu-driver-upgrade-state --no-headers + + echo "" + echo "driver pods" + kubectl get pods -l "app.kubernetes.io/component=nvidia-driver" -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "gpu operator operands" + kubectl get pods -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "driver daemonsets" + kubectl get daemonsets -l "app.kubernetes.io/component=nvidia-driver" -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "NVIDIADriver status" + local nvidiadriver_status + if nvidiadriver_status=$(kubectl get nvidiadriver -o json 2>/dev/null); then + echo "${nvidiadriver_status}" | jq -r ' + (["NAME", "DEFAULT", "STATE", "REASON", "MESSAGE"] | @tsv), + ( + .items[] + | (.status.conditions // []) as $conditions + | ($conditions[-1] // {}) as $latest + | [ + .metadata.name, + (.metadata.labels["nvidia.com/gpu-operator.default-driver"] // "-"), + (.status.state // "-"), + ($latest.reason // "-"), + ($latest.message // "-") + ] + | @tsv + ) + ' + fi +} + wait_for_driver_upgrade_done() { gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present --no-headers | wc -l) local current_time=0 @@ -221,13 +261,16 @@ wait_for_driver_upgrade_done() { if [[ "${current_time}" -gt $((60 * 45)) ]]; then echo "timeout reached" + print_driver_upgrade_debug exit 1; fi - echo "current state of driver upgrade" - kubectl get node -l nvidia.com/gpu.present \ - -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}{"\n"}{end}' - + if [[ $((current_time % 30)) -eq 0 ]]; then + print_driver_upgrade_debug + else + kubectl get node -l nvidia.com/gpu.present \ + -o custom-columns=NODE:.metadata.name,OWNER:.metadata.labels.nvidia\\.com/gpu\\.driver\\.owner,UPGRADE_STATE:.metadata.labels.nvidia\\.com/gpu-driver-upgrade-state --no-headers + fi echo "Sleeping 5 seconds" current_time=$((${current_time} + 5)) diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index bab6e9ac0..e7838ed8c 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -3,12 +3,50 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source "${SCRIPT_DIR}"/.definitions.sh +test_nvidiadriver_helm_render_options() { + local render_file + render_file=$(mktemp) + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true > "${render_file}" + grep -q "kind: NVIDIADriver" "${render_file}" + grep -q "nvidia.com/gpu-operator.default-driver: \"true\"" "${render_file}" + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=false > "${render_file}" + if grep -q "kind: NVIDIADriver" "${render_file}"; then + echo "NVIDIADriver rendered when driver.nvidiaDriverCRD.deployDefaultCR=false" + exit 1 + fi + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=false \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true > "${render_file}" + if grep -q "kind: NVIDIADriver" "${render_file}"; then + echo "NVIDIADriver rendered when driver.nvidiaDriverCRD.enabled=false" + exit 1 + fi + + rm -f "${render_file}" +} + +echo "" +echo "" +echo "------------------------Checking NvidiaDriver Helm Rendering------------------------" +echo "-----------------------------------------------------------------------------------" +test_nvidiadriver_helm_render_options + echo "" echo "" echo "--------------Installing the GPU Operator with NvidiaDriverCR Enabled--------------" echo "-----------------------------------------------------------------------------------" # Install the operator with nvidiaDriver mode set to true -OPERATOR_OPTIONS="--set driver.nvidiaDriverCRD.enabled=true" ${SCRIPT_DIR}/install-operator.sh +OPERATOR_OPTIONS="${OPERATOR_OPTIONS:-} --set driver.nvidiaDriverCRD.enabled=true --set driver.nvidiaDriverCRD.deployDefaultCR=true" ${SCRIPT_DIR}/install-operator.sh USE_NVIDIA_DRIVER_CR="true" "${SCRIPT_DIR}"/verify-operator.sh USE_NVIDIA_DRIVER_CR="true" "${SCRIPT_DIR}"/verify-operand-restarts.sh diff --git a/tests/scripts/end-to-end.sh b/tests/scripts/end-to-end.sh index 46fceb949..03ce90d18 100755 --- a/tests/scripts/end-to-end.sh +++ b/tests/scripts/end-to-end.sh @@ -43,7 +43,18 @@ test_restart_operator ${TEST_NAMESPACE} ${CONTAINER_RUNTIME} "${SCRIPT_DIR}"/enable-operands.sh "${SCRIPT_DIR}"/verify-operator.sh -# Uninstall the workload and operator +echo "" +echo "" +echo "--------------ClusterPolicy to NVIDIADriver Migration Test--------------------------" +echo "------------------------------------------------------------------------------------" +"${SCRIPT_DIR}"/migrate-clusterpolicy-to-nvidiadriver.sh + +echo "Run GPU test workload after migration" +"${SCRIPT_DIR}"/uninstall-workload.sh +"${SCRIPT_DIR}"/install-workload.sh +"${SCRIPT_DIR}"/verify-workload.sh + +# Remove the completed workload before the next workload check. "${SCRIPT_DIR}"/uninstall-workload.sh "${SCRIPT_DIR}"/uninstall-operator.sh diff --git a/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh b/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh new file mode 100755 index 000000000..e971ca076 --- /dev/null +++ b/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +if [[ "${SKIP_MIGRATION}" == "true" ]]; then + echo "Skipping migration: SKIP_MIGRATION=${SKIP_MIGRATION}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +# Import the check definitions +source ${SCRIPT_DIR}/checks.sh + +OPERATOR_REPOSITORY=$(dirname "${OPERATOR_IMAGE}") +OPERATOR_OPTIONS="${OPERATOR_OPTIONS:-} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" +if [[ -n "${OPERATOR_VERSION}" ]]; then + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" +fi +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" + +get_helm_release_name() { + ${HELM} list -n "${TEST_NAMESPACE}" | grep gpu-operator | awk '{print $1}' +} + +wait_for_legacy_driver_daemonset_deleted() { + local elapsed_time=0 + + echo "Waiting for ClusterPolicy-owned driver DaemonSet to be deleted" + while :; do + daemonset_count=$(kubectl get daemonset -l app=nvidia-driver-daemonset -n "${TEST_NAMESPACE}" --no-headers 2>/dev/null | wc -l) + if [[ "${daemonset_count}" -eq 0 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for legacy driver DaemonSet deletion" + kubectl get daemonset -n "${TEST_NAMESPACE}" -o wide + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_orphaned_legacy_driver_pod() { + local pod_name=$1 + local elapsed_time=0 + + echo "Waiting for legacy driver pod/${pod_name} to become orphaned" + while :; do + owner_count=$(kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o json | jq '.metadata.ownerReferences | length') + if [[ "${owner_count}" -eq 0 ]]; then + echo "legacy driver pod/${pod_name} is orphaned" + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for legacy driver pod to become orphaned" + kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o yaml + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_default_nvidiadriver() { + local elapsed_time=0 + + echo "Waiting for default NVIDIADriver to be rendered" + while :; do + default_count=$(kubectl get nvidiadriver -l nvidia.com/gpu-operator.default-driver=true --no-headers 2>/dev/null | wc -l) + if [[ "${default_count}" -eq 1 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for default NVIDIADriver" + kubectl get nvidiadriver --show-labels || true + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_nvidiadriver_owner_labels() { + local driver_name=$1 + local elapsed_time=0 + local gpu_node_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + echo "Waiting for ${gpu_node_count} GPU node(s) to be owned by NVIDIADriver/${driver_name}" + + while :; do + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -eq "${gpu_node_count}" ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for NVIDIADriver owner labels" + kubectl get nodes -l nvidia.com/gpu.present=true \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu\.driver\.owner}{"\n"}{end}' + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_nvidiadriver_daemonset() { + local driver_name=$1 + local elapsed_time=0 + + echo "Waiting for NVIDIADriver-owned driver DaemonSet" + while :; do + daemonset_count=$(kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "${TEST_NAMESPACE}" -o json | + jq --arg driver_name "${driver_name}" '[.items[] | select(.spec.template.spec.nodeSelector["nvidia.com/gpu.driver.owner"] == $driver_name)] | length') + if [[ "${daemonset_count}" -gt 0 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for NVIDIADriver-owned driver DaemonSet" + kubectl get daemonset -n "${TEST_NAMESPACE}" -o yaml + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_legacy_driver_pod_deleted() { + local pod_name=$1 + local elapsed_time=0 + + echo "Waiting for orphaned legacy driver pod/${pod_name} to be deleted by the upgrade flow" + while :; do + if ! kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" >/dev/null 2>&1; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for orphaned legacy driver pod deletion" + print_driver_upgrade_debug + kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o yaml || true + exit 1 + fi + + print_driver_upgrade_debug + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +legacy_driver_pod=$(kubectl get pod -l app=nvidia-driver-daemonset -n "${TEST_NAMESPACE}" -o jsonpath='{.items[0].metadata.name}') +if [[ -z "${legacy_driver_pod}" ]]; then + echo "legacy ClusterPolicy driver pod not found" + kubectl get pods -n "${TEST_NAMESPACE}" -o wide + exit 1 +fi + +operator_name=$(get_helm_release_name) +if [[ -z "${operator_name}" ]]; then + echo "GPU Operator Helm release not found in namespace ${TEST_NAMESPACE}" + ${HELM} list -n "${TEST_NAMESPACE}" + exit 1 +fi + +echo "Migrating Helm release/${operator_name} from ClusterPolicy driver management to NVIDIADriver" +${HELM} upgrade "${operator_name}" "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --reuse-values \ + ${OPERATOR_OPTIONS:-} \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true \ + --wait + +wait_for_legacy_driver_daemonset_deleted +wait_for_orphaned_legacy_driver_pod "${legacy_driver_pod}" +wait_for_default_nvidiadriver +wait_for_nvidiadriver_owner_labels default +wait_for_nvidiadriver_daemonset default + +wait_for_legacy_driver_pod_deleted "${legacy_driver_pod}" +wait_for_driver_upgrade_done +check_nvidia_driver_pods_ready diff --git a/tests/scripts/update-nvidiadriver.sh b/tests/scripts/update-nvidiadriver.sh index 70a49ff5b..0b599ebe2 100755 --- a/tests/scripts/update-nvidiadriver.sh +++ b/tests/scripts/update-nvidiadriver.sh @@ -11,23 +11,196 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh +NVIDIA_DRIVER_NAME="${NVIDIA_DRIVER_NAME:-e2e-driver}" +DEFAULT_NVIDIA_DRIVER_NAME="${DEFAULT_NVIDIA_DRIVER_NAME:-e2e-default-driver}" +SECOND_DEFAULT_NVIDIA_DRIVER_NAME="${SECOND_DEFAULT_NVIDIA_DRIVER_NAME:-e2e-second-default-driver}" +DEFAULT_NVIDIA_DRIVER_LABEL="nvidia.com/gpu-operator.default-driver" + +get_default_nvidiadriver_name() { + kubectl get nvidiadriver -l "${DEFAULT_NVIDIA_DRIVER_LABEL}=true" -o json | + jq -r '.items[0].metadata.name // empty' +} + +get_default_nvidiadriver_count() { + kubectl get nvidiadriver -l "${DEFAULT_NVIDIA_DRIVER_LABEL}=true" -o json | + jq '.items | length' +} + +create_nvidiadriver_from() { + local source_name=$1 + local target_name=$2 + local default_label=${3:-false} + + kubectl get nvidiadriver/"${source_name}" -o json | jq --arg name "${target_name}" --arg default_label "${default_label}" --arg default_driver_label "${DEFAULT_NVIDIA_DRIVER_LABEL}" ' + { + apiVersion: .apiVersion, + kind: .kind, + metadata: { + name: $name, + labels: (if $default_label == "true" then {($default_driver_label): "true"} else {} end) + }, + spec: .spec + } + ' | kubectl apply -f - +} + +remove_default_label() { + local driver_name=$1 + + kubectl label nvidiadriver/"${driver_name}" "${DEFAULT_NVIDIA_DRIVER_LABEL}-" --overwrite +} + +restore_default_label() { + local driver_name=$1 + + kubectl label nvidiadriver/"${driver_name}" "${DEFAULT_NVIDIA_DRIVER_LABEL}=true" --overwrite +} + +wait_for_default_nvidiadriver() { + local expected_name=$1 + local current_time=0 + + echo "Waiting for NVIDIADriver/${expected_name} to be the only default" + while :; do + default_count=$(get_default_nvidiadriver_count) + default_name=$(get_default_nvidiadriver_name) + if [[ "${default_count}" -eq 1 && "${default_name}" == "${expected_name}" ]]; then + echo "NVIDIADriver/${expected_name} is the only default" + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for NVIDIADriver/${expected_name} to be the only default" + kubectl get nvidiadriver --show-labels + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +test_arbitrary_name_default_nvidiadriver() { + local current_default + current_default=$(get_default_nvidiadriver_name) + if [[ -z "${current_default}" ]]; then + echo "default NVIDIADriver not found" + kubectl get nvidiadriver --show-labels + exit 1 + fi + + if [[ "${current_default}" == "${DEFAULT_NVIDIA_DRIVER_NAME}" ]]; then + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + return + fi + + echo "Moving default label from NVIDIADriver/${current_default} to NVIDIADriver/${DEFAULT_NVIDIA_DRIVER_NAME}" + remove_default_label "${current_default}" + if ! create_nvidiadriver_from "${current_default}" "${DEFAULT_NVIDIA_DRIVER_NAME}" true; then + echo "failed to create NVIDIADriver/${DEFAULT_NVIDIA_DRIVER_NAME}; restoring NVIDIADriver/${current_default} default label before failing" + restore_default_label "${current_default}" + exit 1 + fi + kubectl delete nvidiadriver/"${current_default}" + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_owner "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_daemonsets "${DEFAULT_NVIDIA_DRIVER_NAME}" +} + +create_nvidiadriver() { + local default_name + default_name=$(get_default_nvidiadriver_name) + if [[ -z "${default_name}" ]]; then + echo "default NVIDIADriver not found" + kubectl get nvidiadriver --show-labels + exit 1 + fi + + echo "Creating user-defined NVIDIADriver/${NVIDIA_DRIVER_NAME} from NVIDIADriver/${default_name}" + create_nvidiadriver_from "${default_name}" "${NVIDIA_DRIVER_NAME}" false +} + +wait_for_nvidiadriver_owner() { + local driver_name=$1 + local current_time=0 + local gpu_node_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + echo "Waiting for ${gpu_node_count} GPU node(s) to be owned by NVIDIADriver/${driver_name}" + + while :; do + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -eq "${gpu_node_count}" ]]; then + echo "All GPU nodes are owned by NVIDIADriver/${driver_name}" + break + fi + + if [[ "${current_time}" -gt $((60 * 15)) ]]; then + echo "timeout reached waiting for NVIDIADriver/${driver_name} ownership" + kubectl get nodes -l nvidia.com/gpu.present=true \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu\.driver\.owner}{"\n"}{end}' + exit 1 + fi + + echo "NVIDIADriver/${driver_name} owns ${owned_count}/${gpu_node_count} GPU node(s)" + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +get_nvidiadriver_daemonsets() { + local driver_name=$1 + kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o json | + jq --arg driver_name "${driver_name}" '.items | map(select(.spec.template.spec.nodeSelector["nvidia.com/gpu.driver.owner"] == $driver_name))' +} + +wait_for_nvidiadriver_daemonsets() { + local driver_name=$1 + local current_time=0 + + echo "Waiting for daemonsets owned by NVIDIADriver/${driver_name}" + while :; do + daemonset_count=$(get_nvidiadriver_daemonsets "${driver_name}" | jq length) + if [[ "${daemonset_count}" -gt 0 ]]; then + echo "Found ${daemonset_count} daemonset(s) owned by NVIDIADriver/${driver_name}" + break + fi + + if [[ "${current_time}" -gt $((60 * 15)) ]]; then + echo "timeout reached waiting for daemonsets owned by NVIDIADriver/${driver_name}" + kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o yaml + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + test_driver_image_updates() { # Update driver image version - kubectl patch nvidiadriver/default --type='json' -p='[{"op": "replace", "path": "/spec/version", "value": '"$TARGET_DRIVER_VERSION"'}]' + kubectl patch nvidiadriver/"${NVIDIA_DRIVER_NAME}" --type='json' -p='[{"op": "replace", "path": "/spec/version", "value": '"$TARGET_DRIVER_VERSION"'}]' if [ "$?" -ne 0 ]; then echo "cannot update driver image with version $TARGET_DRIVER_VERSION for driver-daemonset" exit 1 fi - # Wait for 10 seconds for the change to be applied by operator - sleep 10 - # Verify update is applied to Driver Daemonset - UPDATED_IMAGE=$(kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o json | jq '.items[0].spec.template.spec.containers[0].image') - if [[ "$UPDATED_IMAGE" != *"$TARGET_DRIVER_VERSION"* ]]; then - echo "Image update failed for driver daemonset to version $TARGET_DRIVER_VERSION" - exit 1 - fi + local current_time=0 + while :; do + if get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" | jq -e --arg version "${TARGET_DRIVER_VERSION}" 'length > 0 and all(.[]; .spec.template.spec.containers[0].image | contains($version))' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "Image update failed for driver daemonset to version $TARGET_DRIVER_VERSION" + get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done echo "driver daemonset image updated successfully to version $TARGET_DRIVER_VERSION" # Delete driver pod to trigger update due to OnDelete policy @@ -44,7 +217,7 @@ test_driver_image_updates() { } test_custom_labels_override() { - if ! kubectl patch nvidiadriver/default --type='json' -p='[{"op": "add", "path": "/spec/labels", "value": {"cloudprovider": "aws", "platform": "kubernetes"}}]'; + if ! kubectl patch nvidiadriver/"${NVIDIA_DRIVER_NAME}" --type='json' -p='[{"op": "add", "path": "/spec/labels", "value": {"cloudprovider": "aws", "platform": "kubernetes"}}]'; then echo "cannot update the labels of the NVIDIADriver resource" exit 1 @@ -52,11 +225,21 @@ test_custom_labels_override() { # Wait for the operator to update the pod template with new labels echo "Waiting for DaemonSet pod template to be updated with new labels..." - kubectl wait daemonset \ - -l "app.kubernetes.io/component=nvidia-driver" \ - -n "$TEST_NAMESPACE" \ - --for=jsonpath='{.spec.template.metadata.labels.cloudprovider}'=aws \ - --timeout=120s + local current_time=0 + while :; do + if get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" | jq -e 'length > 0 and all(.[]; .spec.template.metadata.labels.cloudprovider == "aws" and .spec.template.metadata.labels.platform == "kubernetes")' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for DaemonSet pod template labels" + get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done # Delete driver pod to force recreation with updated labels. Existing pods are not automatically restarted due to the DaemonSet's 'OnDelete` updateStrategy. echo "Deleting driver pod to trigger recreation with updated labels..." @@ -68,20 +251,111 @@ test_custom_labels_override() { check_nvidia_driver_pods_ready echo "checking nvidia-driver-daemonset labels" - for pod in $(kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver" --output=jsonpath={.items..metadata.name}) - do - cp_label_value=$(kubectl get pod -n "$TEST_NAMESPACE" "$pod" --output jsonpath={.metadata.labels.cloudprovider}) - if [ "$cp_label_value" != "aws" ]; then - echo "Custom Label cloudprovider is incorrect when nvidiadriver labels are overridden - $pod" + labeled_pod_count=$(kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver,cloudprovider=aws,platform=kubernetes" --no-headers | wc -l) + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + if [[ "${labeled_pod_count}" -ne "${gpu_node_count}" ]]; then + echo "Custom labels are missing from one or more NVIDIADriver/${NVIDIA_DRIVER_NAME} pods" + kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver" --show-labels + exit 1 + fi +} + +assert_nvidiadriver_owner_count() { + local driver_name=$1 + local gpu_node_count + local owned_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -ne "${gpu_node_count}" ]]; then + echo "Expected ${gpu_node_count} GPU node(s) to remain owned by NVIDIADriver/${driver_name}, found ${owned_count}" + kubectl get nodes -l nvidia.com/gpu.present=true \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu\.driver\.owner}{"\n"}{end}' exit 1 fi - platform_label_value=$(kubectl get pod -n "$TEST_NAMESPACE" "$pod" --output jsonpath={.metadata.labels.platform}) - if [ "$platform_label_value" != "kubernetes" ]; then - echo "Custom Label platform is incorrect when nvidiadriver labels are overridden - $pod" - exit 1 +} + +wait_for_nvidiadriver_condition_message() { + local driver_name=$1 + local message=$2 + local current_time=0 + + echo "Waiting for NVIDIADriver/${driver_name} status message to contain: ${message}" + while :; do + if kubectl get nvidiadriver/"${driver_name}" -o json | jq -e --arg message "${message}" ' + (.status.state // "") == "notReady" and + ([.status.conditions[]?.message // ""] | any(contains($message))) + ' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for NVIDIADriver/${driver_name} status message" + kubectl get nvidiadriver/"${driver_name}" -o yaml + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +wait_for_clusterpolicy_condition_message() { + local message=$1 + local current_time=0 + + echo "Waiting for ClusterPolicy/cluster-policy status message to contain: ${message}" + while :; do + if kubectl get clusterpolicy/cluster-policy -o json | jq -e --arg message "${message}" ' + (.status.state // "") == "notReady" and + ([.status.conditions[]?.message // ""] | any(contains($message))) + ' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for ClusterPolicy/cluster-policy status message" + kubectl get clusterpolicy/cluster-policy -o yaml + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +test_removed_default_label_conflict_preserves_owners() { + echo "Testing that removing the default label makes the CR a normal conflicting NVIDIADriver" + remove_default_label "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_condition_message "${NVIDIA_DRIVER_NAME}" "conflicting NVIDIADriver NodeSelectors found" + wait_for_nvidiadriver_condition_message "${DEFAULT_NVIDIA_DRIVER_NAME}" "conflicting NVIDIADriver NodeSelectors found" + wait_for_clusterpolicy_condition_message "conflicting NVIDIADriver NodeSelectors found" + assert_nvidiadriver_owner_count "${NVIDIA_DRIVER_NAME}" + restore_default_label "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_owner "${NVIDIA_DRIVER_NAME}" +} + +test_multiple_default_labels() { + echo "Testing multiple default-labeled NVIDIADrivers" + if ! create_nvidiadriver_from "${DEFAULT_NVIDIA_DRIVER_NAME}" "${SECOND_DEFAULT_NVIDIA_DRIVER_NAME}" true; then + echo "creation of second default-labeled NVIDIADriver was rejected" + return fi - done + + wait_for_nvidiadriver_condition_message "${SECOND_DEFAULT_NVIDIA_DRIVER_NAME}" "multiple default NVIDIADrivers found" + assert_nvidiadriver_owner_count "${NVIDIA_DRIVER_NAME}" + kubectl delete nvidiadriver/"${SECOND_DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_owner "${NVIDIA_DRIVER_NAME}" } +test_arbitrary_name_default_nvidiadriver +create_nvidiadriver +wait_for_nvidiadriver_owner "${NVIDIA_DRIVER_NAME}" +wait_for_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" +check_nvidia_driver_pods_ready test_driver_image_updates test_custom_labels_override +test_removed_default_label_conflict_preserves_owners +test_multiple_default_labels