Skip to content

Commit 7bdde96

Browse files
vamsi-resolveclaude
andcommitted
fix: recover from warm pool pod deletion instead of permanent error loop
Cherry-picks two upstream fixes: 1. kubernetes-sigs#521 — When an adopted warm pool pod is deleted (node failure, drain, eviction), the controller returned a hard error because the agents.x-k8s.io/pod-name annotation pointed to a non-existent pod. This left the Sandbox stuck in a permanent reconcile error loop. Now the controller clears the stale annotation and falls through to create a replacement pod (which remounts the existing PVC). 2. kubernetes-sigs#469 — During warm pool adoption, ensure the pod-name annotation is correct before the sandbox can be observed as Ready. Prevents stale annotations from being set in the first place. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c1158de commit 7bdde96

3 files changed

Lines changed: 39 additions & 19 deletions

File tree

controllers/sandbox_controller.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,12 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
472472
return nil, fmt.Errorf("pod get failed: %w", err)
473473
}
474474
if podNameAnnotationExists {
475-
log.Error(err, "Pod not found")
476-
return nil, fmt.Errorf("pod in annotation get failed: %w", err)
475+
log.Info("Tracked pod not found, clearing stale annotation", "podName", podName)
476+
patch := client.MergeFrom(sandbox.DeepCopy())
477+
delete(sandbox.Annotations, sandboxv1alpha1.SandboxPodNameAnnotation)
478+
if patchErr := r.Patch(ctx, sandbox, patch); patchErr != nil {
479+
return nil, fmt.Errorf("failed to clear stale pod name annotation: %w", patchErr)
480+
}
477481
}
478482
pod = nil
479483
}

controllers/sandbox_controller_test.go

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,31 +1130,40 @@ func TestReconcilePod(t *testing.T) {
11301130
expectErr: true,
11311131
},
11321132
{
1133-
name: "error when annotated pod does not exist",
1134-
initialObjs: []runtime.Object{},
1135-
sandbox: &sandboxv1alpha1.Sandbox{
1133+
name: "clears stale annotation and creates replacement pod when annotated pod does not exist",
1134+
sandbox: func() *sandboxv1alpha1.Sandbox {
1135+
s := sandboxObj.DeepCopy()
1136+
s.Annotations = map[string]string{
1137+
sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod",
1138+
}
1139+
return s
1140+
}(),
1141+
wantPod: &corev1.Pod{
11361142
ObjectMeta: metav1.ObjectMeta{
1137-
Name: sandboxName,
1138-
Namespace: sandboxNs,
1143+
Name: sandboxName,
1144+
Namespace: sandboxNs,
1145+
ResourceVersion: "1",
1146+
Labels: map[string]string{
1147+
"agents.x-k8s.io/sandbox-name-hash": nameHash,
1148+
"custom-label": "label-val",
1149+
},
11391150
Annotations: map[string]string{
1140-
sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod",
1151+
"custom-annotation": "anno-val",
11411152
},
1153+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
11421154
},
1143-
Spec: sandboxv1alpha1.SandboxSpec{
1144-
Replicas: ptr.To(int32(1)),
1145-
PodTemplate: sandboxv1alpha1.PodTemplate{
1146-
Spec: corev1.PodSpec{
1147-
Containers: []corev1.Container{
1148-
{
1149-
Name: "test-container",
1150-
},
1151-
},
1155+
Spec: corev1.PodSpec{
1156+
Containers: []corev1.Container{
1157+
{
1158+
Name: "test-container",
11521159
},
11531160
},
11541161
},
11551162
},
1156-
wantPod: nil,
1157-
expectErr: true,
1163+
expectErr: false,
1164+
wantSandboxAnnotations: map[string]string{
1165+
sandboxv1alpha1.SandboxPodNameAnnotation: sandboxName,
1166+
},
11581167
},
11591168
{
11601169
name: "refuses to delete annotated pod owned by a different controller",

extensions/controllers/sandboxclaim_controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,13 @@ func (r *SandboxClaimReconciler) adoptSandboxFromCandidates(ctx context.Context,
466466
if adopted.Annotations == nil {
467467
adopted.Annotations = make(map[string]string)
468468
}
469+
// Ensure the adopted sandbox records its pod name before it can be observed Ready.
470+
if podName := adopted.Annotations[v1alpha1.SandboxPodNameAnnotation]; podName != adopted.Name {
471+
if podName != "" {
472+
logger.Info("Correcting adopted sandbox pod-name annotation", "sandbox", adopted.Name, "oldPodName", podName, "newPodName", adopted.Name)
473+
}
474+
adopted.Annotations[v1alpha1.SandboxPodNameAnnotation] = adopted.Name
475+
}
469476
if traceContext, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
470477
adopted.Annotations[asmetrics.TraceContextAnnotation] = traceContext
471478
}

0 commit comments

Comments
 (0)