From 6812a1e1db68e2b8ad4a3607dc9396c96287d37f Mon Sep 17 00:00:00 2001 From: Matthew Booth Date: Sat, 20 Jun 2026 17:06:19 +0100 Subject: [PATCH] Don't set Progressing=False until all pods available We don't want the ClusterOperator to report Progressing=True whenever a Node reboots. However, we also don't want to report Progressing=False during a CNI rollout until all pods are available. This change ensures both are covered. --- pkg/controller/statusmanager/pod_status.go | 42 +- .../statusmanager/status_manager_test.go | 688 +++++++++++++++++- 2 files changed, 708 insertions(+), 22 deletions(-) diff --git a/pkg/controller/statusmanager/pod_status.go b/pkg/controller/statusmanager/pod_status.go index 6e83c910f9..64b79112bb 100644 --- a/pkg/controller/statusmanager/pod_status.go +++ b/pkg/controller/statusmanager/pod_status.go @@ -104,11 +104,19 @@ func (status *StatusManager) SetFromPods() { } else if ds.Status.UpdatedNumberScheduled < ds.Status.CurrentNumberScheduled { progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.CurrentNumberScheduled)) dsProgressing = true - } else if ds.Status.NumberUnavailable > 0 { - if dsRolloutActive { + } else if ds.Status.NumberUnavailable > 0 && (hadState || dsRolloutActive) { + // Rollout in progress: either continuing a tracked rollout or a new/initial rollout + if hadState { + progressing = append(progressing, fmt.Sprintf("DaemonSet %q rollout is waiting for %d pods to become available", dsName.String(), ds.Status.NumberUnavailable)) + } else { progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable)) - dsProgressing = true } + dsProgressing = true + if !isNonCritical(ds) { + clbo = append(clbo, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...) + } + } else if ds.Status.NumberUnavailable > 0 { + // Reboot churn: unavailable pods but no active rollout and no tracked state if !isNonCritical(ds) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...) } @@ -161,15 +169,23 @@ func (status *StatusManager) SetFromPods() { } else if ss.Status.UpdatedReplicas < ss.Status.Replicas { progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is rolling out (%d out of %d updated)", ssName.String(), ss.Status.UpdatedReplicas, ss.Status.Replicas)) ssProgressing = true - } else if ss.Status.ReadyReplicas > 0 && ss.Status.ReadyReplicas < ss.Status.Replicas { - if ssRolloutActive { + } else if ss.Status.ReadyReplicas < ss.Status.Replicas && (hadState || ssRolloutActive) { + // Rollout in progress: either continuing a tracked rollout or a new/initial rollout + if hadState { + progressing = append(progressing, fmt.Sprintf("StatefulSet %q rollout is waiting for %d pods to become available", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas))) + } else { progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas))) - ssProgressing = true } + ssProgressing = true // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so. if !isNonCritical(ss) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...) } + } else if ss.Status.ReadyReplicas < ss.Status.Replicas { + // Reboot churn: unavailable pods but no active rollout and no tracked state + if !isNonCritical(ss) { + clbo = append(clbo, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...) + } } else if ss.Status.AvailableReplicas == 0 && ssRolloutActive { progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not yet scheduled on any nodes", ssName.String())) ssProgressing = true @@ -218,15 +234,23 @@ func (status *StatusManager) SetFromPods() { } else if dep.Status.UpdatedReplicas < dep.Status.Replicas { progressing = append(progressing, fmt.Sprintf("Deployment %q update is rolling out (%d out of %d updated)", depName.String(), dep.Status.UpdatedReplicas, dep.Status.Replicas)) depProgressing = true - } else if dep.Status.UnavailableReplicas > 0 { - if depRolloutActive { + } else if dep.Status.UnavailableReplicas > 0 && (hadState || depRolloutActive) { + // Rollout in progress: either continuing a tracked rollout or a new/initial rollout + if hadState { + progressing = append(progressing, fmt.Sprintf("Deployment %q rollout is waiting for %d pods to become available", depName.String(), dep.Status.UnavailableReplicas)) + } else { progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas)) - depProgressing = true } + depProgressing = true // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so. if !isNonCritical(dep) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...) } + } else if dep.Status.UnavailableReplicas > 0 { + // Reboot churn: unavailable pods but no active rollout and no tracked state + if !isNonCritical(dep) { + clbo = append(clbo, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...) + } } else if dep.Status.AvailableReplicas == 0 && depRolloutActive { progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String())) depProgressing = true diff --git a/pkg/controller/statusmanager/status_manager_test.go b/pkg/controller/statusmanager/status_manager_test.go index 6911e0c0e5..919d669743 100644 --- a/pkg/controller/statusmanager/status_manager_test.go +++ b/pkg/controller/statusmanager/status_manager_test.go @@ -29,6 +29,8 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" ) +const testReleaseVersion = "v1.0.0" + var ( masterMachineConfigIPsecExtName = "80-ipsec-master-extensions" workerMachineConfigIPsecExtName = "80-ipsec-worker-extensions" @@ -1296,7 +1298,19 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { t.Fatalf("unexpected Status.Versions: %#v", co.Status.Versions) } - // Next update: updatedNumberScheduled -> 1 + // Rollout completes: all pods available, state cleared + dsA.Status = appsv1.DaemonSetStatus{ + CurrentNumberScheduled: 1, + DesiredNumberScheduled: 1, + NumberAvailable: 1, + NumberReady: 1, + ObservedGeneration: 2, + UpdatedNumberScheduled: 1, + } + setStatus(t, client, dsA) + status.SetFromPods() + + // Simulate reboot churn: updatedNumberScheduled -> 1, but pods are unavailable after rollout completed dsA.Status = appsv1.DaemonSetStatus{ CurrentNumberScheduled: 1, DesiredNumberScheduled: 1, @@ -1314,8 +1328,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // With the simplified rollout detection logic, once UpdatedNumberScheduled >= CurrentNumberScheduled, - // the rollout is complete. Unavailability after rollout completion is treated as + // Unavailability without tracked rollout state (hadState=false) is treated as // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { @@ -1845,11 +1858,17 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { t.Fatalf("Didn't find %s in pod state", nsn) } + // Complete the rollout so state is cleared before simulating reboot depB.Status.UpdatedReplicas = depB.Status.Replicas - depB.Status.UnavailableReplicas = 1 - depB.Status.AvailableReplicas = 0 + depB.Status.AvailableReplicas = depB.Status.Replicas + depB.Status.UnavailableReplicas = 0 depB.Status.ObservedGeneration = depB.Generation + setStatus(t, client, depB) + status.SetFromPods() + // Simulate node reboot: pods become unavailable after rollout completion + depB.Status.UnavailableReplicas = 1 + depB.Status.AvailableReplicas = 0 setStatus(t, client, depB) status.SetFromPods() @@ -1861,8 +1880,7 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, - // the rollout is complete. Unavailability after rollout completion is treated as + // Unavailability without tracked rollout state (hadState=false) is treated as // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { @@ -2184,10 +2202,17 @@ func TestStatusManagerRestoresActiveRolloutAfterRestart(t *testing.T) { set(t, client, depB) status.SetFromPods() + // Complete the rollout so state is cleared before simulating reboot depB.Status.UpdatedReplicas = depB.Status.Replicas + depB.Status.AvailableReplicas = depB.Status.Replicas + depB.Status.UnavailableReplicas = 0 + depB.Status.ObservedGeneration = depB.Generation + setStatus(t, client, depB) + status.SetFromPods() + + // Simulate node reboot: pods become unavailable after rollout completion depB.Status.AvailableReplicas = 0 depB.Status.UnavailableReplicas = 1 - depB.Status.ObservedGeneration = depB.Generation setStatus(t, client, depB) restarted := New(client, "testing", names.StandAloneClusterName) @@ -2199,8 +2224,7 @@ func TestStatusManagerRestoresActiveRolloutAfterRestart(t *testing.T) { if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, - // the rollout is complete. Unavailability after rollout completion is treated as + // Unavailability without tracked rollout state (hadState=false) is treated as // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { @@ -2263,10 +2287,17 @@ func TestStatusManagerRestoresStatefulSetActiveRolloutAfterRestart(t *testing.T) set(t, client, ssB) status.SetFromPods() + // Complete the rollout so state is cleared before simulating reboot ssB.Status.UpdatedReplicas = ssB.Status.Replicas + ssB.Status.ReadyReplicas = ssB.Status.Replicas + ssB.Status.AvailableReplicas = ssB.Status.Replicas + ssB.Status.ObservedGeneration = ssB.Generation + setStatus(t, client, ssB) + status.SetFromPods() + + // Simulate node reboot: pods become unready after rollout completion ssB.Status.ReadyReplicas = ssB.Status.Replicas - 1 ssB.Status.AvailableReplicas = ssB.Status.Replicas - 1 - ssB.Status.ObservedGeneration = ssB.Generation setStatus(t, client, ssB) restarted := New(client, "testing", names.StandAloneClusterName) @@ -2278,8 +2309,7 @@ func TestStatusManagerRestoresStatefulSetActiveRolloutAfterRestart(t *testing.T) if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, - // the rollout is complete. Unready replicas after rollout completion are treated as + // Unready replicas without tracked rollout state (hadState=false) are treated as // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { @@ -2338,6 +2368,638 @@ func setLastPodState(t *testing.T, client cnoclient.Client, name string, ps podS } } +// TestDaemonSetRolloutWaitsForAvailability verifies that when a DaemonSet rollout +// completes scheduling (UpdatedNumberScheduled == CurrentNumberScheduled) but pods +// are still becoming available (NumberUnavailable > 0), the operator reports +// Progressing=True with a "waiting for pods to become available" message. +func TestDaemonSetRolloutWaitsForAvailability(t *testing.T) { + t.Setenv("RELEASE_VERSION", testReleaseVersion) + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + setCO(t, client, "testing") + + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "alpha", + Generation: 1, + Labels: sl, + Annotations: map[string]string{ + "release.openshift.io/version": testReleaseVersion, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.DaemonSetStatus{ + CurrentNumberScheduled: 3, + DesiredNumberScheduled: 3, + NumberAvailable: 3, + NumberReady: 3, + ObservedGeneration: 1, + UpdatedNumberScheduled: 3, + }, + } + set(t, client, ds) + status.SetFromPods() + + // Verify initial state is not progressing + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + + // Phase 1: Start rollout - increment generation, pods not yet updated + ds.Generation = 2 + ds.Status.ObservedGeneration = 2 + ds.Status.UpdatedNumberScheduled = 0 + ds.Status.NumberUnavailable = 3 + ds.Status.NumberAvailable = 0 + ds.Status.NumberReady = 0 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 1: Expected Progressing=True, got: %#v", oc.Status.Conditions) + } + + // Verify state is tracked + ps := getLastPodState(t, client, "testing") + nsn := ClusteredName{Namespace: "one", Name: "alpha"} + found := false + for _, dsState := range ps.DaemonsetStates { + if dsState.ClusteredName == nsn { + found = true + break + } + } + if !found { + t.Fatalf("Phase 1: DaemonSet state should be tracked during rollout") + } + + // Phase 2: Scheduling completes - UpdatedNumberScheduled catches up + // but pods are still becoming available (NumberUnavailable > 0) + ds.Status.UpdatedNumberScheduled = 3 + ds.Status.NumberUnavailable = 3 + ds.Status.NumberAvailable = 0 + ds.Status.NumberReady = 0 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 2: Expected Progressing=True while waiting for availability, got: %#v", oc.Status.Conditions) + } + + // Verify state is still tracked + ps = getLastPodState(t, client, "testing") + found = false + for _, dsState := range ps.DaemonsetStates { + if dsState.ClusteredName == nsn { + found = true + break + } + } + if !found { + t.Fatalf("Phase 2: DaemonSet state should still be tracked while waiting for availability") + } + + // Phase 3: Partial availability - some pods become available + ds.Status.NumberAvailable = 1 + ds.Status.NumberReady = 1 + ds.Status.NumberUnavailable = 2 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 3: Expected Progressing=True with partial availability, got: %#v", oc.Status.Conditions) + } + + // Phase 4: Full availability - all pods available + ds.Status.NumberAvailable = 3 + ds.Status.NumberReady = 3 + ds.Status.NumberUnavailable = 0 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("Phase 4: Expected Progressing=False when fully available, got: %#v", oc.Status.Conditions) + } + + // Verify state is cleared + ps = getLastPodState(t, client, "testing") + for _, dsState := range ps.DaemonsetStates { + if dsState.ClusteredName == nsn { + t.Fatalf("Phase 4: DaemonSet state should not be tracked when rollout is complete") + } + } +} + +// TestDaemonSetRolloutWaitsForAvailabilityAcrossRestart verifies that hadState +// is preserved across controller restarts via the annotation, allowing the +// "awaiting availability" phase to continue reporting Progressing=True. +func TestDaemonSetRolloutWaitsForAvailabilityAcrossRestart(t *testing.T) { + t.Setenv("RELEASE_VERSION", testReleaseVersion) + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + setCO(t, client, "testing") + + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "alpha", + Generation: 2, + Labels: sl, + Annotations: map[string]string{ + "release.openshift.io/version": testReleaseVersion, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.DaemonSetStatus{ + CurrentNumberScheduled: 3, + DesiredNumberScheduled: 3, + NumberUnavailable: 3, + NumberAvailable: 0, + NumberReady: 0, + ObservedGeneration: 2, + UpdatedNumberScheduled: 0, + }, + } + set(t, client, ds) + status.SetFromPods() + + // Verify rollout is progressing and state is tracked + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Expected Progressing=True during scheduling, got: %#v", oc.Status.Conditions) + } + + ps := getLastPodState(t, client, "testing") + nsn := ClusteredName{Namespace: "one", Name: "alpha"} + found := false + for _, dsState := range ps.DaemonsetStates { + if dsState.ClusteredName == nsn { + found = true + break + } + } + if !found { + t.Fatalf("DaemonSet state should be tracked during scheduling") + } + + // Scheduling completes, but pods still unavailable + ds.Status.UpdatedNumberScheduled = 3 + setStatus(t, client, ds) + + // Simulate controller restart: create new StatusManager pointing at same client + restarted := New(client, "testing", names.StandAloneClusterName) + restarted.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(restarted) + restarted.SetFromPods() + + // Verify Progressing=True is preserved after restart (hadState=true from annotation) + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Expected Progressing=True after restart while waiting for availability, got: %#v", oc.Status.Conditions) + } +} + +// TestDeploymentRolloutWaitsForAvailability verifies the same rollout phases +// for Deployments using UpdatedReplicas, Replicas, and UnavailableReplicas. +func TestDeploymentRolloutWaitsForAvailability(t *testing.T) { + t.Setenv("RELEASE_VERSION", testReleaseVersion) + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + setCO(t, client, "testing") + + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "alpha", + Generation: 1, + Labels: sl, + Annotations: map[string]string{ + "release.openshift.io/version": testReleaseVersion, + }, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 3, + UpdatedReplicas: 3, + AvailableReplicas: 3, + UnavailableReplicas: 0, + ObservedGeneration: 1, + }, + } + set(t, client, dep) + status.SetFromPods() + + // Verify initial state + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + + // Phase 1: Start rollout + dep.Generation = 2 + dep.Status.ObservedGeneration = 2 + dep.Status.UpdatedReplicas = 0 + dep.Status.UnavailableReplicas = 3 + dep.Status.AvailableReplicas = 0 + setStatus(t, client, dep) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 1: Expected Progressing=True, got: %#v", oc.Status.Conditions) + } + + // Phase 2: Rollout completes, awaiting availability + dep.Status.UpdatedReplicas = 3 + dep.Status.UnavailableReplicas = 3 + dep.Status.AvailableReplicas = 0 + setStatus(t, client, dep) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 2: Expected Progressing=True while waiting for availability, got: %#v", oc.Status.Conditions) + } + + // Phase 3: Full availability + dep.Status.AvailableReplicas = 3 + dep.Status.UnavailableReplicas = 0 + setStatus(t, client, dep) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("Phase 3: Expected Progressing=False when fully available, got: %#v", oc.Status.Conditions) + } +} + +// TestStatefulSetRolloutWaitsForAvailability verifies the same rollout phases +// for StatefulSets using UpdatedReplicas, Replicas, ReadyReplicas, and AvailableReplicas. +func TestStatefulSetRolloutWaitsForAvailability(t *testing.T) { + t.Setenv("RELEASE_VERSION", testReleaseVersion) + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + setCO(t, client, "testing") + + ss := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "alpha", + Generation: 1, + Labels: sl, + Annotations: map[string]string{ + "release.openshift.io/version": testReleaseVersion, + }, + }, + Spec: appsv1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.StatefulSetStatus{ + Replicas: 3, + UpdatedReplicas: 3, + ReadyReplicas: 3, + AvailableReplicas: 3, + ObservedGeneration: 1, + }, + } + set(t, client, ss) + status.SetFromPods() + + // Verify initial state + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + + // Phase 1: Start rollout + ss.Generation = 2 + ss.Status.ObservedGeneration = 2 + ss.Status.UpdatedReplicas = 0 + ss.Status.ReadyReplicas = 0 + ss.Status.AvailableReplicas = 0 + setStatus(t, client, ss) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 1: Expected Progressing=True, got: %#v", oc.Status.Conditions) + } + + // Phase 2: All pods updated but none ready yet (ReadyReplicas == 0). + ss.Status.UpdatedReplicas = 3 + ss.Status.ReadyReplicas = 0 + ss.Status.AvailableReplicas = 0 + setStatus(t, client, ss) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 2: Expected Progressing=True when all pods updated but none ready yet, got: %#v", oc.Status.Conditions) + } + + // Phase 3: Rollout completes, awaiting availability (ReadyReplicas < Replicas) + ss.Status.UpdatedReplicas = 3 + ss.Status.ReadyReplicas = 1 + ss.Status.AvailableReplicas = 1 + setStatus(t, client, ss) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Phase 3: Expected Progressing=True while waiting for availability, got: %#v", oc.Status.Conditions) + } + + // Phase 4: Full availability + ss.Status.ReadyReplicas = 3 + ss.Status.AvailableReplicas = 3 + setStatus(t, client, ss) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("Phase 4: Expected Progressing=False when fully available, got: %#v", oc.Status.Conditions) + } +} + +// TestDaemonSetHungRolloutDuringAvailabilityWait verifies that if a rollout +// gets stuck during the awaiting-availability phase (hadState=true, NumberUnavailable>0) +// for longer than ProgressTimeout, the operator reports Degraded=True with a hung +// rollout message while Progressing=True persists. +func TestDaemonSetHungRolloutDuringAvailabilityWait(t *testing.T) { + t.Setenv("RELEASE_VERSION", testReleaseVersion) + client := fake.NewFakeClient() + fakeClock := testingclock.NewFakeClock(time.Now()) + status := New(client, "testing", names.StandAloneClusterName) + status.clock = fakeClock + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + setCO(t, client, "testing") + + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "alpha", + Generation: 2, + Labels: sl, + Annotations: map[string]string{ + "release.openshift.io/version": testReleaseVersion, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.DaemonSetStatus{ + CurrentNumberScheduled: 3, + DesiredNumberScheduled: 3, + NumberUnavailable: 3, + NumberAvailable: 0, + NumberReady: 0, + ObservedGeneration: 2, + UpdatedNumberScheduled: 0, + }, + } + set(t, client, ds) + status.SetFromPods() + + // Start rollout: scheduling phase + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("Expected Progressing=True, Degraded=False during scheduling, got: %#v", oc.Status.Conditions) + } + + // Enter awaiting-availability phase: scheduling completes but pods stuck unavailable + ds.Status.UpdatedNumberScheduled = 3 + ds.Status.NumberUnavailable = 3 + ds.Status.NumberAvailable = 0 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + }) { + t.Fatalf("Expected Progressing=True, Degraded=False while waiting for availability, got: %#v", oc.Status.Conditions) + } + + // Simulate hung rollout: manually set LastChangeTime in the past + ps := getLastPodState(t, client, "testing") + nsn := ClusteredName{Namespace: "one", Name: "alpha"} + for idx, dsState := range ps.DaemonsetStates { + if dsState.ClusteredName == nsn { + ps.DaemonsetStates[idx].LastChangeTime = time.Now().Add(-(ProgressTimeout + time.Minute)) + break + } + } + setLastPodState(t, client, "testing", ps) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("Expected Progressing=True, Degraded=True after timeout, got: %#v", oc.Status.Conditions) + } +} + func TestStatusManagerCheckCrashLoopBackOffPods(t *testing.T) { client := fake.NewFakeClient() status := New(client, "testing", names.StandAloneClusterName)