From 6a77175989a47bb6d14af38156dffef78da32d8c Mon Sep 17 00:00:00 2001 From: Marcelo de Castro Loebens Date: Thu, 21 Aug 2025 11:00:33 -0400 Subject: [PATCH] Wait for helm-controller pod on fluxcd rollback After kube-apiserver is restarted during a rollback, in some instances an issue with the portieris webhook can happen when the helm rollback is performed before the helm-controller pod is restarting (portieris webhook detects a change to an unhealthy resource and fails). To account for that, added a wait for the helm-controller pod status to become 'Ready' before proceeding with the rollback. If the wait has a timeout, the method will warn (instead of failing), since there might be causes for the pod to not become Ready that can be fixed after the rollback. Test plan: PASS: Perform activation rollback with portieris applied. Story: 2011357 Task: 52711 Change-Id: Id50016dfccd75eabcc62f7f54bf274ba6b593af3 Signed-off-by: Marcelo de Castro Loebens --- .../22-rollback-fluxcd-controllers.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/software/upgrade-scripts/22-rollback-fluxcd-controllers.py b/software/upgrade-scripts/22-rollback-fluxcd-controllers.py index 1722ac3b..aa051010 100644 --- a/software/upgrade-scripts/22-rollback-fluxcd-controllers.py +++ b/software/upgrade-scripts/22-rollback-fluxcd-controllers.py @@ -222,6 +222,30 @@ def rollback_fluxcd_controllers(revision): LOG.info("Flux release successfully rolled back") +# Workaround for portieris issue when helm-controller is restarting +@test_k8s_health +def wait_helm_controller_pod_ready(): + """ Wait for helm-controller pod to be Ready + """ + + LOG.info("Waiting for helm-controller pod to be Ready") + + try: + subprocess.run( + ["kubectl", "wait", "--for=condition=Ready", "pods", + "-l", "app=helm-controller", + "-n", RELEASE_NAMESPACE, + "--timeout=60s", + "--kubeconfig", KUBECONFIG], + check=True + ) + except Exception as e: + # Warning and proceeding with the rollback, as the issue might be fixed by it + LOG.warning(f"Error waiting for helm-controller pod to be Ready: {e}") + else: + LOG.info("helm-controller pod is Ready. Proceeding.") + + def main(): action = None @@ -264,6 +288,7 @@ def main(): if target_revision: delete_incompatible_crd() + wait_helm_controller_pod_ready() rollback_fluxcd_controllers(target_revision) else: LOG.error("Version %s is not available in revision history", previous_version)