Files
update/software/upgrade-scripts/21-k8s-app-upgrade.py
edias 9fb805b63f Add logic to script 21 to support app rollback during the
activate-rollback process.

This change introduces adjustments to support application rollback
after the activate step has been executed. It adds the necessary logic
to trigger the rollback operation at the appframework level, using the
AppUpdateManager introduced in:
https://review.opendev.org/c/starlingx/config/+/954298

Test plan:
PASS: build-pkgs && build-image.
PASS: AIO-SX master fresh install.
PASS: check if all apps were correctly installed.

PASS: build-pkgs && build-image.
PASS: AIO-SX 10 fresh install.
PASS: upgrade to starlingx master until deploy done step.

PASS: build-pkgs && build-image.
PASS: AIO-SX 10 fresh install.
PASS: upgrade to starlingx master until activate-done step
PASS: check if all apps were correctly updated.
PASS: run the activate-rollback step and check if all apps correctly
      rollbacks.
PASS: If any app fails to update, it must recover to the previous
      version.
PASS: If the app fails to update and fails to recovery to the previous
      version and is left in apply-failed, the script will fail and
      require manual intervention.
PASS: If the app fails to update and is configured to not perform
      recovery during the update (update_failure_no_rollback: True),
      the script will fail and manual intervention will be required.

PASS: build-pkgs && build-image.
PASS: AIO-SX 10 fresh install.
PASS: install different apps and force two to stay in an apply-failed
      state, and two others in an upload-failed state.
PASS: upgrade to starlingx master until activate-done step
PASS: run the activate-rollback step and check if the process fails.
      The activate-rollback process cannot occur if any app is in
      the apply-failed state.
PASS: If any app fails to rollback, recovery will not be triggered
      and the app will remain in apply-failed status, requiring manual
      intervention

Depends-on: https://review.opendev.org/c/starlingx/config/+/954298

Story: 2011357
Task: 52492

Change-Id: I64a32aef07471a1ff82abf5827995740abea6775
Signed-off-by: edias <edson.dias@windriver.com>
2025-07-25 17:27:41 -03:00

157 lines
4.9 KiB
Python

#!/usr/bin/python
# Copyright (c) 2022-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import logging
import os
import sys
from time import sleep
from cgtsclient import client as cgts_client
from software.utilities.utils import configure_logging
LOG = logging.getLogger('main_logger')
TIMEOUT_LIMIT_IN_MINUTES = 30
PROGRESS_CHECK_INTERVAL_IN_SECONDS = 20
IN_PROGRESS_STATUS = 'in_progress'
FAILED_STATUS = 'failed'
COMPLETED_STATUS = 'completed'
NO_INFO_STATUS = 'no_info'
ERROR_STATUS = 'error'
TIMEOUT_STATUS = 'timeout'
def get_sysinv_client():
sysinv_client = cgts_client.get_client(
"1",
os_auth_token=os.environ.get("OS_AUTH_TOKEN"),
system_url=os.environ.get("SYSTEM_URL")
)
return sysinv_client
def log_progress(
max_attempts,
currently_attempt,
status,
failed_apps=[],
updated_apps=[],
error_msg=None,
action='update'
):
attempt_msg = f"{action.capitalize()} checking {currently_attempt + 1}/{max_attempts}"
interval_msg = f"Checking again in {PROGRESS_CHECK_INTERVAL_IN_SECONDS} second(s)."
status_to_msg = {
IN_PROGRESS_STATUS: f'{attempt_msg}: Application {action} still in progress. {interval_msg}',
FAILED_STATUS: f'{attempt_msg}: The application {action} process failed',
COMPLETED_STATUS: f'{attempt_msg}: Application {action} successfully finished.',
NO_INFO_STATUS: f'{attempt_msg}: No info from the Application Framework regarding \
application {action}. {interval_msg}',
ERROR_STATUS: f'{attempt_msg} failed with error: {error_msg}',
TIMEOUT_STATUS: f'{attempt_msg}: Application {action} failed due to a timeout. \
For more details, check the sysinv logs at /var/log/sysinv.log'
}
apps_msg = ''
if updated_apps and status == IN_PROGRESS_STATUS:
apps_msg += f"{action.capitalize()}d apps up to now: {', '.join(updated_apps)}."
elif updated_apps and status == COMPLETED_STATUS:
apps_msg += f"{action.capitalize()}d apps: {', '.join(updated_apps)}."
if failed_apps:
apps_msg += f"The following apps did not {action} correctly and require manual \
intervention: {', '.join(failed_apps)}."
progress_log = status_to_msg[status]
if status in (FAILED_STATUS, ERROR_STATUS, TIMEOUT_STATUS):
LOG.error(progress_log)
if apps_msg:
LOG.info(apps_msg)
return
LOG.info(progress_log)
if apps_msg:
LOG.info(apps_msg)
def check_apps_update_progress(client, action='update'):
max_attempts = int(TIMEOUT_LIMIT_IN_MINUTES*60 / PROGRESS_CHECK_INTERVAL_IN_SECONDS)
currently_attempt = 0
while currently_attempt < max_attempts:
try:
response = client.kube_app.get_apps_update_status()
status = NO_INFO_STATUS
if response:
status = response['status']
log_progress(
max_attempts,
currently_attempt,
status,
response['failed_apps'],
response['updated_apps'],
action=action
)
if status == IN_PROGRESS_STATUS:
sleep(PROGRESS_CHECK_INTERVAL_IN_SECONDS)
currently_attempt += 1
elif status == FAILED_STATUS:
return False
elif status == COMPLETED_STATUS:
return True
else:
currently_attempt += 1
except Exception as e:
log_progress(
max_attempts,
currently_attempt,
ERROR_STATUS,
error_msg=e,
action=action
)
sleep(PROGRESS_CHECK_INTERVAL_IN_SECONDS)
currently_attempt += 1
log_progress(max_attempts, currently_attempt, TIMEOUT_STATUS)
return False
def main():
action = sys.argv[3]
if action in ('activate', 'activate-rollback'):
configure_logging()
try:
client = get_sysinv_client()
update_operation_result = False
if action == 'activate':
client.kube_app.update_all()
sleep(5)
update_operation_result = check_apps_update_progress(client)
elif action == 'activate-rollback':
if client.kube_app.get_all_apps_by_status('apply-failed'):
LOG.error(
"One or more applications are in 'apply-failed' status."
"Manual intervention is required."
)
return 1
client.kube_app.rollback_all_apps()
sleep(5)
update_operation_result = check_apps_update_progress(client, 'revert')
if update_operation_result:
return 0
return 1
except Exception as e:
LOG.error(e)
return 1
if __name__ == "__main__":
sys.exit(main())