Files
utilities/tools/nocloud-factory-install/factory-install/backup/10-system-backup
Enzo Candotti f13b13f2d4 Wait for rook-ceph to be applied in factory backup stage
If rook-ceph is configured in a standalone system, during a factory
install the backup script fails when it tries to create the ceph
backup, since the rook-ceph application is not yet applied.

This commit updates the 10-system-backup script to wait for the
rook-ceph application to reach the "applied" state before running the
backup.

When rook-ceph is configured, this check adds between 7 to 11 minutes
to the factory-install process, depending on how long rook-ceph takes
to reach the applied state.

Test Plan:
PASS: Run a factory-install in a standalone host with rook-ceph
configured. Verify that the setup backup waits until the app is applied
and the factory-install completes successfully.
PASS: Run a factory-install in a standalone host without rook-ceph
configured. Verify that the backup stage does not wait for the app.

Closes-bug: 2125236

Change-Id: I67b18426b1acc3f3389f2aae71a93b7bb0a66175
Signed-off-by: Enzo Candotti <Enzo.Candotti@windriver.com>
2025-09-23 17:59:28 -03:00

156 lines
5.2 KiB
Bash

#!/bin/bash
#
# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Factory install backup triggered during the backup stage
#
# Flag files that need to be created if backup is successful
BACKUP_FLAG="/var/lib/factory-install/state/backup"
FINAL_FLAG="/var/lib/factory-install/stage/final"
COMPLETE_FLAG="/var/lib/factory-install/complete"
# Remove flag files on failure
cleanup_on_failure() {
log_warn "Backup failed, cleaning up flags..."
rm -f "$BACKUP_FLAG" "$FINAL_FLAG" "$COMPLETE_FLAG"
}
# Trap to cleanup flags on any failure
trap cleanup_on_failure ERR
# Get software version from build info
SW_VERSION=$(awk -F= '/^SW_VERSION/ {print $2}' /etc/build.info | tr -d '"')
BACKUP_DIR="/opt/platform-backup/factory/${SW_VERSION}"
# Source locations
OSTREE_REPO_SRC="/var/www/pages/feed/rel-${SW_VERSION}/ostree_repo"
MINIBOOT_CFG_SRC="/var/www/pages/feed/rel-${SW_VERSION}/kickstart/miniboot.cfg"
# Target locations
OSTREE_REPO_DEST="${BACKUP_DIR}/ostree_repo"
MINIBOOT_CFG_DEST="${BACKUP_DIR}/miniboot.cfg"
function check_rc_die {
local -i rc=${1}
local msg="${2}"
if [ ${rc} -ne 0 ]; then
log_fatal "${msg} [rc=${rc}]"
fi
}
function log_fatal {
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") FATAL: ${*}"
exit 1
}
function log_warn {
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") WARN: ${*}"
}
function log_info {
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") INFO: $*"
}
# Create flags at the beginning so they are included in the backup
log_info "Creating factory install flags..."
touch "$BACKUP_FLAG"
check_rc_die $? "Failed to create backup flag"
touch "$FINAL_FLAG"
check_rc_die $? "Failed to create final flag"
touch "$COMPLETE_FLAG"
check_rc_die $? "Failed to create complete flag"
# Skip the execution of the system-backup script if system_mode is duplex
system_mode=$(awk -F= '/system_mode/ {print $2}' /etc/platform/platform.conf)
if [ "$system_mode" != "simplex" ]; then
log_info "Skipping factory backup for non-simplex system mode"
exit 0
fi
# TODO(ecandotti): Remove when https://bugs.launchpad.net/starlingx/+bug/2116270
# is resolved
if systemctl is-failed --quiet fm-api.service; then
# Restart fm-api if it is in failed state,
# otherwise the backup health-query will fail
log_info "fm-api.service is in failed state, restarting..."
systemctl restart fm-api.service
fi
source /etc/platform/openrc >/dev/null 2>&1 || true
# Check if rook-ceph is configured in the node via ceph-rook-store backend
if system storage-backend-list 2>/dev/null | grep -q ceph-rook-store; then
log_info "Waiting for rook-ceph application to reach applied state..."
start_time=$(date +%s)
timeout=$((15 * 60)) # 15 minutes
while true; do
# Prevent non-zero rc from triggering the ERR trap: swallow the error with || true
rook_status=$(system application-show rook-ceph --column status --format value 2>/dev/null || true)
if [ "$rook_status" = "applied" ]; then
log_info "Ready - rook-ceph application applied"
break
elif [ "$rook_status" = "apply-failed" ]; then
log_fatal "rook-ceph application failed to apply"
fi
now=$(date +%s)
elapsed=$((now - start_time))
if [ $elapsed -ge $timeout ]; then
log_warn "Timeout after $((elapsed/60)) minutes. Current rook-ceph status: $rook_status."
exit 0
fi
sleep 15
done
fi
log_info "Creating backup directory: $BACKUP_DIR"
mkdir -p "$BACKUP_DIR"
check_rc_die $? "Failed to create backup directory $BACKUP_DIR"
# Run StarlingX platform backup playbook
log_info "Running platform backup playbook..."
ansible-playbook /usr/share/ansible/stx-ansible/playbooks/backup.yml \
-e "platform_backup_filename_prefix=factory" \
-e "backup_dir=$BACKUP_DIR" \
-e "backup_registry_filesystem=true"
check_rc_die $? "Failed to run backup playbook"
# Rename the generated backup tarball
log_info "Looking for backup tarball in $BACKUP_DIR"
BACKUP_TAR=$(find "$BACKUP_DIR" -maxdepth 1 -type f -name 'factory_*.tgz' | head -n1)
if [ -n "$BACKUP_TAR" ]; then
log_info "Renaming $BACKUP_TAR to ${BACKUP_DIR}/factory_backup.tgz"
mv "$BACKUP_TAR" "${BACKUP_DIR}/factory_backup.tgz"
check_rc_die $? "Failed to rename $BACKUP_TAR file"
else
log_fatal "No backup tarball found in $BACKUP_DIR"
fi
# Copy ostree_repo directory to factory backup dir
log_info "Copying $OSTREE_REPO_SRC to $OSTREE_REPO_DEST"
cp -r "$OSTREE_REPO_SRC" "$OSTREE_REPO_DEST"
check_rc_die $? "Unable to backup $OSTREE_REPO_SRC"
# Generate ostree_repo directory checksum and copy it to the factory backup dir
log_info "Generating ostree_repo checksum file..."
cd "$(dirname "$OSTREE_REPO_SRC")"
find "$(basename "$OSTREE_REPO_SRC")" -type f -exec md5sum {} + | LC_ALL=C sort | \
md5sum | awk '{ print $1; }' > "${BACKUP_DIR}/.ostree_repo_checksum"
check_rc_die $? "Unable to generate ostree_repo checksum file"
# Copy miniboot.cfg to factory backup dir
log_info "Copying $MINIBOOT_CFG_SRC to $MINIBOOT_CFG_DEST"
cp "$MINIBOOT_CFG_SRC" "$MINIBOOT_CFG_DEST"
check_rc_die $? "Unable to backup $MINIBOOT_CFG_SRC"
log_info "Factory backup completed at $BACKUP_DIR"