
If rook-ceph is configured in a standalone system, during a factory install the backup script fails when it tries to create the ceph backup, since the rook-ceph application is not yet applied. This commit updates the 10-system-backup script to wait for the rook-ceph application to reach the "applied" state before running the backup. When rook-ceph is configured, this check adds between 7 to 11 minutes to the factory-install process, depending on how long rook-ceph takes to reach the applied state. Test Plan: PASS: Run a factory-install in a standalone host with rook-ceph configured. Verify that the setup backup waits until the app is applied and the factory-install completes successfully. PASS: Run a factory-install in a standalone host without rook-ceph configured. Verify that the backup stage does not wait for the app. Closes-bug: 2125236 Change-Id: I67b18426b1acc3f3389f2aae71a93b7bb0a66175 Signed-off-by: Enzo Candotti <Enzo.Candotti@windriver.com>
156 lines
5.2 KiB
Bash
156 lines
5.2 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2025 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Factory install backup triggered during the backup stage
|
|
#
|
|
|
|
# Flag files that need to be created if backup is successful
|
|
BACKUP_FLAG="/var/lib/factory-install/state/backup"
|
|
FINAL_FLAG="/var/lib/factory-install/stage/final"
|
|
COMPLETE_FLAG="/var/lib/factory-install/complete"
|
|
|
|
# Remove flag files on failure
|
|
cleanup_on_failure() {
|
|
log_warn "Backup failed, cleaning up flags..."
|
|
rm -f "$BACKUP_FLAG" "$FINAL_FLAG" "$COMPLETE_FLAG"
|
|
}
|
|
|
|
# Trap to cleanup flags on any failure
|
|
trap cleanup_on_failure ERR
|
|
|
|
# Get software version from build info
|
|
SW_VERSION=$(awk -F= '/^SW_VERSION/ {print $2}' /etc/build.info | tr -d '"')
|
|
BACKUP_DIR="/opt/platform-backup/factory/${SW_VERSION}"
|
|
|
|
# Source locations
|
|
OSTREE_REPO_SRC="/var/www/pages/feed/rel-${SW_VERSION}/ostree_repo"
|
|
MINIBOOT_CFG_SRC="/var/www/pages/feed/rel-${SW_VERSION}/kickstart/miniboot.cfg"
|
|
|
|
# Target locations
|
|
OSTREE_REPO_DEST="${BACKUP_DIR}/ostree_repo"
|
|
MINIBOOT_CFG_DEST="${BACKUP_DIR}/miniboot.cfg"
|
|
|
|
function check_rc_die {
|
|
local -i rc=${1}
|
|
local msg="${2}"
|
|
if [ ${rc} -ne 0 ]; then
|
|
log_fatal "${msg} [rc=${rc}]"
|
|
fi
|
|
}
|
|
|
|
function log_fatal {
|
|
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") FATAL: ${*}"
|
|
exit 1
|
|
}
|
|
|
|
function log_warn {
|
|
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") WARN: ${*}"
|
|
}
|
|
|
|
function log_info {
|
|
echo "$(date +"%Y-%m-%d %H:%M:%S,%3N - factory-backup -") INFO: $*"
|
|
}
|
|
|
|
# Create flags at the beginning so they are included in the backup
|
|
log_info "Creating factory install flags..."
|
|
touch "$BACKUP_FLAG"
|
|
check_rc_die $? "Failed to create backup flag"
|
|
|
|
touch "$FINAL_FLAG"
|
|
check_rc_die $? "Failed to create final flag"
|
|
|
|
touch "$COMPLETE_FLAG"
|
|
check_rc_die $? "Failed to create complete flag"
|
|
|
|
# Skip the execution of the system-backup script if system_mode is duplex
|
|
system_mode=$(awk -F= '/system_mode/ {print $2}' /etc/platform/platform.conf)
|
|
if [ "$system_mode" != "simplex" ]; then
|
|
log_info "Skipping factory backup for non-simplex system mode"
|
|
exit 0
|
|
fi
|
|
|
|
# TODO(ecandotti): Remove when https://bugs.launchpad.net/starlingx/+bug/2116270
|
|
# is resolved
|
|
if systemctl is-failed --quiet fm-api.service; then
|
|
# Restart fm-api if it is in failed state,
|
|
# otherwise the backup health-query will fail
|
|
log_info "fm-api.service is in failed state, restarting..."
|
|
systemctl restart fm-api.service
|
|
fi
|
|
|
|
source /etc/platform/openrc >/dev/null 2>&1 || true
|
|
# Check if rook-ceph is configured in the node via ceph-rook-store backend
|
|
if system storage-backend-list 2>/dev/null | grep -q ceph-rook-store; then
|
|
log_info "Waiting for rook-ceph application to reach applied state..."
|
|
|
|
start_time=$(date +%s)
|
|
timeout=$((15 * 60)) # 15 minutes
|
|
|
|
while true; do
|
|
# Prevent non-zero rc from triggering the ERR trap: swallow the error with || true
|
|
rook_status=$(system application-show rook-ceph --column status --format value 2>/dev/null || true)
|
|
|
|
if [ "$rook_status" = "applied" ]; then
|
|
log_info "Ready - rook-ceph application applied"
|
|
break
|
|
elif [ "$rook_status" = "apply-failed" ]; then
|
|
log_fatal "rook-ceph application failed to apply"
|
|
fi
|
|
|
|
now=$(date +%s)
|
|
elapsed=$((now - start_time))
|
|
|
|
if [ $elapsed -ge $timeout ]; then
|
|
log_warn "Timeout after $((elapsed/60)) minutes. Current rook-ceph status: $rook_status."
|
|
exit 0
|
|
fi
|
|
sleep 15
|
|
done
|
|
fi
|
|
|
|
log_info "Creating backup directory: $BACKUP_DIR"
|
|
mkdir -p "$BACKUP_DIR"
|
|
check_rc_die $? "Failed to create backup directory $BACKUP_DIR"
|
|
|
|
# Run StarlingX platform backup playbook
|
|
log_info "Running platform backup playbook..."
|
|
ansible-playbook /usr/share/ansible/stx-ansible/playbooks/backup.yml \
|
|
-e "platform_backup_filename_prefix=factory" \
|
|
-e "backup_dir=$BACKUP_DIR" \
|
|
-e "backup_registry_filesystem=true"
|
|
check_rc_die $? "Failed to run backup playbook"
|
|
|
|
# Rename the generated backup tarball
|
|
log_info "Looking for backup tarball in $BACKUP_DIR"
|
|
BACKUP_TAR=$(find "$BACKUP_DIR" -maxdepth 1 -type f -name 'factory_*.tgz' | head -n1)
|
|
if [ -n "$BACKUP_TAR" ]; then
|
|
log_info "Renaming $BACKUP_TAR to ${BACKUP_DIR}/factory_backup.tgz"
|
|
mv "$BACKUP_TAR" "${BACKUP_DIR}/factory_backup.tgz"
|
|
check_rc_die $? "Failed to rename $BACKUP_TAR file"
|
|
else
|
|
log_fatal "No backup tarball found in $BACKUP_DIR"
|
|
fi
|
|
|
|
# Copy ostree_repo directory to factory backup dir
|
|
log_info "Copying $OSTREE_REPO_SRC to $OSTREE_REPO_DEST"
|
|
cp -r "$OSTREE_REPO_SRC" "$OSTREE_REPO_DEST"
|
|
check_rc_die $? "Unable to backup $OSTREE_REPO_SRC"
|
|
|
|
# Generate ostree_repo directory checksum and copy it to the factory backup dir
|
|
log_info "Generating ostree_repo checksum file..."
|
|
cd "$(dirname "$OSTREE_REPO_SRC")"
|
|
find "$(basename "$OSTREE_REPO_SRC")" -type f -exec md5sum {} + | LC_ALL=C sort | \
|
|
md5sum | awk '{ print $1; }' > "${BACKUP_DIR}/.ostree_repo_checksum"
|
|
check_rc_die $? "Unable to generate ostree_repo checksum file"
|
|
|
|
# Copy miniboot.cfg to factory backup dir
|
|
log_info "Copying $MINIBOOT_CFG_SRC to $MINIBOOT_CFG_DEST"
|
|
cp "$MINIBOOT_CFG_SRC" "$MINIBOOT_CFG_DEST"
|
|
check_rc_die $? "Unable to backup $MINIBOOT_CFG_SRC"
|
|
|
|
|
|
log_info "Factory backup completed at $BACKUP_DIR"
|