Files
update/sw-patch/bin/sw-patch-init.sh
Eric MacDonald 78694d4ff9 Reduce sw-patch init script controller ping timeout for simplex systems
The DELAY_SEC variable in the sw-patch init script controls how
long the service waits for an active controller to respond before
proceeding. The default is 120 seconds.

On simplex (AIO-SX) systems, this wait always times out since there is
no active 'controller', resulting in an unnecessary 2-minute delay
during reboot or unlock operations.

This update significantly reduces the timeout delay on simplex systems,
improving boot and unlock responsiveness without affecting duplex
systems.

Test Plan:

PASS: Verify build and install on AIO-SX
PASS: Verify build and install on AIO-DX
PASS: Verify unlock of AIO-SX is not delayed by sw-patch timeout
PASS: Verify sw-patch times out after ~10s with no controller ping (SX)
PASS: Verify sw-patch still uses 120s timeout on AIO-DX

Closes-Bug: 2111941
Change-Id: I46d33e87897e2ae67011241f9f83ae0d153f19ce
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2025-06-03 20:28:40 +00:00

137 lines
4.1 KiB
Bash

#!/bin/bash
#
# Copyright (c) 2014-2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# StarlingX Patching
# chkconfig: 345 20 23
# description: StarlingX Patching init script
### BEGIN INIT INFO
# Provides: sw-patch
# Required-Start: $syslog
# Required-Stop: $syslog
# Default-Start: 2 3 5
# Default-Stop: 0 1 6
# Short-Description: sw-patch
# Description: Provides the StarlingX Patching
### END INIT INFO
NAME=$(basename $0)
. /usr/bin/tsconfig
. /etc/platform/platform.conf
logfile=/var/log/patching.log
patch_failed_file=/var/run/patch_install_failed
patched_during_init=/etc/patching/.patched_during_init
# if the system has never been bootstrapped, system_mode is not set
# treat a non bootstrapped system like it is simplex
# and manually manage lighttpd, etc..
if [ "${system_mode}" = "" ]; then
system_mode="simplex"
fi
function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
}
function check_for_rr_patch {
if [ -f /var/run/node_is_patched_rr ]; then
if [ ! -f ${patched_during_init} ]; then
echo
echo "Node has been patched and requires an immediate reboot."
echo
LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting"
touch ${patched_during_init}
/sbin/reboot
else
echo
echo "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
echo
LOG_TO_FILE "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
touch ${patch_failed_file}
rm -f ${patched_during_init}
exit 1
fi
else
rm -f ${patched_during_init}
fi
}
function check_install_uuid {
# Check whether our installed load matches the active controller
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
if [ $? -ne 0 ]; then
if [ "$HOSTNAME" = "controller-1" ]; then
# If we're on controller-1, controller-0 may not have the install_uuid
# matching this release, if we're in an upgrade. If the file doesn't exist,
# bypass this check
return 0
fi
LOG_TO_FILE "Unable to retrieve installation uuid from active controller"
echo "Unable to retrieve installation uuid from active controller"
return 1
fi
if [ "$INSTALL_UUID" != "$CONTROLLER_UUID" ]; then
LOG_TO_FILE "This node is running a different load than the active controller and must be reinstalled"
echo "This node is running a different load than the active controller and must be reinstalled"
return 1
fi
return 0
}
# Check for installation failure
if [ -f /etc/platform/installation_failed ] ; then
LOG_TO_FILE "/etc/platform/installation_failed flag is set. Aborting."
echo "$(basename $0): Detected installation failure. Aborting."
exit 1
fi
# For AIO-SX, abort if config is not yet applied and this is running in init
if [ "${system_mode}" = "simplex" -a ! -f ${INITIAL_CONTROLLER_CONFIG_COMPLETE} -a "$1" = "start" ]; then
LOG_TO_FILE "Config is not yet applied. Skipping init patching"
exit 0
fi
# If the management interface is bonded, it may take some time
# before communications can be properly setup.
# Allow up to $DELAY_SEC seconds to reach controller.
if [ "${system_mode}" = "simplex" ]; then
# Make the delay for simplex systems smaller.
# There is no active controller during reboot.
DELAY_SEC=10
else
DELAY_SEC=120
fi
START=`date +%s`
FOUND=0
while [ $(date +%s) -lt $(( ${START} + ${DELAY_SEC} )) ]; do
LOG_TO_FILE "Waiting for controller to be pingable"
ping -c 1 controller > /dev/null 2>&1 || ping6 -c 1 controller > /dev/null 2>&1
if [ $? -eq 0 ]; then
LOG_TO_FILE "controller is pingable"
FOUND=1
break
fi
sleep 1
done
if [ ${FOUND} -eq 0 ]; then
# 'controller' is not available, just exit
LOG_TO_FILE "Unable to contact active controller (controller). Boot will continue."
exit 1
fi
RC=0
exit $RC