Files
update/software/service-files/software-init.sh
Matheus Guilhermino 301e2e5cae Decrease software service init delay for Simplex
The DELAY_SEC variable in the software init script controls how
long the service waits for an active controller to respond before
proceeding. The default is 120 seconds.

On simplex (AIO-SX) systems, there is a possibility that this could
result in an unnecessary 2-minute delay during reboot or unlock
operations.

This update significantly reduces the timeout delay on simplex systems
without affecting duplex systems.

Test-Plan:
PASS: Verify build and install on AIO-SX
PASS: Verify build and install on AIO-DX
PASS: Verify unlock of AIO-SX is not delayed by software timeout
PASS: Verify software still uses 120s timeout on AIO-DX

Closes-bug: 2118354

Change-Id: I5a56b9d750d4607b6df7ede65e7a2370b141deee
Signed-off-by: Matheus Guilhermino <mmachado@windriver.com>
2025-07-24 17:03:19 +00:00

172 lines
5.4 KiB
Bash

#!/bin/bash
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Unified Software Management
# chkconfig: 345 20 23
# description: StarlingX Unified Software Management init script
### BEGIN INIT INFO
# Provides: software
# Required-Start: $syslog
# Required-Stop: $syslog
# Default-Start: 2 3 5
# Default-Stop: 0 1 6
# Short-Description: software
# Description: Provides the Unified Software Management component
### END INIT INFO
NAME=$(basename $0)
. /usr/bin/tsconfig
. /etc/platform/platform.conf
logfile=/var/log/software.log
software_install_failed_file=/var/run/software_install_failed
software_updated_during_init_file=/etc/software/.software_updated_during_init
node_is_software_updated_rr_file=/var/persist/software-agent/node_is_software_updated_rr
# if the system has never been bootstrapped, system_mode is not set
# treat a non bootstrapped system like it is simplex
# and manually manage lighttpd, etc..
if [ "${system_mode}" = "" ]; then
system_mode="simplex"
fi
function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
}
function check_install_uuid {
# Check whether our installed load matches the active controller
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
if [ $? -ne 0 ]; then
if [ "$HOSTNAME" = "controller-1" ]; then
# If we're on controller-1, controller-0 may not have the install_uuid
# matching this release, if we're in an upgrade. If the file doesn't exist,
# bypass this check
return 0
fi
LOG_TO_FILE "Unable to retrieve installation uuid from active controller"
echo "Unable to retrieve installation uuid from active controller"
return 1
fi
if [ "$INSTALL_UUID" != "$CONTROLLER_UUID" ]; then
LOG_TO_FILE "This node is running a different load than the active controller and must be reinstalled"
echo "This node is running a different load than the active controller and must be reinstalled"
return 1
fi
return 0
}
# Check for installation failure
if [ -f /etc/platform/installation_failed ] ; then
LOG_TO_FILE "/etc/platform/installation_failed flag is set. Aborting."
echo "$(basename $0): Detected installation failure. Aborting."
exit 1
fi
# For AIO-SX, abort if config is not yet applied and this is running in init
if [ "${system_mode}" = "simplex" -a ! -f ${INITIAL_CONTROLLER_CONFIG_COMPLETE} -a "$1" = "start" ]; then
LOG_TO_FILE "Config is not yet applied. Skipping init software"
exit 0
fi
# If the management interface is bonded, it may take some time
# before communications can be properly setup.
# Allow up to $DELAY_SEC seconds to reach controller.
if [ "${system_mode}" = "simplex" ]; then
# Make the delay for simplex systems smaller.
# There is no active controller during reboot.
DELAY_SEC=10
else
DELAY_SEC=120
fi
START=`date +%s`
FOUND=0
while [ $(date +%s) -lt $(( ${START} + ${DELAY_SEC} )) ]; do
LOG_TO_FILE "Waiting for controller to be pingable"
ping -c 1 controller > /dev/null 2>&1 || ping6 -c 1 controller > /dev/null 2>&1
if [ $? -eq 0 ]; then
LOG_TO_FILE "controller is pingable"
FOUND=1
break
fi
sleep 1
done
if [ ${FOUND} -eq 0 ]; then
# 'controller' is not available, just exit
LOG_TO_FILE "Unable to contact active controller (controller). Boot will continue."
exit 1
fi
# skip the service execution if booting in the rollback deployment
if grep -q "ostree=/ostree/2" /proc/cmdline; then
LOG_TO_FILE "System is booted from rollback deployment. Skipping execution..."
exit 0
fi
RC=0
case "$1" in
start)
if [ "${system_mode}" = "simplex" ]; then
# On a simplex CPE, we need to launch the http server first,
# before we can do the software installation
LOG_TO_FILE "***** Launching lighttpd *****"
/etc/init.d/lighttpd start
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
LOG_TO_FILE "***** Shutting down lighttpd *****"
/etc/init.d/lighttpd stop
else
check_install_uuid
if [ $? -ne 0 ]; then
# The INSTALL_UUID doesn't match the active controller, so exit
exit 1
fi
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
fi
;;
stop)
# Nothing to do here
;;
restart)
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
;;
*)
echo "Usage: $0 {start|stop|restart}"
exit 1
esac
exit $RC