Merge "Build retry loop for screen sessions"
This commit is contained in:
commit
002ab9374c
102
functions-common
102
functions-common
@ -1058,31 +1058,24 @@ function run_process {
|
|||||||
echo $!
|
echo $!
|
||||||
}
|
}
|
||||||
|
|
||||||
# Helper to launch a service in a named screen
|
function _start_in_screen {
|
||||||
# screen_it service "command-line"
|
local service=$1
|
||||||
function screen_it {
|
local cmd=$2
|
||||||
SCREEN_NAME=${SCREEN_NAME:-stack}
|
local screen_name=${SCREEN_NAME:-stack}
|
||||||
SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
|
local status_dir=${SERVICE_DIR:-${DEST}/status}
|
||||||
USE_SCREEN=$(trueorfalse True $USE_SCREEN)
|
local service_dir="$status_dir/$screen_name"
|
||||||
|
local pid="$service_dir/$service.pid"
|
||||||
if is_service_enabled $1; then
|
local failure="$service_dir/$service.failure"
|
||||||
# Append the service to the screen rc file
|
|
||||||
screen_rc "$1" "$2"
|
|
||||||
|
|
||||||
if [[ "$USE_SCREEN" = "True" ]]; then
|
|
||||||
screen -S $SCREEN_NAME -X screen -t $1
|
|
||||||
|
|
||||||
if [[ -n ${SCREEN_LOGDIR} ]]; then
|
if [[ -n ${SCREEN_LOGDIR} ]]; then
|
||||||
screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
|
local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
||||||
screen -S $SCREEN_NAME -p $1 -X log on
|
local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
|
||||||
ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
|
# this whole dance is done because of slow nodes
|
||||||
|
screen -S $screen_name -p $service -X logfile ${logfile}
|
||||||
|
screen -S $screen_name -p $service -X log on
|
||||||
|
ln -sf ${logfile} ${shortlog}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# sleep to allow bash to be ready to be send the command - we are
|
|
||||||
# creating a new window in screen and then sends characters, so if
|
|
||||||
# bash isn't running by the time we send the command, nothing happens
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
NL=`echo -ne '\015'`
|
NL=`echo -ne '\015'`
|
||||||
# This fun command does the following:
|
# This fun command does the following:
|
||||||
# - the passed server command is backgrounded
|
# - the passed server command is backgrounded
|
||||||
@ -1092,10 +1085,73 @@ function screen_it {
|
|||||||
# and a message is written to stdout and the service failure file
|
# and a message is written to stdout and the service failure file
|
||||||
# The pid saved can be used in screen_stop() as a process group
|
# The pid saved can be used in screen_stop() as a process group
|
||||||
# id to kill off all child processes
|
# id to kill off all child processes
|
||||||
screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL"
|
echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
|
||||||
|
screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function _is_running_in_screen {
|
||||||
|
local service=$1
|
||||||
|
local screen_name=${SCREEN_NAME:-stack}
|
||||||
|
local status_dir=${SERVICE_DIR:-${DEST}/status}
|
||||||
|
local service_dir="$status_dir/$screen_name"
|
||||||
|
local pid="$service_dir/$service.pid"
|
||||||
|
local failure="$service_dir/$service.failure"
|
||||||
|
if [[ ! -e "$pid" && ! -e "$failure" ]]; then
|
||||||
|
# if we don't have a pid or a failure for why, the command may not
|
||||||
|
# have stuffed in there
|
||||||
|
echo "Warning: neither $pid nor $failure exist, $service didn't seem to start"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if [[ -n ${SCREEN_LOGDIR} ]]; then
|
||||||
|
# if we should be logging, but we don't have a log file, something is wrong
|
||||||
|
local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
||||||
|
if [[ ! -e "$logfile" ]]; then
|
||||||
|
echo "Warning: expected logfile $logfile not found, something wrong with starting $service"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper to launch a service in a named screen
|
||||||
|
# screen_it service "command-line"
|
||||||
|
function screen_it {
|
||||||
|
local service=$1
|
||||||
|
local cmd=$2
|
||||||
|
local screen_name=${SCREEN_NAME:-stack}
|
||||||
|
local status_dir=${SERVICE_DIR:-${DEST}/status}
|
||||||
|
local service_dir="$status_dir/$screen_name"
|
||||||
|
local use_screen=$(trueorfalse True $USE_SCREEN)
|
||||||
|
local pid="$service_dir/$service.pid"
|
||||||
|
|
||||||
|
if is_service_enabled $1; then
|
||||||
|
# Append the service to the screen rc file
|
||||||
|
screen_rc "$service" "$cmd"
|
||||||
|
|
||||||
|
if [[ "$use_screen" = "True" ]]; then
|
||||||
|
screen -S $screen_name -X screen -t $service
|
||||||
|
|
||||||
|
# this retry loop brought to you by slow compute nodes, screen raciness,
|
||||||
|
# and frustration in upgrading.
|
||||||
|
local screen_tries=0
|
||||||
|
while [ "$screen_tries" -lt 10 ]; do
|
||||||
|
_start_in_screen "$service" "$cmd"
|
||||||
|
if _is_running_in_screen $service; then
|
||||||
|
screen_tries=10
|
||||||
|
else
|
||||||
|
screen_tries=$[screen_tries + 1]
|
||||||
|
echo "Failed to start service after $screen_tries attempt(s), retrying"
|
||||||
|
if [[ "$screen_tries" -eq 10 ]]; then
|
||||||
|
echo "Too many retries, giving up"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
else
|
else
|
||||||
# Spawn directly without screen
|
# Spawn directly without screen
|
||||||
run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid
|
run_process "$service" "$cmd" >$pid
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user