Change ceph-init-wrapper wait logic

The stop, start and restart commands are waiting for any status
commands to finish before attempting the actual command

This would cause issues as some commands that are related to OSDs
only would wait for monitor status and vice-versa.

Depending on the number of OSD, the osd status command would take
too much time to finish, resulting on a "stop mon" command to
wait just as much, even though it didn't need to

Changes in this commit:
- commands related to OSD and monitors have their own wait times
  and separate flag files
- add improved logging to better see if the script is waiting
  for a certain function too finish

Change-Id: Ia03981b2b49f999e8a96aa12361209a418da4c50
Closes-bug: 1836075
Depends-On: I3ace73650e4fe9aafc84c82e2ffe048f2039305e
Signed-off-by: Stefan Dinescu <stefan.dinescu@windriver.com>
This commit is contained in:
Stefan Dinescu 2019-07-25 15:00:21 +03:00
parent 52807307ce
commit 12f604b4dd

View File

@ -40,7 +40,8 @@ source /etc/platform/platform.conf
CEPH_SCRIPT="/etc/init.d/ceph" CEPH_SCRIPT="/etc/init.d/ceph"
CEPH_FILE="$VOLATILE_PATH/.ceph_started" CEPH_FILE="$VOLATILE_PATH/.ceph_started"
CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting" CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting"
CEPH_GET_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_status" CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status"
CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status"
CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt" CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
BINDIR=/usr/bin BINDIR=/usr/bin
@ -59,10 +60,13 @@ mkdir -p $DATA_PATH # make sure folder exists
MONITORING_INTERVAL=15 MONITORING_INTERVAL=15
TRACE_LOOP_INTERVAL=5 TRACE_LOOP_INTERVAL=5
GET_STATUS_TIMEOUT=120 GET_OSD_STATUS_TIMEOUT=120
GET_MONITOR_STATUS_TIMEOUT=30
CEPH_STATUS_TIMEOUT=20 CEPH_STATUS_TIMEOUT=20
WAIT_FOR_CMD=1 WAIT_FOR_CMD=1
MONITOR_COMMAND=0
OSD_COMMAND=0
RC=0 RC=0
@ -73,24 +77,65 @@ if [ ! -z $ARGS ]; then
args+=("${new_args[@]}") args+=("${new_args[@]}")
fi fi
check_command_type ()
{
if [[ $# -eq 0 ]]; then
MONITOR_COMMAND=1
OSD_COMMAND=1
elif [[ "$1" == "osd"* ]]; then
OSD_COMMAND=1
elif [[ "$1" == "mon"* ]]; then
MONITOR_COMMAND=1
else
exit 1
fi
}
wait_for_status () wait_for_status ()
{ {
timeout=$GET_STATUS_TIMEOUT # wait for status no more than $timeout seconds local STATUS_TIMEOUT=0
while [ -f ${CEPH_GET_STATUS_FILE} ] && [ $timeout -gt 0 ]; do
# For a general "ceph status" command which includes checks
# for both monitors and OSDS, we use the OSD timeout.
if [[ $OSD_COMMAND == 1 ]]; then
STATUS_TIMEOUT=$GET_OSD_STATUS_TIMEOUT
elif [[ $MONITOR_COMMAND == 1 ]]; then
STATUS_TIMEOUT=$GET_MONITOR_STATUS_TIMEOUT
fi
timeout_expiry=$((${SECONDS} + ${STATUS_TIMEOUT}))
while [ ${SECONDS} -le ${timeout_expiry} ]; do
if [[ $MONITOR_COMMAND == 1 ]] && [[ ! -f ${CEPH_GET_MON_STATUS_FILE} ]]; then
break
fi
if [[ $OSD_COMMAND == 1 ]] && [[ ! -f ${CEPH_GET_OSD_STATUS_FILE} ]]; then
break
fi
sleep 1 sleep 1
let timeout-=1
done done
if [ $timeout -eq 0 ]; then if [ $timeout -eq 0 ]; then
wlog "-" "WARN" "Getting status takes more than ${GET_STATUS_TIMEOUT}s, continuing" wlog "-" "WARN" "Getting status takes more than ${STATUS_TIMEOUT}s, continuing"
rm -f $CEPH_GET_STATUS_FILE if [[ $MONITOR_COMMAND == 1 ]]; then
rm -f $CEPH_GET_MON_STATUS_FILE
fi
if [[ $OSD_COMMAND == 1 ]]; then
rm -f $CEPH_GET_OSD_STATUS_FILE
fi
fi fi
} }
start () start ()
{ {
if [ -f ${CEPH_FILE} ]; then if [ -f ${CEPH_FILE} ]; then
wlog "-" INFO "Ceph START $1 command received"
wait_for_status wait_for_status
${CEPH_SCRIPT} start $1 ${CEPH_SCRIPT} start $1
wlog "-" INFO "Ceph START $1 command finished."
RC=$? RC=$?
else else
# Ceph is not running on this node, return success # Ceph is not running on this node, return success
@ -100,17 +145,21 @@ start ()
stop () stop ()
{ {
wlog "-" INFO "Ceph STOP $1 command received."
wait_for_status wait_for_status
${CEPH_SCRIPT} stop $1 ${CEPH_SCRIPT} stop $1
wlog "-" INFO "Ceph STOP $1 command finished."
} }
restart () restart ()
{ {
if [ -f ${CEPH_FILE} ]; then if [ -f ${CEPH_FILE} ]; then
wlog "-" INFO "Ceph RESTART $1 command received."
wait_for_status wait_for_status
touch $CEPH_RESTARTING_FILE touch $CEPH_RESTARTING_FILE
${CEPH_SCRIPT} restart $1 ${CEPH_SCRIPT} restart $1
rm -f $CEPH_RESTARTING_FILE rm -f $CEPH_RESTARTING_FILE
wlog "-" INFO "Ceph RESTART $1 command finished."
else else
# Ceph is not running on this node, return success # Ceph is not running on this node, return success
exit 0 exit 0
@ -170,14 +219,13 @@ log_and_kill_hung_procs ()
done done
} }
status () status ()
{ {
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
timeout $CEPH_STATUS_TIMEOUT ceph -s timeout $CEPH_STATUS_TIMEOUT ceph -s
if [ "$?" -ne 0 ]; then if [ "$?" -ne 0 ]; then
# Ceph cluster is not accessible. Don't panic, controller swact # Ceph cluster is not accessible. Don't panic, controller swact
# may be in progress. # may be in progress.
wlog "-" INFO "Ceph is down, ignoring OSD status." wlog "-" INFO "Ceph is down, ignoring OSD status."
exit 0 exit 0
fi fi
@ -191,7 +239,14 @@ status ()
if [ -f ${CEPH_FILE} ]; then if [ -f ${CEPH_FILE} ]; then
# Make sure the script does not 'exit' between here and the 'rm -f' below # Make sure the script does not 'exit' between here and the 'rm -f' below
# or the checkpoint file will be left behind # or the checkpoint file will be left behind
touch -f ${CEPH_GET_STATUS_FILE} if [[ $MONITOR_COMMAND == 1 ]]; then
touch -f ${CEPH_GET_MON_STATUS_FILE}
fi
if [[ $OSD_COMMAND == 1 ]]; then
touch -f ${CEPH_GET_OSD_STATUS_FILE}
fi
result=`${CEPH_SCRIPT} status $1` result=`${CEPH_SCRIPT} status $1`
RC=$? RC=$?
if [ "$RC" -ne 0 ]; then if [ "$RC" -ne 0 ]; then
@ -236,7 +291,13 @@ status ()
fi fi
fi fi
rm -f ${CEPH_GET_STATUS_FILE} if [[ $MONITOR_COMMAND == 1 ]]; then
rm -f ${CEPH_GET_MON_STATUS_FILE}
fi
if [[ $OSD_COMMAND == 1 ]]; then
rm -f ${CEPH_GET_OSD_STATUS_FILE}
fi
if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
# SM needs exit code != 0 from 'status mon' argument of the init script on # SM needs exit code != 0 from 'status mon' argument of the init script on
@ -262,15 +323,19 @@ status ()
case "${args[0]}" in case "${args[0]}" in
start) start)
check_command_type ${args[1]}
start ${args[1]} start ${args[1]}
;; ;;
stop) stop)
check_command_type ${args[1]}
stop ${args[1]} stop ${args[1]}
;; ;;
restart) restart)
check_command_type ${args[1]}
restart ${args[1]} restart ${args[1]}
;; ;;
status) status)
check_command_type ${args[1]}
status ${args[1]} status ${args[1]}
;; ;;
*) *)