Add --timeout option to collect tool

This update adds a new --timeout command line option to the collect
tool so that users can extend collect's global timeout.

Prior to this update the collect tool had a fixed 1000 second
or 16.6 minute timeout. Collect of hosts in large busy systems can
take an unpredictably long time. Sometimes longer than 1000 seconds.
This can be particularly true when collecting from the active
controller deploying and managing lots of pods across many hosts.

This new timeout option allows the user to specify a specific timeout
in minutes, between 10 and 120, while defaulting to 20 minutes.
The default or user specified global timeout is passed to subclouds
for subcloud collect as well.

Test Plan:

PASS: Verify new --timeout or -t options at command line arg level
PASS: Verify --timeout <minutes> parse; error, in and out of bounds
PASS: Verify timeout option is described in collect help
PASS: Verify 110 minute collect with --timeout 120
PASS: Verify 45 minute collect times out with --timeout 40
PASS: Verify 2 minute collect with --timeout 10
PASS: Verify default timeout is 20 minutes
PASS: Verify default or specified timeout is displayed
PASS: Verify default or specified timeout is shared with the subcloud
PASS: Verify timeout error handling.
PASS: Verify collect error handling behavior if --timeout or -t is
      specified but the number of minutes is missing.

Regression:

PASS: Verify collect system and subcloud handling
PASS: Verify system and subcloud dated collects ; verified content
PASS: Verify collect with a variety of options

Closes-Bug: 2004666
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
Change-Id: Ib68b78f7c810f43fc8d13cbf291ac00f08c3c4f4
This commit is contained in:
Eric MacDonald 2023-02-12 15:15:43 -05:00
parent db7144f1cf
commit 0b079b4804
2 changed files with 40 additions and 3 deletions

View File

@ -342,6 +342,11 @@ function print_help()
echo " collect -a -sc [--inline | -in] ... collect logs for all subclouds one after the other"
echo " collect --subcloud --continue ... continue a suspended subcloud collect"
echo ""
echo "Collect Timeout"
echo ""
echo "collect [--timeout | -t] <minutes> ... collect with user specified timeout"
echo " valid change range is 10-120 minutes"
echo " default: 20 mins"
echo "Dated Collect:"
echo ""
echo "collect [--start-date | -s] YYYYMMDD ... collection of logs on and after this date"
@ -415,10 +420,16 @@ COLLECT_CONTINUE_MSG_NEEDED=false
SUBCLOUD_COLLECT_CONTINUE=false
SUBCLOUD_COLLECT_CONTINUE_LIST_FILE="/tmp/collect_continue.lst"
declare -i TIMEOUT_MIN_MINS=10
declare -i TIMEOUT_MAX_MINS=120
declare -i TIMEOUT_DEF_MINS=20
declare -i TIMEOUT_MIN_SECS=$(($TIMEOUT_MAX_MINS*60))
declare -i TIMEOUT_MAX_SECS=$(($TIMEOUT_MAX_MINS*60))
declare -i TIMEOUT_DEF_SECS=$(($TIMEOUT_DEF_MINS*60)) # 20 minutes
# overall collect timeout
TIMEOUT=1000
declare -i TIMEOUT=${TIMEOUT_DEF_SECS}
SECONDS=0
let UNTIL=${SECONDS}+${TIMEOUT}
COLLECT_NAME=""
@ -707,6 +718,22 @@ while [[ ${#} -gt 0 ]] ; do
clear_variable_args
;;
-t|--timeout)
if [[ ${2} =~ ^[0-9]+$ ]] ; then
if [ ${2} -lt ${TIMEOUT_MIN_MINS} -o \
${2} -gt ${TIMEOUT_MAX_MINS} ] ; then
elog "timeout must be between ${TIMEOUT_MIN_MINS} and ${TIMEOUT_MAX_MINS} minutes"
collect_exit ${FAIL_TIMEOUT_ARG}
else
TIMEOUT="$((${2}*60))"
fi
else
elog "timeout value must be an integer"
collect_exit ${FAIL_TIMEOUT_ARG}
fi
shift
;;
--skip-mask)
SKIP_MASK=true
shift
@ -758,6 +785,9 @@ while [[ ${#} -gt 0 ]] ; do
shift # past argument or value
done
# The default TIMEOUT may have been revised with the --timeout option.
# Update UNTIL with updated global timeout time in secs.
let UNTIL=${SECONDS}+${TIMEOUT}
date -d $STARTDATE > /dev/null 2>/dev/null
rc_start_date=${?}
@ -1093,6 +1123,8 @@ pw=${pw/\[/\\\[} # replace '[' with '\['
pw=${pw/$/\\$} # replace '$' with '\$'
pw=${pw/\"/\\\"} # replace '"' with '\"'
ilog "collect bundle timeout set to $((${TIMEOUT}/60)) minutes"
###########################################################################
#
# Name : passwordless_sudo_test
@ -1908,6 +1940,10 @@ function collect_subcloud_run()
collect_cmd+=("-v")
fi
# pass the timeout to the subcloud
collect_cmd+=("-t $((${TIMEOUT}/60))")
# pass the date range to the subcloud
collect_cmd+=("--start-date ${STARTDATE}")
collect_cmd+=("--end-date $ENDDATE")
@ -3068,7 +3104,7 @@ if [ "${SUBCLOUD_COLLECT}" = true ] ; then
if [ ${SUBCLOUDS} -gt ${TIMEOUT_THRESHOLD_FACTOR} -a "${PARALLEL_COLLECT_MODE}" = true ] ; then
# adjust overall timeout to account for the large number of subclouds
let UNTIL=$(((SUBCLOUDS*SUBCLOUDS_TIMEOUT_BOOST)+TIMEOUT))
ilog "adjusted subcloud collect timout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds"
ilog "adjusted subcloud collect timeout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds"
fi
if [ "${ALLHOSTS}" = true ] ; then
if [ ${SUBCLOUDS} -gt ${MAX_LIST_PRINT} ] ; then

View File

@ -58,6 +58,7 @@ FAIL_NAME_TOO_LONG=55
FAIL_INVALID_START_DATE=56
FAIL_INVALID_END_DATE=57
FAIL_INVALID_DATE_RANGE=58
FAIL_TIMEOUT_ARG=59
# Warnings are above 200
WARN_WARNING=200