Add --timeout option to collect tool

This update adds a new --timeout command line option to the collect tool so that users can extend collect's global timeout. Prior to this update the collect tool had a fixed 1000 second or 16.6 minute timeout. Collect of hosts in large busy systems can take an unpredictably long time. Sometimes longer than 1000 seconds. This can be particularly true when collecting from the active controller deploying and managing lots of pods across many hosts. This new timeout option allows the user to specify a specific timeout in minutes, between 10 and 120, while defaulting to 20 minutes. The default or user specified global timeout is passed to subclouds for subcloud collect as well. Test Plan: PASS: Verify new --timeout or -t options at command line arg level PASS: Verify --timeout <minutes> parse; error, in and out of bounds PASS: Verify timeout option is described in collect help PASS: Verify 110 minute collect with --timeout 120 PASS: Verify 45 minute collect times out with --timeout 40 PASS: Verify 2 minute collect with --timeout 10 PASS: Verify default timeout is 20 minutes PASS: Verify default or specified timeout is displayed PASS: Verify default or specified timeout is shared with the subcloud PASS: Verify timeout error handling. PASS: Verify collect error handling behavior if --timeout or -t is specified but the number of minutes is missing. Regression: PASS: Verify collect system and subcloud handling PASS: Verify system and subcloud dated collects ; verified content PASS: Verify collect with a variety of options Closes-Bug: 2004666 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com> Change-Id: Ib68b78f7c810f43fc8d13cbf291ac00f08c3c4f4
2023-02-12 15:15:43 -05:00 · 2023-02-12 15:15:43 -05:00 · 0b079b4804
commit 0b079b4804
parent db7144f1cf
2 changed files with 40 additions and 3 deletions
--- a/tools/collector/debian-scripts/collect
+++ b/tools/collector/debian-scripts/collect
@ -342,6 +342,11 @@ function print_help()
    echo " collect -a -sc [--inline | -in]                ... collect logs for all subclouds one after the other"
    echo " collect --subcloud --continue                  ... continue a suspended subcloud collect"
    echo ""
+    echo "Collect Timeout"
+    echo ""
+    echo "collect [--timeout | -t] <minutes>              ... collect with user specified timeout"
+    echo "                                                    valid change range is 10-120 minutes"
+    echo "                                                    default: 20 mins"
    echo "Dated Collect:"
    echo ""
    echo "collect [--start-date | -s] YYYYMMDD            ... collection of logs on and  after this date"
@ -415,10 +420,16 @@ COLLECT_CONTINUE_MSG_NEEDED=false
 SUBCLOUD_COLLECT_CONTINUE=false
 SUBCLOUD_COLLECT_CONTINUE_LIST_FILE="/tmp/collect_continue.lst"

+declare -i TIMEOUT_MIN_MINS=10
+declare -i TIMEOUT_MAX_MINS=120
+declare -i TIMEOUT_DEF_MINS=20
+declare -i TIMEOUT_MIN_SECS=$(($TIMEOUT_MAX_MINS*60))
+declare -i TIMEOUT_MAX_SECS=$(($TIMEOUT_MAX_MINS*60))
+declare -i TIMEOUT_DEF_SECS=$(($TIMEOUT_DEF_MINS*60)) # 20 minutes
+
 # overall collect timeout
-TIMEOUT=1000
+declare -i TIMEOUT=${TIMEOUT_DEF_SECS}
 SECONDS=0
-let UNTIL=${SECONDS}+${TIMEOUT}

 COLLECT_NAME=""

@ -707,6 +718,22 @@ while [[ ${#} -gt 0 ]] ; do
        clear_variable_args
        ;;

+        -t|--timeout)
+        if [[ ${2} =~ ^[0-9]+$ ]] ; then
+            if [ ${2} -lt ${TIMEOUT_MIN_MINS} -o \
+                 ${2} -gt ${TIMEOUT_MAX_MINS} ] ; then
+                elog "timeout must be between ${TIMEOUT_MIN_MINS} and ${TIMEOUT_MAX_MINS} minutes"
+                collect_exit ${FAIL_TIMEOUT_ARG}
+            else
+                TIMEOUT="$((${2}*60))"
+            fi
+        else
+            elog "timeout value must be an integer"
+            collect_exit ${FAIL_TIMEOUT_ARG}
+        fi
+        shift
+        ;;
+
        --skip-mask)
        SKIP_MASK=true
        shift
@ -758,6 +785,9 @@ while [[ ${#} -gt 0 ]] ; do
    shift # past argument or value
 done

+# The default TIMEOUT may have been revised with the --timeout option.
+# Update UNTIL with updated global timeout time in secs.
+let UNTIL=${SECONDS}+${TIMEOUT}

 date -d $STARTDATE > /dev/null 2>/dev/null
 rc_start_date=${?}
@ -1093,6 +1123,8 @@ pw=${pw/\[/\\\[} # replace '[' with '\['
 pw=${pw/$/\\$}   # replace '$' with '\$'
 pw=${pw/\"/\\\"} # replace '"' with '\"'

+ilog "collect bundle timeout set to $((${TIMEOUT}/60)) minutes"
+
 ###########################################################################
 #
 # Name       : passwordless_sudo_test
@ -1908,6 +1940,10 @@ function collect_subcloud_run()
        collect_cmd+=("-v")
    fi

+    # pass the timeout to the subcloud
+    collect_cmd+=("-t $((${TIMEOUT}/60))")
+
+    # pass the date range to the subcloud
    collect_cmd+=("--start-date ${STARTDATE}")
    collect_cmd+=("--end-date $ENDDATE")

@ -3068,7 +3104,7 @@ if [ "${SUBCLOUD_COLLECT}" = true ] ; then
    if [ ${SUBCLOUDS} -gt ${TIMEOUT_THRESHOLD_FACTOR} -a "${PARALLEL_COLLECT_MODE}" = true ] ; then
        # adjust overall timeout to account for the large number of subclouds
        let UNTIL=$(((SUBCLOUDS*SUBCLOUDS_TIMEOUT_BOOST)+TIMEOUT))
-        ilog "adjusted subcloud collect timout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds"
+        ilog "adjusted subcloud collect timeout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds"
    fi
    if [ "${ALLHOSTS}" = true ] ; then
        if [ ${SUBCLOUDS} -gt ${MAX_LIST_PRINT} ] ; then
--- a/tools/collector/debian-scripts/collect_utils
+++ b/tools/collector/debian-scripts/collect_utils
@ -58,6 +58,7 @@ FAIL_NAME_TOO_LONG=55
 FAIL_INVALID_START_DATE=56
 FAIL_INVALID_END_DATE=57
 FAIL_INVALID_DATE_RANGE=58
+FAIL_TIMEOUT_ARG=59

 # Warnings are above 200
 WARN_WARNING=200