Enhance collect to support collecting from subclouds
This update introduces a new collect command line
option --subcloud (or -sc) to support the collect
of subclouds from the system controller.
This update also
- defaults to a 1 month dated collect
- defaults to parallel collect ; instead of one by one
- introduces an --inline or -in option to collect hosts
or subclouds one by one ; i.e. the legacy collect mode
before the parallel collect was introduced.
- adds a check_host_reachable access test to each host
or subcloud to verify access before trying to collect.
- adds collect --continue support for collecting a large
number of subclouds when there is not enough scratch
space to hold them all in one go.
- scale subcloud collect timeout with number of subclouds
- show early initial progress for subcloud collect
- improved speed of subcloud name verification for large
number of listed subclouds
- improved scratch space management on final tar create
Test Plan:
Orchestrated subcloud(s) collect ; parallel and inline:
PASS: Verify single subcloud collect
PASS: Verify listed subcloud collect ; one and several
PASS: Verify named subcloud collect (-sc -a -p -n <name>)
PASS: Verify all subcloud collect in parallel
PASS: Verify subcloud collect continue option handling
Active controller host(s) collect ; parallel and inline:
PASS: Verify single host collect
PASS: Verify listed host collect ; one and several
PASS: Verify named collect
PASS: Verify all hosts collect
Misc New Features:
PASS: Verify new defaulted 1 month dated collect
PASS: Verify new --file option for subcloud collect
PASS: Verify collect --clean for local and remote hosts
and subclouds
PASS: Verify collect tar cleanup on hosts and subclouds
following collect
PASS: Verify parallel collect early progress with .'s
PASS: Verify subcloud collect continue warning message
Failure Cases:
PASS: Verify subcloud collect with failing dcmanager process
PASS: Verify subcloud collect with no provisioned subclouds
PASS: Verify fault handling surrounding use of new --file option
PASS: Verify partial collect after one or more subcloud collect
errors or timeouts
PASS: Verify subcloud collect is only accepted on a system controller
PASS: Verify handling of unreachable host or subcloud
PASS: Verify handling of host or subcloud that reboots during collect
PASS: Verify collect of subcloud with a lot of /var/log
PASS: Verify collect handling when remote host or subcloud
runs out of space
PASS: Verify subcloud collect handling when system controller
runs out of space
PASS: Verify host collect handling when active controller
runs out of space
PASS: Verify all report_error case handling for collect subcloud
PASS: Verify subcloud collect timeout on remote subcloud is
reported as a subcloud timeout
PASS: Verify host or subcloud collect with no valid hosts or
subclouds found or specified
PASS: Verify collect continue option failure handling
Regression:
PASS: Verify host and subcloud specification options (-a -l … , … )
PASS: Verify --all option overrides --list option
PASS: Verify collect drops duplicate or unknown host/subclouds
PASS: Verify host or subcloud collect clean option behavior
PASS: Verify host or subcloud collect reject with -lt 25% free
scratch space
PASS: Verify permission and incorrect password error handling
PASS: Verify collect handling for unresponsive host or subcloud
PASS: Verify subcloud collect clean of unresponsive host or subcloud
PASS: Verify handling of 'control c' during collect
PASS: Verify collect logging on all hosts and subclouds
PASS: Verify shellcheck static analysis
PASS: Verify bashate static analysis
Change-Id: Ie76bfc86b1ee5eab83f42b65b643ccdf13ad7580
Story: 2009055
Task: 42836
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -27,6 +27,8 @@ FAIL_TIMEOUT7=17
|
||||
FAIL_TIMEOUT8=18
|
||||
FAIL_TIMEOUT9=19
|
||||
|
||||
FAIL_SUBCLOUD_TIMEOUT=20
|
||||
|
||||
FAIL_PASSWORD=30
|
||||
FAIL_PERMISSION=31
|
||||
FAIL_CLEANUP=32
|
||||
@@ -39,10 +41,29 @@ FAIL_INSUFFICIENT_SPACE=38
|
||||
FAIL_INTERNAL=39
|
||||
FAIL_NO_TARDIR=40
|
||||
FAIL_NO_TARBALLS=41
|
||||
FAIL_NO_FILE_SPECIFIED=42
|
||||
FAIL_FILE_NOT_FOUND=43
|
||||
FAIL_FILE_EMPTY=44
|
||||
FAIL_PASSWORD_PROMPT=45
|
||||
FAIL_MISSING_PARAMETER=46
|
||||
FAIL_DATE_FORMAT=47
|
||||
FAIL_NO_HOSTS=48
|
||||
FAIL_FILE_COPY=49
|
||||
FAIL_SUBCLOUD=50
|
||||
FAIL_CONTINUE=51
|
||||
FAIL_SUBCLOUDNAME=52
|
||||
FAIL_NO_SUBCLOUDS=53
|
||||
FAIL_NOT_SYSTEMCONTROLLER=54
|
||||
|
||||
|
||||
# Warnings are above 200
|
||||
WARN_WARNING=200
|
||||
WARN_HOSTNAME=201
|
||||
WARN_SUBCLOUD=202
|
||||
|
||||
COLLECT_ERROR="Error:"
|
||||
COLLECT_DEBUG="Debug:"
|
||||
COLLECT_WARN="Warning:"
|
||||
|
||||
# Failure Strings
|
||||
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
|
||||
@@ -51,12 +72,38 @@ FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
|
||||
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
|
||||
FAIL_UNREACHABLE_STR="Unreachable"
|
||||
|
||||
FAIL_TIMEOUT_STR="operation timeout"
|
||||
FAIL_SUBCLOUD_TIMEOUT_STR="subcloud collect timeout"
|
||||
|
||||
FAIL_NO_FILE_SPECIFIED_STR="no file specified"
|
||||
FAIL_FILE_NOT_FOUND_STR="no such file or directory"
|
||||
FAIL_FILE_EMPTY_STR="file is empty"
|
||||
FAIL_PASSWORD_PROMPT_STR="password for"
|
||||
|
||||
FAIL_DATE_FORMAT_STR="date format"
|
||||
FAIL_INACTIVE_STR="not active"
|
||||
FAIL_NO_HOSTS_STR="empty host list"
|
||||
FAIL_NO_SUBCLOUDS_STR="empty subcloud list"
|
||||
FAIL_MISSING_PARAMETER_STR="missing parameter"
|
||||
FAIL_FILE_COPY_STR="failed to copy"
|
||||
FAIL_CONTINUE_STR="cannot continue"
|
||||
|
||||
# The minimum amount of % free space on /scratch to allow collect to proceed
|
||||
MIN_PERCENT_SPACE_REQUIRED=75
|
||||
|
||||
# Subcloud collect stops when avail scratch drops below this threshold.
|
||||
# Use collect -sc --continue to tell collect to continue collecting subclouds
|
||||
# from where it left off.
|
||||
# 2Gib in K blocks rounded up
|
||||
declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up
|
||||
|
||||
# Log file path/names
|
||||
COLLECT_LOG=/var/log/collect.log
|
||||
COLLECT_ERROR_LOG=/tmp/collect_error.log
|
||||
HOST_COLLECT_ERROR_LOG="/tmp/host_collect_error.log"
|
||||
|
||||
DCROLE_SYSTEMCONTROLLER="systemcontroller"
|
||||
DCROLE_SUBCLOUD="subcloud"
|
||||
|
||||
function source_openrc_if_needed
|
||||
{
|
||||
@@ -79,7 +126,7 @@ function source_openrc_if_needed
|
||||
OPENRC="/etc/platform/openrc"
|
||||
if [ -e "${OPENRC}" ] ; then
|
||||
OS_PASSWORD=""
|
||||
source ${OPENRC}
|
||||
source ${OPENRC} 2>/dev/null 1>/dev/null
|
||||
if [ "${OS_PASSWORD}" != "" ] ; then
|
||||
ACTIVE=true
|
||||
fi
|
||||
@@ -99,6 +146,7 @@ cmd_done_file="/usr/local/sbin/expect_done"
|
||||
TAR_ZIP_CMD="tar -cvzf"
|
||||
TAR_UZIP_CMD="tar -xvzf"
|
||||
TAR_CMD="tar -cvhf"
|
||||
TAR_CMD_APPEND="tar -rvhf"
|
||||
UNTAR_CMD="tar -xvf"
|
||||
ZIP_CMD="gzip"
|
||||
NICE_CMD="/usr/bin/nice -n19"
|
||||
@@ -128,14 +176,14 @@ function ilog
|
||||
|
||||
function elog
|
||||
{
|
||||
echo "Error: $@"
|
||||
logger -t ${COLLECT_TAG} "Error: $@"
|
||||
echo "${COLLECT_ERROR} $@"
|
||||
logger -t ${COLLECT_TAG} "${COLLECT_ERROR} $@"
|
||||
}
|
||||
|
||||
function wlog
|
||||
{
|
||||
echo "Warning: $@"
|
||||
logger -t ${COLLECT_TAG} "Warning: $@"
|
||||
echo "${COLLECT_WARN} $@"
|
||||
logger -t ${COLLECT_TAG} "${COLLECT_WARN} $@"
|
||||
}
|
||||
|
||||
function set_debug_mode()
|
||||
@@ -146,8 +194,8 @@ function set_debug_mode()
|
||||
function dlog()
|
||||
{
|
||||
if [ "$DEBUG" == true ] ; then
|
||||
logger -t ${COLLECT_TAG} "Debug: $@"
|
||||
echo "$(date) Debug: $@"
|
||||
logger -t ${COLLECT_TAG} "${COLLECT_DEBUG} $@"
|
||||
echo "$(date) ${COLLECT_DEBUG} $@"
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -214,8 +262,7 @@ function collect_errors()
|
||||
|
||||
## now loop through known space related error strings
|
||||
index=0
|
||||
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ]
|
||||
do
|
||||
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do
|
||||
grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
|
||||
if [ "$?" == "0" ] ; then
|
||||
|
||||
|
||||
Reference in New Issue
Block a user