
This change adds arbitrary parameters, to reduce or avoid the need to patchback when new features are added. The snapshot parameter was removed to use this new change with -o snapshot=true. This commit also add a sanitize function to allow only letters, numbers, space, - and _. Usage: software deploy start/precheck <major release> -o snapshot=true --options test=test Test Plan: PASS: Build-pkgs, Build-image, Install/Bootstrap/Unlock PASS: Check precheck script receives the parameters (by injecting logs manually) PASS: Check deploy start receives the parameters by injecting manual logs. PASS: Check software.json stores the parameters under options field. PASS: Deploy start blocked with reserved key in options. PASS: Check activate-rollback is using deploy options for snapshot. PASS: Major release deployment stx10 -> stx11 in sx. Story: 2011357 Task: 52264 Change-Id: I53b8863ceeec74c45831d959fe6cea00990cc156 Signed-off-by: Luis Eduardo Bonatti <luizeduardo.bonatti@windriver.com>
510 lines
21 KiB
Python
510 lines
21 KiB
Python
#!/usr/bin/python3
|
|
# -*- encoding: utf-8 -*-
|
|
#
|
|
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
|
#
|
|
# Copyright (c) 2023-2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
"""
|
|
Run platform upgrade deploy precheck as a standalone executable
|
|
"""
|
|
|
|
import argparse
|
|
import base64
|
|
import json
|
|
import logging
|
|
import re
|
|
import requests
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
|
|
from lxml import etree as ElementTree
|
|
from tsconfig.tsconfig import SW_VERSION
|
|
|
|
import upgrade_utils
|
|
|
|
|
|
# TODO(heitormatsui) keep updated for every release
|
|
SUPPORTED_K8S_VERSIONS = [
|
|
"v1.24.4",
|
|
"v1.25.3",
|
|
"v1.26.1",
|
|
"v1.27.5",
|
|
"v1.28.4",
|
|
"v1.29.2",
|
|
]
|
|
|
|
RC_SUCCESS = 0
|
|
RC_UNHEALTHY = 3
|
|
STATE_AVAILABLE = 'available'
|
|
STATE_DEPLOYED = 'deployed'
|
|
SYSTEM_MODE_SIMPLEX = "simplex"
|
|
# the space needed is derived from the sum of snapshot sizes
|
|
# defined in lvm_snapshot.LvmSnapshotManager.LOGICAL_VOLUMES
|
|
FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB = 24
|
|
|
|
LOG = logging.getLogger('main_logger')
|
|
|
|
class HealthCheck(object):
|
|
"""This class represents a general health check object
|
|
that uses sysinv-client to run system health checks"""
|
|
|
|
SUCCESS_MSG = 'OK'
|
|
FAIL_MSG = 'Fail'
|
|
|
|
def __init__(self, config):
|
|
self._config = config
|
|
|
|
# get target release from script directory location
|
|
self._target_release = re.match("^.*/rel-(\d\d.\d\d.\d+)/", __file__).group(1)
|
|
self._major_release = self._target_release.rsplit(".", 1)[0]
|
|
|
|
# get sysinv token, endpoint and client
|
|
self._sysinv_token, self._sysinv_endpoint = \
|
|
upgrade_utils.get_token_endpoint(config, service_type="platform")
|
|
self._sysinv_client = upgrade_utils.get_sysinv_client(self._sysinv_token,
|
|
self._sysinv_endpoint)
|
|
|
|
def _check_license(self, version):
|
|
"""
|
|
Validates the current license is valid for the specified version
|
|
:param version: version to be checked against installed license
|
|
:return: True is license is valid for version, False otherwise
|
|
"""
|
|
license_dict = self._sysinv_client.license.show()
|
|
if license_dict["error"]:
|
|
return False
|
|
|
|
# create temp file with license content to run verify-license binary against it
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=True) as license_file:
|
|
try:
|
|
license_file.write(license_dict["content"])
|
|
license_file.flush()
|
|
subprocess.check_call(["/usr/bin/verify-license", # pylint: disable=not-callable
|
|
license_file.name,
|
|
version],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.STDOUT)
|
|
except subprocess.CalledProcessError:
|
|
return False
|
|
return True
|
|
|
|
# TODO(heitormatsui): implement patch precheck targeted against USM
|
|
# and implement patch precheck for subcloud
|
|
def _check_required_patches_state(self, required_patches, patch_health_check=False):
|
|
"""
|
|
Check if the required patches are in 'deployed' state, if patch_health_check is
|
|
True, the required_patches can be in 'available' state as well.
|
|
:param required_patches: list of patches to be checked
|
|
:param patch_health_check: boolean if is a patch or upgrage health check
|
|
:return: boolean indicating success/failure and list of patches
|
|
that are not in the 'deployed' or 'available' state
|
|
"""
|
|
success = True
|
|
releases = self._config.get("releases", "")
|
|
releases_in_allowed_states = []
|
|
preinstalled_patches = []
|
|
|
|
for release in json.loads(releases):
|
|
if release['state'] == STATE_DEPLOYED:
|
|
releases_in_allowed_states.append(release)
|
|
|
|
if release['prepatched_iso']:
|
|
preinstalled_patches = release.get('preinstalled_patches', [])
|
|
|
|
# TODO(lvieira): remove this after 24.09 major release
|
|
# treat special cases for prepatched already released
|
|
# (they do not have preinstalled_patches list)
|
|
if not preinstalled_patches:
|
|
component = release['release_id'].split('-')[0]
|
|
already_released_prepatched_isos = {
|
|
"24.09.100": ["24.09.0"],
|
|
"24.09.200": ["24.09.100", "24.09.0"],
|
|
"24.09.1": ["24.09.0"],
|
|
"24.09.2": ["24.09.1", "24.09.0"],
|
|
"24.09.3": ["24.09.2", "24.09.1", "24.09.0"]
|
|
}
|
|
released = already_released_prepatched_isos.get(release['sw_version'], [])
|
|
preinstalled_patches.extend([f"{component}-{release}" for release in released])
|
|
|
|
if patch_health_check and release['state'] == STATE_AVAILABLE:
|
|
releases_in_allowed_states.append(release)
|
|
|
|
allowed_patches = [release['release_id'] for release in releases_in_allowed_states]
|
|
allowed_patches.extend(preinstalled_patches)
|
|
missing_patches = list(set(required_patches) - set(allowed_patches))
|
|
|
|
if missing_patches:
|
|
success = False
|
|
|
|
return success, missing_patches
|
|
|
|
def run_general_health_check(self):
|
|
"""Run general health check using sysinv client"""
|
|
force = self._config.get("force", False)
|
|
health_ok = success = True
|
|
|
|
alarm_ignore_list = ["280.001", # subcloud offline
|
|
"280.002", # subcloud resource out-of-sync
|
|
"280.003", # subcloud backup failure
|
|
"280.004", # subcloud peer group in disconnected state
|
|
"280.005", # subcloud peer group managed with lower priority
|
|
"900.201", # Software upgrade auto apply in progress
|
|
]
|
|
api_cmd = self._sysinv_endpoint + "/health/kube-upgrade"
|
|
|
|
if force:
|
|
api_cmd += '/relaxed'
|
|
|
|
if alarm_ignore_list:
|
|
api_cmd += f'?alarm_ignore_list={alarm_ignore_list}'
|
|
|
|
method = 'GET'
|
|
output = upgrade_utils.call_api(self._sysinv_token, method, api_cmd)
|
|
|
|
# check hosts are patch current
|
|
deploy_in_progress = self._config.get("deploy_in_progress", "{}")
|
|
deploy_in_progress = json.loads(deploy_in_progress)
|
|
if deploy_in_progress:
|
|
success = False
|
|
from_load = deploy_in_progress["from_release"]
|
|
to_load = deploy_in_progress["to_release"]
|
|
output += ('All hosts are patch current: [%s]\n') \
|
|
% (HealthCheck.FAIL_MSG)
|
|
output += ('Deployment in progress: %s to %s\n' % (from_load, to_load))
|
|
else:
|
|
success = True
|
|
output += ('All hosts are patch current: [%s]\n') \
|
|
% (HealthCheck.SUCCESS_MSG)
|
|
health_ok = health_ok and success
|
|
|
|
if HealthCheck.FAIL_MSG in output:
|
|
success = False
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|
|
|
|
|
|
class UpgradeHealthCheck(HealthCheck):
|
|
"""This class represents an upgrade-specific health check object
|
|
that verifies if system is in a valid state for upgrade"""
|
|
|
|
# TODO(heitormatsui): switch from using upgrade metadata xml to
|
|
# the new USM metadata format
|
|
def _check_valid_upgrade_path(self):
|
|
"""Checks if active release to specified release is a valid upgrade path"""
|
|
# Get active release
|
|
isystem = self._sysinv_client.isystem.list()[0]
|
|
active_release = isystem.software_version
|
|
|
|
# supported_release is a dict with {release: required_patch}
|
|
supported_releases = dict()
|
|
|
|
# Parse upgrade metadata file for supported upgrade paths
|
|
root = ElementTree.parse("/var/www/pages/feed/rel-%s/upgrades/metadata.xml" % self._major_release)
|
|
upgrade_root = root.find("supported_upgrades").findall("upgrade")
|
|
for upgrade in upgrade_root:
|
|
version = upgrade.find("version")
|
|
required_patch = upgrade.find("required_patch")
|
|
supported_releases.update({version.text: [required_patch.text] if
|
|
required_patch is not None else []})
|
|
success = active_release in supported_releases
|
|
return success, active_release, supported_releases.get(active_release, [])
|
|
|
|
# TODO(heitormatsui) do we need this check on USM? Remove if we don't
|
|
def _check_active_is_controller_0(self):
|
|
"""Checks that active controller is controller-0"""
|
|
controllers = self._sysinv_client.ihost.list()
|
|
for controller in controllers:
|
|
if controller.hostname == "controller-0" and \
|
|
"Controller-Active" in controller.capabilities["Personality"]:
|
|
return True
|
|
return False
|
|
|
|
def _check_kube_version(self, supported_versions):
|
|
"""
|
|
Check if active k8s version is in a list of supported versions
|
|
:param supported_versions: list of supported k8s versions
|
|
:return: boolean indicating success/failure and active k8s version
|
|
"""
|
|
kube_versions = self._sysinv_client.kube_version.list()
|
|
active_version = None
|
|
for kv in kube_versions:
|
|
if kv.state == "active":
|
|
active_version = kv.version
|
|
break
|
|
success = active_version in supported_versions
|
|
return success, active_version
|
|
|
|
@staticmethod
|
|
def _check_system_simplex():
|
|
system_mode = upgrade_utils.get_platform_conf("system_mode")
|
|
return system_mode == SYSTEM_MODE_SIMPLEX
|
|
|
|
@staticmethod
|
|
def _check_free_space_for_snapshots():
|
|
success = True
|
|
available_gib = upgrade_utils.get_available_gib_in_vg()
|
|
if available_gib < FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB:
|
|
success = False
|
|
msg = (("Insufficient free space on cgts-vg to create snapshot\n"
|
|
" Current free space is %.2fGiB but %.2fGiB is needed")
|
|
% (available_gib, FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB))
|
|
return success, msg
|
|
return success, None
|
|
|
|
def _check_snapshot_option(self):
|
|
options = json.loads(self._config.get("options", {}))
|
|
snapshot = options.get("snapshot", False)
|
|
return upgrade_utils.to_bool(snapshot)
|
|
|
|
def run_health_check(self):
|
|
"""Run specific upgrade health checks"""
|
|
|
|
health_ok = True
|
|
output = ""
|
|
|
|
# check if k8s version is valid
|
|
success, active_version = self._check_kube_version(SUPPORTED_K8S_VERSIONS)
|
|
output += 'Active kubernetes version [%s] is a valid supported version: [%s]\n' \
|
|
% (active_version, HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
|
|
if not active_version:
|
|
output += ('-> Failed to get version info. Upgrade kubernetes to one of the '
|
|
'supported versions [%s] and ensure that the kubernetes version '
|
|
'information is available in the kubeadm configmap.\n'
|
|
'See "system kube-version-list"\n' % ", ".join(SUPPORTED_K8S_VERSIONS))
|
|
elif not success:
|
|
output += ('-> Upgrade active kubernetes version [%s] to one of the '
|
|
'supported versions [%s]. See "system kube-version-list"\n' %
|
|
(active_version, ", ".join(SUPPORTED_K8S_VERSIONS)))
|
|
health_ok = health_ok and success
|
|
|
|
# TODO(heitormatsui) Do we need the following check on USM?
|
|
# The load is only imported to controller-0. An upgrade can only
|
|
# be started when controller-0 is active.
|
|
is_controller_0 = self._check_active_is_controller_0()
|
|
success = is_controller_0
|
|
output += \
|
|
'Active controller is controller-0: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
health_ok = health_ok and success
|
|
|
|
# check installed license
|
|
# NOTE(nicodemos): We just need to check the license for major release
|
|
success = self._check_license(self._major_release)
|
|
output += 'Installed license is valid: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
health_ok = health_ok and success
|
|
|
|
# TODO(mdecastr) Plan is to add support to ECC key, this verification need to
|
|
# be re evaluated in future releases if the support is implemented.
|
|
# Check if system-local-ca's private key is RSA
|
|
if upgrade_utils.get_distributed_cloud_role() == 'subcloud':
|
|
# system-local-ca in subclouds either match the systemcontroller's,
|
|
# or it will be changed to match in upgrade activation
|
|
LOG.info("Checking system-local-ca's private key is not required for subclouds.")
|
|
success = True
|
|
else:
|
|
success = self._check_local_issuer_rsa_private_key()
|
|
|
|
if not success:
|
|
LOG.error("system-local-ca's private key is either not RSA or not valid.")
|
|
output += 'Platform Issuer: [%s]\n' % HealthCheck.FAIL_MSG
|
|
output += ('-> Platform Issuer (system-local-ca) TLS private key is not valid. '
|
|
'Only RSA keys are supported.\n'
|
|
' Please perform the \'Update system-local-ca or Migrate Platform '
|
|
'Certificates to use Cert Manager\'\n'
|
|
' procedure to update the Platform Issuer, providing a valid RSA '
|
|
'cert/key to be used by the issuer.\n')
|
|
else:
|
|
LOG.info("system-local-ca has a valid private key.")
|
|
health_ok = health_ok and success
|
|
|
|
# check if it is a valid upgrade path
|
|
success, active_release, required_patches = self._check_valid_upgrade_path()
|
|
output += 'Valid upgrade path from release %s to %s: [%s]\n' \
|
|
% (active_release, self._major_release,
|
|
HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
health_ok = health_ok and success
|
|
|
|
# check if required patches are deployed
|
|
success, missing_patches = self._check_required_patches_state(required_patches)
|
|
output += 'Required patches are applied: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
if not success:
|
|
output += '-> Patches not applied: [%s]\n' \
|
|
% ', '.join(missing_patches)
|
|
health_ok = health_ok and success
|
|
|
|
# check for LVM snapshot health checks
|
|
use_lvm_snapshot = self._check_snapshot_option()
|
|
if use_lvm_snapshot:
|
|
success = self._check_system_simplex()
|
|
output += '(LVM snapshots) System is AIO-SX: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
health_ok = health_ok and success
|
|
|
|
success, msg = self._check_free_space_for_snapshots()
|
|
output += '(LVM snapshots) Disk space available: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
if not success:
|
|
output += "-> %s" % msg
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|
|
|
|
def _check_required_patches(self, required_patches):
|
|
"""
|
|
Check if required patches are applied using the patching API
|
|
:return: tuple (success, missing_patches)
|
|
"""
|
|
try:
|
|
patch_token, patch_endpoint = upgrade_utils.get_token_endpoint(
|
|
self._config, service_type="patching")
|
|
patch_endpoint += "/v1/query/"
|
|
response = requests.get(patch_endpoint, headers={
|
|
"X-Auth-Token": patch_token}, timeout=10)
|
|
except Exception as e:
|
|
return False, "Failed to connect to patching API: %s" % e
|
|
|
|
query_patches = response.json()['pd']
|
|
applied_patches = []
|
|
for patch_key, patch in query_patches.items():
|
|
if patch.get('patchstate') in {'Applied', 'Committed'}:
|
|
applied_patches.append(patch_key)
|
|
|
|
missing_patches = [patch for patch in required_patches if patch not in applied_patches]
|
|
success = not missing_patches
|
|
|
|
return success, missing_patches
|
|
|
|
def _check_local_issuer_rsa_private_key(self):
|
|
secret = upgrade_utils.get_secret_data_yaml('system-local-ca', 'cert-manager')
|
|
if secret is None or secret == '':
|
|
LOG.error("Error while retrieving system-local-ca's secret data.")
|
|
return False
|
|
|
|
key_b64 = secret.get('data', {}).get('tls.key', None)
|
|
if key_b64 is None:
|
|
LOG.error("Could not retrieve system-local-ca private key.")
|
|
return False
|
|
return upgrade_utils.is_tls_key_rsa(base64.b64decode(key_b64).decode('utf-8'))
|
|
|
|
|
|
class PatchHealthCheck(HealthCheck):
|
|
"""This class represents a patch-specific health check object
|
|
that verifies if system is in valid state to apply a patch"""
|
|
|
|
def _get_required_patches(self):
|
|
"""Get required patches for a target release"""
|
|
releases = self._config.get("releases")
|
|
required_patches = []
|
|
for release in json.loads(releases):
|
|
if release["sw_version"] == self._target_release:
|
|
required_patches.extend(release["requires"])
|
|
break
|
|
|
|
return required_patches
|
|
|
|
def run_health_check(self):
|
|
"""Run specific patch health checks"""
|
|
health_ok = True
|
|
output = ""
|
|
|
|
# check required patches for target release
|
|
required_patches = self._get_required_patches()
|
|
success, missing_patches = self._check_required_patches_state(required_patches, True)
|
|
output += 'Required patches are deployed or available: [%s]\n' \
|
|
% (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
|
|
if not success:
|
|
output += '-> Patches not deployed or available: [%s]\n' \
|
|
% ', '.join(missing_patches)
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|
|
|
|
|
|
def parse_config(args=None):
|
|
"""Parse the parameters passed to the script"""
|
|
parser = argparse.ArgumentParser(description="Run health checks to verify if the system "
|
|
"meets the requirements to deploy a specific "
|
|
"release.")
|
|
parser.add_argument("--auth_url",
|
|
help="Authentication URL",
|
|
required=True)
|
|
parser.add_argument("--username",
|
|
help="Username",
|
|
required=True)
|
|
parser.add_argument("--password",
|
|
help="Password",
|
|
required=True)
|
|
parser.add_argument("--project_name",
|
|
help="Project Name",
|
|
required=True)
|
|
parser.add_argument("--user_domain_name",
|
|
help="User Domain Name",
|
|
required=True)
|
|
parser.add_argument("--project_domain_name",
|
|
help="Project Domain Name",
|
|
required=True)
|
|
parser.add_argument("--region_name",
|
|
help="Region Name",
|
|
default="RegionOne")
|
|
parser.add_argument("--force",
|
|
help="Ignore non-critical health checks",
|
|
action="store_true")
|
|
parser.add_argument("--patch",
|
|
help="Set precheck to run against a patch release",
|
|
action="store_true")
|
|
parser.add_argument("--options",
|
|
help="Additional parameters in key=value format.")
|
|
parser.add_argument("--releases",
|
|
help="Releases",
|
|
default="[]")
|
|
parser.add_argument("--deploy_in_progress",
|
|
help="check hosts are patch current",
|
|
default="{}")
|
|
|
|
# if args was not passed will use sys.argv by default
|
|
parsed_args = parser.parse_args(args)
|
|
return vars(parsed_args)
|
|
|
|
|
|
def main(argv=None):
|
|
config = parse_config(argv)
|
|
patch_release = config.get("patch", False)
|
|
|
|
health_ok = True
|
|
output = ""
|
|
|
|
if patch_release:
|
|
health_check = PatchHealthCheck(config)
|
|
else:
|
|
health_check = UpgradeHealthCheck(config)
|
|
|
|
# execute general health check
|
|
general_health_ok, general_output = health_check.run_general_health_check()
|
|
# execute release-specific health check
|
|
specific_health_ok, specific_output = health_check.run_health_check()
|
|
|
|
# combine health check results removing extra line breaks/blank spaces from the output
|
|
health_ok = general_health_ok and specific_health_ok
|
|
output = general_output.strip() + "\n" + specific_output.strip()
|
|
|
|
# print health check output and exit
|
|
print(output)
|
|
if health_ok:
|
|
return RC_SUCCESS
|
|
else:
|
|
return RC_UNHEALTHY
|
|
|
|
|
|
if __name__ == "__main__":
|
|
upgrade_utils.configure_logging("/var/log/software.log", log_level=logging.INFO)
|
|
sys.exit(main())
|