update/software/scripts/deploy-precheck

#!/usr/bin/python3
# -*- encoding: utf-8 -*-
#
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (c) 2023-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

"""
Run platform upgrade deploy precheck as a standalone executable
"""

import argparse
import base64
import json
import logging
import os
import re
import requests
import subprocess
import sys
import tempfile

from lxml import etree as ElementTree

import upgrade_utils


RC_SUCCESS = 0
RC_UNHEALTHY = 3
STATE_AVAILABLE = 'available'
STATE_DEPLOYED = 'deployed'
STATE_UNAVAILABLE = 'unavailable'
SYSTEM_MODE_SIMPLEX = "simplex"
# the space needed is derived from the sum of snapshot sizes
# defined in lvm_snapshot.LvmSnapshotManager.LOGICAL_VOLUMES
FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB = 24
INITIAL_CONFIG_COMPLETE_FLAG = "/etc/platform/.initial_config_complete"

LOG = logging.getLogger('main_logger')

class HealthCheck(object):
    """This class represents a general health check object
    that uses sysinv-client to run system health checks"""

    SUCCESS_MSG = 'OK'
    FAIL_MSG = 'Fail'

    def __init__(self, config):
        self._config = config

        # get target release from script directory location
        self._target_release = re.match("^.*/rel-(\d\d.\d\d.\d+)/", __file__).group(1)
        self._major_release = self._target_release.rsplit(".", 1)[0]
        if not self._config.get("pre_bootstrap"):
            # get sysinv token, endpoint and client
            self._sysinv_token, self._sysinv_endpoint = \
                upgrade_utils.get_token_endpoint(config, service_type="platform")
            self._sysinv_client = upgrade_utils.get_sysinv_client(self._sysinv_token,
                                                                  self._sysinv_endpoint)

    def _check_license(self, version):
        """
        Validates the current license is valid for the specified version
        :param version: version to be checked against installed license
        :return: True is license is valid for version, False otherwise
        """
        license_dict = self._sysinv_client.license.show()
        if license_dict["error"]:
            return False

        # create temp file with license content to run verify-license binary against it
        with tempfile.NamedTemporaryFile(mode="w", delete=True) as license_file:
            try:
                license_file.write(license_dict["content"])
                license_file.flush()
                subprocess.check_call(["/usr/bin/verify-license",  # pylint: disable=not-callable
                                       license_file.name,
                                       version],
                                      stdout=subprocess.DEVNULL,
                                      stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError:
                return False
        return True

    # TODO(heitormatsui): implement patch precheck targeted against USM
    #  and implement patch precheck for subcloud
    def _check_required_patches_state(self, required_patches, allow_available=False):
        """
        Check if the required patches are in 'deployed' or 'unavailable' state,
        if allow_available is True, the required_patches can be in 'available' state as well.
        :param required_patches: list of patches to be checked
        :param allow_available: boolean if allow available patches
        :return: boolean indicating success/failure and list of patches
                 that are not in the 'deployed', 'unavailable' or 'available' state
        """
        success = True
        releases = self._config.get("releases", "")
        releases_in_allowed_states = []
        for release in json.loads(releases):
            if release['state'] == STATE_DEPLOYED or \
                release['state'] == STATE_UNAVAILABLE or \
                (allow_available and release['state'] == STATE_AVAILABLE):
                releases_in_allowed_states.append(release)

        allowed_patches = [release['release_id'] for release in releases_in_allowed_states]
        missing_patches = list(set(required_patches) - set(allowed_patches))
        if missing_patches:
            success = False

        return success, missing_patches

    def run_general_health_check(self):
        """Run general health check using sysinv client"""
        force = self._config.get("force", False)
        health_ok = success = True

        alarm_ignore_list = ["280.001", # subcloud offline
                             "280.002", # subcloud resource out-of-sync
                             "280.003", # subcloud backup failure
                             "280.004", # subcloud peer group in disconnected state
                             "280.005", # subcloud peer group managed with lower priority
                             "900.201", # Software upgrade auto apply in progress
                             ]
        # Don't call api if pre bootstrap scenario
        if not self._config.get("pre_bootstrap"):
            api_cmd = self._sysinv_endpoint + "/health/kube-upgrade"

            if force:
                api_cmd += '/relaxed'

            if alarm_ignore_list:
                api_cmd += f'?alarm_ignore_list={alarm_ignore_list}'

            method = 'GET'
            output = upgrade_utils.call_api(self._sysinv_token, method, api_cmd)
        else:
            output = 'System Health:\n'

        # check hosts are patch current
        deploy_in_progress = self._config.get("deploy_in_progress", "{}")
        deploy_in_progress = json.loads(deploy_in_progress)
        if deploy_in_progress:
            success = False
            from_load = deploy_in_progress["from_release"]
            to_load = deploy_in_progress["to_release"]
            output += ('All hosts are patch current: [%s]\n') \
                % (HealthCheck.FAIL_MSG)
            output += ('Deployment in progress: %s to %s\n' % (from_load, to_load))
        else:
            success = True
            output += ('All hosts are patch current: [%s]\n') \
                 % (HealthCheck.SUCCESS_MSG)
        health_ok = health_ok and success

        if HealthCheck.FAIL_MSG in output:
            success = False
        health_ok = health_ok and success

        return health_ok, output


class UpgradeHealthCheck(HealthCheck):
    """This class represents an upgrade-specific health check object
    that verifies if system is in a valid state for upgrade"""

    def __init__(self, config):
        super().__init__(config)
        self._ostree_feed_path = f"/var/www/pages/feed/rel-{self._major_release}"

    # TODO(heitormatsui): switch from using upgrade metadata xml to
    #  the new USM metadata format
    def _check_valid_upgrade_path(self):
        """Checks if active release to specified release is a valid upgrade path"""
        # Get active release
        isystem = self._sysinv_client.isystem.list()[0]
        active_release = isystem.software_version

        # supported_release is a dict with {release: required_patch}
        supported_releases = dict()

        # Parse upgrade metadata file for supported upgrade paths
        root = ElementTree.parse(f"{self._ostree_feed_path}/upgrades/metadata.xml")
        upgrade_root = root.find("supported_upgrades").findall("upgrade")
        for upgrade in upgrade_root:
            version = upgrade.find("version")
            required_patch = upgrade.find("required_patch")
            supported_releases.update({version.text: [required_patch.text] if
                                       required_patch is not None else []})
        success = active_release in supported_releases
        return success, active_release, supported_releases.get(active_release, [])

    # TODO(heitormatsui) do we need this check on USM? Remove if we don't
    def _check_active_is_controller_0(self):
        """Checks that active controller is controller-0"""
        controllers = self._sysinv_client.ihost.list()
        for controller in controllers:
            if controller.hostname == "controller-0" and \
                    "Controller-Active" in controller.capabilities["Personality"]:
                return True
        return False

    def _get_supported_k8s_versions(self):
        supported_versions = []
        # TODO(heitormatsui): is this the most appropriate way
        #  to fetch valid k8s versions for a given release?
        cmd = ["ostree", f"--repo={self._ostree_feed_path}/ostree_repo",
               "ls", "starlingx", "/usr/local/kubernetes"]
        try:
            proc = subprocess.run(cmd, check=True, text=True, capture_output=True)
            for line in proc.stdout.strip().split("\n"):
                match = re.match(r"^.*/kubernetes/([\d.]+)", line)
                if match:
                    supported_versions.append(f"v{match.group(1)}")
        except subprocess.CalledProcessError as e:
            return []
        return supported_versions

    def _check_kube_version(self):
        """
        Check if active k8s version is in a list of supported versions
        :param supported_versions: list of supported k8s versions
        :return: boolean indicating success/failure,
                 active k8s version,
                 list of supported k8s versions
        """
        kube_versions = self._sysinv_client.kube_version.list()
        active_version = None
        for kv in kube_versions:
            if kv.state == "active":
                active_version = kv.version
                break
        supported_versions = self._get_supported_k8s_versions()
        success = active_version in supported_versions
        return success, active_version, supported_versions

    @staticmethod
    def _check_system_simplex():
        system_mode = upgrade_utils.get_platform_conf("system_mode")
        return system_mode == SYSTEM_MODE_SIMPLEX

    @staticmethod
    def _check_free_space_for_snapshots():
        success = True
        available_gib = upgrade_utils.get_available_gib_in_vg()
        if available_gib < FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB:
            success = False
            msg = (("Insufficient free space on cgts-vg to create snapshot\n"
                    "   Current free space is %.2fGiB but %.2fGiB is needed")
                   % (available_gib, FREE_SPACE_NEEDED_LVM_SNAPSHOTS_GIB))
            return success, msg
        return success, None

    def _check_snapshot_option(self):
        options = json.loads(self._config.get("options", {}))
        snapshot = options.get("snapshot", False)
        return upgrade_utils.to_bool(snapshot)

    def run_health_check(self):
        """Run specific upgrade health checks"""

        health_ok = True
        output = ""

        # check if k8s version is valid
        success, active_version, supported_versions = self._check_kube_version()
        output += 'Active kubernetes version [%s] is a valid supported version: [%s]\n' \
                  % (active_version, HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)

        if not active_version:
            output += ('-> Failed to get version info. Upgrade kubernetes to one of the '
                       'supported versions [%s] and ensure that the kubernetes version '
                       'information is available in the kubeadm configmap.\n'
                       'See "system kube-version-list"\n' % ", ".join(supported_versions))
        elif not supported_versions:
            output += ('-> Failed to get supported kubernetes version list from the to-release '
                       f'feed in {self._ostree_feed_path}/ostree_repo\n')
        elif not success:
            output += ('-> Upgrade active kubernetes version [%s] to one of the '
                       'supported versions [%s]. See "system kube-version-list"\n' %
                       (active_version, ", ".join(supported_versions)))
        health_ok = health_ok and success

        # TODO(heitormatsui) Do we need the following check on USM?
        # The load is only imported to controller-0. An upgrade can only
        # be started when controller-0 is active.
        is_controller_0 = self._check_active_is_controller_0()
        success = is_controller_0
        output += \
            'Active controller is controller-0: [%s]\n' \
            % (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
        health_ok = health_ok and success

        # check installed license
        # NOTE(nicodemos): We just need to check the license for major release
        success = self._check_license(self._major_release)
        output += 'Installed license is valid: [%s]\n' \
                  % (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
        health_ok = health_ok and success

        # TODO(mdecastr) Plan is to add support to ECC key, this verification need to
        # be re evaluated in future releases if the support is implemented.
        # Check if system-local-ca's private key is RSA
        if upgrade_utils.get_distributed_cloud_role() == 'subcloud':
            # system-local-ca in subclouds either match the systemcontroller's,
            # or it will be changed to match in upgrade activation
            LOG.info("Checking system-local-ca's private key is not required for subclouds.")
            success = True
        else:
            success = self._check_local_issuer_rsa_private_key()

        if not success:
            LOG.error("system-local-ca's private key is either not RSA or not valid.")
            output += 'Platform Issuer: [%s]\n' % HealthCheck.FAIL_MSG
            output += ('-> Platform Issuer (system-local-ca) TLS private key is not valid. '
                       'Only RSA keys are supported.\n'
                       '   Please perform the \'Update system-local-ca or Migrate Platform '
                       'Certificates to use Cert Manager\'\n'
                       '   procedure to update the Platform Issuer, providing a valid RSA '
                       'cert/key to be used by the issuer.\n')
        else:
            LOG.info("system-local-ca has a valid private key.")
        health_ok = health_ok and success

        # check if it is a valid upgrade path
        success, active_release, required_patches = self._check_valid_upgrade_path()
        output += 'Valid upgrade path from release %s to %s: [%s]\n' \
                  % (active_release, self._major_release,
                     HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
        health_ok = health_ok and success

        # check if required patches are deployed
        success, missing_patches = self._check_required_patches_state(required_patches)
        output += 'Required patches are applied: [%s]\n' \
                  % (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
        if not success:
            output += '-> Patches not applied: [%s]\n' \
                % ', '.join(missing_patches)
        health_ok = health_ok and success

        # check for LVM snapshot health checks
        use_lvm_snapshot = self._check_snapshot_option()
        if use_lvm_snapshot:
            success = self._check_system_simplex()
            output += '(LVM snapshots) System is AIO-SX: [%s]\n' \
                       % (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
            health_ok = health_ok and success

            success, msg = self._check_free_space_for_snapshots()
            output += '(LVM snapshots) Disk space available: [%s]\n' \
                       % (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG)
            if not success:
                output += "-> %s" % msg
            health_ok = health_ok and success

        return health_ok, output

    def _check_required_patches(self, required_patches):
        """
        Check if required patches are applied using the patching API
        :return: tuple (success, missing_patches)
        """
        try:
            patch_token, patch_endpoint = upgrade_utils.get_token_endpoint(
                self._config, service_type="patching")
            patch_endpoint += "/v1/query/"
            response = requests.get(patch_endpoint, headers={
                                    "X-Auth-Token": patch_token}, timeout=10)
        except Exception as e:
            return False, "Failed to connect to patching API: %s" % e

        query_patches = response.json()['pd']
        applied_patches = []
        for patch_key, patch in query_patches.items():
            if patch.get('patchstate') in {'Applied', 'Committed'}:
                applied_patches.append(patch_key)

        missing_patches = [patch for patch in required_patches if patch not in applied_patches]
        success = not missing_patches

        return success, missing_patches

    def _check_local_issuer_rsa_private_key(self):
        secret = upgrade_utils.get_secret_data_yaml('system-local-ca', 'cert-manager')
        if secret is None or secret == '':
            LOG.error("Error while retrieving system-local-ca's secret data.")
            return False

        key_b64 = secret.get('data', {}).get('tls.key', None)
        if key_b64 is None:
            LOG.error("Could not retrieve system-local-ca private key.")
            return False
        return upgrade_utils.is_tls_key_rsa(base64.b64decode(key_b64).decode('utf-8'))


class PatchHealthCheck(HealthCheck):
    """This class represents a patch-specific health check object
    that verifies if system is in valid state to apply a patch"""

    def _get_required_patches(self):
        """Get required patches for a target release"""
        releases = self._config.get("releases")
        required_patches = []
        for release in json.loads(releases):
            if release["sw_version"] == self._target_release:
                required_patches.extend(release["requires"])
                break

        return required_patches

    def run_health_check(self):
        """Run specific patch health checks"""
        health_ok = True
        output = ""
        allow_available_patches = True
        states_required = "deployed, unavailable or available"

        # check required patches for target release
        required_patches = self._get_required_patches()

        # 24.09.300 patch requires all previous patches to be deployed
        if self._target_release == "24.09.300":
            allow_available_patches = False
            states_required = "deployed or unavailable"

        success, missing_patches = self._check_required_patches_state(required_patches, allow_available_patches)
        output += 'Required patches are %s: [%s]\n' \
                  % (states_required, (HealthCheck.SUCCESS_MSG if success else HealthCheck.FAIL_MSG))
        if not success:
            output += '-> Patches not %s: [%s]\n' \
                % (states_required, ', '.join(missing_patches))
        health_ok = health_ok and success

        return health_ok, output


def set_pre_bootstrap_info(config):
    config["pre_bootstrap"] = (not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG))


def parse_config(args=None):
    """Parse the parameters passed to the script"""
    parser = argparse.ArgumentParser(description="Run health checks to verify if the system "
                                                 "meets the requirements to deploy a specific "
                                                 "release.")
    parser.add_argument("--auth_url",
                        help="Authentication URL",
                        required=True)
    parser.add_argument("--username",
                        help="Username",
                        required=True)
    parser.add_argument("--password",
                        help="Password",
                        required=True)
    parser.add_argument("--project_name",
                        help="Project Name",
                        required=True)
    parser.add_argument("--user_domain_name",
                        help="User Domain Name",
                        required=True)
    parser.add_argument("--project_domain_name",
                        help="Project Domain Name",
                        required=True)
    parser.add_argument("--region_name",
                        help="Region Name",
                        default="RegionOne")
    parser.add_argument("--force",
                        help="Ignore non-critical health checks",
                        action="store_true")
    parser.add_argument("--patch",
                        help="Set precheck to run against a patch release",
                        action="store_true")
    parser.add_argument("--options",
                        help="Additional parameters in key=value format.")
    parser.add_argument("--releases",
                        help="Releases",
                        default="[]")
    parser.add_argument("--deploy_in_progress",
                        help="check hosts are patch current",
                        default="{}")

    # if args was not passed will use sys.argv by default
    parsed_args = parser.parse_args(args)
    return vars(parsed_args)


def main(argv=None):
    config = parse_config(argv)
    patch_release = config.get("patch", False)

    set_pre_bootstrap_info(config)

    if patch_release:
        health_check = PatchHealthCheck(config)
    else:
        health_check = UpgradeHealthCheck(config)

    # execute general health check
    general_health_ok, general_output = health_check.run_general_health_check()
    # execute release-specific health check
    specific_health_ok, specific_output = health_check.run_health_check()

    # combine health check results removing extra line breaks/blank spaces from the output
    health_ok = general_health_ok and specific_health_ok
    output = general_output.strip() + "\n" + specific_output.strip()

    # print health check output and exit
    print(output)
    if health_ok:
        return RC_SUCCESS
    else:
        return RC_UNHEALTHY


if __name__ == "__main__":
    upgrade_utils.configure_logging("/var/log/software.log", log_level=logging.INFO)
    sys.exit(main())