Merge "Fix software deploy start: precheck before deploy start"

This commit is contained in:
Zuul
2024-10-09 15:29:07 +00:00
committed by Gerrit Code Review
2 changed files with 82 additions and 17 deletions

View File

@@ -34,6 +34,8 @@ SOFTWARE_CONFIG_FILE_LOCAL = "/etc/software/software.conf"
RC_SUCCESS = 0 RC_SUCCESS = 0
RC_UNHEALTHY = 3 RC_UNHEALTHY = 3
PRECHECK_RESULT_VALID_PERIOD = 300
DEPLOY_PRECHECK_SCRIPT = "deploy-precheck" DEPLOY_PRECHECK_SCRIPT = "deploy-precheck"
UPGRADE_UTILS_SCRIPT = "upgrade_utils.py" UPGRADE_UTILS_SCRIPT = "upgrade_utils.py"
DEPLOY_START_SCRIPT = "software-deploy-start" DEPLOY_START_SCRIPT = "software-deploy-start"

View File

@@ -23,6 +23,7 @@ import subprocess
import tempfile import tempfile
import threading import threading
import time import time
import typing
from wsgiref import simple_server from wsgiref import simple_server
from fm_api import fm_api from fm_api import fm_api
@@ -2626,7 +2627,7 @@ class PatchController(PatchService):
return release return release
def _deploy_precheck(self, release_version: str, force: bool = False, def _deploy_precheck(self, release_version: str, force: bool = False,
region_name: str = None, patch: bool = False) -> dict: region_name: typing.Optional[str] = None, patch: bool = False) -> dict:
""" """
Verify if system satisfy the requisites to upgrade to a specified deployment. Verify if system satisfy the requisites to upgrade to a specified deployment.
:param release_version: full release name, e.g. starlingx-MM.mm.pp :param release_version: full release name, e.g. starlingx-MM.mm.pp
@@ -2647,6 +2648,7 @@ class PatchController(PatchService):
if not os.path.isfile(precheck_script) and patch: if not os.path.isfile(precheck_script) and patch:
# Precheck script may not be available for some patches # Precheck script may not be available for some patches
# In that case, report system as healthy with info message to proceed # In that case, report system as healthy with info message to proceed
self._save_precheck_result(release_version, healthy=True)
msg_info = f"No deploy-precheck script available for patch version {release_version}" msg_info = f"No deploy-precheck script available for patch version {release_version}"
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True) return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
@@ -2657,6 +2659,7 @@ class PatchController(PatchService):
msg_error = "Fail to perform deploy precheck. " \ msg_error = "Fail to perform deploy precheck. " \
"Uploaded release may have been damaged. " \ "Uploaded release may have been damaged. " \
"Try delete and re-upload the release.\n" "Try delete and re-upload the release.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error) return dict(info=msg_info, warning=msg_warning, error=msg_error)
if self.pre_bootstrap and not force: if self.pre_bootstrap and not force:
@@ -2664,6 +2667,7 @@ class PatchController(PatchService):
# script access any of services like sysinv, keystone, etc. # script access any of services like sysinv, keystone, etc.
msg_warning = "Pre-bootstrap environment may not support deploy precheck.\n" \ msg_warning = "Pre-bootstrap environment may not support deploy precheck.\n" \
"Use --force option to execute deploy precheck script.\n" "Use --force option to execute deploy precheck script.\n"
self._save_precheck_result(release_version, healthy=True)
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True) return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
deploy_in_progress = self._get_software_upgrade() deploy_in_progress = self._get_software_upgrade()
@@ -2684,6 +2688,7 @@ class PatchController(PatchService):
LOG.error(msg) LOG.error(msg)
msg_error = "Fail to perform deploy precheck. Internal error has occured." \ msg_error = "Fail to perform deploy precheck. Internal error has occured." \
"Try lock and unlock the controller for recovery.\n" "Try lock and unlock the controller for recovery.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error) return dict(info=msg_info, warning=msg_warning, error=msg_error)
# TODO(heitormatsui) if different region was passed as parameter then # TODO(heitormatsui) if different region was passed as parameter then
@@ -2727,8 +2732,10 @@ class PatchController(PatchService):
system_healthy = None system_healthy = None
if precheck_return.returncode in [constants.RC_SUCCESS, constants.RC_UNHEALTHY]: if precheck_return.returncode in [constants.RC_SUCCESS, constants.RC_UNHEALTHY]:
system_healthy = precheck_return.returncode == constants.RC_SUCCESS system_healthy = precheck_return.returncode == constants.RC_SUCCESS
self._save_precheck_result(release_version, healthy=system_healthy)
msg_info += precheck_return.stdout msg_info += precheck_return.stdout
else: else:
self._save_precheck_result(release_version, healthy=False)
msg_error += precheck_return.stdout msg_error += precheck_return.stdout
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=system_healthy) return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=system_healthy)
@@ -2959,6 +2966,60 @@ class PatchController(PatchService):
thread = threading.Thread(target=run) thread = threading.Thread(target=run)
thread.start() thread.start()
def _precheck_before_start(self, deployment, release_version, is_patch, force=False):
LOG.info("Running deploy precheck.")
precheck_result = self._deploy_precheck(release_version, force=force, patch=is_patch)
if precheck_result.get('system_healthy') is None:
precheck_result["error"] = (
f"Fail to perform deploy precheck. Internal error has occurred.\n"
f"{precheck_result['error']}"
)
return precheck_result
elif precheck_result.get('system_healthy') is False:
precheck_result["error"] = (
f"The following issues have been detected, which prevent deploying {deployment}\n"
f"{precheck_result['info']}\n"
"Please fix above issues then retry the deploy.\n"
)
return precheck_result
return None
def _get_precheck_result_file_path(self, release_version):
return os.path.join("/opt/software/", f"rel-{release_version}", "precheck-result.json")
def _safe_remove_precheck_result_file(self, release_version):
precheck_result_file = self._get_precheck_result_file_path(release_version)
if os.path.isfile(precheck_result_file):
os.remove(precheck_result_file)
def _save_precheck_result(self, release_version, healthy):
precheck_result_file = self._get_precheck_result_file_path(release_version)
with open(precheck_result_file, "w") as f:
json.dump({"healthy": healthy, "timestamp": time.time()}, f)
def _should_run_precheck_prior_deploy_start(self, release_version, force, is_patch):
# there is not precheck script in this state
if self.pre_bootstrap:
return False
# we should be able to patch an unhealthy system ignoring the unhealthy state
if is_patch and force:
return False
file_path = self._get_precheck_result_file_path(release_version)
if not os.path.isfile(file_path):
LOG.info("The precheck result file %s does not exist." % file_path)
return True
with open(file_path) as f:
last_result = json.load(f)
if time.time() - last_result["timestamp"] > constants.PRECHECK_RESULT_VALID_PERIOD:
LOG.info("The precheck result expired.")
return True
return not last_result["healthy"]
@require_deploy_state([None], @require_deploy_state([None],
"There is already a deployment in progress ({state.value}). " "There is already a deployment in progress ({state.value}). "
"Please complete/delete the current deployment.") "Please complete/delete the current deployment.")
@@ -2971,7 +3032,8 @@ class PatchController(PatchService):
The operation includes steps: The operation includes steps:
1. find all undeployed dependency releases 1. find all undeployed dependency releases
2. ensure all releases (dependency and specified release) are ready to deployed 2. ensure all releases (dependency and specified release) are ready to deployed
3. precheck 3. precheck, if last precheck was not executed or if was executed and failed or
if precheck result expired
4. transform all involved releases to deploying state 4. transform all involved releases to deploying state
5. start the deploy subprocess 5. start the deploy subprocess
""" """
@@ -3000,25 +3062,26 @@ class PatchController(PatchService):
if hostname not in valid_hostnames: if hostname not in valid_hostnames:
LOG.warning("Using unknown hostname for local install: %s", hostname) LOG.warning("Using unknown hostname for local install: %s", hostname)
patch_release = True to_release = deploy_release.sw_release
if utils.is_upgrade_deploy(SW_VERSION, deploy_release.sw_release): is_upgrade_deploy = utils.is_upgrade_deploy(SW_VERSION, deploy_release.sw_release)
is_patch = not is_upgrade_deploy
if self._should_run_precheck_prior_deploy_start(to_release, force, is_patch):
LOG.info("Executing software deploy precheck prior to software deploy start")
if precheck_result := self._precheck_before_start(
deployment,
to_release,
is_patch=is_patch,
force=force
):
return precheck_result
self._safe_remove_precheck_result_file(to_release)
if is_upgrade_deploy:
# TODO(bqian) remove default latest commit when a commit-id is built into GA metadata # TODO(bqian) remove default latest commit when a commit-id is built into GA metadata
if commit_id is None: if commit_id is None:
commit_id = ostree_utils.get_feed_latest_commit(deploy_sw_version) commit_id = ostree_utils.get_feed_latest_commit(deploy_sw_version)
patch_release = False
to_release = deploy_release.sw_release
ret = self._deploy_precheck(to_release, force, patch=patch_release)
if ret["system_healthy"] is None:
ret["error"] = "Fail to perform deploy precheck. Internal error has occurred.\n" + \
ret["error"]
return ret
elif not ret["system_healthy"]:
ret["error"] = "The following issues have been detected, which prevent " \
"deploying %s\n" % deployment + ret["info"] + \
"Please fix above issues then retry the deploy.\n"
return ret
if self._deploy_upgrade_start(to_release, commit_id): if self._deploy_upgrade_start(to_release, commit_id):
collect_current_load_for_hosts(deploy_sw_version) collect_current_load_for_hosts(deploy_sw_version)
create_deploy_hosts() create_deploy_hosts()