Merge "Fix software deploy start: precheck before deploy start"

This commit is contained in:
Zuul
2024-10-09 15:29:07 +00:00
committed by Gerrit Code Review
2 changed files with 82 additions and 17 deletions

View File

@@ -34,6 +34,8 @@ SOFTWARE_CONFIG_FILE_LOCAL = "/etc/software/software.conf"
RC_SUCCESS = 0
RC_UNHEALTHY = 3
PRECHECK_RESULT_VALID_PERIOD = 300
DEPLOY_PRECHECK_SCRIPT = "deploy-precheck"
UPGRADE_UTILS_SCRIPT = "upgrade_utils.py"
DEPLOY_START_SCRIPT = "software-deploy-start"

View File

@@ -23,6 +23,7 @@ import subprocess
import tempfile
import threading
import time
import typing
from wsgiref import simple_server
from fm_api import fm_api
@@ -2626,7 +2627,7 @@ class PatchController(PatchService):
return release
def _deploy_precheck(self, release_version: str, force: bool = False,
region_name: str = None, patch: bool = False) -> dict:
region_name: typing.Optional[str] = None, patch: bool = False) -> dict:
"""
Verify if system satisfy the requisites to upgrade to a specified deployment.
:param release_version: full release name, e.g. starlingx-MM.mm.pp
@@ -2647,6 +2648,7 @@ class PatchController(PatchService):
if not os.path.isfile(precheck_script) and patch:
# Precheck script may not be available for some patches
# In that case, report system as healthy with info message to proceed
self._save_precheck_result(release_version, healthy=True)
msg_info = f"No deploy-precheck script available for patch version {release_version}"
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
@@ -2657,6 +2659,7 @@ class PatchController(PatchService):
msg_error = "Fail to perform deploy precheck. " \
"Uploaded release may have been damaged. " \
"Try delete and re-upload the release.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if self.pre_bootstrap and not force:
@@ -2664,6 +2667,7 @@ class PatchController(PatchService):
# script access any of services like sysinv, keystone, etc.
msg_warning = "Pre-bootstrap environment may not support deploy precheck.\n" \
"Use --force option to execute deploy precheck script.\n"
self._save_precheck_result(release_version, healthy=True)
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
deploy_in_progress = self._get_software_upgrade()
@@ -2684,6 +2688,7 @@ class PatchController(PatchService):
LOG.error(msg)
msg_error = "Fail to perform deploy precheck. Internal error has occured." \
"Try lock and unlock the controller for recovery.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# TODO(heitormatsui) if different region was passed as parameter then
@@ -2727,8 +2732,10 @@ class PatchController(PatchService):
system_healthy = None
if precheck_return.returncode in [constants.RC_SUCCESS, constants.RC_UNHEALTHY]:
system_healthy = precheck_return.returncode == constants.RC_SUCCESS
self._save_precheck_result(release_version, healthy=system_healthy)
msg_info += precheck_return.stdout
else:
self._save_precheck_result(release_version, healthy=False)
msg_error += precheck_return.stdout
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=system_healthy)
@@ -2959,6 +2966,60 @@ class PatchController(PatchService):
thread = threading.Thread(target=run)
thread.start()
def _precheck_before_start(self, deployment, release_version, is_patch, force=False):
LOG.info("Running deploy precheck.")
precheck_result = self._deploy_precheck(release_version, force=force, patch=is_patch)
if precheck_result.get('system_healthy') is None:
precheck_result["error"] = (
f"Fail to perform deploy precheck. Internal error has occurred.\n"
f"{precheck_result['error']}"
)
return precheck_result
elif precheck_result.get('system_healthy') is False:
precheck_result["error"] = (
f"The following issues have been detected, which prevent deploying {deployment}\n"
f"{precheck_result['info']}\n"
"Please fix above issues then retry the deploy.\n"
)
return precheck_result
return None
def _get_precheck_result_file_path(self, release_version):
return os.path.join("/opt/software/", f"rel-{release_version}", "precheck-result.json")
def _safe_remove_precheck_result_file(self, release_version):
precheck_result_file = self._get_precheck_result_file_path(release_version)
if os.path.isfile(precheck_result_file):
os.remove(precheck_result_file)
def _save_precheck_result(self, release_version, healthy):
precheck_result_file = self._get_precheck_result_file_path(release_version)
with open(precheck_result_file, "w") as f:
json.dump({"healthy": healthy, "timestamp": time.time()}, f)
def _should_run_precheck_prior_deploy_start(self, release_version, force, is_patch):
# there is not precheck script in this state
if self.pre_bootstrap:
return False
# we should be able to patch an unhealthy system ignoring the unhealthy state
if is_patch and force:
return False
file_path = self._get_precheck_result_file_path(release_version)
if not os.path.isfile(file_path):
LOG.info("The precheck result file %s does not exist." % file_path)
return True
with open(file_path) as f:
last_result = json.load(f)
if time.time() - last_result["timestamp"] > constants.PRECHECK_RESULT_VALID_PERIOD:
LOG.info("The precheck result expired.")
return True
return not last_result["healthy"]
@require_deploy_state([None],
"There is already a deployment in progress ({state.value}). "
"Please complete/delete the current deployment.")
@@ -2971,7 +3032,8 @@ class PatchController(PatchService):
The operation includes steps:
1. find all undeployed dependency releases
2. ensure all releases (dependency and specified release) are ready to deployed
3. precheck
3. precheck, if last precheck was not executed or if was executed and failed or
if precheck result expired
4. transform all involved releases to deploying state
5. start the deploy subprocess
"""
@@ -3000,25 +3062,26 @@ class PatchController(PatchService):
if hostname not in valid_hostnames:
LOG.warning("Using unknown hostname for local install: %s", hostname)
patch_release = True
if utils.is_upgrade_deploy(SW_VERSION, deploy_release.sw_release):
to_release = deploy_release.sw_release
is_upgrade_deploy = utils.is_upgrade_deploy(SW_VERSION, deploy_release.sw_release)
is_patch = not is_upgrade_deploy
if self._should_run_precheck_prior_deploy_start(to_release, force, is_patch):
LOG.info("Executing software deploy precheck prior to software deploy start")
if precheck_result := self._precheck_before_start(
deployment,
to_release,
is_patch=is_patch,
force=force
):
return precheck_result
self._safe_remove_precheck_result_file(to_release)
if is_upgrade_deploy:
# TODO(bqian) remove default latest commit when a commit-id is built into GA metadata
if commit_id is None:
commit_id = ostree_utils.get_feed_latest_commit(deploy_sw_version)
patch_release = False
to_release = deploy_release.sw_release
ret = self._deploy_precheck(to_release, force, patch=patch_release)
if ret["system_healthy"] is None:
ret["error"] = "Fail to perform deploy precheck. Internal error has occurred.\n" + \
ret["error"]
return ret
elif not ret["system_healthy"]:
ret["error"] = "The following issues have been detected, which prevent " \
"deploying %s\n" % deployment + ret["info"] + \
"Please fix above issues then retry the deploy.\n"
return ret
if self._deploy_upgrade_start(to_release, commit_id):
collect_current_load_for_hosts(deploy_sw_version)
create_deploy_hosts()