Handle deployment interruption
This commit is an enhancement for handling interruption during the USM upgrade. When the USM upgrade is in the stage of deploy-start, deploy-host, activate or activate-rollback, if the interruption occurs, such as host reboot, the deploy state will be set to failed for the recent stage. After setting to failure state, the USM upgrade can be re-tried. Test Plan: PASS: build and deploy iso PASS: SX upgrade with deploy start interruption PASS: DX upgrade with deploy start interruption Task: 2011357 Story: 51849 Change-Id: I37341d9be5c17d1da7161e08c7b46fd86f28f589 Signed-off-by: junfeng-li <junfeng.li@windriver.com>
This commit is contained in:
@@ -8,6 +8,7 @@ oslo.policy
|
|||||||
oslo.serialization
|
oslo.serialization
|
||||||
netaddr
|
netaddr
|
||||||
pecan
|
pecan
|
||||||
|
psutil
|
||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
pycryptodomex
|
pycryptodomex
|
||||||
PyGObject
|
PyGObject
|
||||||
|
@@ -35,9 +35,12 @@ import software.apt_utils as apt_utils
|
|||||||
import software.ostree_utils as ostree_utils
|
import software.ostree_utils as ostree_utils
|
||||||
from software.api import app
|
from software.api import app
|
||||||
from software.authapi import app as auth_app
|
from software.authapi import app as auth_app
|
||||||
|
from software.constants import CONTROLLER_0_HOSTNAME
|
||||||
|
from software.constants import CONTROLLER_1_HOSTNAME
|
||||||
from software.constants import INSTALL_LOCAL_FLAG
|
from software.constants import INSTALL_LOCAL_FLAG
|
||||||
from software.states import DEPLOY_STATES
|
|
||||||
from software.states import DEPLOY_HOST_STATES
|
from software.states import DEPLOY_HOST_STATES
|
||||||
|
from software.states import DEPLOY_STATES
|
||||||
|
from software.states import INTERRUPTION_RECOVERY_STATES
|
||||||
from software.base import PatchService
|
from software.base import PatchService
|
||||||
from software.dc_utils import get_subcloud_groupby_version
|
from software.dc_utils import get_subcloud_groupby_version
|
||||||
from software.deploy_state import require_deploy_state
|
from software.deploy_state import require_deploy_state
|
||||||
@@ -88,6 +91,7 @@ from software.software_functions import get_release_from_patch
|
|||||||
from software.software_functions import clean_up_deployment_data
|
from software.software_functions import clean_up_deployment_data
|
||||||
from software.software_functions import run_remove_temporary_data_script
|
from software.software_functions import run_remove_temporary_data_script
|
||||||
from software.release_state import ReleaseState
|
from software.release_state import ReleaseState
|
||||||
|
from software.utilities.deploy_set_failed import start_set_fail
|
||||||
from software.deploy_host_state import DeployHostState
|
from software.deploy_host_state import DeployHostState
|
||||||
from software.deploy_state import DeployState
|
from software.deploy_state import DeployState
|
||||||
from software.release_verify import verify_files
|
from software.release_verify import verify_files
|
||||||
@@ -996,6 +1000,7 @@ class PatchController(PatchService):
|
|||||||
|
|
||||||
self.hosts = {}
|
self.hosts = {}
|
||||||
self.controller_neighbours = {}
|
self.controller_neighbours = {}
|
||||||
|
self.host_mgmt_ip = []
|
||||||
|
|
||||||
self.db_api_instance = get_instance()
|
self.db_api_instance = get_instance()
|
||||||
|
|
||||||
@@ -4385,6 +4390,42 @@ class PatchController(PatchService):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def is_host_active_controller(self):
|
||||||
|
"""
|
||||||
|
Check if current host is active controller by checking if floating ip is assigned
|
||||||
|
to the host
|
||||||
|
:return: True if it is active controller, False otherwise
|
||||||
|
"""
|
||||||
|
if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG):
|
||||||
|
return False
|
||||||
|
|
||||||
|
floating_mgmt_ip = utils.gethostbyname(constants.CONTROLLER_FLOATING_HOSTNAME)
|
||||||
|
if not floating_mgmt_ip:
|
||||||
|
return False
|
||||||
|
|
||||||
|
ip_family = utils.get_management_family()
|
||||||
|
mgmt_iface = cfg.get_mgmt_iface()
|
||||||
|
|
||||||
|
host_mgmt_ip_list = utils.get_iface_ip(mgmt_iface, ip_family)
|
||||||
|
return floating_mgmt_ip in host_mgmt_ip_list if host_mgmt_ip_list else False
|
||||||
|
|
||||||
|
def set_interruption_fail_state(self):
|
||||||
|
"""
|
||||||
|
Set the host failed state after an interruption based on current deployment state
|
||||||
|
"""
|
||||||
|
upgrade_status = self.get_software_upgrade()
|
||||||
|
if self.is_host_active_controller() and os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG) and upgrade_status:
|
||||||
|
|
||||||
|
if upgrade_status.get('state') == DEPLOY_STATES.HOST.value and not is_simplex():
|
||||||
|
to_fail_hostname = CONTROLLER_0_HOSTNAME if self.hostname == CONTROLLER_1_HOSTNAME else \
|
||||||
|
CONTROLLER_1_HOSTNAME
|
||||||
|
# In DX, when it is in deploy-host state, we can only set the standby controller to fail
|
||||||
|
start_set_fail(True, to_fail_hostname)
|
||||||
|
|
||||||
|
elif upgrade_status.get('state') in INTERRUPTION_RECOVERY_STATES:
|
||||||
|
# The deployment was interrupted. We need to update the deployment state first
|
||||||
|
start_set_fail(True, self.hostname)
|
||||||
|
|
||||||
|
|
||||||
class PatchControllerApiThread(threading.Thread):
|
class PatchControllerApiThread(threading.Thread):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -4531,6 +4572,12 @@ class PatchControllerMainThread(threading.Thread):
|
|||||||
sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False')
|
sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False')
|
||||||
LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors)
|
LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors)
|
||||||
|
|
||||||
|
LOG.info("software-controller-daemon is starting")
|
||||||
|
|
||||||
|
LOG.info("%s is active controller: %s", sc.hostname, sc.is_host_active_controller())
|
||||||
|
|
||||||
|
sc.set_interruption_fail_state()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if sc.pre_bootstrap and cfg.get_mgmt_ip():
|
if sc.pre_bootstrap and cfg.get_mgmt_ip():
|
||||||
sc.pre_bootstrap = False
|
sc.pre_bootstrap = False
|
||||||
|
@@ -149,3 +149,13 @@ VALID_HOST_DEPLOY_STATE = [
|
|||||||
DEPLOY_HOST_STATES.ROLLBACK_FAILED,
|
DEPLOY_HOST_STATES.ROLLBACK_FAILED,
|
||||||
DEPLOY_HOST_STATES.ROLLBACK_PENDING,
|
DEPLOY_HOST_STATES.ROLLBACK_PENDING,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Only in these states, the state will be
|
||||||
|
# set to failed after interruption
|
||||||
|
INTERRUPTION_RECOVERY_STATES = [
|
||||||
|
DEPLOY_STATES.START.value,
|
||||||
|
DEPLOY_STATES.HOST.value,
|
||||||
|
DEPLOY_STATES.HOST_ROLLBACK.value,
|
||||||
|
DEPLOY_STATES.ACTIVATE.value,
|
||||||
|
DEPLOY_STATES.ACTIVATE_ROLLBACK.value,
|
||||||
|
]
|
||||||
|
@@ -13,6 +13,7 @@ from netaddr import IPAddress
|
|||||||
import os
|
import os
|
||||||
from oslo_config import cfg as oslo_cfg
|
from oslo_config import cfg as oslo_cfg
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
import psutil
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import socket
|
import socket
|
||||||
@@ -114,7 +115,8 @@ def get_component_and_versions(release_name):
|
|||||||
match = pattern.match(release_name)
|
match = pattern.match(release_name)
|
||||||
if match:
|
if match:
|
||||||
component = match.group(2) or None
|
component = match.group(2) or None
|
||||||
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}" if match.group(5) else ".0")
|
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}"
|
||||||
|
if match.group(5) else ".0")
|
||||||
software_version = f"{match.group(3)}.{match.group(4)}"
|
software_version = f"{match.group(3)}.{match.group(4)}"
|
||||||
patch_version = match.group(5) or '0'
|
patch_version = match.group(5) or '0'
|
||||||
return component, release_version, software_version, patch_version
|
return component, release_version, software_version, patch_version
|
||||||
@@ -274,7 +276,8 @@ def save_temp_file(file_item, temp_dir=constants.SCRATCH_DIR):
|
|||||||
LOG.error("Not enough space to save file %s in %s \n \
|
LOG.error("Not enough space to save file %s in %s \n \
|
||||||
Available %s bytes. File size %s", file_name, temp_dir, avail_space, file_size)
|
Available %s bytes. File size %s", file_name, temp_dir, avail_space, file_size)
|
||||||
except Exception:
|
except Exception:
|
||||||
msg = "Failed to get file size in bytes for {} or disk space for {}".format(file_item, temp_dir)
|
msg = "Failed to get file size in bytes for {} or disk space for {}".format(
|
||||||
|
file_item, temp_dir)
|
||||||
LOG.exception(msg)
|
LOG.exception(msg)
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
|
|
||||||
@@ -519,3 +522,42 @@ def find_file_by_regex(dir_path, pattern):
|
|||||||
except Exception:
|
except Exception:
|
||||||
LOG.error("Can't find files by regex pattern in directory %s." % dir_path)
|
LOG.error("Can't find files by regex pattern in directory %s." % dir_path)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_iface_ip(iface_name: str, ip_family: int = socket.AF_INET) -> list[str]:
|
||||||
|
"""Get IP addresses for a network interface filtered by address family.
|
||||||
|
|
||||||
|
:param iface_name: Name of the network interface to query
|
||||||
|
:param ip_family: Address family to filter by (socket.AF_INET or socket.AF_INET6)
|
||||||
|
|
||||||
|
return: List of IP addresses matching the specified family
|
||||||
|
"""
|
||||||
|
# Input validation
|
||||||
|
if not iface_name or not isinstance(iface_name, str):
|
||||||
|
raise ValueError("Interface name must be a non-empty string")
|
||||||
|
|
||||||
|
if ip_family not in (socket.AF_INET, socket.AF_INET6):
|
||||||
|
raise TypeError(f"Invalid address family: {ip_family}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get network interface addresses
|
||||||
|
interface_addresses = psutil.net_if_addrs()
|
||||||
|
|
||||||
|
# Return early if interface not found
|
||||||
|
if iface_name not in interface_addresses:
|
||||||
|
LOG.error("Interface %s not found", iface_name)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Filter interfaces and collect IP addresses in one pass
|
||||||
|
# Secondary IP config e.g. enp0s8:2 needs to be handled accordingly
|
||||||
|
return [
|
||||||
|
addr.address
|
||||||
|
for name, addrs in interface_addresses.items()
|
||||||
|
if name.startswith(iface_name)
|
||||||
|
for addr in addrs
|
||||||
|
if addr.family == ip_family
|
||||||
|
]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error("Error getting IP for interface %s: %s", iface_name, str(e))
|
||||||
|
return []
|
||||||
|
Reference in New Issue
Block a user