From 4aa5b7759c86e48284fba1b1bd8cf752e35a927f Mon Sep 17 00:00:00 2001 From: Caio Bruchert Date: Mon, 21 Jul 2025 17:09:33 -0300 Subject: [PATCH] Upgrade/rollback fixes for mgmt addr reduction The following issues started after mgmt address reduction feature was merged: 1. upgrade: in some occasions fm-api and sm-api fail to start This happened when because the processes can start before dnsmasq.addn_hosts file is updated. The fix consists in restarting both services during upgrade activate. 2. rollback: in some occasions agent-hooks 'ceph mon start' command times out. This happened because ceph mon start needs the correct IP address to be created in Linux and it will be created only after host unlock. The fix consists in adding the IP temporarily before running the 'ceph mon start' command. 3. ceph alarm after finishing the rollback process. This happened because the last ceph reconfiguration command was not running for rollback. The fix consists in checking the correct state to run the command. Test plan: - SX upgrade - SX rollback Story: 2011191 Task: 52558 Change-Id: Ibfc8d3aaf8f95f5a3250b0826fe40d888bb6d760 Signed-off-by: Caio Bruchert --- software/software/agent_hooks.py | 21 ++++++++---------- .../03-ceph-mon-enable-msgr2.sh | 8 ++++++- .../04-remove-mgmt-node-addresses.py | 22 +++++++++++++++++-- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/software/software/agent_hooks.py b/software/software/agent_hooks.py index 473335aa..0d71f808 100644 --- a/software/software/agent_hooks.py +++ b/software/software/agent_hooks.py @@ -29,10 +29,6 @@ from ipaddress import ip_address from ipaddress import IPv6Address import psycopg2 -import software.constants as constants -import software.utils as utils - - log_format = ('%(asctime)s: ' + '[%(process)s]: ' '%(filename)s(%(lineno)s): %(levelname)s: %(message)s') LOG.basicConfig(filename="/var/log/software.log", @@ -877,24 +873,21 @@ class ReconfigureCephMonHook(BaseHook): fields = line.split() if fields[1] == host: ip = fields[0] - if isinstance(ip_address(ip), IPv6Address): - ip = f"[{ip}]" - return ip + mon_ip = f"[{ip}]" if isinstance(ip_address(ip), IPv6Address) else ip + return mon_ip, ip return None def run(self): # Handle both upgrade to 25.09 and rollback to 24.09 if self._to_release == "24.09" or self._to_release == "25.09": - system_type = utils.get_platform_conf("system_type") - system_mode = utils.get_platform_conf("system_mode") - if (system_type == constants.SYSTEM_TYPE_ALL_IN_ONE and - system_mode == constants.SYSTEM_MODE_SIMPLEX): + system_mode = self.get_platform_conf("system_mode") + if (system_mode == self.SIMPLEX): if not self.is_ceph_configured(): LOG.info("ceph-mon: skipping reconfiguration, bare metal ceph not configured for mgmt") return fsid = self.get_fsid() mon_name = "controller-0" - mon_ip = self.get_mon_ip() + mon_ip, ip = self.get_mon_ip() if not fsid or not mon_ip: LOG.exception("Invalid fsid or mon_ip") raise ValueError("Invalid params") @@ -911,6 +904,10 @@ class ReconfigureCephMonHook(BaseHook): ["/etc/init.d/ceph", "start", "mon"], ["ln", "-s", "/etc/ceph/ceph.conf.pmon", "/etc/pmon.d/ceph.conf"], ] + if self._to_release == "24.09": + # For /etc/init.d/ceph start mon to work during rollback, need to add mon_ip temporarily + # to the loopback. This will corrected permanently after host unlock and reboot. + cmds.insert(0, ["ip", "address", "replace", f"{ip}", "dev", "lo"]) try: for cmd in cmds: diff --git a/software/upgrade-scripts/03-ceph-mon-enable-msgr2.sh b/software/upgrade-scripts/03-ceph-mon-enable-msgr2.sh index 0bdd9ac6..9e794c28 100755 --- a/software/upgrade-scripts/03-ceph-mon-enable-msgr2.sh +++ b/software/upgrade-scripts/03-ceph-mon-enable-msgr2.sh @@ -17,6 +17,11 @@ FROM_RELEASE=$1 TO_RELEASE=$2 ACTION=$3 +FROM_RELEASE_ARR=(${FROM_RELEASE//./ }) +FROM_RELEASE_MAJOR=${FROM_RELEASE_ARR[0]} +TO_RELEASE_ARR=(${TO_RELEASE//./ }) +TO_RELEASE_MAJOR=${TO_RELEASE_ARR[0]} + SOFTWARE_LOG_PATH="/var/log/software.log" function log { @@ -26,7 +31,8 @@ function log { log "ceph-mon: enable ceph-mon msgr2"\ "from $FROM_RELEASE to $TO_RELEASE with action $ACTION" -if [[ "$ACTION" == "activate" && "$FROM_RELEASE" == "24.09" ]] || [[ "$ACTION" == "delete" && "$TO_RELEASE" == "24.09" ]]; then +if [[ "$ACTION" == "activate" && ${TO_RELEASE_MAJOR} -eq 25 ]] || \ + [[ "$ACTION" == "delete" && ${TO_RELEASE_MAJOR} -eq 24 ]]; then source /etc/platform/platform.conf if [[ "${system_mode}" == "simplex" ]]; then if [[ -f /etc/platform/.node_ceph_configured ]]; then diff --git a/software/upgrade-scripts/04-remove-mgmt-node-addresses.py b/software/upgrade-scripts/04-remove-mgmt-node-addresses.py index c2b8174a..c594330e 100644 --- a/software/upgrade-scripts/04-remove-mgmt-node-addresses.py +++ b/software/upgrade-scripts/04-remove-mgmt-node-addresses.py @@ -15,6 +15,7 @@ # import logging +import subprocess import sys from packaging import version @@ -55,8 +56,8 @@ def main(): res = 0 to_release_version = version.Version(to_release) target_version = version.Version("25.09") - if action == 'migrate' and to_release_version == target_version: - if get_system_mode() == "simplex": + if get_system_mode() == "simplex": + if action == 'migrate' and to_release_version == target_version: try: conn = psycopg2.connect("dbname=sysinv user=postgres port=%s" % postgres_port) @@ -65,6 +66,8 @@ def main(): except Exception as e: LOG.exception("Error: {}".format(e)) res = 1 + elif action == 'activate' and to_release_version == target_version: + restart_services_bound_to_controller0_address() return res @@ -140,6 +143,21 @@ def db_update(conn, query): conn.commit() +def restart_services_bound_to_controller0_address(): + services = ( + 'sm-api', + 'fm-api', + ) + for service in services: + LOG.info(f"Restarting {service}...") + try: + subprocess.run(['systemctl', 'restart', service], check=True, timeout=15) + except subprocess.TimeoutExpired: + LOG.error(f"Restarting {service} timed out.") + except subprocess.CalledProcessError as e: + LOG.error(f"Restarting {service} failed: {e.stderr}") + + def get_system_mode(): ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read()