From 7abbc8b537cc43d3b4f2ae43928c2df5d1d61fd2 Mon Sep 17 00:00:00 2001 From: Caio Bruchert Date: Tue, 5 Aug 2025 13:41:20 -0300 Subject: [PATCH] More upgrade/rollback fixes for mgmt address reduction Two issues were fixed: 1. Increase ceph mon agent-hooks commands time out value In some cases during rollback the timeout value as too short and some commands aborted before completion. 2. Fix fm-api not binding after deploy-host, unlock and reboot The previous solution consisted in restarting the fm-api service during deploy-activate, but still caused the fm-api to be unavailable between boot up and activate and if doing rollback at this point. The new solution consists of fixing dnsmasq.addn_hosts during deploy-host. Test plan: - SX upgrade - SX rollback Story: 2011191 Task: 52632 Change-Id: I2705a97500427031c2a0a69a6fd59f42b2e239f8 Signed-off-by: Caio Bruchert --- software/software/agent_hooks.py | 37 +++++++++++++------ .../04-remove-mgmt-node-addresses.py | 18 --------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/software/software/agent_hooks.py b/software/software/agent_hooks.py index ee9e5935..2982e957 100644 --- a/software/software/agent_hooks.py +++ b/software/software/agent_hooks.py @@ -18,6 +18,7 @@ from abc import ABC from abc import abstractmethod import configparser import filecmp +import fileinput import glob import logging as LOG import os @@ -857,7 +858,7 @@ class LogPermissionRestorerHook(BaseHook): self.restore_cron_permissions() -class ReconfigureCephMonHook(BaseHook): +class FixSimplexAddressesHook(BaseHook): """ Reconfigure ceph-mon with the mgmt floating address """ @@ -935,16 +936,30 @@ class ReconfigureCephMonHook(BaseHook): if self._to_release == "24.09" or self._to_release == "25.09": system_mode = self.get_platform_conf("system_mode") if (system_mode == self.SIMPLEX): + mon_ip, ip = self.get_mon_ip() + + # fix dnsmasq.addn_hosts until sysinv conductor fixes it definitely + if self._to_release == "25.09": + LOG.info("fix-sx-addr: fixing dnsmasq.addn_hosts") + addn_hosts = "/opt/platform/config/25.09/dnsmasq.addn_hosts" + for line in fileinput.input(files=addn_hosts, inplace=True): + cols = line.split() + if "controller-0.internal" in cols[1]: + line = line.replace(cols[0], ip) + elif "controller-1.internal" in cols[1]: + continue + print(line, end="") + if not self.is_ceph_configured(): - LOG.info("ceph-mon: skipping reconfiguration, bare metal ceph not configured for mgmt") + LOG.info("fix-sx-addr: skipping ceph mon reconfig, bare metal ceph not configured for mgmt") return + fsid = self.get_fsid() mon_name = "controller-0" - mon_ip, ip = self.get_mon_ip() if not fsid or not mon_ip: LOG.exception("Invalid fsid or mon_ip") raise ValueError("Invalid params") - LOG.info("ceph-mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip)) + LOG.info("fix-sx-addr: ceph mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip)) cmds = [ ["rm", "-f", "/etc/pmon.d/ceph.conf"], @@ -964,14 +979,14 @@ class ReconfigureCephMonHook(BaseHook): try: for cmd in cmds: - LOG.info("ceph-mon: exec: '%s'" % ' '.join(cmd)) - subprocess.check_call(cmd, timeout=8) - LOG.info("ceph-mon: reconfiguration finished") + LOG.info("fix-sx-addr: exec: '%s'" % ' '.join(cmd)) + subprocess.check_call(cmd, timeout=60) + LOG.info("fix-sx-addr: reconfiguration finished") except subprocess.CalledProcessError as e: - LOG.exception("ceph-mon: failed executing the command '%s': %s" % (' '.join(cmd), str(e))) + LOG.exception("fix-sx-addr: failed executing the command '%s': %s" % (' '.join(cmd), str(e))) raise else: - LOG.info("ceph-mon: skipping reconfiguration, system_mode is not simplex") + LOG.info("fix-sx-addr: skipping reconfiguration, system_mode is not simplex") class AbstractSysctlFlagHook(BaseHook, ABC): @@ -1151,7 +1166,7 @@ class HookManager(object): FixPSQLPermissionHook, DeleteControllerFeedRemoteHook, RestartKubeApiServer, - ReconfigureCephMonHook, + FixSimplexAddressesHook, CISSysctlFlagHookUpgrade, # enable usm-initialize service for next # reboot only if everything else is done @@ -1168,7 +1183,7 @@ class HookManager(object): RevertUmaskHook, RevertCrtPermissionsHook, LogPermissionRestorerHook, - ReconfigureCephMonHook, + FixSimplexAddressesHook, CISSysctlFlagHookRollback, # enable usm-initialize service for next # reboot only if everything else is done diff --git a/software/upgrade-scripts/04-remove-mgmt-node-addresses.py b/software/upgrade-scripts/04-remove-mgmt-node-addresses.py index c594330e..f0ba3cef 100644 --- a/software/upgrade-scripts/04-remove-mgmt-node-addresses.py +++ b/software/upgrade-scripts/04-remove-mgmt-node-addresses.py @@ -15,7 +15,6 @@ # import logging -import subprocess import sys from packaging import version @@ -66,8 +65,6 @@ def main(): except Exception as e: LOG.exception("Error: {}".format(e)) res = 1 - elif action == 'activate' and to_release_version == target_version: - restart_services_bound_to_controller0_address() return res @@ -143,21 +140,6 @@ def db_update(conn, query): conn.commit() -def restart_services_bound_to_controller0_address(): - services = ( - 'sm-api', - 'fm-api', - ) - for service in services: - LOG.info(f"Restarting {service}...") - try: - subprocess.run(['systemctl', 'restart', service], check=True, timeout=15) - except subprocess.TimeoutExpired: - LOG.error(f"Restarting {service} timed out.") - except subprocess.CalledProcessError as e: - LOG.error(f"Restarting {service} failed: {e.stderr}") - - def get_system_mode(): ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read()