More upgrade/rollback fixes for mgmt address reduction

Two issues were fixed:

1. Increase ceph mon agent-hooks commands time out value
   In some cases during rollback the timeout value as too short and
   some commands aborted before completion.
2. Fix fm-api not binding after deploy-host, unlock and reboot
   The previous solution consisted in restarting the fm-api service
   during deploy-activate, but still caused the fm-api to be unavailable
   between boot up and activate and if doing rollback at this point.
   The new solution consists of fixing dnsmasq.addn_hosts during
   deploy-host.

Test plan:
- SX upgrade
- SX rollback

Story: 2011191
Task: 52632

Change-Id: I2705a97500427031c2a0a69a6fd59f42b2e239f8
Signed-off-by: Caio Bruchert <caio.bruchert@windriver.com>
This commit is contained in:
Caio Bruchert
2025-08-05 13:41:20 -03:00
parent 25923e0c4e
commit 7abbc8b537
2 changed files with 26 additions and 29 deletions

View File

@@ -18,6 +18,7 @@ from abc import ABC
from abc import abstractmethod from abc import abstractmethod
import configparser import configparser
import filecmp import filecmp
import fileinput
import glob import glob
import logging as LOG import logging as LOG
import os import os
@@ -857,7 +858,7 @@ class LogPermissionRestorerHook(BaseHook):
self.restore_cron_permissions() self.restore_cron_permissions()
class ReconfigureCephMonHook(BaseHook): class FixSimplexAddressesHook(BaseHook):
""" """
Reconfigure ceph-mon with the mgmt floating address Reconfigure ceph-mon with the mgmt floating address
""" """
@@ -935,16 +936,30 @@ class ReconfigureCephMonHook(BaseHook):
if self._to_release == "24.09" or self._to_release == "25.09": if self._to_release == "24.09" or self._to_release == "25.09":
system_mode = self.get_platform_conf("system_mode") system_mode = self.get_platform_conf("system_mode")
if (system_mode == self.SIMPLEX): if (system_mode == self.SIMPLEX):
mon_ip, ip = self.get_mon_ip()
# fix dnsmasq.addn_hosts until sysinv conductor fixes it definitely
if self._to_release == "25.09":
LOG.info("fix-sx-addr: fixing dnsmasq.addn_hosts")
addn_hosts = "/opt/platform/config/25.09/dnsmasq.addn_hosts"
for line in fileinput.input(files=addn_hosts, inplace=True):
cols = line.split()
if "controller-0.internal" in cols[1]:
line = line.replace(cols[0], ip)
elif "controller-1.internal" in cols[1]:
continue
print(line, end="")
if not self.is_ceph_configured(): if not self.is_ceph_configured():
LOG.info("ceph-mon: skipping reconfiguration, bare metal ceph not configured for mgmt") LOG.info("fix-sx-addr: skipping ceph mon reconfig, bare metal ceph not configured for mgmt")
return return
fsid = self.get_fsid() fsid = self.get_fsid()
mon_name = "controller-0" mon_name = "controller-0"
mon_ip, ip = self.get_mon_ip()
if not fsid or not mon_ip: if not fsid or not mon_ip:
LOG.exception("Invalid fsid or mon_ip") LOG.exception("Invalid fsid or mon_ip")
raise ValueError("Invalid params") raise ValueError("Invalid params")
LOG.info("ceph-mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip)) LOG.info("fix-sx-addr: ceph mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip))
cmds = [ cmds = [
["rm", "-f", "/etc/pmon.d/ceph.conf"], ["rm", "-f", "/etc/pmon.d/ceph.conf"],
@@ -964,14 +979,14 @@ class ReconfigureCephMonHook(BaseHook):
try: try:
for cmd in cmds: for cmd in cmds:
LOG.info("ceph-mon: exec: '%s'" % ' '.join(cmd)) LOG.info("fix-sx-addr: exec: '%s'" % ' '.join(cmd))
subprocess.check_call(cmd, timeout=8) subprocess.check_call(cmd, timeout=60)
LOG.info("ceph-mon: reconfiguration finished") LOG.info("fix-sx-addr: reconfiguration finished")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
LOG.exception("ceph-mon: failed executing the command '%s': %s" % (' '.join(cmd), str(e))) LOG.exception("fix-sx-addr: failed executing the command '%s': %s" % (' '.join(cmd), str(e)))
raise raise
else: else:
LOG.info("ceph-mon: skipping reconfiguration, system_mode is not simplex") LOG.info("fix-sx-addr: skipping reconfiguration, system_mode is not simplex")
class AbstractSysctlFlagHook(BaseHook, ABC): class AbstractSysctlFlagHook(BaseHook, ABC):
@@ -1151,7 +1166,7 @@ class HookManager(object):
FixPSQLPermissionHook, FixPSQLPermissionHook,
DeleteControllerFeedRemoteHook, DeleteControllerFeedRemoteHook,
RestartKubeApiServer, RestartKubeApiServer,
ReconfigureCephMonHook, FixSimplexAddressesHook,
CISSysctlFlagHookUpgrade, CISSysctlFlagHookUpgrade,
# enable usm-initialize service for next # enable usm-initialize service for next
# reboot only if everything else is done # reboot only if everything else is done
@@ -1168,7 +1183,7 @@ class HookManager(object):
RevertUmaskHook, RevertUmaskHook,
RevertCrtPermissionsHook, RevertCrtPermissionsHook,
LogPermissionRestorerHook, LogPermissionRestorerHook,
ReconfigureCephMonHook, FixSimplexAddressesHook,
CISSysctlFlagHookRollback, CISSysctlFlagHookRollback,
# enable usm-initialize service for next # enable usm-initialize service for next
# reboot only if everything else is done # reboot only if everything else is done

View File

@@ -15,7 +15,6 @@
# #
import logging import logging
import subprocess
import sys import sys
from packaging import version from packaging import version
@@ -66,8 +65,6 @@ def main():
except Exception as e: except Exception as e:
LOG.exception("Error: {}".format(e)) LOG.exception("Error: {}".format(e))
res = 1 res = 1
elif action == 'activate' and to_release_version == target_version:
restart_services_bound_to_controller0_address()
return res return res
@@ -143,21 +140,6 @@ def db_update(conn, query):
conn.commit() conn.commit()
def restart_services_bound_to_controller0_address():
services = (
'sm-api',
'fm-api',
)
for service in services:
LOG.info(f"Restarting {service}...")
try:
subprocess.run(['systemctl', 'restart', service], check=True, timeout=15)
except subprocess.TimeoutExpired:
LOG.error(f"Restarting {service} timed out.")
except subprocess.CalledProcessError as e:
LOG.error(f"Restarting {service} failed: {e.stderr}")
def get_system_mode(): def get_system_mode():
ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read() ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read()