More upgrade/rollback fixes for mgmt address reduction

Two issues were fixed:

1. Increase ceph mon agent-hooks commands time out value
   In some cases during rollback the timeout value as too short and
   some commands aborted before completion.
2. Fix fm-api not binding after deploy-host, unlock and reboot
   The previous solution consisted in restarting the fm-api service
   during deploy-activate, but still caused the fm-api to be unavailable
   between boot up and activate and if doing rollback at this point.
   The new solution consists of fixing dnsmasq.addn_hosts during
   deploy-host.

Test plan:
- SX upgrade
- SX rollback

Story: 2011191
Task: 52632

Change-Id: I2705a97500427031c2a0a69a6fd59f42b2e239f8
Signed-off-by: Caio Bruchert <caio.bruchert@windriver.com>
This commit is contained in:
Caio Bruchert
2025-08-05 13:41:20 -03:00
parent 25923e0c4e
commit 7abbc8b537
2 changed files with 26 additions and 29 deletions

View File

@@ -18,6 +18,7 @@ from abc import ABC
from abc import abstractmethod
import configparser
import filecmp
import fileinput
import glob
import logging as LOG
import os
@@ -857,7 +858,7 @@ class LogPermissionRestorerHook(BaseHook):
self.restore_cron_permissions()
class ReconfigureCephMonHook(BaseHook):
class FixSimplexAddressesHook(BaseHook):
"""
Reconfigure ceph-mon with the mgmt floating address
"""
@@ -935,16 +936,30 @@ class ReconfigureCephMonHook(BaseHook):
if self._to_release == "24.09" or self._to_release == "25.09":
system_mode = self.get_platform_conf("system_mode")
if (system_mode == self.SIMPLEX):
mon_ip, ip = self.get_mon_ip()
# fix dnsmasq.addn_hosts until sysinv conductor fixes it definitely
if self._to_release == "25.09":
LOG.info("fix-sx-addr: fixing dnsmasq.addn_hosts")
addn_hosts = "/opt/platform/config/25.09/dnsmasq.addn_hosts"
for line in fileinput.input(files=addn_hosts, inplace=True):
cols = line.split()
if "controller-0.internal" in cols[1]:
line = line.replace(cols[0], ip)
elif "controller-1.internal" in cols[1]:
continue
print(line, end="")
if not self.is_ceph_configured():
LOG.info("ceph-mon: skipping reconfiguration, bare metal ceph not configured for mgmt")
LOG.info("fix-sx-addr: skipping ceph mon reconfig, bare metal ceph not configured for mgmt")
return
fsid = self.get_fsid()
mon_name = "controller-0"
mon_ip, ip = self.get_mon_ip()
if not fsid or not mon_ip:
LOG.exception("Invalid fsid or mon_ip")
raise ValueError("Invalid params")
LOG.info("ceph-mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip))
LOG.info("fix-sx-addr: ceph mon: using fsid=%s, mon_name=%s, mon_ip=%s" % (fsid, mon_name, mon_ip))
cmds = [
["rm", "-f", "/etc/pmon.d/ceph.conf"],
@@ -964,14 +979,14 @@ class ReconfigureCephMonHook(BaseHook):
try:
for cmd in cmds:
LOG.info("ceph-mon: exec: '%s'" % ' '.join(cmd))
subprocess.check_call(cmd, timeout=8)
LOG.info("ceph-mon: reconfiguration finished")
LOG.info("fix-sx-addr: exec: '%s'" % ' '.join(cmd))
subprocess.check_call(cmd, timeout=60)
LOG.info("fix-sx-addr: reconfiguration finished")
except subprocess.CalledProcessError as e:
LOG.exception("ceph-mon: failed executing the command '%s': %s" % (' '.join(cmd), str(e)))
LOG.exception("fix-sx-addr: failed executing the command '%s': %s" % (' '.join(cmd), str(e)))
raise
else:
LOG.info("ceph-mon: skipping reconfiguration, system_mode is not simplex")
LOG.info("fix-sx-addr: skipping reconfiguration, system_mode is not simplex")
class AbstractSysctlFlagHook(BaseHook, ABC):
@@ -1151,7 +1166,7 @@ class HookManager(object):
FixPSQLPermissionHook,
DeleteControllerFeedRemoteHook,
RestartKubeApiServer,
ReconfigureCephMonHook,
FixSimplexAddressesHook,
CISSysctlFlagHookUpgrade,
# enable usm-initialize service for next
# reboot only if everything else is done
@@ -1168,7 +1183,7 @@ class HookManager(object):
RevertUmaskHook,
RevertCrtPermissionsHook,
LogPermissionRestorerHook,
ReconfigureCephMonHook,
FixSimplexAddressesHook,
CISSysctlFlagHookRollback,
# enable usm-initialize service for next
# reboot only if everything else is done

View File

@@ -15,7 +15,6 @@
#
import logging
import subprocess
import sys
from packaging import version
@@ -66,8 +65,6 @@ def main():
except Exception as e:
LOG.exception("Error: {}".format(e))
res = 1
elif action == 'activate' and to_release_version == target_version:
restart_services_bound_to_controller0_address()
return res
@@ -143,21 +140,6 @@ def db_update(conn, query):
conn.commit()
def restart_services_bound_to_controller0_address():
services = (
'sm-api',
'fm-api',
)
for service in services:
LOG.info(f"Restarting {service}...")
try:
subprocess.run(['systemctl', 'restart', service], check=True, timeout=15)
except subprocess.TimeoutExpired:
LOG.error(f"Restarting {service} timed out.")
except subprocess.CalledProcessError as e:
LOG.error(f"Restarting {service} failed: {e.stderr}")
def get_system_mode():
ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read()