Upgrade/rollback fixes for mgmt addr reduction

The following issues started after mgmt address reduction feature
was merged:

1. upgrade: in some occasions fm-api and sm-api fail to start
  This happened when because the processes can start before
  dnsmasq.addn_hosts file is updated.
  The fix consists in restarting both services during upgrade activate.

2. rollback: in some occasions agent-hooks 'ceph mon start' command
  times out.
  This happened because ceph mon start needs the correct IP address to
  be created in Linux and it will be created only after host unlock.
  The fix consists in adding the IP temporarily before running the
  'ceph mon start' command.

3. ceph alarm after finishing the rollback process.
  This happened because the last ceph reconfiguration command was not
  running for rollback.
  The fix consists in checking the correct state to run the command.

Test plan:
- SX upgrade
- SX rollback

Story: 2011191
Task: 52558

Change-Id: Ibfc8d3aaf8f95f5a3250b0826fe40d888bb6d760
Signed-off-by: Caio Bruchert <caio.bruchert@windriver.com>
This commit is contained in:
Caio Bruchert
2025-07-21 17:09:33 -03:00
parent 248a8911a0
commit 4aa5b7759c
3 changed files with 36 additions and 15 deletions

View File

@@ -29,10 +29,6 @@ from ipaddress import ip_address
from ipaddress import IPv6Address
import psycopg2
import software.constants as constants
import software.utils as utils
log_format = ('%(asctime)s: ' + '[%(process)s]: '
'%(filename)s(%(lineno)s): %(levelname)s: %(message)s')
LOG.basicConfig(filename="/var/log/software.log",
@@ -877,24 +873,21 @@ class ReconfigureCephMonHook(BaseHook):
fields = line.split()
if fields[1] == host:
ip = fields[0]
if isinstance(ip_address(ip), IPv6Address):
ip = f"[{ip}]"
return ip
mon_ip = f"[{ip}]" if isinstance(ip_address(ip), IPv6Address) else ip
return mon_ip, ip
return None
def run(self):
# Handle both upgrade to 25.09 and rollback to 24.09
if self._to_release == "24.09" or self._to_release == "25.09":
system_type = utils.get_platform_conf("system_type")
system_mode = utils.get_platform_conf("system_mode")
if (system_type == constants.SYSTEM_TYPE_ALL_IN_ONE and
system_mode == constants.SYSTEM_MODE_SIMPLEX):
system_mode = self.get_platform_conf("system_mode")
if (system_mode == self.SIMPLEX):
if not self.is_ceph_configured():
LOG.info("ceph-mon: skipping reconfiguration, bare metal ceph not configured for mgmt")
return
fsid = self.get_fsid()
mon_name = "controller-0"
mon_ip = self.get_mon_ip()
mon_ip, ip = self.get_mon_ip()
if not fsid or not mon_ip:
LOG.exception("Invalid fsid or mon_ip")
raise ValueError("Invalid params")
@@ -911,6 +904,10 @@ class ReconfigureCephMonHook(BaseHook):
["/etc/init.d/ceph", "start", "mon"],
["ln", "-s", "/etc/ceph/ceph.conf.pmon", "/etc/pmon.d/ceph.conf"],
]
if self._to_release == "24.09":
# For /etc/init.d/ceph start mon to work during rollback, need to add mon_ip temporarily
# to the loopback. This will corrected permanently after host unlock and reboot.
cmds.insert(0, ["ip", "address", "replace", f"{ip}", "dev", "lo"])
try:
for cmd in cmds:

View File

@@ -17,6 +17,11 @@ FROM_RELEASE=$1
TO_RELEASE=$2
ACTION=$3
FROM_RELEASE_ARR=(${FROM_RELEASE//./ })
FROM_RELEASE_MAJOR=${FROM_RELEASE_ARR[0]}
TO_RELEASE_ARR=(${TO_RELEASE//./ })
TO_RELEASE_MAJOR=${TO_RELEASE_ARR[0]}
SOFTWARE_LOG_PATH="/var/log/software.log"
function log {
@@ -26,7 +31,8 @@ function log {
log "ceph-mon: enable ceph-mon msgr2"\
"from $FROM_RELEASE to $TO_RELEASE with action $ACTION"
if [[ "$ACTION" == "activate" && "$FROM_RELEASE" == "24.09" ]] || [[ "$ACTION" == "delete" && "$TO_RELEASE" == "24.09" ]]; then
if [[ "$ACTION" == "activate" && ${TO_RELEASE_MAJOR} -eq 25 ]] || \
[[ "$ACTION" == "delete" && ${TO_RELEASE_MAJOR} -eq 24 ]]; then
source /etc/platform/platform.conf
if [[ "${system_mode}" == "simplex" ]]; then
if [[ -f /etc/platform/.node_ceph_configured ]]; then

View File

@@ -15,6 +15,7 @@
#
import logging
import subprocess
import sys
from packaging import version
@@ -55,8 +56,8 @@ def main():
res = 0
to_release_version = version.Version(to_release)
target_version = version.Version("25.09")
if action == 'migrate' and to_release_version == target_version:
if get_system_mode() == "simplex":
if get_system_mode() == "simplex":
if action == 'migrate' and to_release_version == target_version:
try:
conn = psycopg2.connect("dbname=sysinv user=postgres port=%s"
% postgres_port)
@@ -65,6 +66,8 @@ def main():
except Exception as e:
LOG.exception("Error: {}".format(e))
res = 1
elif action == 'activate' and to_release_version == target_version:
restart_services_bound_to_controller0_address()
return res
@@ -140,6 +143,21 @@ def db_update(conn, query):
conn.commit()
def restart_services_bound_to_controller0_address():
services = (
'sm-api',
'fm-api',
)
for service in services:
LOG.info(f"Restarting {service}...")
try:
subprocess.run(['systemctl', 'restart', service], check=True, timeout=15)
except subprocess.TimeoutExpired:
LOG.error(f"Restarting {service} timed out.")
except subprocess.CalledProcessError as e:
LOG.error(f"Restarting {service} failed: {e.stderr}")
def get_system_mode():
ini_str = '[DEFAULT]\n' + open('/etc/platform/platform.conf', 'r').read()