From 28ab0e7aa19a8c1b4fc1ce0eedbdc953af200522 Mon Sep 17 00:00:00 2001 From: Heitor Matsui Date: Thu, 22 May 2025 18:19:21 -0300 Subject: [PATCH] Replace etcd cloning for symlink Previously, the etcd to-release directory was a clone of the from-release database; with the current code a symlink is created instead, so the etcd database is the same over all the upgrade procedure, and the symlink is removed and from-release directory is renamed to to-release during deploy delete. This change is proposed to fix an issue occuring in multinode systems where there is a mismatch between etcd (kubectl) and the pods effectively running in a host (crictl), ultimately leading to deploy activate failures. Test Plan PASS: AIO-SX e2e stx-10 -> stx-11 upgrade PASS: AIO-DX e2e stx-10 -> stx-11 upgrade PASS: AIO-DX orchestrated stx-10 -> stx-11 upgrade Closes-bug: 2111588 Change-Id: I19bdffbbe7325e3edd9c45751dcac4af66acdf97 Signed-off-by: Heitor Matsui --- software/scripts/prepare-data-migration | 9 ++++++--- software/software/software_functions.py | 24 +++++++++++++++++++++++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/software/scripts/prepare-data-migration b/software/scripts/prepare-data-migration index 22339928..ca170ab1 100644 --- a/software/scripts/prepare-data-migration +++ b/software/scripts/prepare-data-migration @@ -157,11 +157,14 @@ class DataMigration(object): etcd_to_dir = os.path.join(ETCD_PATH, self.to_release) etcd_from_dir = os.path.join(ETCD_PATH, self.from_release) - shutil.rmtree(etcd_to_dir, ignore_errors=True) + if os.path.islink(etcd_to_dir): + os.unlink(etcd_to_dir) + elif os.path.isdir(etcd_to_dir): + shutil.rmtree(etcd_to_dir, ignore_errors=True) try: - shutil.copytree(etcd_from_dir, etcd_to_dir) - LOG.info("Copied etcd from %s to %s completed", etcd_from_dir, etcd_to_dir) + os.symlink(etcd_from_dir, etcd_to_dir, target_is_directory=True) + LOG.info("Symlink etcd from %s to %s completed", etcd_from_dir, etcd_to_dir) except Exception as e: LOG.exception("Failed to copy etcd from %s to %s. Error: %s.", etcd_from_dir, etcd_to_dir, e.output) diff --git a/software/software/software_functions.py b/software/software/software_functions.py index dc73facf..b0e8eec5 100644 --- a/software/software/software_functions.py +++ b/software/software/software_functions.py @@ -1553,11 +1553,33 @@ def clean_up_deployment_data(major_release): os.path.join(constants.POSTGRES_PATH, constants.UPGRADE), os.path.join(constants.POSTGRES_PATH, major_release), os.path.join(constants.RABBIT_PATH, major_release), - os.path.join(constants.ETCD_PATH, major_release), ] for folder in upgrade_folders: shutil.rmtree(folder, ignore_errors=True) + # etcd has different cleanup procedure: + # - remove the to-release symlink + # - rename from-release directory to to-release + # - restart etcd process + etcd_from_path = os.path.join(constants.ETCD_PATH, major_release) + etcd_to_path = os.path.join(constants.ETCD_PATH, SW_VERSION) + if utils.compare_release_version(SW_VERSION, major_release): + if os.path.islink(etcd_to_path): + os.unlink(etcd_to_path) + LOG.info("Removed %s symlink", etcd_to_path) + os.rename(etcd_from_path, etcd_to_path) + LOG.info("Renamed %s directory to %s", etcd_from_path, etcd_to_path) + try: + subprocess.run(["/usr/bin/sm-restart-safe", "service", "etcd"], check=True) + LOG.info("Restarted etcd service") + except subprocess.CalledProcessError as e: + LOG.error("Error restarting etcd: %s", str(e)) + # on rollback, only the symlink needs to be removed + else: + if os.path.islink(etcd_from_path): + os.unlink(etcd_from_path) + LOG.info("Removed %s symlink", etcd_from_path) + def remove_major_release_deployment_flags(): """