Integrate lvm snapshot feature with upgrade process

This commit integrates the previously created lvm snapshot
code in [1], [2] and [3] with the upgrade/rollback process:

[1] https://review.opendev.org/c/starlingx/update/+/946371
[2] https://review.opendev.org/c/starlingx/update/+/946716
[3] https://review.opendev.org/c/starlingx/update/+/946835

NOTE: the feature is supported ONLY for AIO-SX for now
NOTE: the 'Depends-on' commit ensures that deploy precheck
      enforces the feature requirements are satisfied before
      allowing it to be enabled on deploy start
TODO: the snapshot removal after upgrade is successful will be
      implemented after the action 'delete' is included for
      the upgrade scripts; currently this is still in progress

- New functions are added to the lvm_snapshot module, to
  create and restore lvm snapshots safely, i.e.:
  - For snapshot creation, if it fails to create any snapshot
    then it will delete the others and proceed with the feature
    disabled
  - For snapshot restoration, it validates if all expected
    snapshots exists, if they aren't expired (not older than
    a time limit) and if they are valid (not 100% full) and
    only after all these conditions are satisfied the snapshots
    are restored. If any of them fail, rollback proceeds with
    the feature disabled
- Now, software controller effectively uses the --snapshot option
  to trigger snapshot creation during deploy start step
- lvm_snapshot.py module can be called as a standalone executable
  to allow it be called from deploy start script, reducing the
  dependenvy of the feature on the from-release code
- software controller restores the lvm snapshots during
  activate-rollback, so that the workflow is not changed (which
  is better from the point of view of orchestration)
- If lvm snapshot restoration fails, it will fall back to
  the standard activate-rollback procedure (upgrade scripts
  with action 'activate-rollback')
- Snapshots are now an object, and can override the default
  behavior of create/restore; one case was introduced to var-lv,
  to update software.json so that after host reboots the deployment
  data is correct

This commit also improve some log messages and remove deprecated code.

Test Plan
PASS: AIO-SX stx-10 -> stx-11 e2e upgrade with snapshot enabled
PASS: AIO-SX stx-10 -> stx-11 e2e rollback with snapshot enabled
PASS: AIO-SX stx-10 -> stx-11 e2e upgrade with snapshot disabled
PASS: AIO-SX stx-10 -> stx-11 e2e rollback with snapshot disabled
PASS: AIO-DX stx-10 -> stx-11, attempt to enable snapshot and
      verify deploy precheck blocks it
PASS: AIO-DX stx-10 -> stx-11 e2e upgrade with snapshot disabled

Depends-on: https://review.opendev.org/c/starlingx/update/+/946672

Story: 2011357
Task: 51981

Change-Id: I0759c424f9590947b349263a181a16e9d277741b
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
This commit is contained in:
Heitor Matsui
2025-04-11 16:33:24 -03:00
parent 4f3d32a9e9
commit c86d830872
7 changed files with 379 additions and 65 deletions

View File

@@ -89,6 +89,8 @@ override_dh_install:
${SCRIPTDIR}/sync-controllers-feed
install -m 755 scripts/remove-temporary-data \
${SCRIPTDIR}/remove-temporary-data
install -m 755 software/lvm_snapshot.py \
${SCRIPTDIR}/manage-lvm-snapshots
install -d -m 755 $(ROOT)/usr/local/share/upgrade.d
install -p -D -m 755 upgrade-scripts/* $(ROOT)/usr/local/share/upgrade.d
install -d -m 755 $(ROOT)/etc/update.d

View File

@@ -166,10 +166,25 @@ class DeployStart:
raise
LOG.info("Cleanup complete")
def _take_lvm_snapshots(self):
deployment = upgrade_utils.get_deployment_data()
if deployment.get("snapshot") is True:
LOG.info("LVM snapshot option enabled, proceeding to take snapshots...")
script_path = os.path.join(self.SCRIPT_DIR, "manage-lvm-snapshots")
cmd = [script_path, "--create"]
try:
subprocess.run(cmd, check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
LOG.error("Error taking LVM snapshots: %s", e.stderr)
raise
else:
LOG.info("LVM snapshot option is not enabled, skipping...")
def run(self):
try:
self._check_directories()
self._checkout_ostree_repo()
self._take_lvm_snapshots()
self._prepare_mount_points()
self._prepare_data_migration()
self._create_postgres_database()

View File

@@ -273,3 +273,10 @@ def get_available_gib_in_vg():
raise Exception(msg)
return vfree
def get_deployment_data():
"""Get the current deployment data"""
with open("/opt/software/software.json", "r") as fp:
deployment = json.loads(fp.read())
return deployment.get("deploy")[0]

View File

@@ -52,7 +52,7 @@ DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
# Certificate
ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'
# Software path's
# Software paths
SOFTWARE_STORAGE_DIR = "/opt/software"
SOFTWARE_CONFIG_FILE_LOCAL = "/etc/software/software.conf"
SOFTWARE_DEPLOY_FOLDER = "software-deploy"

View File

@@ -1,3 +1,4 @@
#!/usr/bin/python3
"""
Copyright (c) 2025 Wind River Systems, Inc.
@@ -5,10 +6,177 @@ SPDX-License-Identifier: Apache-2.0
"""
import argparse
import contextlib
from datetime import datetime
from datetime import timezone
import json
import logging
from pathlib import Path
import shutil
import subprocess
import sys
import tempfile
from software.software_functions import LOG
LOG = logging.getLogger("main_logger")
class LVMSnapshot:
ATTRIBUTES = ["lv_time", "lv_snapshot_invalid"]
SEPARATOR = ","
CREATE_DATE_MASK = "%Y-%m-%d %H:%M:%S %z"
def __init__(self, vg_name, lv_name, lv_size=None):
self._vg_name = vg_name
self._lv_name = lv_name
self._lv_size = lv_size
self._name = f"{lv_name}_snapshot"
@property
def lv_name(self):
return self._lv_name
@property
def name(self):
return self._name
@staticmethod
def get_command_abs_path(command):
return Path("/usr/sbin") / command
@staticmethod
def run_command(command, shell=False, check=True):
"""
Helper function to run shell commands and capture output
:param command: command to be executed (can be list or string)
:param shell: if command must run in a shell (command should be string)
:param check: if subprocess.CalledProcessError must be raised when rc != 0
"""
try:
result = subprocess.run(command, shell=shell, check=check,
text=True, capture_output=True)
return result
except subprocess.CalledProcessError as e:
LOG.error("Error executing command: %s\n%s" % (command, e.stderr))
raise
except Exception as e:
LOG.error("Error executing command: %s", str(e))
raise
def to_json(self):
"""
Return snapshot object in a json serializable format
"""
return {
"name": self._name,
"vg_name": self._vg_name,
"lv_name": self._lv_name,
}
def get_dev_path(self):
"""
Return snapshot path under /dev
"""
return Path("/dev") / self._vg_name / self._name
@contextlib.contextmanager
def mount(self):
"""
Mount the snapshot in a temporary directory, so that it's
content can be manipulated to cover specific scenarios
"""
mount_dir = tempfile.mkdtemp(prefix=f"{self._lv_name}-", dir="/tmp")
try:
self.run_command(["/usr/bin/mount", self.get_dev_path(), mount_dir])
LOG.info("Mounted %s under %s", self._name, mount_dir)
yield mount_dir
except Exception as e:
LOG.error("Error mounting snapshot: %s", str(e))
raise
finally:
self.run_command(["/usr/bin/umount", "-l", mount_dir])
shutil.rmtree(mount_dir, ignore_errors=True)
LOG.info("Directory %s unmounted and removed", mount_dir)
def exists(self):
"""
Check if a snapshot volume exists in the local filesystem
"""
command = f"lvs --noheadings -o lv_name {self._vg_name} | grep -w {self._name}"
result = self.run_command(command, shell=True, check=False)
return result.returncode == 0
def create(self):
"""
Run the command to create a snapshot
"""
command = [self.get_command_abs_path("lvcreate"), "-y", "-L", self._lv_size, "-s", "-n",
self._name, Path("/dev") / self._vg_name / self._lv_name]
self.run_command(command)
def restore(self):
"""
Run the command to restore a snapshot
"""
command = [self.get_command_abs_path("lvconvert"), "-y", "--merge",
Path("/dev") / self._vg_name / self._name]
self.run_command(command)
def delete(self):
"""
Run the command to delete a snapshot
"""
command = [self.get_command_abs_path("lvremove"), "-f",
Path("/dev") / self._vg_name / self._name]
self.run_command(command)
def get_attributes(self):
"""
Get the creation date and status for a snapshot
"""
command = [self.get_command_abs_path("lvdisplay"), self._vg_name, "--select",
f"lv_name={self._name}", "--noheadings",
f"--separator={self.SEPARATOR}",
"-C", "-o", ",".join(self.ATTRIBUTES)]
process = self.run_command(command)
output = process.stdout.strip()
attributes = output.split(self.SEPARATOR)
create_date = attributes[0]
valid_state = "invalid" not in attributes[1].lower()
return datetime.strptime(create_date, self.CREATE_DATE_MASK), valid_state
class VarSnapshot(LVMSnapshot):
SOFTWARE_JSON_SNAPSHOT = "rootdirs/opt/software/software.json"
SOFTWARE_JSON_CURRENT = "/opt/software/software.json"
def restore(self):
"""
Override default restore behavior for var-lv; which has
the following specific scenarios to treat on restore:
- software.json needs to be updated to the current status, otherwise
will be restored with the pre-deploy start content incorrectly
"""
try:
with self.mount() as mount_dir:
software_json = Path(mount_dir) / self.SOFTWARE_JSON_SNAPSHOT
shutil.copy2(self.SOFTWARE_JSON_CURRENT, software_json)
LOG.info("Copied current deployment to %s", software_json)
with open(software_json, "r") as fp:
content = json.loads(fp.read())
deploy_host = content.get("deploy_host")
for host in deploy_host:
host["state"] = "rollback-deployed"
deploy = content.get("deploy")
for d in deploy:
d["state"] = "host-rollback-done"
with open(software_json, "w") as fp:
fp.write(json.dumps(content))
LOG.info("Deployment data updated")
except Exception as e:
LOG.error("Failure updating %s: %s", software_json, str(e))
raise
super().restore()
class LVMSnapshotManager:
@@ -17,6 +185,9 @@ class LVMSnapshotManager:
"""
# LVM snapshot default constants
VOLUME_GROUP = "cgts-vg"
# NOTE: snapshots store the changes between the state when they were taken and
# the current state of the LV, so how much and how fast it fills up relates to
# how much and how fast data is changing in each LV during the upgrade
LOGICAL_VOLUMES = { # lv_name: snapshot_size
"docker-lv": "12G",
"etcd-lv": "2G",
@@ -26,6 +197,9 @@ class LVMSnapshotManager:
"rabbit-lv": "1G",
"var-lv": "3G",
}
# TODO(heitormatsui) revisit this value soon to check
# if it matches the feature requirements and expectations
SNAPSHOT_EXPIRE_TIME = 86400 # 24 hours
def __init__(self, vg_name=None, lvs=None):
self._vg_name = vg_name if vg_name is not None else self.VOLUME_GROUP
@@ -39,72 +213,178 @@ class LVMSnapshotManager:
def lvs(self):
return self._lvs
@staticmethod
def run_command(command, shell=False, check=True):
"""Helper function to run shell commands and capture output."""
try:
result = subprocess.run(command, shell=shell, check=check,
text=True, capture_output=True)
return result
except subprocess.CalledProcessError as e:
LOG.error("Error executing command: %s\n%s" % (command, e.stderr))
raise
def create_instance(self, lv_name):
"""
Factory method to create snapshot instance; LVs that need to
override the default snapshot behavior must inherit the base
snapshot class and include a condition in this method
"""
# specific snapshot instances
if lv_name == "var-lv":
return VarSnapshot(self.vg_name, lv_name)
# otherwise create a generic instance
return LVMSnapshot(self.vg_name, lv_name)
def snapshot_exists(self, snapshot_name):
"""Check if a snapshot volume exists."""
command = f"lvs --noheadings -o lv_name {self.vg_name} | grep -w {snapshot_name}"
result = self.run_command(command, shell=True, check=False)
return result.returncode == 0
def create_snapshots(self):
"""Create snapshots for the specified logical volumes."""
def _create_snapshots(self):
"""Create snapshots for the specified logical volumes"""
LOG.info("Creating snapshots...")
for lv_name, lv_size in self.lvs.items():
snapshot_name = f"{lv_name}_snapshot"
if self.snapshot_exists(snapshot_name):
LOG.info("Snapshot already exists for %s. Skipping" % lv_name)
continue
snapshot = LVMSnapshot(self.vg_name, lv_name, lv_size)
if snapshot.exists():
LOG.info("Snapshot %s already exists, deleting snapshot...", snapshot.name)
snapshot.delete()
LOG.info("Creating snapshot for %s in volume group %s" % (lv_name, self.vg_name))
command = ["lvcreate", "-y", "-L", lv_size, "-s", "-n",
snapshot_name, Path("/dev") / self.vg_name / lv_name]
self.run_command(command)
snapshot.create()
LOG.info("Snapshots created successfully")
def restore_snapshots(self):
"""Activate LVM snapshots and prepare the system for rollback."""
def create_snapshots(self):
"""
Create snapshots and return success only if all expected snapshots are created,
if any snapshot creation returns error, all snapshots are cleared
"""
try:
self._create_snapshots()
except Exception:
LOG.error("Error creating snapshots, existing snapshots will be deleted")
self.delete_snapshots()
return False
return True
def _restore_snapshots(self):
"""Activate LVM snapshots and prepare the system for rollback"""
LOG.info("Restoring all active snapshots...")
for lv_name in self.lvs.keys():
snapshot_name = f"{lv_name}_snapshot"
if not self.snapshot_exists(snapshot_name):
LOG.info("Snapshot %s for %s does not exist. Skipping" % (snapshot_name, lv_name))
snapshot = self.create_instance(lv_name)
if not snapshot.exists():
LOG.info("Snapshot %s for %s does not exist. Skipping", snapshot.name, lv_name)
continue
LOG.info("Restoring snapshot for %s: %s" % (lv_name, snapshot_name))
command = ["lvconvert", "-y", "--merge", Path("/dev") / self.vg_name / snapshot_name]
self.run_command(command)
LOG.info("Snapshots restored, please reboot to apply changes")
LOG.info("Restoring snapshot for %s: %s", lv_name, snapshot.name)
snapshot.restore()
LOG.info("Snapshots restored, reboot is needed to apply the changes")
def restore_snapshots(self, force=False):
"""
Restore snapshots, but only after doing sanity checks:
- If all expected snapshots exists
- If snapshots are valid (if a snapshot reaches it's maximum size it is invalidated)
- If snapshots haven't expired (a snapshot expire if it is older than a determined period)
"""
# check for snapshot existence
# TODO(heitormatsui) optimize by calling one single command and check all
# existing snapshots from its output instead of calling it for each LV
snapshots = self.list_snapshots()
snapshots_lv_name = [snapshot.lv_name for snapshot in snapshots]
all_snapshots_found = all(lv_name in snapshots_lv_name
for lv_name in self._lvs)
if all_snapshots_found:
LOG.info("All expected snapshots found")
else:
LOG.error("Cannot proceed with snapshot restore, missing snapshots for %s",
set(self._lvs) - set(snapshots_lv_name))
return False
# check for invalid or expired snapshots
now = datetime.now(tz=timezone.utc)
expired_snapshots = []
invalid_snapshots = []
for snapshot in snapshots:
create_date, valid_state = snapshot.get_attributes()
age = (now - create_date).seconds
if valid_state is False:
LOG.error("Snapshot %s is invalid", snapshot.name)
invalid_snapshots.append(snapshot.name)
elif age > self.SNAPSHOT_EXPIRE_TIME:
LOG.error("Snapshot %s is expired, age: %s seconds", snapshot.name, age)
expired_snapshots.append(snapshot.name)
else:
LOG.info("Snapshot %s can be used", snapshot.name)
if invalid_snapshots:
LOG.error("Cannot proceed with snapshot restore, "
"invalid snapshots: %s", invalid_snapshots)
return False
if expired_snapshots:
if force:
LOG.warning("Force restore requested, allowing to proceed with "
"expired snapshots: %s", expired_snapshots)
else:
LOG.error("Cannot proceed with snapshot restore, "
"expired snapshots: %s", expired_snapshots)
return False
# restore snapshots
LOG.info("All snapshots validated")
try:
self._restore_snapshots()
except Exception:
return False
return True
def delete_snapshots(self):
"""Deactivate and delete any active snapshots and remove the rollback marker."""
"""Delete any active snapshots"""
LOG.info("Deleting all active snapshots...")
for lv_name in self.lvs.keys():
snapshot_name = f"{lv_name}_snapshot"
if self.snapshot_exists(snapshot_name):
LOG.info("Deleting snapshot for %s: %s" % (lv_name, snapshot_name))
command = ["lvremove", "-f", Path("/dev") / self.vg_name / snapshot_name]
self.run_command(command)
snapshot = LVMSnapshot(self.vg_name, lv_name)
if snapshot.exists():
LOG.info("Deleting snapshot for %s: %s" % (lv_name, snapshot.name))
snapshot.delete()
else:
LOG.info("Snapshot %s does not exist or is already deleted" % snapshot_name)
LOG.info("Snapshot %s does not exist or was already deleted", snapshot.name)
LOG.info("Snapshots deleted successfully")
def check_snapshots(self):
def list_snapshots(self):
"""Check if any snapshots exist for the specified logical volumes."""
LOG.info("Checking for existing LVM snapshots...")
snapshots_found = []
snapshots = []
for lv_name in self.lvs.keys():
snapshot_name = f"{lv_name}_snapshot"
if self.snapshot_exists(snapshot_name):
LOG.info("Snapshot exists for %s: %s" % (lv_name, snapshot_name))
snapshots_found.append(snapshot_name)
snapshot = LVMSnapshot(self.vg_name, lv_name)
if snapshot.exists():
LOG.info("Snapshot exists for %s: %s", lv_name, snapshot.name)
snapshots.append(snapshot)
else:
LOG.info("No snapshot found for %s: %s" % (lv_name, snapshot_name))
return snapshots_found
LOG.info("No snapshot found for %s", lv_name)
return snapshots
def main():
"""Main function to be executed when called as an executable"""
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--create",
action="store_true",
help="Create LVM snapshots")
parser.add_argument("-r", "--restore",
action="store_true",
help="Restore LVM snapshots")
parser.add_argument("-d", "--delete",
action="store_true",
help="Delete LVM snapshots")
parser.add_argument("-l", "--list",
action="store_true",
help="List existing snapshots")
args = parser.parse_args()
try:
manager = LVMSnapshotManager()
success = True
if args.create:
success = manager.create_snapshots()
elif args.restore:
success = manager.restore_snapshots()
elif args.delete:
manager.delete_snapshots()
elif args.list:
snapshots = [snapshot.to_json() for snapshot in manager.list_snapshots()]
print(json.dumps(snapshots, indent=4))
else:
parser.print_usage()
except Exception as e:
LOG.exception("Error: %s", str(e))
success = False
return 0 if success else 1
if __name__ == "__main__":
import upgrade_utils # pylint: disable=E0401
upgrade_utils.configure_logging('/var/log/software.log', log_level=logging.INFO)
sys.exit(main())

View File

@@ -16,7 +16,6 @@ import sys
import time
import software.ostree_utils as ostree_utils
from software.lvm_snapshot import LVMSnapshotManager
from software.software_functions import configure_logging
from software.software_functions import execute_agent_hooks
from software.software_functions import LOG
@@ -378,20 +377,11 @@ class SoftwareMessageDeployDeleteCleanupReq(messages.PatchMessage):
# undeploy the from-release ostree deployment to free sysroot disk space
success_ostree_undeploy_from_release = ostree_utils.undeploy_inactive_deployments()
# remove the lvm snapshots created during the upgrade process
success_remove_lvm_snapshots = True
try:
lsm = LVMSnapshotManager()
lsm.delete_snapshots()
except Exception:
success_remove_lvm_snapshots = False
cleanup_results = [
(success_ostree_remote_cleanup, "cleaning temporary refs/remotes"),
(success_ostree_remote_update, "updating default remote"),
(success_remove_upgrade_flags, "removing local upgrade flags"),
(success_ostree_undeploy_from_release, "undeploying from-release ostree deployment"),
(success_remove_lvm_snapshots, "removing LVM snapshots"),
]
for result, log_msg in cleanup_results:
if result not in [None, False]:

View File

@@ -33,6 +33,7 @@ from fm_api import constants as fm_constants
from oslo_config import cfg as oslo_cfg
import software.apt_utils as apt_utils
import software.lvm_snapshot as lvm_snapshot
import software.ostree_utils as ostree_utils
from software.api import app
from software.authapi import app as auth_app
@@ -3279,10 +3280,8 @@ class PatchController(PatchService):
if is_patch:
deploy_state.start(running_release, to_release, feed_repo, None, reboot_required)
else:
# TODO(bqian) remove default latest commit when a commit-id is built into GA metadata
if commit_id is None:
commit_id = ostree_utils.get_feed_latest_commit(deploy_sw_version)
deploy_state.start(running_release, to_release, feed_repo, commit_id, reboot_required)
deploy_state.start(running_release, to_release, feed_repo, commit_id,
reboot_required, snapshot=snapshot)
# Start applying the releases
upgrade = not is_patch
@@ -3805,6 +3804,27 @@ class PatchController(PatchService):
upgrade_activate_rollback_cmd = [
"source", "/etc/platform/openrc;", cmd_path, from_release, to_release]
# check if LVM snapshots are enabled and try to restore them
# TODO(heitormatsui): we don't really need to verify the system mode
# as LVM snapshots will only be allowed if the system is AIO-SX
system_mode = utils.get_platform_conf("system_mode")
if system_mode == constants.SYSTEM_MODE_SIMPLEX:
deploy = self.db_api_instance.get_deploy_all()[0]
enabled_lvm_snapshots = deploy.get("snapshot")
if enabled_lvm_snapshots:
LOG.info("LVM snapshots are enabled")
manager = lvm_snapshot.LVMSnapshotManager()
success = manager.restore_snapshots()
if success:
LOG.info("LVM snapshots were restored, upgrade scripts with "
"action=activate-rollback will be skipped")
deploy_state = DeployState.get_instance()
deploy_state.activate_rollback_done()
return
else:
LOG.warning("Failure restoring LVM snapshots, falling back "
"to standard activate-rollback procedure")
try:
LOG.info("starting subprocess %s" % ' '.join(upgrade_activate_rollback_cmd))
subprocess.Popen(' '.join(upgrade_activate_rollback_cmd), start_new_session=True, shell=True, env=env)