Files
update/software/scripts/deploy-start
Heitor Matsui c86d830872 Integrate lvm snapshot feature with upgrade process
This commit integrates the previously created lvm snapshot
code in [1], [2] and [3] with the upgrade/rollback process:

[1] https://review.opendev.org/c/starlingx/update/+/946371
[2] https://review.opendev.org/c/starlingx/update/+/946716
[3] https://review.opendev.org/c/starlingx/update/+/946835

NOTE: the feature is supported ONLY for AIO-SX for now
NOTE: the 'Depends-on' commit ensures that deploy precheck
      enforces the feature requirements are satisfied before
      allowing it to be enabled on deploy start
TODO: the snapshot removal after upgrade is successful will be
      implemented after the action 'delete' is included for
      the upgrade scripts; currently this is still in progress

- New functions are added to the lvm_snapshot module, to
  create and restore lvm snapshots safely, i.e.:
  - For snapshot creation, if it fails to create any snapshot
    then it will delete the others and proceed with the feature
    disabled
  - For snapshot restoration, it validates if all expected
    snapshots exists, if they aren't expired (not older than
    a time limit) and if they are valid (not 100% full) and
    only after all these conditions are satisfied the snapshots
    are restored. If any of them fail, rollback proceeds with
    the feature disabled
- Now, software controller effectively uses the --snapshot option
  to trigger snapshot creation during deploy start step
- lvm_snapshot.py module can be called as a standalone executable
  to allow it be called from deploy start script, reducing the
  dependenvy of the feature on the from-release code
- software controller restores the lvm snapshots during
  activate-rollback, so that the workflow is not changed (which
  is better from the point of view of orchestration)
- If lvm snapshot restoration fails, it will fall back to
  the standard activate-rollback procedure (upgrade scripts
  with action 'activate-rollback')
- Snapshots are now an object, and can override the default
  behavior of create/restore; one case was introduced to var-lv,
  to update software.json so that after host reboots the deployment
  data is correct

This commit also improve some log messages and remove deprecated code.

Test Plan
PASS: AIO-SX stx-10 -> stx-11 e2e upgrade with snapshot enabled
PASS: AIO-SX stx-10 -> stx-11 e2e rollback with snapshot enabled
PASS: AIO-SX stx-10 -> stx-11 e2e upgrade with snapshot disabled
PASS: AIO-SX stx-10 -> stx-11 e2e rollback with snapshot disabled
PASS: AIO-DX stx-10 -> stx-11, attempt to enable snapshot and
      verify deploy precheck blocks it
PASS: AIO-DX stx-10 -> stx-11 e2e upgrade with snapshot disabled

Depends-on: https://review.opendev.org/c/starlingx/update/+/946672

Story: 2011357
Task: 51981

Change-Id: I0759c424f9590947b349263a181a16e9d277741b
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
2025-05-23 17:19:39 -03:00

237 lines
9.7 KiB
Python
Executable File

#!/usr/bin/python3
#
# Copyright (c) 2023-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script is to start a major release deployment. It does the following:
# 1. collect data from running system for migration,
# 2. create bind mounts to deployment directory
# 3. copy necessary data to deployment directory
# 4. in chroot, start 2nd instance of PostgreSQL database service
# 5. perform data migration
#
import logging
import os
import shutil
import subprocess
import sys
import upgrade_utils
LOG = logging.getLogger('main_logger')
class DeployStart:
STAGING_DIR = "/sysroot/upgrade"
SYSROOT_DIR = os.path.join(STAGING_DIR, "sysroot")
OSTREE_BRANCH = "starlingx"
REPORT_AGENT = "deploy-start"
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) # this script location
DEPLOY_STATE_START_DONE = "start-done"
DEPLOY_STATE_START_FAILED = "start-failed"
def __init__(self, from_version, to_version, k8s_version, postgres_port,
feed_ostree_repo_dir, commit_id=None, ignore_errors=False):
self._from_version = from_version
self._to_version = to_version
self._k8s_version = k8s_version.lstrip("v")
self._postgres_port = postgres_port
self._feed_ostree_repo_dir = feed_ostree_repo_dir
self._feed_ostree_repo_url = f"file://{feed_ostree_repo_dir}"
self._commit_id = commit_id
self._ignore_errors = ignore_errors
def _update_deploy_state(self, state):
try:
script_path = "/usr/bin/software-deploy-update"
cmd = [script_path, "-s", state, self.REPORT_AGENT]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
LOG.error(f"Failed updating deploy state: {e.stderr}")
def _check_directories(self):
for directory in [self.SYSROOT_DIR]:
if os.path.isdir(directory):
error_msg = (f"{directory} already exists. Please ensure to "
f"clean up the environment before proceeding")
LOG.error(error_msg)
raise OSError(error_msg)
def _checkout_ostree_repo(self):
# TODO(bqian) make commit_id mandatory once the commit-id is built to metadata.xml for major releases
if self._commit_id is None:
LOG.info("Retrieving commit-id...")
# get commit id, only latest for now
try:
cmd = ["ostree", "rev-parse", f"--repo={self._feed_ostree_repo_dir}", self.OSTREE_BRANCH]
process = subprocess.run(cmd, check=True, text=True, capture_output=True)
self._commit_id = process.stdout.strip()
except subprocess.CalledProcessError as e:
LOG.error(f"Failed to retrieve commit-id: {e.stderr}")
raise
LOG.info(f"Latest commit-id: {self._commit_id}")
LOG.info(f"Checking out ostree repo, commit-id: {self._commit_id}")
os.makedirs(self.STAGING_DIR, exist_ok=True)
try:
cmd = ["ostree", "checkout", f"--repo={self._feed_ostree_repo_dir}", self._commit_id, self.SYSROOT_DIR]
subprocess.run(cmd, check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
LOG.error(f"Failed to checkout commit-id {self._commit_id}: {e.stderr}")
raise
LOG.info(f"Checked out ostree repo in {self.SYSROOT_DIR}")
def _prepare_mount_points(self):
# create proper mounts on deploy file system
LOG.info("Creating mount points...")
try:
script_path = os.path.join(self.SCRIPT_DIR, "prepare-chroot-mounts")
cmd = [script_path, self.SYSROOT_DIR, "-m"]
subprocess.run(cmd, check=True, text=True, capture_output=True)
shutil.copy2("/etc/kubernetes/admin.conf", os.path.join(self.SYSROOT_DIR, "etc/kubernetes/"))
except subprocess.CalledProcessError as e:
LOG.error(f"Failed to mount required mount points: {e.stderr}")
raise
LOG.info("Mount points created successfully")
def _prepare_data_migration(self):
LOG.info("Preparing for data migration...")
# OS_AUTH_URL, OS_USERNAME, OS_PASSWORD, OS_PROJECT_NAME, OS_USER_DOMAIN_NAME,
# OS_PROJECT_DOMAIN_NAME, OS_REGION_NAME are in environment variables
try:
script_path = os.path.join(self.SCRIPT_DIR, "prepare-data-migration")
cmd = [script_path,
f"--rootdir={self.SYSROOT_DIR}",
f"--from_release={self._from_version}",
f"--to_release={self._to_version}",
f"--auth_url={os.environ.get('OS_AUTH_URL')}",
f"--username={os.environ.get('OS_USERNAME')}",
f"--password={os.environ.get('OS_PASSWORD')}",
f"--project_name={os.environ.get('OS_PROJECT_NAME')}",
f"--user_domain_name={os.environ.get('OS_USER_DOMAIN_NAME')}",
f"--project_domain_name={os.environ.get('OS_PROJECT_DOMAIN_NAME')}",
f"--region_name={os.environ.get('OS_REGION_NAME')}",
]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
LOG.error("Failed to extract data for migration")
raise
LOG.info("Data migration preparations complete")
def _create_postgres_database(self):
LOG.info("Creating temporary database...")
try:
script_path = "/usr/sbin/software-deploy/create-postgres-database"
cmd = ["/usr/sbin/chroot", self.SYSROOT_DIR, script_path, self._postgres_port]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
LOG.error("Failed to start 2nd instance of postgresql")
raise
LOG.info("Database creation complete")
def _run_data_migration(self):
LOG.info("Starting data migration...")
try:
script_path = "/usr/bin/software-migrate"
cmd = ["/usr/sbin/chroot", self.SYSROOT_DIR, script_path,
self._from_version, self._to_version, self._postgres_port]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
LOG.error("Failed to migrate data")
raise
LOG.info("Data migration completed")
def _sync_controllers_feed(self):
LOG.info("Syncing feed between controllers...")
try:
script_path = os.path.join(self.SCRIPT_DIR, "sync-controllers-feed")
cmd = [script_path, f"--feed={os.path.dirname(self._feed_ostree_repo_dir)}"]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
LOG.error("Failed to sync feeds")
raise
LOG.info("Feed sync complete")
def _remove_temporary_data(self):
LOG.info("Starting cleanup...")
try:
script_path = os.path.join(self.SCRIPT_DIR, "remove-temporary-data")
cmd = [script_path, self.SYSROOT_DIR]
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
LOG.error("Failed cleaning up temporary data")
raise
LOG.info("Cleanup complete")
def _take_lvm_snapshots(self):
deployment = upgrade_utils.get_deployment_data()
if deployment.get("snapshot") is True:
LOG.info("LVM snapshot option enabled, proceeding to take snapshots...")
script_path = os.path.join(self.SCRIPT_DIR, "manage-lvm-snapshots")
cmd = [script_path, "--create"]
try:
subprocess.run(cmd, check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
LOG.error("Error taking LVM snapshots: %s", e.stderr)
raise
else:
LOG.info("LVM snapshot option is not enabled, skipping...")
def run(self):
try:
self._check_directories()
self._checkout_ostree_repo()
self._take_lvm_snapshots()
self._prepare_mount_points()
self._prepare_data_migration()
self._create_postgres_database()
self._run_data_migration()
self._sync_controllers_feed()
self._update_deploy_state(self.DEPLOY_STATE_START_DONE)
except Exception:
self._update_deploy_state(self.DEPLOY_STATE_START_FAILED)
return 1
finally:
self._remove_temporary_data()
return 0
if __name__ == "__main__":
upgrade_utils.configure_logging("/var/log/software.log", log_level=logging.INFO)
from_version = None
to_version = None
k8s_version = None
postgres_port = None
feed_ostree_repo_dir = None
commit_id = None
for arg in range(1, len(sys.argv)):
if arg == 1:
from_version = sys.argv[arg]
elif arg == 2:
to_version = sys.argv[arg]
elif arg == 3:
k8s_version = sys.argv[arg]
elif arg == 4:
postgres_port = sys.argv[arg]
elif arg == 5:
feed_ostree_repo_dir = sys.argv[arg]
elif arg == 6:
commit_id = sys.argv[arg]
ignore_errors = os.environ.get("IGNORE_ERRORS", False)
if any(x is None for x in [from_version, to_version, k8s_version, postgres_port, feed_ostree_repo_dir]):
usage_msg = (f"usage: {sys.argv[0]} <from_version> <to_version> <k8s_version> "
f"<postgresql_port> <feed_ostree_repo_dir> [commit_id]")
print(usage_msg)
LOG.info(usage_msg)
sys.exit(1)
deploy_start = DeployStart(from_version, to_version, k8s_version, postgres_port, feed_ostree_repo_dir,
commit_id=commit_id, ignore_errors=ignore_errors)
sys.exit(deploy_start.run())