DCManager update for Redfish subcloud restore

In this commit, dcmanager api and manager are updated to
support the restore of a Redfish capable subcloud from
backup data.

Unit tests will be added in a separate commit.

Tests:
  - Restore request without install (not yet supported)
  - Restore request for a subcloud that is currently in 'managed'
    state
  - Load is not in dc-vault
  - Mandatory restore value (backup_filename) is not present
  - Specified backup file cannot be found
  - Subcloud restore with backup data on the target (i.e.
    backup tarball is under /opt/platform-backup)
  - Subcloud restore with backup data on the system controller
    (i.e. on_box_data = false)
  - Simulate install failure
  - Simulate check target failure
  - Batch subcloud restore
  - Subcloud restore retry
  - Subcloud restore with patches

Task: 41725
Story: 2008573
Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/777046
Change-Id: I8134c535e39231837727811475b0f01b2ccddb63
Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
This commit is contained in:
Tee Ngo
2021-02-23 00:09:57 -05:00
parent c446fb2f2f
commit af14a6f65c
9 changed files with 384 additions and 29 deletions

View File

@@ -13,7 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
# Copyright (c) 2017-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -63,6 +63,7 @@ LOCK_NAME = 'SubcloudsController'
BOOTSTRAP_VALUES = 'bootstrap_values'
INSTALL_VALUES = 'install_values'
RESTORE_VALUES = 'restore_values'
SUBCLOUD_ADD_MANDATORY_FILE = [
BOOTSTRAP_VALUES,
@@ -72,11 +73,28 @@ SUBCLOUD_RECONFIG_MANDATORY_FILE = [
consts.DEPLOY_CONFIG,
]
SUBCLOUD_RESTORE_MANDATORY_FILE = [
RESTORE_VALUES,
]
SUBCLOUD_ADD_GET_FILE_CONTENTS = [
BOOTSTRAP_VALUES,
INSTALL_VALUES,
]
# The following parameters can be provided by the user for
# remote subcloud restore
# - initial_backup_dir (default to /opt/platform-backup)
# - backup_filename (mandatory parameter)
# - ansible_ssh_pass (sysadmin_password)
# - ansible_become_pass (sysadmin_password)
# - on_box_data (default to true)
# - wipe_ceph_osds (default to false)
# - ansible_remote_tmp (default to /tmp)
MANDATORY_RESTORE_VALUES = [
'backup_filename',
]
class SubcloudsController(object):
VERSION_ALIASES = {
@@ -192,6 +210,30 @@ class SubcloudsController(object):
self._get_common_deploy_files(payload)
return payload
@staticmethod
def _get_restore_payload(request):
payload = dict()
for f in SUBCLOUD_RESTORE_MANDATORY_FILE:
if f not in request.POST:
pecan.abort(400, _("Missing required file for %s") % f)
multipart_data = decoder.MultipartDecoder(request.body,
pecan.request.headers.get('Content-Type'))
for f in SUBCLOUD_RESTORE_MANDATORY_FILE:
for part in multipart_data.parts:
header = part.headers.get('Content-Disposition')
if f in header:
file_item = request.POST[f]
file_item.file.seek(0, os.SEEK_SET)
data = yaml.safe_load(file_item.file.read().decode('utf8'))
payload.update({RESTORE_VALUES: data})
elif "sysadmin_password" in header:
payload.update({'sysadmin_password': part.content})
elif "with_install" in header:
payload.update({'with_install': part.content})
return payload
def _get_config_file_path(self, subcloud_name, config_file_type=None):
if config_file_type == consts.DEPLOY_CONFIG:
file_path = os.path.join(
@@ -432,14 +474,8 @@ class SubcloudsController(object):
if k not in install_values:
if k == 'image':
# check for the image at load vault load location
matching_iso, matching_sig = utils.get_vault_load_files(tsc.SW_VERSION)
if not os.path.isfile(matching_iso):
msg = ('Failed to get active load image. Provide '
'active load image via '
'"system --os-region-name SystemController '
'load-import --active"')
pecan.abort(400, _(msg))
matching_iso, matching_sig = \
SubcloudsController.verify_active_load_in_vault()
LOG.info("image was not in install_values: will reference %s" %
matching_iso)
else:
@@ -511,6 +547,15 @@ class SubcloudsController(object):
return True
@staticmethod
def _validate_restore_values(payload):
"""Validate the restore values to ensure parameters for remote restore are present"""
restore_values = payload.get(RESTORE_VALUES)
for p in MANDATORY_RESTORE_VALUES:
if p not in restore_values:
pecan.abort(400, _('Mandatory restore value %s not present') % p)
def _get_subcloud_users(self):
"""Get the subcloud users and passwords from keyring"""
DEFAULT_SERVICE_PROJECT_NAME = 'services'
@@ -618,6 +663,22 @@ class SubcloudsController(object):
data_install=data_install)
return subcloud
@staticmethod
def verify_active_load_in_vault():
try:
matching_iso, matching_sig = utils.get_vault_load_files(tsc.SW_VERSION)
if not matching_iso:
msg = _('Failed to get active load image. Provide '
'active load image via '
'"system --os-region-name SystemController '
'load-import --active"')
LOG.exception(msg)
pecan.abort(400, msg)
return matching_iso, matching_sig
except Exception as e:
LOG.exception(str(e))
pecan.abort(400, str(e))
@index.when(method='GET', template='json')
def get(self, subcloud_ref=None, detail=None):
"""Get details about subcloud.
@@ -997,6 +1058,86 @@ class SubcloudsController(object):
except Exception:
LOG.exception("Unable to reinstall subcloud %s" % subcloud.name)
pecan.abort(500, _('Unable to reinstall subcloud'))
elif verb == "restore":
payload = self._get_restore_payload(request)
if not payload:
pecan.abort(400, _('Body required'))
if subcloud.management_state != consts.MANAGEMENT_UNMANAGED:
pecan.abort(400, _('Subcloud can not be restored while it is still '
'in managed state. Please unmanage the subcloud '
'and try again.'))
elif subcloud.deploy_status in [consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_BOOTSTRAPPING,
consts.DEPLOY_STATE_DEPLOYING]:
pecan.abort(400, _('This operation is not allowed while subcloud install, '
'bootstrap or deploy is in progress.'))
sysadmin_password = \
payload.get('sysadmin_password')
if not sysadmin_password:
pecan.abort(400, _('subcloud sysadmin_password required'))
try:
payload['sysadmin_password'] = base64.b64decode(
sysadmin_password).decode('utf-8')
except Exception:
msg = _('Failed to decode subcloud sysadmin_password, '
'verify the password is base64 encoded')
LOG.exception(msg)
pecan.abort(400, msg)
with_install = payload.get('with_install')
if with_install is not None:
if with_install == 'true' or with_install == 'True':
payload.update({'with_install': True})
elif with_install == 'false' or with_install == 'False':
payload.update({'with_install': False})
else:
pecan.abort(400, _('Invalid with_install value'))
self._validate_restore_values(payload)
if with_install:
# Request to remote install as part of subcloud restore. Confirm the
# subcloud install data in the db still contain the required parameters
# for remote install.
install_values = self._get_subcloud_db_install_values(subcloud)
payload.update({
'install_values': install_values,
})
# Confirm the active system controller load is still in dc-vault
SubcloudsController.verify_active_load_in_vault()
else:
# Not Redfish capable subcloud. The subcloud has been reinstalled
# and required patches have been applied.
#
# Pseudo code:
# - Retrieve install_values of the subcloud from the database.
# If it does not exist, try to retrieve the bootstrap address
# from its ansible inventory file (/opt/dc/ansible).
# - If the bootstrap address can be obtained, add install_values
# to the payload and continue.
# - If the bootstrap address cannot be obtained, abort with an
# error message advising the user to run "dcmanager subcloud
# update --bootstrap-address <bootstrap_address>" command
msg = _('This operation is not yet supported for subclouds without '
'remote install capability.')
LOG.exception(msg)
pecan.abort(400, msg)
try:
self.rpc_client.restore_subcloud(context, subcloud_id,
payload)
# Return deploy_status as pre-restore
subcloud.deploy_status = consts.DEPLOY_STATE_PRE_RESTORE
return db_api.subcloud_db_model_to_dict(subcloud)
except RemoteError as e:
pecan.abort(422, e.value)
except Exception:
LOG.exception("Unable to restore subcloud %s" % subcloud.name)
pecan.abort(500, _('Unable to restore subcloud'))
elif verb == 'update_status':
res = self.updatestatus(subcloud.name)
return res

View File

@@ -162,6 +162,10 @@ DEPLOY_STATE_DEPLOY_FAILED = 'deploy-failed'
DEPLOY_STATE_MIGRATING_DATA = 'migrating-data'
DEPLOY_STATE_DATA_MIGRATION_FAILED = 'data-migration-failed'
DEPLOY_STATE_MIGRATED = 'migrated'
DEPLOY_STATE_PRE_RESTORE = 'pre-restore'
DEPLOY_STATE_RESTORE_PREP_FAILED = 'restore-prep-failed'
DEPLOY_STATE_RESTORING = 'restoring'
DEPLOY_STATE_RESTORE_FAILED = 'restore-failed'
DEPLOY_STATE_DONE = 'complete'

View File

@@ -248,20 +248,22 @@ def get_vault_load_files(target_version):
matching_iso = None
matching_sig = None
for a_file in os.listdir(vault_dir):
if a_file.lower().endswith(".iso"):
matching_iso = os.path.join(vault_dir, a_file)
continue
elif a_file.lower().endswith(".sig"):
matching_sig = os.path.join(vault_dir, a_file)
continue
# If no .iso or .sig is found, raise an exception
if matching_iso is None:
raise exceptions.VaultLoadMissingError(
file_type='.iso', vault_dir=vault_dir)
if matching_sig is None:
raise exceptions.VaultLoadMissingError(
file_type='.sig', vault_dir=vault_dir)
if os.path.isdir(vault_dir):
for a_file in os.listdir(vault_dir):
if a_file.lower().endswith(".iso"):
matching_iso = os.path.join(vault_dir, a_file)
continue
elif a_file.lower().endswith(".sig"):
matching_sig = os.path.join(vault_dir, a_file)
continue
# If no .iso or .sig is found, raise an exception
if matching_iso is None:
raise exceptions.VaultLoadMissingError(
file_type='.iso', vault_dir=vault_dir)
if matching_sig is None:
raise exceptions.VaultLoadMissingError(
file_type='.sig', vault_dir=vault_dir)
# return the iso and sig for this load
return (matching_iso, matching_sig)

View File

@@ -10,7 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
# Copyright (c) 2017-2021 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
@@ -148,6 +148,14 @@ class DCManagerService(service.Service):
subcloud_id,
payload)
@request_context
def restore_subcloud(self, context, subcloud_id, payload):
# Restore a subcloud
LOG.info("Handling restore_subcloud request for: %s" % subcloud_id)
return self.subcloud_manager.restore_subcloud(context,
subcloud_id,
payload)
@request_context
def update_subcloud_endpoint_status(self, context, subcloud_name=None,
endpoint_type=None,

View File

@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
# Copyright (c) 2017-2021 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
@@ -71,6 +71,10 @@ ANSIBLE_SUBCLOUD_PLAYBOOK = \
'/usr/share/ansible/stx-ansible/playbooks/bootstrap.yml'
ANSIBLE_SUBCLOUD_INSTALL_PLAYBOOK = \
'/usr/share/ansible/stx-ansible/playbooks/install.yml'
ANSIBLE_SUBCLOUD_RESTORE_PLAYBOOK = \
'/usr/share/ansible/stx-ansible/playbooks/restore_platform.yml'
ANSIBLE_HOST_VALIDATION_PLAYBOOK = \
'/usr/share/ansible/stx-ansible/playbooks/validate_host.yml'
USERS_TO_REPLICATE = [
'sysinv',
@@ -219,6 +223,28 @@ class SubcloudManager(manager.Manager):
]
return deploy_command
def compose_check_target_command(self, subcloud_name,
ansible_subcloud_inventory_file, payload):
check_target_command = [
"ansible-playbook", ANSIBLE_HOST_VALIDATION_PLAYBOOK,
"-i", ansible_subcloud_inventory_file,
"--limit", subcloud_name,
"-e", "@%s" % consts.ANSIBLE_OVERRIDES_PATH + "/" +
subcloud_name + "_check_target_values.yml"]
return check_target_command
def compose_restore_command(self, subcloud_name,
ansible_subcloud_inventory_file, payload):
restore_command = [
"ansible-playbook", ANSIBLE_SUBCLOUD_RESTORE_PLAYBOOK,
"-i", ansible_subcloud_inventory_file,
"--limit", subcloud_name,
"-e", "@%s" % consts.ANSIBLE_OVERRIDES_PATH + "/" +
subcloud_name + "_restore_values.yml"]
return restore_command
def add_subcloud(self, context, payload):
"""Add subcloud and notify orchestrators.
@@ -544,10 +570,131 @@ class SubcloudManager(manager.Manager):
context, subcloud_id,
deploy_status=consts.DEPLOY_STATE_DEPLOY_PREP_FAILED)
def _create_check_target_override_file(self, payload, subcloud_name):
check_target_override_file = os.path.join(
consts.ANSIBLE_OVERRIDES_PATH, subcloud_name +
'_check_target_values.yml')
with open(check_target_override_file, 'w') as f_out:
f_out.write(
'---\n'
)
for k, v in payload['check_target_values'].items():
f_out.write("%s: %s\n" % (k, json.dumps(v)))
def _create_restore_override_file(self, payload, subcloud_name):
restore_override_file = os.path.join(
consts.ANSIBLE_OVERRIDES_PATH, subcloud_name +
'_restore_values.yml')
with open(restore_override_file, 'w') as f_out:
f_out.write(
'---\n'
)
for k, v in payload['restore_values'].items():
f_out.write("%s: %s\n" % (k, json.dumps(v)))
def _prepare_for_restore(self, payload, subcloud_name):
payload['check_target_values'] = dict()
payload['check_target_values']['ansible_ssh_pass'] = \
payload['sysadmin_password']
payload['check_target_values']['software_version'] = SW_VERSION
payload['check_target_values']['bootstrap_address'] = \
payload['bootstrap-address']
payload['check_target_values']['check_bootstrap_address'] = 'true'
payload['check_target_values']['check_patches'] = 'false'
self._create_check_target_override_file(payload, subcloud_name)
payload['restore_values']['ansible_ssh_pass'] = \
payload['sysadmin_password']
payload['restore_values']['ansible_become_pass'] = \
payload['sysadmin_password']
payload['restore_values']['admin_password'] = \
str(keyring.get_password('CGCS', 'admin'))
payload['restore_values']['skip_patches_restore'] = 'true'
self._create_restore_override_file(payload, subcloud_name)
def restore_subcloud(self, context, subcloud_id, payload):
"""Restore subcloud
:param context: request context object
:param subcloud_id: subcloud id from db
:param payload: subcloud restore detail
"""
# Retrieve the subcloud details from the database
subcloud = db_api.subcloud_get(context, subcloud_id)
if subcloud.management_state != consts.MANAGEMENT_UNMANAGED:
raise exceptions.SubcloudNotUnmanaged()
db_api.subcloud_update(context, subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_RESTORE)
try:
# Ansible inventory filename for the specified subcloud
ansible_subcloud_inventory_file = self._get_ansible_filename(
subcloud.name, INVENTORY_FILE_POSTFIX)
# Add parameters used to generate inventory
payload['name'] = subcloud.name
payload['bootstrap-address'] = \
payload['install_values']['bootstrap_address']
payload['software_version'] = SW_VERSION
install_command = None
if payload['with_install']:
# Redfish capable subclouds
LOG.info("Reinstalling subcloud %s." % subcloud.name)
# Disegard the current 'image' config. Always reinstall with
# the system controller active image in dc-vault.
matching_iso, matching_sig = utils.get_vault_load_files(SW_VERSION)
payload['install_values'].update({'image': matching_iso})
payload['install_values']['ansible_ssh_pass'] = \
payload['sysadmin_password']
utils.create_subcloud_inventory(payload,
ansible_subcloud_inventory_file)
install_command = self.compose_install_command(
subcloud.name, ansible_subcloud_inventory_file)
else:
# Non Redfish capable subcloud
# Shouldn't get here as the API has already rejected the request.
return
# Prepare for restore
self._prepare_for_restore(payload, subcloud.name)
check_target_command = self.compose_check_target_command(
subcloud.name, ansible_subcloud_inventory_file, payload)
restore_command = self.compose_restore_command(
subcloud.name, ansible_subcloud_inventory_file, payload)
apply_thread = threading.Thread(
target=self.run_deploy,
args=(subcloud, payload, context,
install_command, None, None, check_target_command, restore_command))
apply_thread.start()
return db_api.subcloud_db_model_to_dict(subcloud)
except Exception:
LOG.exception("Failed to restore subcloud %s" % subcloud.name)
db_api.subcloud_update(
context, subcloud_id,
deploy_status=consts.DEPLOY_STATE_RESTORE_PREP_FAILED)
@staticmethod
def run_deploy(subcloud, payload, context,
install_command=None, apply_command=None,
deploy_command=None):
deploy_command=None, check_target_command=None,
restore_command=None):
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
'_playbook_output.log'
@@ -584,6 +731,27 @@ class SubcloudManager(manager.Manager):
install.cleanup()
LOG.info("Successfully installed subcloud %s" % subcloud.name)
# Leave the following block here in case there is another use
# case besides subcloud restore where validating host post
# fresh install is necessary.
if check_target_command:
try:
run_playbook(log_file, check_target_command)
except PlaybookExecutionFailed:
msg = "Failed to run the validate host playbook" \
" for subcloud %s, check individual log at " \
"%s for detailed output." % (
subcloud.name,
log_file)
LOG.error(msg)
if restore_command:
db_api.subcloud_update(
context, subcloud.id,
deploy_status=consts.DEPLOY_STATE_RESTORE_PREP_FAILED)
return
LOG.info("Successfully checked subcloud %s" % subcloud.name)
if apply_command:
try:
# Update the subcloud to bootstrapping
@@ -632,6 +800,27 @@ class SubcloudManager(manager.Manager):
return
LOG.info("Successfully deployed subcloud %s" %
subcloud.name)
elif restore_command:
db_api.subcloud_update(
context, subcloud.id,
deploy_status=consts.DEPLOY_STATE_RESTORING)
# Run the restore platform playbook
try:
run_playbook(log_file, restore_command)
except PlaybookExecutionFailed:
msg = "Failed to run the subcloud restore playbook" \
" for subcloud %s, check individual log at " \
"%s for detailed output." % (
subcloud.name,
log_file)
LOG.error(msg)
db_api.subcloud_update(
context, subcloud.id,
deploy_status=consts.DEPLOY_STATE_RESTORE_FAILED)
return
LOG.info("Successfully restored controller-0 of subcloud %s" %
subcloud.name)
db_api.subcloud_update(
context, subcloud.id,

View File

@@ -110,6 +110,11 @@ class ImportingLoadState(BaseState):
else:
# ISO and SIG files are found in the vault under a version directory
iso_path, sig_path = utils.get_vault_load_files(target_version)
if not iso_path:
message = ("Failed to get upgrade load info for subcloud %s" %
strategy_step.subcloud.name)
raise Exception(message)
# Call the API. import_load blocks until the load state is 'importing'
new_load = self.subcloud_sysinv.import_load(iso_path, sig_path)
if new_load.software_version != target_version:

View File

@@ -181,7 +181,7 @@ class UpgradingSimplexState(BaseState):
# The 'software_version' is the active running load on SystemController
matching_iso, _ = utils.get_vault_load_files(SW_VERSION)
if not os.path.isfile(matching_iso):
if not matching_iso:
message = ("Failed to get upgrade load info for subcloud %s" %
strategy_step.subcloud.name)
raise Exception(message)

View File

@@ -10,7 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
# Copyright (c) 2017-2021 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
@@ -105,6 +105,11 @@ class ManagerClient(RPCClient):
subcloud_id=subcloud_id,
payload=payload))
def restore_subcloud(self, ctxt, subcloud_id, payload):
return self.cast(ctxt, self.make_msg('restore_subcloud',
subcloud_id=subcloud_id,
payload=payload))
def update_subcloud_endpoint_status(self, ctxt, subcloud_name=None,
endpoint_type=None,
sync_status=consts.

View File

@@ -462,7 +462,7 @@ class TestSubcloudPost(testroot.DCManagerApiTest,
"""Test POST operation with install values fails if data missing."""
# todo(abailey): add a new unit test with no image and no vault files
mock_vault_files.return_value = ('fake_iso', 'fake_sig')
mock_vault_files.return_value = (None, None)
params = self.get_post_params()
# add bmc_password to params
@@ -483,6 +483,7 @@ class TestSubcloudPost(testroot.DCManagerApiTest,
expect_errors=True)
self._verify_post_failure(response, key, None)
mock_vault_files.return_value = ('fake_iso', 'fake_sig')
# try with nothing removed and verify it works
self.install_data = copy.copy(self.FAKE_INSTALL_DATA)
upload_files = self.get_post_upload_files()