diff --git a/distributedcloud/dccommon/subcloud_install.py b/distributedcloud/dccommon/subcloud_install.py index 5e1f438af..101d47d4e 100644 --- a/distributedcloud/dccommon/subcloud_install.py +++ b/distributedcloud/dccommon/subcloud_install.py @@ -31,6 +31,7 @@ from dccommon.drivers.openstack.sysinv_v1 import SysinvClient from dccommon import exceptions from dccommon import install_consts from dccommon import utils as common_utils +from dcmanager.common import consts as common_consts from dcmanager.common import utils LOG = logging.getLogger(__name__) @@ -618,7 +619,7 @@ class SubcloudInstall(object): # for cleanup on process restart/SWACT. common_utils.run_playbook(log_file, install_command) except exceptions.PlaybookExecutionFailed: - msg = ("Failed to install the subcloud %s, check individual " - "log at %s for detailed output." - % (self.name, log_file)) + msg = ("Failed to install %s, check individual " + "log at %s or run %s for details" + % (self.name, log_file, common_consts.ERROR_DESC_CMD)) raise Exception(msg) diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index fa2d958b2..b4fac362d 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -184,6 +184,7 @@ DEPLOY_STATE_DONE = 'complete' # Subcloud errors ERROR_DESC_EMPTY = 'No errors present' +ERROR_DESC_CMD = 'dcmanager subcloud errors ' # error_description max length ERROR_DESCRIPTION_LENGTH = 2048 diff --git a/distributedcloud/dcmanager/common/exceptions.py b/distributedcloud/dcmanager/common/exceptions.py index 710bd31c3..5b37d184e 100644 --- a/distributedcloud/dcmanager/common/exceptions.py +++ b/distributedcloud/dcmanager/common/exceptions.py @@ -178,7 +178,7 @@ class CertificateUploadError(DCManagerException): class LicenseInstallError(DCManagerException): - message = _("Error while installing license on subcloud: %(subcloud_id)s") + message = _("Error while installing license on subcloud: %(subcloud_id)s. %(error_message)s") class LicenseMissingError(DCManagerException): diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/completing.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/completing.py index 366040e56..665b701e7 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/completing.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/completing.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2021 Wind River Systems, Inc. +# Copyright (c) 2020-2022 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -97,8 +97,15 @@ class CompletingUpgradeState(BaseState): # invoke the API 'upgrade-complete' # This is a partially blocking call that raises exception on failure. # We will re-attempt even if that failure is encountered - self._upgrade_complete(strategy_step) - + try: + message = self._upgrade_complete(strategy_step) + except Exception as e: + msg = ("Failed to complete upgrade. %s" % + str(e)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + raise # 'completion' deletes the upgrade. Need to loop until it is deleted counter = 0 while True: @@ -112,7 +119,12 @@ class CompletingUpgradeState(BaseState): break counter += 1 if counter >= self.max_queries: - raise Exception("Timeout waiting for completion to complete") + msg = ("Timeout waiting for completion to complete: %s:" % + message) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + raise Exception(msg) time.sleep(self.sleep_duration) # When we return from this method without throwing an exception, the diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/importing_load.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/importing_load.py index 14c58e714..95132f940 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/importing_load.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/importing_load.py @@ -11,6 +11,7 @@ from dcmanager.common import utils from dcmanager.common.exceptions import StrategyStoppedException from dcmanager.common.exceptions import VaultLoadMissingError +from dcmanager.db import api as db_api from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \ REGION_ONE_SYSTEM_INFO_CACHE_TYPE @@ -155,11 +156,20 @@ class ImportingLoadState(BaseState): # Send only the required fields creation_keys = ['software_version', 'compatible_version', 'required_patches'] target_load = {key: target_load[key] for key in creation_keys} - load = self.get_sysinv_client( - strategy_step.subcloud.name).import_load_metadata(target_load) - self.info_log(strategy_step, - "Load: %s is now: %s" % ( - load.software_version, load.state)) + try: + load = self.get_sysinv_client( + strategy_step.subcloud.name).import_load_metadata(target_load) + self.info_log(strategy_step, + "Load: %s is now: %s" % ( + load.software_version, load.state)) + except Exception as e: + msg = ("Failed to import load metadata. %s" % + str(e)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + self.error_log(strategy_step, msg) + raise else: while True: # If event handler stop has been triggered, fail the state diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/installing_license.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/installing_license.py index ec98fe29c..eb4b60bd0 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/installing_license.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/installing_license.py @@ -6,6 +6,7 @@ from dccommon import consts as dccommon_consts from dcmanager.common import consts from dcmanager.common import exceptions +from dcmanager.db import api as db_api from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \ REGION_ONE_LICENSE_CACHE_TYPE @@ -51,8 +52,15 @@ class InstallingLicenseState(BaseState): return self.next_state else: # An unexpected error occurred querying the license + message = ('An unexpected error occurred querying the license %s. Detail: %s' % + (dccommon_consts.SYSTEM_CONTROLLER_NAME, + target_error)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH]) raise exceptions.LicenseInstallError( - subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME) + subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME, + error_message=target_error) # retrieve the keystone session for the subcloud and query its license subcloud_sysinv_client = \ @@ -76,8 +84,17 @@ class InstallingLicenseState(BaseState): install_rc = subcloud_sysinv_client.install_license(target_license) install_error = install_rc.get('error') if len(install_error) != 0: + # Save error response from sysinv into subcloud error description. + # Provide exception with sysinv error response to strategy_step details + message = ('Error installing license on subcloud %s. Detail: %s' % + (strategy_step.subcloud.name, + install_error)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH]) raise exceptions.LicenseInstallError( - subcloud_id=strategy_step.subcloud_id) + subcloud_id=strategy_step.subcloud_id, + error_message=install_error) # The license has been successfully installed. Move to the next stage self.info_log(strategy_step, "License installed.") diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/migrating_data.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/migrating_data.py index 251c8266d..e354cf16e 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/migrating_data.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/migrating_data.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2021 Wind River Systems, Inc. +# Copyright (c) 2020-2022 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -10,6 +10,7 @@ from dccommon.exceptions import PlaybookExecutionFailed from dccommon.utils import run_playbook from dcmanager.common import consts from dcmanager.common.exceptions import StrategyStoppedException +from dcmanager.common import utils from dcmanager.db import api as db_api from dcmanager.orchestrator.states.base import BaseState @@ -32,16 +33,14 @@ DEFAULT_API_SLEEP = 60 DEFAULT_ANSIBLE_SLEEP = 180 -def migrate_subcloud_data(subcloud_name, migrate_command): - log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name) + \ - '_playbook_output.log' +def migrate_subcloud_data(migrate_command, log_file): try: run_playbook(log_file, migrate_command) except PlaybookExecutionFailed: - msg = ("Failed to migrate data for subcloud %s, check individual " - "log at %s for detailed output." - % (subcloud_name, log_file)) - raise Exception(msg) + msg_orch = ("Failed to migrate data, check individual " + "log at %s or run %s for details" + % (log_file, consts.ERROR_DESC_CMD)) + raise Exception(msg_orch) class MigratingDataState(BaseState): @@ -142,7 +141,8 @@ class MigratingDataState(BaseState): ansible_subcloud_inventory_file = os.path.join( consts.ANSIBLE_OVERRIDES_PATH, strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX) - + log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \ + '_playbook_output.log' # Send skip_patching=true to prevent the playbook from applying any patches present in the # upgrade_data. All the required patches will be included in the generated install iso. data_migrating_cmd = [ @@ -152,12 +152,17 @@ class MigratingDataState(BaseState): % (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)] try: - migrate_subcloud_data(strategy_step.subcloud.name, - data_migrating_cmd) + migrate_subcloud_data(data_migrating_cmd, log_file) except Exception as e: + # Two error messages: one for subcloud error description and logs and + # one for orchestrator strategy_step detail (shorter than the previous). + msg_subcloud = utils.find_ansible_error_msg( + strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA) db_api.subcloud_update( self.context, strategy_step.subcloud_id, - deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED) + deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, + error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH]) + self.error_log(strategy_step, msg_subcloud) self.error_log(strategy_step, str(e)) raise diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py index b777e8136..9c41c798c 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py @@ -90,6 +90,10 @@ class PreCheckState(BaseState): if (host.administrative == consts.ADMIN_LOCKED and upgrades): alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM) + # Clean old error messages + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=consts.ERROR_DESC_EMPTY) # The health conditions acceptable for upgrade are: # a) subcloud is completely healthy (i.e. no failed checks) # b) subcloud only fails alarm check and it only has non-management @@ -106,8 +110,14 @@ class PreCheckState(BaseState): # # These could be Kubernetes or other related failure(s) which has not been been # converted into an alarm condition. - details = "System health check failed. Please run 'system health-query' " \ - "command on the subcloud for more details." + error_desc_msg = ("System health check failed. \n %s" % + fails) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + details = ("System health check failed. Please run 'system health-query' " + "command on the subcloud or %s on central for details" + % (consts.ERROR_DESC_CMD)) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, @@ -125,9 +135,16 @@ class PreCheckState(BaseState): for alarm in alarms: if alarm.alarm_id not in alarm_ignore_list: if alarm.mgmt_affecting == "True": - details = "System health check failed due to alarm %s. " \ - "Please run 'system health-query' " \ - "command on the subcloud for more details." % alarm.alarm_id + error_desc_msg = ("System health check failed due to alarm %s. " + "System health: \n %s" % + (alarm.alarm_id, system_health)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + details = ("System health check failed due to alarm %s. " + "Please run 'system health-query' " + "command on the subcloud or %s on central for details." % + (alarm.alarm_id, consts.ERROR_DESC_CMD)) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, @@ -135,9 +152,16 @@ class PreCheckState(BaseState): ) else: # Multiple failures - details = "System health check failed due to multiple failures. " \ - "Please run 'system health-query' command on the " \ - "subcloud for more details." + error_desc_msg = ("System health check failed due to multiple failures. " + "Health: \n %s" % + (system_health)) + db_api.subcloud_update( + self.context, strategy_step.subcloud_id, + error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + details = ("System health check failed due to multiple failures. " + "Please run 'system health-query' command on the " + "subcloud or %s on central for details." % + (consts.ERROR_DESC_CMD)) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, diff --git a/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py b/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py index 11cb2eb17..9053cbe2b 100644 --- a/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py +++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2021 Wind River Systems, Inc. +# Copyright (c) 2020-2022 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -201,11 +201,12 @@ class UpgradingSimplexState(BaseState): if not subcloud.data_install: # Set the deploy status to pre-install-failed so it can be # handled accordingly in pre check step. + message = ("Failed to get upgrade data from install") db_api.subcloud_update( self.context, strategy_step.subcloud_id, - deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED) + deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED, + error_description=message) - message = ("Failed to get upgrade data from install") self.warn_log(strategy_step, message) raise Exception(message) @@ -337,6 +338,8 @@ class UpgradingSimplexState(BaseState): def perform_subcloud_install(self, strategy_step, session, install_values): + log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, strategy_step.subcloud.name) + \ + '_playbook_output.log' db_api.subcloud_update( self.context, strategy_step.subcloud_id, deploy_status=consts.DEPLOY_STATE_PRE_INSTALL) @@ -350,7 +353,8 @@ class UpgradingSimplexState(BaseState): except Exception as e: db_api.subcloud_update( self.context, strategy_step.subcloud_id, - deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED) + deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED, + error_description=str(e)[0:consts.ERROR_DESCRIPTION_LENGTH]) self.error_log(strategy_step, str(e)) # TODO(jkung): cleanup to be implemented within SubcloudInstall install.cleanup() @@ -379,9 +383,15 @@ class UpgradingSimplexState(BaseState): try: install.install(consts.DC_ANSIBLE_LOG_DIR, install_command) except Exception as e: + # Detailed error message for subcloud error description field. + # Exception message for strategy_step detail. + msg = utils.find_ansible_error_msg( + strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_INSTALLING) db_api.subcloud_update( self.context, strategy_step.subcloud_id, - deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED) + deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED, + error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH]) + self.error_log(strategy_step, msg) self.error_log(strategy_step, str(e)) install.cleanup() raise