From d150aa287f6dc3630765b73d6d510e7b54ff31aa Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Tue, 7 Apr 2020 16:59:12 +0200 Subject: [PATCH] Refactor AgentBase.heartbeat and process_next_step These methods are reaching the complexity limits, split them. Also update messages to be more human-friendly. The complexity limit is returned back to 18 as before hacking 3.0. Change-Id: I918af9508f3c99321625f6df96b1bf12ead26243 Story: #2006963 --- ironic/drivers/modules/agent_base.py | 270 ++++++++++-------- .../unit/drivers/modules/test_agent_base.py | 18 +- tox.ini | 2 +- 3 files changed, 158 insertions(+), 132 deletions(-) diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 70ea04411b..eaf9b4e8aa 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -459,6 +459,85 @@ class HeartbeatMixin(object): 'maintenance mode; not taking any action.', {'node': node.uuid}) + def _heartbeat_deploy_wait(self, task): + msg = _('Failed checking if deploy is done') + node = task.node + try: + # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we + # are currently in the core deploy.deploy step. Other deploy steps + # may cause the agent to boot, but we should not trigger deployment + # at that point if the driver is polling for completion of a step. + if self.in_core_deploy_step(task): + if not self.deploy_has_started(task): + msg = _('Node failed to deploy') + self.continue_deploy(task) + elif self.deploy_is_done(task): + msg = _('Node failed to move to active state') + self.reboot_to_instance(task) + else: + node.touch_provisioning() + else: + # The exceptions from RPC are not possible as we using cast + # here + # Check if the driver is polling for completion of a step, + # via the 'deployment_polling' flag. + polling = node.driver_internal_info.get( + 'deployment_polling', False) + if not polling: + manager_utils.notify_conductor_resume_deploy(task) + node.touch_provisioning() + except Exception as e: + last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e} + LOG.exception('Asynchronous exception for node %(node)s: %(err)s', + {'node': task.node.uuid, 'err': last_error}) + # Do not call the error handler is the node is already DEPLOYFAIL + if node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): + deploy_utils.set_failed_state( + task, last_error, collect_logs=bool(self._client)) + + def _heartbeat_clean_wait(self, task): + node = task.node + msg = _('Failed checking if cleaning is done') + try: + node.touch_provisioning() + if not node.clean_step: + LOG.debug('Node %s just booted to start cleaning.', + node.uuid) + msg = _('Node failed to start the first cleaning step') + # First, cache the clean steps + self.refresh_clean_steps(task) + # Then set/verify node clean steps and start cleaning + conductor_steps.set_node_cleaning_steps(task) + # The exceptions from RPC are not possible as we using cast + # here + manager_utils.notify_conductor_resume_clean(task) + else: + msg = _('Node failed to check cleaning progress') + # Check if the driver is polling for completion of a step, + # via the 'cleaning_polling' flag. + polling = node.driver_internal_info.get( + 'cleaning_polling', False) + if not polling: + self.continue_cleaning(task) + except Exception as e: + last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e} + LOG.exception('Asynchronous exception for node %(node)s: %(err)s', + {'node': task.node.uuid, 'err': last_error}) + if node.provision_state in (states.CLEANING, states.CLEANWAIT): + manager_utils.cleaning_error_handler(task, last_error) + + def _heartbeat_rescue_wait(self, task): + msg = _('Node failed to perform rescue operation') + try: + self._finalize_rescue(task) + except Exception as e: + last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e} + LOG.exception('Asynchronous exception for node %(node)s: %(err)s', + {'node': task.node.uuid, 'err': last_error}) + if task.node.provision_state in (states.RESCUING, + states.RESCUEWAIT): + manager_utils.rescuing_error_handler(task, last_error) + @METRICS.timer('HeartbeatMixin.heartbeat') def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. @@ -508,71 +587,12 @@ class HeartbeatMixin(object): if node.maintenance: return self._heartbeat_in_maintenance(task) - # Async call backs don't set error state on their own - # TODO(jimrollenhagen) improve error messages here - msg = _('Failed checking if deploy is done.') - try: - # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we - # are currently in the core deploy.deploy step. Other deploy steps - # may cause the agent to boot, but we should not trigger deployment - # at that point if the driver is polling for completion of a step. - if node.provision_state == states.DEPLOYWAIT: - if self.in_core_deploy_step(task): - if not self.deploy_has_started(task): - msg = _('Node failed to deploy.') - self.continue_deploy(task) - elif self.deploy_is_done(task): - msg = _('Node failed to move to active state.') - self.reboot_to_instance(task) - else: - node.touch_provisioning() - else: - # The exceptions from RPC are not possible as we using cast - # here - # Check if the driver is polling for completion of a step, - # via the 'deployment_polling' flag. - polling = node.driver_internal_info.get( - 'deployment_polling', False) - if not polling: - manager_utils.notify_conductor_resume_deploy(task) - node.touch_provisioning() - elif node.provision_state == states.CLEANWAIT: - node.touch_provisioning() - if not node.clean_step: - LOG.debug('Node %s just booted to start cleaning.', - node.uuid) - msg = _('Node failed to start the first cleaning step.') - # First, cache the clean steps - self.refresh_clean_steps(task) - # Then set/verify node clean steps and start cleaning - conductor_steps.set_node_cleaning_steps(task) - # The exceptions from RPC are not possible as we using cast - # here - manager_utils.notify_conductor_resume_clean(task) - else: - msg = _('Node failed to check cleaning progress.') - # Check if the driver is polling for completion of a step, - # via the 'cleaning_polling' flag. - polling = node.driver_internal_info.get( - 'cleaning_polling', False) - if not polling: - self.continue_cleaning(task) - elif (node.provision_state == states.RESCUEWAIT): - msg = _('Node failed to perform rescue operation.') - self._finalize_rescue(task) - except Exception as e: - err_info = {'msg': msg, 'e': e} - last_error = _('Asynchronous exception: %(msg)s ' - 'Exception: %(e)s for node') % err_info - errmsg = last_error + ' %(node)s' - LOG.exception(errmsg, {'node': node.uuid}) - if node.provision_state in (states.CLEANING, states.CLEANWAIT): - manager_utils.cleaning_error_handler(task, last_error) - elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): - deploy_utils.set_failed_state( - task, last_error, collect_logs=bool(self._client)) - elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): - manager_utils.rescuing_error_handler(task, last_error) + if node.provision_state == states.DEPLOYWAIT: + self._heartbeat_deploy_wait(task) + elif node.provision_state == states.CLEANWAIT: + self._heartbeat_clean_wait(task) + elif node.provision_state == states.RESCUEWAIT: + self._heartbeat_rescue_wait(task) def _finalize_rescue(self, task): """Call ramdisk to prepare rescue mode and verify result. @@ -704,6 +724,67 @@ class AgentDeployMixin(HeartbeatMixin): """ return execute_step(task, step, 'clean') + def _process_version_mismatch(self, task, step_type): + node = task.node + # For manual clean, the target provision state is MANAGEABLE, whereas + # for automated cleaning, it is (the default) AVAILABLE. + manual_clean = node.target_provision_state == states.MANAGEABLE + + # Cache the new clean steps (and 'hardware_manager_version') + try: + self.refresh_steps(task, step_type) + except exception.NodeCleaningFailure as e: + msg = (_('Could not continue cleaning on node ' + '%(node)s: %(err)s.') % + {'node': node.uuid, 'err': e}) + LOG.exception(msg) + return manager_utils.cleaning_error_handler(task, msg) + except exception.InstanceDeployFailure as e: + msg = (_('Could not continue deployment on node ' + '%(node)s: %(err)s.') % + {'node': node.uuid, 'err': e}) + LOG.exception(msg) + return manager_utils.deploying_error_handler(task, msg) + + if manual_clean: + # Don't restart manual cleaning if agent reboots to a new + # version. Both are operator actions, unlike automated + # cleaning. Manual clean steps are not necessarily idempotent + # like automated clean steps and can be even longer running. + LOG.info('During manual cleaning, node %(node)s detected ' + 'a clean version mismatch. Re-executing and ' + 'continuing from current step %(step)s.', + {'node': node.uuid, 'step': node.clean_step}) + + driver_internal_info = node.driver_internal_info + driver_internal_info['skip_current_clean_step'] = False + node.driver_internal_info = driver_internal_info + node.save() + else: + # Restart the process, agent must have rebooted to new version + LOG.info('During %(type)s, node %(node)s detected a ' + '%(type)s version mismatch. Resetting %(type)s steps ' + 'and rebooting the node.', + {'type': step_type, 'node': node.uuid}) + try: + conductor_steps.set_node_cleaning_steps(task) + except exception.NodeCleaningFailure as e: + msg = (_('Could not restart automated cleaning on node ' + '%(node)s after step %(step)s: %(err)s.') % + {'node': node.uuid, 'err': e, + 'step': node.clean_step}) + LOG.exception(msg) + return manager_utils.cleaning_error_handler(task, msg) + except exception.InstanceDeployFailure as e: + msg = (_('Could not restart deployment on node ' + '%(node)s after step %(step)s: %(err)s.') % + {'node': node.uuid, 'err': e, + 'step': node.deploy_step}) + LOG.exception(msg) + return manager_utils.deploying_error_handler(task, msg) + + manager_utils.notify_conductor_resume_operation(task, step_type) + @METRICS.timer('AgentDeployMixin.process_next_step') def process_next_step(self, task, step_type, **kwargs): """Start the next clean/deploy step if the previous one is complete. @@ -724,9 +805,6 @@ class AgentDeployMixin(HeartbeatMixin): assert step_type in ('clean', 'deploy') node = task.node - # For manual clean, the target provision state is MANAGEABLE, whereas - # for automated cleaning, it is (the default) AVAILABLE. - manual_clean = node.target_provision_state == states.MANAGEABLE agent_commands = self._client.get_commands_status(task.node) if not agent_commands: @@ -770,61 +848,7 @@ class AgentDeployMixin(HeartbeatMixin): return manager_utils.cleaning_error_handler(task, msg) elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH', 'DEPLOY_VERSION_MISMATCH'): - # Cache the new clean steps (and 'hardware_manager_version') - try: - self.refresh_steps(task, step_type) - except exception.NodeCleaningFailure as e: - msg = (_('Could not continue cleaning on node ' - '%(node)s: %(err)s.') % - {'node': node.uuid, 'err': e}) - LOG.exception(msg) - return manager_utils.cleaning_error_handler(task, msg) - except exception.InstanceDeployFailure as e: - msg = (_('Could not continue deployment on node ' - '%(node)s: %(err)s.') % - {'node': node.uuid, 'err': e}) - LOG.exception(msg) - return manager_utils.deploying_error_handler(task, msg) - - if manual_clean: - # Don't restart manual cleaning if agent reboots to a new - # version. Both are operator actions, unlike automated - # cleaning. Manual clean steps are not necessarily idempotent - # like automated clean steps and can be even longer running. - LOG.info('During manual cleaning, node %(node)s detected ' - 'a clean version mismatch. Re-executing and ' - 'continuing from current step %(step)s.', - {'node': node.uuid, 'step': node.clean_step}) - - driver_internal_info = node.driver_internal_info - driver_internal_info['skip_current_clean_step'] = False - node.driver_internal_info = driver_internal_info - node.save() - else: - # Restart the process, agent must have rebooted to new version - LOG.info('During %(type)s, node %(node)s detected a ' - '%(type)s version mismatch. Resetting %(type)s steps ' - 'and rebooting the node.', - {'type': step_type, 'node': node.uuid}) - try: - conductor_steps.set_node_cleaning_steps(task) - except exception.NodeCleaningFailure as e: - msg = (_('Could not restart automated cleaning on node ' - '%(node)s after step %(step)s: %(err)s.') % - {'node': node.uuid, 'err': e, - 'step': node.clean_step}) - LOG.exception(msg) - return manager_utils.cleaning_error_handler(task, msg) - except exception.InstanceDeployFailure as e: - msg = (_('Could not restart deployment on node ' - '%(node)s after step %(step)s: %(err)s.') % - {'node': node.uuid, 'err': e, - 'step': node.deploy_step}) - LOG.exception(msg) - return manager_utils.deploying_error_handler(task, msg) - - manager_utils.notify_conductor_resume_operation(task, step_type) - + self._process_version_mismatch(task, step_type) elif command.get('command_status') == 'SUCCEEDED': step_hook = _get_post_step_hook(node, step_type) if step_hook is not None: diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 62b2e892f6..e5f4e7b8a7 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -384,9 +384,10 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): failed_mock.assert_called_once_with( task, mock.ANY, collect_logs=True) log_mock.assert_called_once_with( - 'Asynchronous exception: Failed checking if deploy is done. ' - 'Exception: LlamaException for node %(node)s', - {'node': task.node.uuid}) + 'Asynchronous exception for node %(node)s: %(err)s', + {'err': 'Failed checking if deploy is done. ' + 'Error: LlamaException', + 'node': task.node.uuid}) @mock.patch.object(agent_base.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @@ -420,9 +421,10 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): # deploy_utils.set_failed_state anymore self.assertFalse(failed_mock.called) log_mock.assert_called_once_with( - 'Asynchronous exception: Failed checking if deploy is done. ' - 'Exception: LlamaException for node %(node)s', - {'node': task.node.uuid}) + 'Asynchronous exception for node %(node)s: %(err)s', + {'err': 'Failed checking if deploy is done. ' + 'Error: LlamaException', + 'node': task.node.uuid}) @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) @mock.patch.object(agent_base.HeartbeatMixin, @@ -574,8 +576,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): mock_finalize.assert_called_once_with(mock.ANY, task) mock_rescue_err_handler.assert_called_once_with( - task, 'Asynchronous exception: Node failed to perform ' - 'rescue operation. Exception: some failure for node') + task, 'Node failed to perform ' + 'rescue operation. Error: some failure') @mock.patch.object(agent_base.HeartbeatMixin, 'in_core_deploy_step', autospec=True) diff --git a/tox.ini b/tox.ini index b5980e41df..62fac742e6 100644 --- a/tox.ini +++ b/tox.ini @@ -116,7 +116,7 @@ filename = *.py,app.wsgi exclude = .venv,.git,.tox,dist,doc,*lib/python*,*egg,build import-order-style = pep8 application-import-names = ironic -max-complexity=20 +max-complexity=18 # [H106] Don't put vim configuration in source files. # [H203] Use assertIs(Not)None to check for None. # [H204] Use assert(Not)Equal to check for equality.