Refactor AgentBase.heartbeat and process_next_step

These methods are reaching the complexity limits, split them.
Also update messages to be more human-friendly.

The complexity limit is returned back to 18 as before hacking 3.0.

Change-Id: I918af9508f3c99321625f6df96b1bf12ead26243
Story: #2006963
This commit is contained in:
Dmitry Tantsur 2020-04-07 16:59:12 +02:00
parent 45d9390187
commit d150aa287f
3 changed files with 158 additions and 132 deletions

View File

@ -459,6 +459,85 @@ class HeartbeatMixin(object):
'maintenance mode; not taking any action.', 'maintenance mode; not taking any action.',
{'node': node.uuid}) {'node': node.uuid})
def _heartbeat_deploy_wait(self, task):
msg = _('Failed checking if deploy is done')
node = task.node
try:
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
# are currently in the core deploy.deploy step. Other deploy steps
# may cause the agent to boot, but we should not trigger deployment
# at that point if the driver is polling for completion of a step.
if self.in_core_deploy_step(task):
if not self.deploy_has_started(task):
msg = _('Node failed to deploy')
self.continue_deploy(task)
elif self.deploy_is_done(task):
msg = _('Node failed to move to active state')
self.reboot_to_instance(task)
else:
node.touch_provisioning()
else:
# The exceptions from RPC are not possible as we using cast
# here
# Check if the driver is polling for completion of a step,
# via the 'deployment_polling' flag.
polling = node.driver_internal_info.get(
'deployment_polling', False)
if not polling:
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
except Exception as e:
last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
{'node': task.node.uuid, 'err': last_error})
# Do not call the error handler is the node is already DEPLOYFAIL
if node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
deploy_utils.set_failed_state(
task, last_error, collect_logs=bool(self._client))
def _heartbeat_clean_wait(self, task):
node = task.node
msg = _('Failed checking if cleaning is done')
try:
node.touch_provisioning()
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
msg = _('Node failed to start the first cleaning step')
# First, cache the clean steps
self.refresh_clean_steps(task)
# Then set/verify node clean steps and start cleaning
conductor_steps.set_node_cleaning_steps(task)
# The exceptions from RPC are not possible as we using cast
# here
manager_utils.notify_conductor_resume_clean(task)
else:
msg = _('Node failed to check cleaning progress')
# Check if the driver is polling for completion of a step,
# via the 'cleaning_polling' flag.
polling = node.driver_internal_info.get(
'cleaning_polling', False)
if not polling:
self.continue_cleaning(task)
except Exception as e:
last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
{'node': task.node.uuid, 'err': last_error})
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
manager_utils.cleaning_error_handler(task, last_error)
def _heartbeat_rescue_wait(self, task):
msg = _('Node failed to perform rescue operation')
try:
self._finalize_rescue(task)
except Exception as e:
last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
{'node': task.node.uuid, 'err': last_error})
if task.node.provision_state in (states.RESCUING,
states.RESCUEWAIT):
manager_utils.rescuing_error_handler(task, last_error)
@METRICS.timer('HeartbeatMixin.heartbeat') @METRICS.timer('HeartbeatMixin.heartbeat')
def heartbeat(self, task, callback_url, agent_version): def heartbeat(self, task, callback_url, agent_version):
"""Process a heartbeat. """Process a heartbeat.
@ -508,71 +587,12 @@ class HeartbeatMixin(object):
if node.maintenance: if node.maintenance:
return self._heartbeat_in_maintenance(task) return self._heartbeat_in_maintenance(task)
# Async call backs don't set error state on their own if node.provision_state == states.DEPLOYWAIT:
# TODO(jimrollenhagen) improve error messages here self._heartbeat_deploy_wait(task)
msg = _('Failed checking if deploy is done.') elif node.provision_state == states.CLEANWAIT:
try: self._heartbeat_clean_wait(task)
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we elif node.provision_state == states.RESCUEWAIT:
# are currently in the core deploy.deploy step. Other deploy steps self._heartbeat_rescue_wait(task)
# may cause the agent to boot, but we should not trigger deployment
# at that point if the driver is polling for completion of a step.
if node.provision_state == states.DEPLOYWAIT:
if self.in_core_deploy_step(task):
if not self.deploy_has_started(task):
msg = _('Node failed to deploy.')
self.continue_deploy(task)
elif self.deploy_is_done(task):
msg = _('Node failed to move to active state.')
self.reboot_to_instance(task)
else:
node.touch_provisioning()
else:
# The exceptions from RPC are not possible as we using cast
# here
# Check if the driver is polling for completion of a step,
# via the 'deployment_polling' flag.
polling = node.driver_internal_info.get(
'deployment_polling', False)
if not polling:
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
elif node.provision_state == states.CLEANWAIT:
node.touch_provisioning()
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
msg = _('Node failed to start the first cleaning step.')
# First, cache the clean steps
self.refresh_clean_steps(task)
# Then set/verify node clean steps and start cleaning
conductor_steps.set_node_cleaning_steps(task)
# The exceptions from RPC are not possible as we using cast
# here
manager_utils.notify_conductor_resume_clean(task)
else:
msg = _('Node failed to check cleaning progress.')
# Check if the driver is polling for completion of a step,
# via the 'cleaning_polling' flag.
polling = node.driver_internal_info.get(
'cleaning_polling', False)
if not polling:
self.continue_cleaning(task)
elif (node.provision_state == states.RESCUEWAIT):
msg = _('Node failed to perform rescue operation.')
self._finalize_rescue(task)
except Exception as e:
err_info = {'msg': msg, 'e': e}
last_error = _('Asynchronous exception: %(msg)s '
'Exception: %(e)s for node') % err_info
errmsg = last_error + ' %(node)s'
LOG.exception(errmsg, {'node': node.uuid})
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
manager_utils.cleaning_error_handler(task, last_error)
elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
deploy_utils.set_failed_state(
task, last_error, collect_logs=bool(self._client))
elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
manager_utils.rescuing_error_handler(task, last_error)
def _finalize_rescue(self, task): def _finalize_rescue(self, task):
"""Call ramdisk to prepare rescue mode and verify result. """Call ramdisk to prepare rescue mode and verify result.
@ -704,6 +724,67 @@ class AgentDeployMixin(HeartbeatMixin):
""" """
return execute_step(task, step, 'clean') return execute_step(task, step, 'clean')
def _process_version_mismatch(self, task, step_type):
node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is (the default) AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
# Cache the new clean steps (and 'hardware_manager_version')
try:
self.refresh_steps(task, step_type)
except exception.NodeCleaningFailure as e:
msg = (_('Could not continue cleaning on node '
'%(node)s: %(err)s.') %
{'node': node.uuid, 'err': e})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
except exception.InstanceDeployFailure as e:
msg = (_('Could not continue deployment on node '
'%(node)s: %(err)s.') %
{'node': node.uuid, 'err': e})
LOG.exception(msg)
return manager_utils.deploying_error_handler(task, msg)
if manual_clean:
# Don't restart manual cleaning if agent reboots to a new
# version. Both are operator actions, unlike automated
# cleaning. Manual clean steps are not necessarily idempotent
# like automated clean steps and can be even longer running.
LOG.info('During manual cleaning, node %(node)s detected '
'a clean version mismatch. Re-executing and '
'continuing from current step %(step)s.',
{'node': node.uuid, 'step': node.clean_step})
driver_internal_info = node.driver_internal_info
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
node.save()
else:
# Restart the process, agent must have rebooted to new version
LOG.info('During %(type)s, node %(node)s detected a '
'%(type)s version mismatch. Resetting %(type)s steps '
'and rebooting the node.',
{'type': step_type, 'node': node.uuid})
try:
conductor_steps.set_node_cleaning_steps(task)
except exception.NodeCleaningFailure as e:
msg = (_('Could not restart automated cleaning on node '
'%(node)s after step %(step)s: %(err)s.') %
{'node': node.uuid, 'err': e,
'step': node.clean_step})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
except exception.InstanceDeployFailure as e:
msg = (_('Could not restart deployment on node '
'%(node)s after step %(step)s: %(err)s.') %
{'node': node.uuid, 'err': e,
'step': node.deploy_step})
LOG.exception(msg)
return manager_utils.deploying_error_handler(task, msg)
manager_utils.notify_conductor_resume_operation(task, step_type)
@METRICS.timer('AgentDeployMixin.process_next_step') @METRICS.timer('AgentDeployMixin.process_next_step')
def process_next_step(self, task, step_type, **kwargs): def process_next_step(self, task, step_type, **kwargs):
"""Start the next clean/deploy step if the previous one is complete. """Start the next clean/deploy step if the previous one is complete.
@ -724,9 +805,6 @@ class AgentDeployMixin(HeartbeatMixin):
assert step_type in ('clean', 'deploy') assert step_type in ('clean', 'deploy')
node = task.node node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is (the default) AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
agent_commands = self._client.get_commands_status(task.node) agent_commands = self._client.get_commands_status(task.node)
if not agent_commands: if not agent_commands:
@ -770,61 +848,7 @@ class AgentDeployMixin(HeartbeatMixin):
return manager_utils.cleaning_error_handler(task, msg) return manager_utils.cleaning_error_handler(task, msg)
elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH', elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH',
'DEPLOY_VERSION_MISMATCH'): 'DEPLOY_VERSION_MISMATCH'):
# Cache the new clean steps (and 'hardware_manager_version') self._process_version_mismatch(task, step_type)
try:
self.refresh_steps(task, step_type)
except exception.NodeCleaningFailure as e:
msg = (_('Could not continue cleaning on node '
'%(node)s: %(err)s.') %
{'node': node.uuid, 'err': e})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
except exception.InstanceDeployFailure as e:
msg = (_('Could not continue deployment on node '
'%(node)s: %(err)s.') %
{'node': node.uuid, 'err': e})
LOG.exception(msg)
return manager_utils.deploying_error_handler(task, msg)
if manual_clean:
# Don't restart manual cleaning if agent reboots to a new
# version. Both are operator actions, unlike automated
# cleaning. Manual clean steps are not necessarily idempotent
# like automated clean steps and can be even longer running.
LOG.info('During manual cleaning, node %(node)s detected '
'a clean version mismatch. Re-executing and '
'continuing from current step %(step)s.',
{'node': node.uuid, 'step': node.clean_step})
driver_internal_info = node.driver_internal_info
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
node.save()
else:
# Restart the process, agent must have rebooted to new version
LOG.info('During %(type)s, node %(node)s detected a '
'%(type)s version mismatch. Resetting %(type)s steps '
'and rebooting the node.',
{'type': step_type, 'node': node.uuid})
try:
conductor_steps.set_node_cleaning_steps(task)
except exception.NodeCleaningFailure as e:
msg = (_('Could not restart automated cleaning on node '
'%(node)s after step %(step)s: %(err)s.') %
{'node': node.uuid, 'err': e,
'step': node.clean_step})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
except exception.InstanceDeployFailure as e:
msg = (_('Could not restart deployment on node '
'%(node)s after step %(step)s: %(err)s.') %
{'node': node.uuid, 'err': e,
'step': node.deploy_step})
LOG.exception(msg)
return manager_utils.deploying_error_handler(task, msg)
manager_utils.notify_conductor_resume_operation(task, step_type)
elif command.get('command_status') == 'SUCCEEDED': elif command.get('command_status') == 'SUCCEEDED':
step_hook = _get_post_step_hook(node, step_type) step_hook = _get_post_step_hook(node, step_type)
if step_hook is not None: if step_hook is not None:

View File

@ -384,9 +384,10 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
failed_mock.assert_called_once_with( failed_mock.assert_called_once_with(
task, mock.ANY, collect_logs=True) task, mock.ANY, collect_logs=True)
log_mock.assert_called_once_with( log_mock.assert_called_once_with(
'Asynchronous exception: Failed checking if deploy is done. ' 'Asynchronous exception for node %(node)s: %(err)s',
'Exception: LlamaException for node %(node)s', {'err': 'Failed checking if deploy is done. '
{'node': task.node.uuid}) 'Error: LlamaException',
'node': task.node.uuid})
@mock.patch.object(agent_base.HeartbeatMixin, @mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True) 'in_core_deploy_step', autospec=True)
@ -420,9 +421,10 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
# deploy_utils.set_failed_state anymore # deploy_utils.set_failed_state anymore
self.assertFalse(failed_mock.called) self.assertFalse(failed_mock.called)
log_mock.assert_called_once_with( log_mock.assert_called_once_with(
'Asynchronous exception: Failed checking if deploy is done. ' 'Asynchronous exception for node %(node)s: %(err)s',
'Exception: LlamaException for node %(node)s', {'err': 'Failed checking if deploy is done. '
{'node': task.node.uuid}) 'Error: LlamaException',
'node': task.node.uuid})
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin, @mock.patch.object(agent_base.HeartbeatMixin,
@ -574,8 +576,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
mock_finalize.assert_called_once_with(mock.ANY, task) mock_finalize.assert_called_once_with(mock.ANY, task)
mock_rescue_err_handler.assert_called_once_with( mock_rescue_err_handler.assert_called_once_with(
task, 'Asynchronous exception: Node failed to perform ' task, 'Node failed to perform '
'rescue operation. Exception: some failure for node') 'rescue operation. Error: some failure')
@mock.patch.object(agent_base.HeartbeatMixin, @mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True) 'in_core_deploy_step', autospec=True)

View File

@ -116,7 +116,7 @@ filename = *.py,app.wsgi
exclude = .venv,.git,.tox,dist,doc,*lib/python*,*egg,build exclude = .venv,.git,.tox,dist,doc,*lib/python*,*egg,build
import-order-style = pep8 import-order-style = pep8
application-import-names = ironic application-import-names = ironic
max-complexity=20 max-complexity=18
# [H106] Don't put vim configuration in source files. # [H106] Don't put vim configuration in source files.
# [H203] Use assertIs(Not)None to check for None. # [H203] Use assertIs(Not)None to check for None.
# [H204] Use assert(Not)Equal to check for equality. # [H204] Use assert(Not)Equal to check for equality.