Asynchronous out of band deploy steps fails to execute

Asynchronous out of band steps in a deploy template fails to
execute. This commit fixes that issue. Asynchronous steps can
set 'skip_current_deploy_step' flag to False in
'driver_internal_info' to make sure that upon reboot same step
is re-executed. Also it can set 'deployment_reboot' flag to True
in 'driver_internal_info' to signal that it has rebooted the node.

Co-Authored-By: Mark Goddard <mark@stackhpc.com>
Change-Id: If6217afb5453c311d5ca71ba37458a9b97c18395
Story: 2006342
Task: 36095
This commit is contained in:
Shivanand Tendulker 2019-08-01 08:20:00 -04:00
parent 71b7441b78
commit 8f907886a1
11 changed files with 213 additions and 10 deletions

View File

@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager):
action=event, node=task.node.uuid,
state=task.node.provision_state)
def _get_node_next_deploy_steps(self, task):
return self._get_node_next_steps(task, 'deploy')
def _get_node_next_deploy_steps(self, task, skip_current_step=True):
return self._get_node_next_steps(task, 'deploy',
skip_current_step=skip_current_step)
@METRICS.timer('ConductorManager.continue_node_deploy')
def continue_node_deploy(self, context, node_id):
@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state,
'deploy_state': ', '.join(expected_states)})
next_step_index = self._get_node_next_deploy_steps(task)
info = node.driver_internal_info
try:
skip_current_step = info.pop('skip_current_deploy_step')
except KeyError:
skip_current_step = True
else:
node.driver_internal_info = info
node.save()
next_step_index = self._get_node_next_deploy_steps(
task, skip_current_step=skip_current_step)
# TODO(rloo): When deprecation period is over and node is in
# states.DEPLOYWAIT only, delete the 'if' and always 'resume'.
@ -3861,6 +3872,16 @@ def _do_next_deploy_step(task, step_index, conductor_id):
try:
result = interface.execute_deploy_step(task, step)
except exception.IronicException as e:
if isinstance(e, exception.AgentConnectionFailed):
if task.node.driver_internal_info.get('deployment_reboot'):
LOG.info('Agent is not yet running on node %(node)s after '
'deployment reboot, waiting for agent to come up '
'to run next deploy step %(step)s.',
{'node': node.uuid, 'step': step})
driver_internal_info['skip_current_deploy_step'] = False
node.driver_internal_info = driver_internal_info
task.process_event('wait')
return
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
'%(err)s' %
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
@ -3885,7 +3906,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
node.save()
# Check if the step is done or not. The step should return
# states.CLEANWAIT if the step is still being executed, or
# states.DEPLOYWAIT if the step is still being executed, or
# None if the step is done.
# NOTE(deva): Some drivers may return states.DEPLOYWAIT
# eg. if they are waiting for a callback
@ -3915,6 +3936,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
driver_internal_info = node.driver_internal_info
driver_internal_info['deploy_steps'] = None
driver_internal_info.pop('deploy_step_index', None)
driver_internal_info.pop('deployment_reboot', None)
node.driver_internal_info = driver_internal_info
node.save()

View File

@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
node.deploy_step = {}
info = node.driver_internal_info
info.pop('deploy_step_index', None)
# Clear any leftover metadata about deployment reboots
info.pop('deployment_reboot', None)
node.driver_internal_info = info
if cleanup_err:

View File

@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
task.process_event('wait')
self.continue_deploy(task)
elif task.driver.storage.should_write_image(task):
# Check if the driver has already performed a reboot in a previous
# deploy step.
if not task.node.driver_internal_info.get('deployment_reboot'):
manager_utils.node_power_action(task, states.REBOOT)
info = task.node.driver_internal_info
info.pop('deployment_reboot', None)
task.node.driver_internal_info = info
task.node.save()
return states.DEPLOYWAIT
else:
# TODO(TheJulia): At some point, we should de-dupe this code

View File

@ -366,6 +366,9 @@ class HeartbeatMixin(object):
else:
node.touch_provisioning()
else:
# The exceptions from RPC are not possible as we using cast
# here
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
elif node.provision_state == states.CLEANWAIT:
node.touch_provisioning()

View File

@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
# Standard deploy process
deploy_utils.cache_instance_image(task.context, node)
check_image_size(task)
# Check if the driver has already performed a reboot in a previous
# deploy step.
if not task.node.driver_internal_info.get('deployment_reboot',
False):
manager_utils.node_power_action(task, states.REBOOT)
info = task.node.driver_internal_info
info.pop('deployment_reboot', None)
task.node.driver_internal_info = info
task.node.save()
return states.DEPLOYWAIT
else:
# Boot to an Storage Volume

View File

@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
mock.ANY, 1, mock.ANY)
self.assertFalse(mock_event.called)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def _continue_node_deploy_skip_step(self, mock_spawn, skip=True):
# test that skipping current step mechanism works
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
if not skip:
driver_info['skip_current_deploy_step'] = skip
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
self._start_service()
self.service.continue_node_deploy(self.context, node.uuid)
self._stop_service()
node.refresh()
if skip:
expected_step_index = 1
else:
self.assertNotIn(
'skip_current_deploy_step', node.driver_internal_info)
expected_step_index = 0
mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
mock.ANY, expected_step_index, mock.ANY)
def test_continue_node_deploy_skip_step(self):
self._continue_node_deploy_skip_step()
def test_continue_node_deploy_no_skip_step(self):
self._continue_node_deploy_skip_step(skip=False)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True)
def test_do_next_deploy_step_oob_reboot(self, mock_execute):
# When a deploy step fails, go to DEPLOYWAIT
tgt_prov_state = states.ACTIVE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'deploy_steps': self.deploy_steps,
'deploy_step_index': None,
'deployment_reboot': True},
clean_step={})
mock_execute.side_effect = exception.AgentConnectionFailed(
reason='failed')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
manager._do_next_deploy_step(task, 0, mock.ANY)
self._stop_service()
node.refresh()
# Make sure we go to CLEANWAIT
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.deploy_steps[0], node.deploy_step)
self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.deploy_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True)
def test_do_next_clean_step_oob_reboot_fail(self,
mock_execute):
# When a deploy step fails with no reboot requested go to DEPLOYFAIL
tgt_prov_state = states.ACTIVE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'deploy_steps': self.deploy_steps,
'deploy_step_index': None},
deploy_step={})
mock_execute.side_effect = exception.AgentConnectionFailed(
reason='failed')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
manager._do_next_deploy_step(task, 0, mock.ANY)
self._stop_service()
node.refresh()
# Make sure we go to DEPLOYFAIL, clear deploy_steps
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual({}, node.deploy_step)
self.assertNotIn('deploy_step_index', node.driver_internal_info)
self.assertNotIn('skip_current_deploy_step', node.driver_internal_info)
self.assertIsNotNone(node.last_error)
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.deploy_steps[0])
@mgr_utils.mock_record_keepalive
class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
mock_execute.assert_called_once_with(mock.ANY, mock.ANY,
self.deploy_steps[0])
def test__get_node_next_deploy_steps(self):
def _test__get_node_next_deploy_steps(self, skip=True):
driver_internal_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
node = obj_utils.create_test_node(
@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
deploy_step=self.deploy_steps[0])
with task_manager.acquire(self.context, node.uuid) as task:
step_index = self.service._get_node_next_deploy_steps(task)
self.assertEqual(1, step_index)
step_index = self.service._get_node_next_deploy_steps(
task, skip_current_step=skip)
expected_index = 1 if skip else 0
self.assertEqual(expected_index, step_index)
def test__get_node_next_deploy_steps(self):
self._test__get_node_next_deploy_steps()
def test__get_node_next_deploy_steps_no_skip(self):
self._test__get_node_next_deploy_steps(skip=False)
def test__get_node_next_deploy_steps_unset_deploy_step(self):
driver_internal_info = {'deploy_steps': self.deploy_steps,

View File

@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual(self.errmsg, self.node.last_error)
self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail')
def _test_deploying_error_handler_cleanup(self, exc, expected_str):

View File

@ -321,6 +321,23 @@ class TestAgentDeploy(db_base.DbTestCase):
power_mock.assert_called_once_with(task, states.REBOOT)
self.assertFalse(mock_pxe_instance.called)
@mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
@mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
def test_deploy_with_deployment_reboot(self, power_mock,
mock_pxe_instance):
driver_internal_info = self.node.driver_internal_info
driver_internal_info['deployment_reboot'] = True
self.node.driver_internal_info = driver_internal_info
self.node.save()
with task_manager.acquire(
self.context, self.node['uuid'], shared=False) as task:
driver_return = self.driver.deploy(task)
self.assertEqual(driver_return, states.DEPLOYWAIT)
self.assertFalse(power_mock.called)
self.assertFalse(mock_pxe_instance.called)
self.assertNotIn(
'deployment_reboot', task.node.driver_internal_info)
@mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
autospec=True)

View File

@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(cd_mock.called)
rti_mock.assert_called_once_with(self.deploy, task)
@mock.patch.object(manager_utils,
'notify_conductor_resume_deploy', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock,
deploy_is_done_mock,
deploy_started_mock,
in_deploy_mock):
in_deploy_mock,
in_resume_deploy_mock):
# Check that heartbeats do not trigger deployment actions when not in
# the deploy.deploy step.
in_deploy_mock.return_value = False
@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
self.assertTrue(in_resume_deploy_mock.called)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)

View File

@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
mock_check_image_size.assert_called_once_with(task)
mock_node_power_action.assert_called_once_with(task, states.REBOOT)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True)
@mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True)
def test_deploy_with_deployment_reboot(self, mock_cache_instance_image,
mock_check_image_size,
mock_node_power_action):
driver_internal_info = self.node.driver_internal_info
driver_internal_info['deployment_reboot'] = True
self.node.driver_internal_info = driver_internal_info
self.node.save()
with task_manager.acquire(self.context,
self.node.uuid, shared=False) as task:
state = task.driver.deploy.deploy(task)
self.assertEqual(state, states.DEPLOYWAIT)
mock_cache_instance_image.assert_called_once_with(
self.context, task.node)
mock_check_image_size.assert_called_once_with(task)
self.assertFalse(mock_node_power_action.called)
self.assertNotIn(
'deployment_reboot', task.node.driver_internal_info)
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
autospec=True)
@mock.patch.object(flat_network.FlatNetwork,

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixes an issue wherein asynchronous out-of-band deploy steps in
deployment template fails to execute. See `story 2006342
<https://storyboard.openstack.org/#!/story/2006342>`__ for details.