Asynchronous out of band deploy steps fails to execute
Asynchronous out of band steps in a deploy template fails to execute. This commit fixes that issue. Asynchronous steps can set 'skip_current_deploy_step' flag to False in 'driver_internal_info' to make sure that upon reboot same step is re-executed. Also it can set 'deployment_reboot' flag to True in 'driver_internal_info' to signal that it has rebooted the node. Co-Authored-By: Mark Goddard <mark@stackhpc.com> Change-Id: If6217afb5453c311d5ca71ba37458a9b97c18395 Story: 2006342 Task: 36095
This commit is contained in:
parent
71b7441b78
commit
8f907886a1
@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
action=event, node=task.node.uuid,
|
||||
state=task.node.provision_state)
|
||||
|
||||
def _get_node_next_deploy_steps(self, task):
|
||||
return self._get_node_next_steps(task, 'deploy')
|
||||
def _get_node_next_deploy_steps(self, task, skip_current_step=True):
|
||||
return self._get_node_next_steps(task, 'deploy',
|
||||
skip_current_step=skip_current_step)
|
||||
|
||||
@METRICS.timer('ConductorManager.continue_node_deploy')
|
||||
def continue_node_deploy(self, context, node_id):
|
||||
@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'state': node.provision_state,
|
||||
'deploy_state': ', '.join(expected_states)})
|
||||
|
||||
next_step_index = self._get_node_next_deploy_steps(task)
|
||||
info = node.driver_internal_info
|
||||
try:
|
||||
skip_current_step = info.pop('skip_current_deploy_step')
|
||||
except KeyError:
|
||||
skip_current_step = True
|
||||
else:
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
||||
next_step_index = self._get_node_next_deploy_steps(
|
||||
task, skip_current_step=skip_current_step)
|
||||
|
||||
# TODO(rloo): When deprecation period is over and node is in
|
||||
# states.DEPLOYWAIT only, delete the 'if' and always 'resume'.
|
||||
@ -3861,6 +3872,16 @@ def _do_next_deploy_step(task, step_index, conductor_id):
|
||||
try:
|
||||
result = interface.execute_deploy_step(task, step)
|
||||
except exception.IronicException as e:
|
||||
if isinstance(e, exception.AgentConnectionFailed):
|
||||
if task.node.driver_internal_info.get('deployment_reboot'):
|
||||
LOG.info('Agent is not yet running on node %(node)s after '
|
||||
'deployment reboot, waiting for agent to come up '
|
||||
'to run next deploy step %(step)s.',
|
||||
{'node': node.uuid, 'step': step})
|
||||
driver_internal_info['skip_current_deploy_step'] = False
|
||||
node.driver_internal_info = driver_internal_info
|
||||
task.process_event('wait')
|
||||
return
|
||||
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
|
||||
'%(err)s' %
|
||||
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
|
||||
@ -3885,7 +3906,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
|
||||
node.save()
|
||||
|
||||
# Check if the step is done or not. The step should return
|
||||
# states.CLEANWAIT if the step is still being executed, or
|
||||
# states.DEPLOYWAIT if the step is still being executed, or
|
||||
# None if the step is done.
|
||||
# NOTE(deva): Some drivers may return states.DEPLOYWAIT
|
||||
# eg. if they are waiting for a callback
|
||||
@ -3915,6 +3936,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
|
||||
driver_internal_info = node.driver_internal_info
|
||||
driver_internal_info['deploy_steps'] = None
|
||||
driver_internal_info.pop('deploy_step_index', None)
|
||||
driver_internal_info.pop('deployment_reboot', None)
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
|
||||
|
@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
|
||||
node.deploy_step = {}
|
||||
info = node.driver_internal_info
|
||||
info.pop('deploy_step_index', None)
|
||||
# Clear any leftover metadata about deployment reboots
|
||||
info.pop('deployment_reboot', None)
|
||||
node.driver_internal_info = info
|
||||
|
||||
if cleanup_err:
|
||||
|
@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
|
||||
task.process_event('wait')
|
||||
self.continue_deploy(task)
|
||||
elif task.driver.storage.should_write_image(task):
|
||||
# Check if the driver has already performed a reboot in a previous
|
||||
# deploy step.
|
||||
if not task.node.driver_internal_info.get('deployment_reboot'):
|
||||
manager_utils.node_power_action(task, states.REBOOT)
|
||||
info = task.node.driver_internal_info
|
||||
info.pop('deployment_reboot', None)
|
||||
task.node.driver_internal_info = info
|
||||
task.node.save()
|
||||
return states.DEPLOYWAIT
|
||||
else:
|
||||
# TODO(TheJulia): At some point, we should de-dupe this code
|
||||
|
@ -366,6 +366,9 @@ class HeartbeatMixin(object):
|
||||
else:
|
||||
node.touch_provisioning()
|
||||
else:
|
||||
# The exceptions from RPC are not possible as we using cast
|
||||
# here
|
||||
manager_utils.notify_conductor_resume_deploy(task)
|
||||
node.touch_provisioning()
|
||||
elif node.provision_state == states.CLEANWAIT:
|
||||
node.touch_provisioning()
|
||||
|
@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
|
||||
# Standard deploy process
|
||||
deploy_utils.cache_instance_image(task.context, node)
|
||||
check_image_size(task)
|
||||
# Check if the driver has already performed a reboot in a previous
|
||||
# deploy step.
|
||||
if not task.node.driver_internal_info.get('deployment_reboot',
|
||||
False):
|
||||
manager_utils.node_power_action(task, states.REBOOT)
|
||||
info = task.node.driver_internal_info
|
||||
info.pop('deployment_reboot', None)
|
||||
task.node.driver_internal_info = info
|
||||
task.node.save()
|
||||
|
||||
return states.DEPLOYWAIT
|
||||
else:
|
||||
# Boot to an Storage Volume
|
||||
|
@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
mock.ANY, 1, mock.ANY)
|
||||
self.assertFalse(mock_event.called)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def _continue_node_deploy_skip_step(self, mock_spawn, skip=True):
|
||||
# test that skipping current step mechanism works
|
||||
driver_info = {'deploy_steps': self.deploy_steps,
|
||||
'deploy_step_index': 0}
|
||||
if not skip:
|
||||
driver_info['skip_current_deploy_step'] = skip
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.DEPLOYWAIT,
|
||||
target_provision_state=states.MANAGEABLE,
|
||||
driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
|
||||
self._start_service()
|
||||
self.service.continue_node_deploy(self.context, node.uuid)
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
if skip:
|
||||
expected_step_index = 1
|
||||
else:
|
||||
self.assertNotIn(
|
||||
'skip_current_deploy_step', node.driver_internal_info)
|
||||
expected_step_index = 0
|
||||
mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
|
||||
mock.ANY, expected_step_index, mock.ANY)
|
||||
|
||||
def test_continue_node_deploy_skip_step(self):
|
||||
self._continue_node_deploy_skip_step()
|
||||
|
||||
def test_continue_node_deploy_no_skip_step(self):
|
||||
self._continue_node_deploy_skip_step(skip=False)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
|
||||
autospec=True)
|
||||
def test_do_next_deploy_step_oob_reboot(self, mock_execute):
|
||||
# When a deploy step fails, go to DEPLOYWAIT
|
||||
tgt_prov_state = states.ACTIVE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.DEPLOYING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'deploy_steps': self.deploy_steps,
|
||||
'deploy_step_index': None,
|
||||
'deployment_reboot': True},
|
||||
clean_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
manager._do_next_deploy_step(task, 0, mock.ANY)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to CLEANWAIT
|
||||
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual(self.deploy_steps[0], node.deploy_step)
|
||||
self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
|
||||
self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.deploy_steps[0])
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
|
||||
autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot_fail(self,
|
||||
mock_execute):
|
||||
# When a deploy step fails with no reboot requested go to DEPLOYFAIL
|
||||
tgt_prov_state = states.ACTIVE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.DEPLOYING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'deploy_steps': self.deploy_steps,
|
||||
'deploy_step_index': None},
|
||||
deploy_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
manager._do_next_deploy_step(task, 0, mock.ANY)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to DEPLOYFAIL, clear deploy_steps
|
||||
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual({}, node.deploy_step)
|
||||
self.assertNotIn('deploy_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_deploy_step', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.deploy_steps[0])
|
||||
|
||||
|
||||
@mgr_utils.mock_record_keepalive
|
||||
class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
mock_execute.assert_called_once_with(mock.ANY, mock.ANY,
|
||||
self.deploy_steps[0])
|
||||
|
||||
def test__get_node_next_deploy_steps(self):
|
||||
def _test__get_node_next_deploy_steps(self, skip=True):
|
||||
driver_internal_info = {'deploy_steps': self.deploy_steps,
|
||||
'deploy_step_index': 0}
|
||||
node = obj_utils.create_test_node(
|
||||
@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
deploy_step=self.deploy_steps[0])
|
||||
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
step_index = self.service._get_node_next_deploy_steps(task)
|
||||
self.assertEqual(1, step_index)
|
||||
step_index = self.service._get_node_next_deploy_steps(
|
||||
task, skip_current_step=skip)
|
||||
expected_index = 1 if skip else 0
|
||||
self.assertEqual(expected_index, step_index)
|
||||
|
||||
def test__get_node_next_deploy_steps(self):
|
||||
self._test__get_node_next_deploy_steps()
|
||||
|
||||
def test__get_node_next_deploy_steps_no_skip(self):
|
||||
self._test__get_node_next_deploy_steps(skip=False)
|
||||
|
||||
def test__get_node_next_deploy_steps_unset_deploy_step(self):
|
||||
driver_internal_info = {'deploy_steps': self.deploy_steps,
|
||||
|
@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.assertEqual(self.errmsg, self.node.last_error)
|
||||
self.assertEqual({}, self.node.deploy_step)
|
||||
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
|
||||
self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
|
||||
self.task.process_event.assert_called_once_with('fail')
|
||||
|
||||
def _test_deploying_error_handler_cleanup(self, exc, expected_str):
|
||||
|
@ -321,6 +321,23 @@ class TestAgentDeploy(db_base.DbTestCase):
|
||||
power_mock.assert_called_once_with(task, states.REBOOT)
|
||||
self.assertFalse(mock_pxe_instance.called)
|
||||
|
||||
@mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
|
||||
@mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
|
||||
def test_deploy_with_deployment_reboot(self, power_mock,
|
||||
mock_pxe_instance):
|
||||
driver_internal_info = self.node.driver_internal_info
|
||||
driver_internal_info['deployment_reboot'] = True
|
||||
self.node.driver_internal_info = driver_internal_info
|
||||
self.node.save()
|
||||
with task_manager.acquire(
|
||||
self.context, self.node['uuid'], shared=False) as task:
|
||||
driver_return = self.driver.deploy(task)
|
||||
self.assertEqual(driver_return, states.DEPLOYWAIT)
|
||||
self.assertFalse(power_mock.called)
|
||||
self.assertFalse(mock_pxe_instance.called)
|
||||
self.assertNotIn(
|
||||
'deployment_reboot', task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
|
||||
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
|
||||
autospec=True)
|
||||
|
@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
self.assertFalse(cd_mock.called)
|
||||
rti_mock.assert_called_once_with(self.deploy, task)
|
||||
|
||||
@mock.patch.object(manager_utils,
|
||||
'notify_conductor_resume_deploy', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'in_core_deploy_step', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock,
|
||||
deploy_is_done_mock,
|
||||
deploy_started_mock,
|
||||
in_deploy_mock):
|
||||
in_deploy_mock,
|
||||
in_resume_deploy_mock):
|
||||
# Check that heartbeats do not trigger deployment actions when not in
|
||||
# the deploy.deploy step.
|
||||
in_deploy_mock.return_value = False
|
||||
@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
self.assertFalse(deploy_is_done_mock.called)
|
||||
self.assertFalse(cd_mock.called)
|
||||
self.assertFalse(rti_mock.called)
|
||||
self.assertTrue(in_resume_deploy_mock.called)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
|
||||
autospec=True)
|
||||
|
@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
|
||||
mock_check_image_size.assert_called_once_with(task)
|
||||
mock_node_power_action.assert_called_once_with(task, states.REBOOT)
|
||||
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
@mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True)
|
||||
@mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True)
|
||||
def test_deploy_with_deployment_reboot(self, mock_cache_instance_image,
|
||||
mock_check_image_size,
|
||||
mock_node_power_action):
|
||||
driver_internal_info = self.node.driver_internal_info
|
||||
driver_internal_info['deployment_reboot'] = True
|
||||
self.node.driver_internal_info = driver_internal_info
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context,
|
||||
self.node.uuid, shared=False) as task:
|
||||
state = task.driver.deploy.deploy(task)
|
||||
self.assertEqual(state, states.DEPLOYWAIT)
|
||||
mock_cache_instance_image.assert_called_once_with(
|
||||
self.context, task.node)
|
||||
mock_check_image_size.assert_called_once_with(task)
|
||||
self.assertFalse(mock_node_power_action.called)
|
||||
self.assertNotIn(
|
||||
'deployment_reboot', task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
|
||||
autospec=True)
|
||||
@mock.patch.object(flat_network.FlatNetwork,
|
||||
|
@ -0,0 +1,6 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes an issue wherein asynchronous out-of-band deploy steps in
|
||||
deployment template fails to execute. See `story 2006342
|
||||
<https://storyboard.openstack.org/#!/story/2006342>`__ for details.
|
Loading…
Reference in New Issue
Block a user