Avoid RPC notify_conductor_resume_{deploy,clean} in agent_base

Currently we use an RPC call to the conductor itself to proceed to the next clean or deploy step. This is unnecessary and requires temporary lifting the lock, potentially causing race conditions. This change makes the agent code use continue_node_{deploy,clean} directly. The drivers still need updating, it will be done later. Story: #2008167 Task: #40922 Change-Id: If4763d542029b9021432425532f24a0228f04c25
2020-10-06 17:44:07 +02:00 · 2020-10-06 17:44:07 +02:00 · 18d016f796
commit 18d016f796
parent e51c6b930e
5 changed files with 386 additions and 355 deletions
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -1206,10 +1206,6 @@ class ConductorManager(base_manager.BaseConductorManager):
        with task_manager.acquire(context, node_id, shared=False, patient=True,
                                  purpose='continue node cleaning') as task:
            node = task.node
-            if node.target_provision_state == states.MANAGEABLE:
-                target_state = states.MANAGEABLE
-            else:
-                target_state = None

            if node.provision_state != states.CLEANWAIT:
                raise exception.InvalidStateRequested(_(
@ -1219,7 +1215,7 @@ class ConductorManager(base_manager.BaseConductorManager):
                     'state': node.provision_state,
                     'clean_state': states.CLEANWAIT})

-            task.process_event('resume', target_state=target_state)
+            task.resume_cleaning()

            task.set_spawn_error_hook(utils.spawn_cleaning_error_handler,
                                      task.node)
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@ -604,6 +604,14 @@ class TaskManager(object):
            # emitted at __exit__().
            self._saved_node = self.node

+    def resume_cleaning(self):
+        """A helper to resume cleaning with the right target state."""
+        if self.node.target_provision_state == states.MANAGEABLE:
+            target_state = states.MANAGEABLE
+        else:
+            target_state = None
+        self.process_event('resume', target_state=target_state)
+
    def __enter__(self):
        return self

--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@ -31,6 +31,8 @@ from ironic.common.i18n import _
 from ironic.common import image_service
 from ironic.common import states
 from ironic.common import utils
+from ironic.conductor import cleaning
+from ironic.conductor import deployments
 from ironic.conductor import steps as conductor_steps
 from ironic.conductor import task_manager
 from ironic.conductor import utils as manager_utils
@ -400,6 +402,15 @@ def _step_failure_handler(task, msg, step_type, traceback=False):
        manager_utils.deploying_error_handler(task, msg, traceback=traceback)


+def _continue_steps(task, step_type):
+    if step_type == 'clean':
+        task.resume_cleaning()
+        cleaning.continue_node_clean(task)
+    else:
+        task.process_event('resume')
+        deployments.continue_node_deploy(task)
+
+
 class HeartbeatMixin(object):
    """Mixin class implementing heartbeat processing."""

@ -510,13 +521,12 @@ class HeartbeatMixin(object):
                LOG.debug('Node %s just booted to start cleaning.',
                          node.uuid)
                msg = _('Node failed to start the first cleaning step')
+                task.resume_cleaning()
                # First, cache the clean steps
                self.refresh_clean_steps(task)
                # Then set/verify node clean steps and start cleaning
                conductor_steps.set_node_cleaning_steps(task)
-                # The exceptions from RPC are not possible as we using cast
-                # here
-                manager_utils.notify_conductor_resume_clean(task)
+                cleaning.continue_node_clean(task)
            else:
                msg = _('Node failed to check cleaning progress')
                # Check if the driver is polling for completion of a step,
@ -910,7 +920,7 @@ class AgentBaseMixin(object):
                return manager_utils.deploying_error_handler(task, msg,
                                                             traceback=True)

-        manager_utils.notify_conductor_resume_operation(task, step_type)
+        _continue_steps(task, step_type)

    @METRICS.timer('AgentBaseMixin.process_next_step')
    def process_next_step(self, task, step_type, **kwargs):
@ -940,8 +950,7 @@ class AgentBaseMixin(object):
                     else 'deployment_reboot')
            utils.pop_node_nested_field(node, 'driver_internal_info', field)
            node.save()
-            manager_utils.notify_conductor_resume_operation(task, step_type)
-            return
+            return _continue_steps(task, step_type)

        current_step = (node.clean_step if step_type == 'clean'
                        else node.deploy_step)
@ -1000,7 +1009,7 @@ class AgentBaseMixin(object):
            LOG.info('Agent on node %(node)s returned %(type)s command '
                     'success, moving to next step',
                     {'node': node.uuid, 'type': step_type})
-            manager_utils.notify_conductor_resume_operation(task, step_type)
+            _continue_steps(task, step_type)
        else:
            msg = (_('Agent returned unknown status for %(type)s step %(step)s'
                     ' on node %(node)s : %(err)s.') %
@ -1202,24 +1211,6 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
                    'error': e})
            log_and_raise_deployment_error(task, msg, exc=e)

-    # TODO(dtantsur): remove in W
-    @METRICS.timer('AgentDeployMixin.reboot_and_finish_deploy')
-    def reboot_and_finish_deploy(self, task):
-        """Helper method to trigger reboot on the node and finish deploy.
-
-        This method initiates a reboot on the node. On success, it
-        marks the deploy as complete. On failure, it logs the error
-        and marks deploy as failure.
-
-        :param task: a TaskManager object containing the node
-        :raises: InstanceDeployFailure, if node reboot failed.
-        """
-        # NOTE(dtantsur): do nothing here, the new deploy steps tear_down_agent
-        # and boot_instance will be picked up and finish the deploy (even for
-        # legacy deploy interfaces without decomposed steps).
-        task.process_event('wait')
-        manager_utils.notify_conductor_resume_deploy(task)
-
    @METRICS.timer('AgentDeployMixin.prepare_instance_to_boot')
    def prepare_instance_to_boot(self, task, root_uuid, efi_sys_uuid,
                                 prep_boot_part_uuid=None):
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@ -24,6 +24,7 @@ from ironic.common import boot_devices
 from ironic.common import exception
 from ironic.common import image_service
 from ironic.common import states
+from ironic.conductor import cleaning
 from ironic.conductor import steps as conductor_steps
 from ironic.conductor import task_manager
 from ironic.conductor import utils as manager_utils
@ -315,9 +316,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
                       'refresh_steps', autospec=True)
    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
                       autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps,
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    def test_heartbeat_resume_clean(self, mock_clean, mock_set_steps,
                                    mock_refresh, mock_touch):
        self.node.clean_step = {}
        self.node.provision_state = states.CLEANWAIT
@ -328,7 +328,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):

        mock_touch.assert_called_once_with(mock.ANY)
        mock_refresh.assert_called_once_with(mock.ANY, task, 'clean')
-        mock_notify.assert_called_once_with(task, 'clean')
+        mock_clean.assert_called_once_with(task)
        mock_set_steps.assert_called_once_with(task)

    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@ -337,16 +337,16 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
                       'refresh_steps', autospec=True)
    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
                       autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    def test_heartbeat_resume_clean_fails(self, mock_notify, mock_set_steps,
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    def test_heartbeat_resume_clean_fails(self, mock_clean, mock_set_steps,
                                          mock_refresh, mock_touch,
                                          mock_handler):
-        mocks = [mock_refresh, mock_set_steps, mock_notify]
-        self.node.clean_step = {}
-        self.node.provision_state = states.CLEANWAIT
-        self.node.save()
+        mocks = [mock_refresh, mock_set_steps, mock_clean]
        for i in range(len(mocks)):
+            self.node.clean_step = {}
+            self.node.provision_state = states.CLEANWAIT
+            self.node.save()
+
            before_failed_mocks = mocks[:i]
            failed_mock = mocks[i]
            after_failed_mocks = mocks[i + 1:]
@ -1528,31 +1528,6 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
            self.assertFalse(prepare_mock.called)
            self.assertFalse(failed_state_mock.called)

-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning(self, status_mock, notify_mock):
-        # Test a successful execute clean step on the agent
-        self.node.clean_step = {
-            'priority': 10,
-            'interface': 'deploy',
-            'step': 'erase_devices',
-            'reboot_requested': False
-        }
-        self.node.save()
-        status_mock.return_value = [{
-            'command_status': 'SUCCEEDED',
-            'command_name': 'execute_clean_step',
-            'command_result': {
-                'clean_step': self.node.clean_step
-            }
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            notify_mock.assert_called_once_with(task, 'clean')
-
    @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
    @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True,
                       autospec=True)
@ -1647,295 +1622,6 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
            self.assertNotIn('deployment_reboot',
                             task.node.driver_internal_info)

-    @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
-    @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True,
-                       autospec=True)
-    @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_reboot(
-            self, status_mock, reboot_mock, mock_prepare, mock_build_opt):
-        # Test a successful execute clean step on the agent, with reboot
-        self.node.clean_step = {
-            'priority': 42,
-            'interface': 'deploy',
-            'step': 'reboot_me_afterwards',
-            'reboot_requested': True
-        }
-        self.node.save()
-        status_mock.return_value = [{
-            'command_status': 'SUCCEEDED',
-            'command_name': 'execute_clean_step',
-            'command_result': {
-                'clean_step': self.node.clean_step
-            }
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            reboot_mock.assert_called_once_with(task, states.REBOOT)
-
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_after_reboot(self, status_mock, notify_mock):
-        # Test a successful execute clean step on the agent, with reboot
-        self.node.clean_step = {
-            'priority': 42,
-            'interface': 'deploy',
-            'step': 'reboot_me_afterwards',
-            'reboot_requested': True
-        }
-        driver_internal_info = self.node.driver_internal_info
-        driver_internal_info['cleaning_reboot'] = True
-        self.node.driver_internal_info = driver_internal_info
-        self.node.save()
-        # Represents a freshly booted agent with no commands
-        status_mock.return_value = []
-
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            notify_mock.assert_called_once_with(task, 'clean')
-            self.assertNotIn('cleaning_reboot',
-                             task.node.driver_internal_info)
-
-    @mock.patch.object(agent_base,
-                       '_get_post_step_hook', autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_with_hook(
-            self, status_mock, notify_mock, get_hook_mock):
-        self.node.clean_step = {
-            'priority': 10,
-            'interface': 'raid',
-            'step': 'create_configuration',
-        }
-        self.node.save()
-        command_status = {
-            'command_status': 'SUCCEEDED',
-            'command_name': 'execute_clean_step',
-            'command_result': {'clean_step': self.node.clean_step}}
-        status_mock.return_value = [command_status]
-        hook_mock = mock.MagicMock(spec=types.FunctionType, __name__='foo')
-        get_hook_mock.return_value = hook_mock
-        with task_manager.acquire(self.context, self.node.uuid,
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-
-            get_hook_mock.assert_called_once_with(task.node, 'clean')
-            hook_mock.assert_called_once_with(task, command_status)
-            notify_mock.assert_called_once_with(task, 'clean')
-
-    @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_base,
-                       '_get_post_step_hook', autospec=True)
-    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_with_hook_fails(
-            self, status_mock, error_handler_mock, get_hook_mock,
-            notify_mock, collect_logs_mock):
-        self.node.clean_step = {
-            'priority': 10,
-            'interface': 'raid',
-            'step': 'create_configuration',
-        }
-        self.node.save()
-        command_status = {
-            'command_status': 'SUCCEEDED',
-            'command_name': 'execute_clean_step',
-            'command_result': {'clean_step': self.node.clean_step}}
-        status_mock.return_value = [command_status]
-        hook_mock = mock.MagicMock(spec=types.FunctionType, __name__='foo')
-        hook_mock.side_effect = RuntimeError('error')
-        get_hook_mock.return_value = hook_mock
-        with task_manager.acquire(self.context, self.node.uuid,
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-
-            get_hook_mock.assert_called_once_with(task.node, 'clean')
-            hook_mock.assert_called_once_with(task, command_status)
-            error_handler_mock.assert_called_once_with(task, mock.ANY,
-                                                       traceback=True)
-            self.assertFalse(notify_mock.called)
-            collect_logs_mock.assert_called_once_with(task.node,
-                                                      label='cleaning')
-
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_old_command(self, status_mock, notify_mock):
-        # Test when a second execute_clean_step happens to the agent, but
-        # the new step hasn't started yet.
-        self.node.clean_step = {
-            'priority': 10,
-            'interface': 'deploy',
-            'step': 'erase_devices',
-            'reboot_requested': False
-        }
-        self.node.save()
-        status_mock.return_value = [{
-            'command_status': 'SUCCEEDED',
-            'command_name': 'execute_clean_step',
-            'command_result': {
-                'priority': 20,
-                'interface': 'deploy',
-                'step': 'update_firmware',
-                'reboot_requested': False
-            }
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            self.assertFalse(notify_mock.called)
-
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_running(self, status_mock, notify_mock):
-        # Test that no action is taken while a clean step is executing
-        status_mock.return_value = [{
-            'command_status': 'RUNNING',
-            'command_name': 'execute_clean_step',
-            'command_result': None
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            self.assertFalse(notify_mock.called)
-
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_no_step_running(self, status_mock, notify_mock):
-        status_mock.return_value = [{
-            'command_status': 'SUCCEEDED',
-            'command_name': 'get_clean_steps',
-            'command_result': []
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            notify_mock.assert_called_once_with(task, 'clean')
-
-    @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
-    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_fail(self, status_mock, error_mock,
-                                    collect_logs_mock):
-        # Test that a failure puts the node in CLEANFAIL
-        status_mock.return_value = [{
-            'command_status': 'FAILED',
-            'command_name': 'execute_clean_step',
-            'command_result': {}
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            task.node.clean_step = {
-                'step': 'erase_devices',
-                'interface': 'deploy',
-            }
-            self.deploy.continue_cleaning(task)
-            error_mock.assert_called_once_with(task, mock.ANY, traceback=False)
-            collect_logs_mock.assert_called_once_with(task.node,
-                                                      label='cleaning')
-
-    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
-                       autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def _test_continue_cleaning_clean_version_mismatch(
-            self, status_mock, refresh_steps_mock, notify_mock, steps_mock,
-            manual=False):
-        status_mock.return_value = [{
-            'command_status': 'CLEAN_VERSION_MISMATCH',
-            'command_name': 'execute_clean_step',
-        }]
-        tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
-        self.node.provision_state = states.CLEANWAIT
-        self.node.target_provision_state = tgt_prov_state
-        self.node.save()
-        with task_manager.acquire(self.context, self.node.uuid,
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            notify_mock.assert_called_once_with(task, 'clean')
-            refresh_steps_mock.assert_called_once_with(mock.ANY, task, 'clean')
-            if manual:
-                self.assertFalse(
-                    task.node.driver_internal_info['skip_current_clean_step'])
-                self.assertFalse(steps_mock.called)
-            else:
-                steps_mock.assert_called_once_with(task)
-                self.assertNotIn('skip_current_clean_step',
-                                 task.node.driver_internal_info)
-
-    def test_continue_cleaning_automated_clean_version_mismatch(self):
-        self._test_continue_cleaning_clean_version_mismatch()
-
-    def test_continue_cleaning_manual_clean_version_mismatch(self):
-        self._test_continue_cleaning_clean_version_mismatch(manual=True)
-
-    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
-    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
-                       autospec=True)
-    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
-                       autospec=True)
-    @mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
-                       autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_clean_version_mismatch_fail(
-            self, status_mock, refresh_steps_mock, notify_mock, steps_mock,
-            error_mock, manual=False):
-        status_mock.return_value = [{
-            'command_status': 'CLEAN_VERSION_MISMATCH',
-            'command_name': 'execute_clean_step',
-            'command_result': {'hardware_manager_version': {'Generic': '1'}}
-        }]
-        refresh_steps_mock.side_effect = exception.NodeCleaningFailure("boo")
-        tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
-        self.node.provision_state = states.CLEANWAIT
-        self.node.target_provision_state = tgt_prov_state
-        self.node.save()
-        with task_manager.acquire(self.context, self.node.uuid,
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-
-            status_mock.assert_called_once_with(mock.ANY, task.node)
-            refresh_steps_mock.assert_called_once_with(mock.ANY, task, 'clean')
-            error_mock.assert_called_once_with(task, mock.ANY, traceback=True)
-            self.assertFalse(notify_mock.called)
-            self.assertFalse(steps_mock.called)
-
-    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
-    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
-                       autospec=True)
-    def test_continue_cleaning_unknown(self, status_mock, error_mock):
-        # Test that unknown commands are treated as failures
-        status_mock.return_value = [{
-            'command_status': 'UNKNOWN',
-            'command_name': 'execute_clean_step',
-            'command_result': {}
-        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
-                                  shared=False) as task:
-            self.deploy.continue_cleaning(task)
-            error_mock.assert_called_once_with(task, mock.ANY, traceback=False)
-
    def _test_clean_step_hook(self):
        """Helper method for unit tests related to clean step hooks."""
        some_function_mock = mock.MagicMock()
@ -1985,6 +1671,351 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
        self.assertIsNone(hook_returned)


+class ContinueCleaningTest(AgentDeployMixinBaseTest):
+
+    def setUp(self):
+        super().setUp()
+        self.node.provision_state = states.CLEANWAIT
+        self.node.target_provision_state = states.AVAILABLE
+        self.node.save()
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning(self, status_mock, clean_mock):
+        # Test a successful execute clean step on the agent
+        self.node.clean_step = {
+            'priority': 10,
+            'interface': 'deploy',
+            'step': 'erase_devices',
+            'reboot_requested': False
+        }
+        self.node.save()
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {
+                'clean_step': self.node.clean_step
+            }
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            clean_mock.assert_called_once_with(task)
+            self.assertEqual(states.CLEANING, task.node.provision_state)
+            self.assertEqual(states.AVAILABLE,
+                             task.node.target_provision_state)
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_manual_cleaning(self, status_mock, clean_mock):
+        self.node.target_provision_state = states.MANAGEABLE
+        self.node.clean_step = {
+            'priority': 10,
+            'interface': 'deploy',
+            'step': 'erase_devices',
+            'reboot_requested': False
+        }
+        self.node.save()
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {
+                'clean_step': self.node.clean_step
+            }
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            clean_mock.assert_called_once_with(task)
+            self.assertEqual(states.CLEANING, task.node.provision_state)
+            self.assertEqual(states.MANAGEABLE,
+                             task.node.target_provision_state)
+
+    @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
+    @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True,
+                       autospec=True)
+    @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_reboot(
+            self, status_mock, reboot_mock, mock_prepare, mock_build_opt):
+        # Test a successful execute clean step on the agent, with reboot
+        self.node.clean_step = {
+            'priority': 42,
+            'interface': 'deploy',
+            'step': 'reboot_me_afterwards',
+            'reboot_requested': True
+        }
+        self.node.save()
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {
+                'clean_step': self.node.clean_step
+            }
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            reboot_mock.assert_called_once_with(task, states.REBOOT)
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_after_reboot(self, status_mock, clean_mock):
+        # Test a successful execute clean step on the agent, with reboot
+        self.node.clean_step = {
+            'priority': 42,
+            'interface': 'deploy',
+            'step': 'reboot_me_afterwards',
+            'reboot_requested': True
+        }
+        driver_internal_info = self.node.driver_internal_info
+        driver_internal_info['cleaning_reboot'] = True
+        self.node.driver_internal_info = driver_internal_info
+        self.node.save()
+        # Represents a freshly booted agent with no commands
+        status_mock.return_value = []
+
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            clean_mock.assert_called_once_with(task)
+            self.assertEqual(states.CLEANING, task.node.provision_state)
+            self.assertNotIn('cleaning_reboot',
+                             task.node.driver_internal_info)
+
+    @mock.patch.object(agent_base,
+                       '_get_post_step_hook', autospec=True)
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_with_hook(
+            self, status_mock, clean_mock, get_hook_mock):
+        self.node.clean_step = {
+            'priority': 10,
+            'interface': 'raid',
+            'step': 'create_configuration',
+        }
+        self.node.save()
+        command_status = {
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {'clean_step': self.node.clean_step}}
+        status_mock.return_value = [command_status]
+        hook_mock = mock.MagicMock(spec=types.FunctionType, __name__='foo')
+        get_hook_mock.return_value = hook_mock
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+
+            get_hook_mock.assert_called_once_with(task.node, 'clean')
+            hook_mock.assert_called_once_with(task, command_status)
+            clean_mock.assert_called_once_with(task)
+
+    @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_base,
+                       '_get_post_step_hook', autospec=True)
+    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_with_hook_fails(
+            self, status_mock, error_handler_mock, get_hook_mock,
+            clean_mock, collect_logs_mock):
+        self.node.clean_step = {
+            'priority': 10,
+            'interface': 'raid',
+            'step': 'create_configuration',
+        }
+        self.node.save()
+        command_status = {
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {'clean_step': self.node.clean_step}}
+        status_mock.return_value = [command_status]
+        hook_mock = mock.MagicMock(spec=types.FunctionType, __name__='foo')
+        hook_mock.side_effect = RuntimeError('error')
+        get_hook_mock.return_value = hook_mock
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+
+            get_hook_mock.assert_called_once_with(task.node, 'clean')
+            hook_mock.assert_called_once_with(task, command_status)
+            error_handler_mock.assert_called_once_with(task, mock.ANY,
+                                                       traceback=True)
+            self.assertFalse(clean_mock.called)
+            collect_logs_mock.assert_called_once_with(task.node,
+                                                      label='cleaning')
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_old_command(self, status_mock, clean_mock):
+        # Test when a second execute_clean_step happens to the agent, but
+        # the new step hasn't started yet.
+        self.node.clean_step = {
+            'priority': 10,
+            'interface': 'deploy',
+            'step': 'erase_devices',
+            'reboot_requested': False
+        }
+        self.node.save()
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'execute_clean_step',
+            'command_result': {
+                'priority': 20,
+                'interface': 'deploy',
+                'step': 'update_firmware',
+                'reboot_requested': False
+            }
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            self.assertFalse(clean_mock.called)
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_running(self, status_mock, clean_mock):
+        # Test that no action is taken while a clean step is executing
+        status_mock.return_value = [{
+            'command_status': 'RUNNING',
+            'command_name': 'execute_clean_step',
+            'command_result': None
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            self.assertFalse(clean_mock.called)
+
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_no_step_running(self, status_mock, clean_mock):
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'get_clean_steps',
+            'command_result': []
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            clean_mock.assert_called_once_with(task)
+
+    @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
+    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_fail(self, status_mock, error_mock,
+                                    collect_logs_mock):
+        # Test that a failure puts the node in CLEANFAIL
+        status_mock.return_value = [{
+            'command_status': 'FAILED',
+            'command_name': 'execute_clean_step',
+            'command_result': {}
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            task.node.clean_step = {
+                'step': 'erase_devices',
+                'interface': 'deploy',
+            }
+            self.deploy.continue_cleaning(task)
+            error_mock.assert_called_once_with(task, mock.ANY, traceback=False)
+            collect_logs_mock.assert_called_once_with(task.node,
+                                                      label='cleaning')
+
+    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
+                       autospec=True)
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
+                       autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def _test_continue_cleaning_clean_version_mismatch(
+            self, status_mock, refresh_steps_mock, clean_mock, steps_mock,
+            manual=False):
+        status_mock.return_value = [{
+            'command_status': 'CLEAN_VERSION_MISMATCH',
+            'command_name': 'execute_clean_step',
+        }]
+        tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+        self.node.provision_state = states.CLEANWAIT
+        self.node.target_provision_state = tgt_prov_state
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            clean_mock.assert_called_once_with(task)
+            refresh_steps_mock.assert_called_once_with(mock.ANY, task, 'clean')
+            if manual:
+                self.assertFalse(
+                    task.node.driver_internal_info['skip_current_clean_step'])
+                self.assertFalse(steps_mock.called)
+            else:
+                steps_mock.assert_called_once_with(task)
+                self.assertNotIn('skip_current_clean_step',
+                                 task.node.driver_internal_info)
+
+    def test_continue_cleaning_automated_clean_version_mismatch(self):
+        self._test_continue_cleaning_clean_version_mismatch()
+
+    def test_continue_cleaning_manual_clean_version_mismatch(self):
+        self._test_continue_cleaning_clean_version_mismatch(manual=True)
+
+    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
+    @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
+                       autospec=True)
+    @mock.patch.object(cleaning, 'continue_node_clean', autospec=True)
+    @mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
+                       autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_clean_version_mismatch_fail(
+            self, status_mock, refresh_steps_mock, clean_mock, steps_mock,
+            error_mock, manual=False):
+        status_mock.return_value = [{
+            'command_status': 'CLEAN_VERSION_MISMATCH',
+            'command_name': 'execute_clean_step',
+            'command_result': {'hardware_manager_version': {'Generic': '1'}}
+        }]
+        refresh_steps_mock.side_effect = exception.NodeCleaningFailure("boo")
+        tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+        self.node.provision_state = states.CLEANWAIT
+        self.node.target_provision_state = tgt_prov_state
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+
+            status_mock.assert_called_once_with(mock.ANY, task.node)
+            refresh_steps_mock.assert_called_once_with(mock.ANY, task, 'clean')
+            error_mock.assert_called_once_with(task, mock.ANY, traceback=True)
+            self.assertFalse(clean_mock.called)
+            self.assertFalse(steps_mock.called)
+
+    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_unknown(self, status_mock, error_mock):
+        # Test that unknown commands are treated as failures
+        status_mock.return_value = [{
+            'command_status': 'UNKNOWN',
+            'command_name': 'execute_clean_step',
+            'command_result': {}
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            error_mock.assert_called_once_with(task, mock.ANY, traceback=False)
+
+
 class TestRefreshCleanSteps(AgentDeployMixinBaseTest):

    def setUp(self):
--- a/releasenotes/notes/no-deploy-rpc-dec8ee1d0326d1ad.yaml
+++ b/releasenotes/notes/no-deploy-rpc-dec8ee1d0326d1ad.yaml
@ -0,0 +1,5 @@
+---
+other:
+  - |
+    The agent deploy and cleaning code no longer uses an RPC call to the same
+    conductor when proceeding to the next deploy or clean step.