From a0dddb19a6e94b2df64235eb229ca3fd9027c0fc Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 11 Nov 2024 12:02:17 +0100 Subject: [PATCH] Agent deploy: account for disable_power_off * In tear_down_agent: lock down the agent and keep the node powered on. * In boot_instance: reboot the node instead of powering it on. * In tear_down_inband_cleaning/service: avoid powering off the node. * In tear_down_service: lock down the agent before tearing down. * In tear_down and clean_up: do not power off, reboot after clean-up. * In rescue/unrescue: use one reboot instead of power off/on. Not locking down in tear_down_inband_cleaning because the node will not end up on a tenant network (and lockdown may disrupt fast-track). Depends-On: https://review.opendev.org/c/openstack/ironic-python-agent/+/934234 Partial-Bug: #2077432 Change-Id: Ib2297ec5b69df6b2ecd11942fb8fdc9d640de6bc --- ironic/drivers/modules/agent.py | 69 +++--- ironic/drivers/modules/agent_base.py | 23 +- ironic/drivers/modules/agent_client.py | 74 ++++++ ironic/drivers/modules/deploy_utils.py | 51 ++-- ironic/drivers/utils.py | 16 ++ .../tests/unit/drivers/modules/test_agent.py | 226 +++++++++++++++--- .../unit/drivers/modules/test_agent_client.py | 75 ++++++ .../unit/drivers/modules/test_deploy_utils.py | 43 +++- 8 files changed, 482 insertions(+), 95 deletions(-) diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py index 105454d655..2cc63c5a09 100644 --- a/ironic/drivers/modules/agent.py +++ b/ironic/drivers/modules/agent.py @@ -412,7 +412,11 @@ class CustomAgentDeploy(agent_base.AgentBaseMixin, client = agent_client.get_client(task) try: - if not can_power_on: + if node.disable_power_off: + LOG.info("Node %s does not support power off, locking " + "down the agent", node.uuid) + client.lockdown(node) + elif not can_power_on: LOG.info('Power interface of node %s does not support ' 'power on, using reboot to switch to the instance', node.uuid) @@ -486,7 +490,7 @@ class CustomAgentDeploy(agent_base.AgentBaseMixin, LOG.debug('The agent for node %(node)s has recently checked ' 'in, and the node power will remain unmodified.', {'node': task.node.uuid}) - else: + elif not node.disable_power_off: # Powering off node to setup networking for port and # ensure that the state is reset if it is inadvertently # on for any unknown reason. @@ -1136,24 +1140,23 @@ class AgentRescue(base.RescueInterface): :raises: any boot interface's prepare_ramdisk exceptions. :returns: Returns states.RESCUEWAIT """ - manager_utils.node_power_action(task, states.POWER_OFF) - # NOTE(TheJulia): Revealing that the power is off at any time can - # cause external power sync to decide that the node must be off. - # This may result in a post-rescued instance being turned off - # unexpectedly after rescue has started. - # TODO(TheJulia): Once we have power/state callbacks to nova, - # the reset of the power_state can be removed. - task.node.power_state = states.POWER_ON - task.node.save() + with driver_utils.power_off_and_on(task): + # NOTE(TheJulia): Revealing that the power is off at any time can + # cause external power sync to decide that the node must be off. + # This may result in a post-rescued instance being turned off + # unexpectedly after rescue has started. + # TODO(TheJulia): Once we have power/state callbacks to nova, + # the reset of the power_state can be removed. + task.node.power_state = states.POWER_ON + task.node.save() - task.driver.boot.clean_up_instance(task) - with manager_utils.power_state_for_network_configuration(task): - task.driver.network.unconfigure_tenant_networks(task) - task.driver.network.add_rescuing_network(task) - if CONF.agent.manage_agent_boot: - # prepare_ramdisk will set the boot device - deploy_utils.prepare_agent_boot(task) - manager_utils.node_power_action(task, states.POWER_ON) + task.driver.boot.clean_up_instance(task) + with manager_utils.power_state_for_network_configuration(task): + task.driver.network.unconfigure_tenant_networks(task) + task.driver.network.add_rescuing_network(task) + if CONF.agent.manage_agent_boot: + # prepare_ramdisk will set the boot device + deploy_utils.prepare_agent_boot(task) return states.RESCUEWAIT @@ -1171,22 +1174,20 @@ class AgentRescue(base.RescueInterface): :raises: any boot interface's prepare_instance exceptions. :returns: Returns states.ACTIVE """ - manager_utils.node_power_action(task, states.POWER_OFF) + with driver_utils.power_off_and_on(task): + # NOTE(TheJulia): Revealing that the power is off at any time can + # cause external power sync to decide that the node must be off. + # This may result in a post-rescued instance being turned off + # unexpectedly after unrescue. + # TODO(TheJulia): Once we have power/state callbacks to nova, + # the reset of the power_state can be removed. + task.node.power_state = states.POWER_ON + task.node.save() - # NOTE(TheJulia): Revealing that the power is off at any time can - # cause external power sync to decide that the node must be off. - # This may result in a post-rescued instance being turned off - # unexpectedly after unrescue. - # TODO(TheJulia): Once we have power/state callbacks to nova, - # the reset of the power_state can be removed. - task.node.power_state = states.POWER_ON - task.node.save() - - self.clean_up(task) - with manager_utils.power_state_for_network_configuration(task): - task.driver.network.configure_tenant_networks(task) - task.driver.boot.prepare_instance(task) - manager_utils.node_power_action(task, states.POWER_ON) + self.clean_up(task) + with manager_utils.power_state_for_network_configuration(task): + task.driver.network.configure_tenant_networks(task) + task.driver.boot.prepare_instance(task) return states.ACTIVE diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index a937abd1c8..9315446b12 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -755,7 +755,8 @@ class AgentBaseMixin(object): :raises: other exceptions by the node's power driver if something wrong occurred during the power action. """ - manager_utils.node_power_action(task, states.POWER_OFF) + if not task.node.disable_power_off: + manager_utils.node_power_action(task, states.POWER_OFF) task.driver.storage.detach_volumes(task) deploy_utils.tear_down_storage_configuration(task) with manager_utils.power_state_for_network_configuration(task): @@ -779,6 +780,11 @@ class AgentBaseMixin(object): task.driver.boot.clean_up_instance(task) provider = dhcp_factory.DHCPFactory() provider.clean_dhcp(task) + # NOTE(dtantsur): in a regular case, this is handled by tear_down, but + # we cannot power off the node there, so making sure it no longer runs + # the instance image by rebooting after boot.clean_up_instance. + if task.node.disable_power_off: + manager_utils.node_power_action(task, states.REBOOT) def take_over(self, task): """Take over management of this node from a dead conductor. @@ -841,8 +847,14 @@ class AgentBaseMixin(object): :raises: NodeServiceFailure, NetworkError if the servicing ports cannot be removed """ - deploy_utils.tear_down_inband_service( - task) + if task.node.disable_power_off: + LOG.debug('Locking down the agent on node %s before exiting ' + 'servicing', task.node.uuid) + agent_client.get_client(task).lockdown( + # NOTE(dtantsur): we cannot assume the agent is still running + # hence fail_if_unavailable=False. + task.node, fail_if_unavailable=False) + deploy_utils.tear_down_inband_service(task) @METRICS.timer('AgentBaseMixin.get_clean_steps') def get_clean_steps(self, task): @@ -1178,7 +1190,10 @@ class AgentOobStepsMixin(object): can_power_on = (states.POWER_ON in task.driver.power.get_supported_power_states(task)) try: - if can_power_on: + if task.node.disable_power_off: + # We haven't powered off the node yet - reset it now. + manager_utils.node_power_action(task, states.REBOOT) + elif can_power_on: manager_utils.node_power_action(task, states.POWER_ON) else: LOG.debug('Not trying to power on node %s that does not ' diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 8e3e4e5ebe..74d17f5f0f 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -15,10 +15,12 @@ from http import client as http_client import os import ssl +import time from ironic_lib import metrics_utils from oslo_log import log from oslo_serialization import jsonutils +from oslo_utils import excutils from oslo_utils import strutils import requests import tenacity @@ -839,3 +841,75 @@ class AgentClient(object): return self._command(node=node, method='rescue.finalize_rescue', params=params) + + @METRICS.timer('AgentClient.lockdown') + def lockdown(self, node, fail_if_unavailable=True): + """Lock down the agent so that it's not usable any more. + + :param node: A Node object. + :param fail_if_unavailable: Whether to fail this call if the agent is + already unavailable. + :raises: IronicException when failed to issue the request or there was + a malformed response from the agent. + :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. + """ + if not fail_if_unavailable: + try: + self.get_commands_status(node, expect_errors=True) + except exception.AgentConnectionFailed: + LOG.debug('Agent is already down when trying to lock down ' + 'node %s', node.uuid) + return + except Exception: + LOG.exception('Unexpected exception when checking agent ' + 'status on node %s, proceeding with lockdown', + node.uuid) + + wait = CONF.agent.post_deploy_get_power_state_retry_interval + attempts = CONF.agent.post_deploy_get_power_state_retries + 1 + + @tenacity.retry(stop=tenacity.stop_after_attempt(attempts), + retry=tenacity.retry_unless_exception_type( + exception.AgentConnectionFailed), + wait=tenacity.wait_fixed(wait), + sleep=time.sleep, # for unit testing + reraise=True) + def _wait_until_locked_down(node): + self.get_commands_status(node, expect_errors=True) + LOG.debug('Agent is still available on node %s, waiting for the ' + 'lockdown command to take effect', node.uuid) + + self.sync(node) + + try: + self._command(node=node, method='system.lockdown', params={}) + except Exception as e: + with excutils.save_and_raise_exception(): + LOG.error('Failed to lock down node %(node_uuid)s. ' + '%(cls)s: %(error)s', + {'node_uuid': node.uuid, + 'cls': e.__class__.__name__, 'error': e}, + exc_info=not isinstance( + e, exception.IronicException)) + + try: + _wait_until_locked_down(node) + except exception.AgentConnectionFailed: + pass # expected + except tenacity.RetryError: + LOG.error('Failed to lock down node %(node_uuid)s in at least ' + '%(timeout)d seconds: agent is still available', + {'node_uuid': node.uuid, + 'timeout': (wait * (attempts - 1))}) + raise exception.AgentCommandTimeout(command='system.lockdown', + node=node.uuid) + except Exception as e: + with excutils.save_and_reraise_exception(): + LOG.error('Failed to lock down node %(node_uuid)s. ' + '%(cls)s: %(error)s', + {'node_uuid': node.uuid, + 'cls': e.__class__.__name__, 'error': e}, + exc_info=not isinstance( + e, exception.IronicException)) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 4848e2fb7e..aeb0385b9e 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -772,7 +772,7 @@ def tear_down_inband_cleaning(task, manage_boot=True): node = task.node cleaning_failure = (node.fault == faults.CLEAN_FAILURE) - if not (fast_track or cleaning_failure): + if not (fast_track or cleaning_failure or node.disable_power_off): manager_utils.node_power_action(task, states.POWER_OFF) if manage_boot: @@ -781,8 +781,11 @@ def tear_down_inband_cleaning(task, manage_boot=True): power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) if not (fast_track or cleaning_failure): - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) + if node.disable_power_off: + manager_utils.node_power_action(task, states.REBOOT) + else: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) def prepare_inband_service(task): @@ -797,24 +800,23 @@ def prepare_inband_service(task): :raises: any boot interface's prepare_ramdisk exceptions. :returns: Returns states.SERVICEWAIT """ - manager_utils.node_power_action(task, states.POWER_OFF) - # NOTE(TheJulia): Revealing that the power is off at any time can - # cause external power sync to decide that the node must be off. - # This may result in a post-rescued instance being turned off - # unexpectedly after rescue has started. - # TODO(TheJulia): Once we have power/state callbacks to nova, - # the reset of the power_state can be removed. - task.node.power_state = states.POWER_ON - task.node.save() + with driver_utils.power_off_and_on(task): + # NOTE(TheJulia): Revealing that the power is off at any time can + # cause external power sync to decide that the node must be off. + # This may result in a post-rescued instance being turned off + # unexpectedly after rescue has started. + # TODO(TheJulia): Once we have power/state callbacks to nova, + # the reset of the power_state can be removed. + task.node.power_state = states.POWER_ON + task.node.save() - task.driver.boot.clean_up_instance(task) - with manager_utils.power_state_for_network_configuration(task): - task.driver.network.unconfigure_tenant_networks(task) - task.driver.network.add_servicing_network(task) - if CONF.agent.manage_agent_boot: - # prepare_ramdisk will set the boot device - prepare_agent_boot(task) - manager_utils.node_power_action(task, states.POWER_ON) + task.driver.boot.clean_up_instance(task) + with manager_utils.power_state_for_network_configuration(task): + task.driver.network.unconfigure_tenant_networks(task) + task.driver.network.add_servicing_network(task) + if CONF.agent.manage_agent_boot: + # prepare_ramdisk will set the boot device + prepare_agent_boot(task) return states.SERVICEWAIT @@ -838,7 +840,7 @@ def tear_down_inband_service(task): node = task.node service_failure = (node.fault == faults.SERVICE_FAILURE) - if not service_failure: + if not service_failure and not node.disable_power_off: manager_utils.node_power_action(task, states.POWER_OFF) task.driver.boot.clean_up_ramdisk(task) @@ -851,7 +853,9 @@ def tear_down_inband_service(task): task.driver.boot.prepare_instance(task) # prepare_instance does not power on the node, the deploy interface is # normally responsible for that. - manager_utils.node_power_action(task, states.POWER_ON) + next_state = (states.REBOOT if task.node.disable_power_off + else states.POWER_ON) + manager_utils.node_power_action(task, next_state) def get_image_instance_info(node): @@ -1652,7 +1656,8 @@ def reboot_to_finish_step(task, timeout=None): disable_ramdisk = task.node.driver_internal_info.get( 'cleaning_disable_ramdisk') if not disable_ramdisk: - if manager_utils.is_fast_track(task): + if (manager_utils.is_fast_track(task) + and not task.node.disable_power_off): LOG.debug('Forcing power off on node %s for a clean reboot into ' 'the agent image', task.node) manager_utils.node_power_action(task, states.POWER_OFF) diff --git a/ironic/drivers/utils.py b/ironic/drivers/utils.py index 2d9bdcf5a3..e85b033f99 100644 --- a/ironic/drivers/utils.py +++ b/ironic/drivers/utils.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +import contextlib import os import tempfile @@ -504,3 +505,18 @@ def need_prepare_ramdisk(node): states.INSPECTWAIT, states.SERVICING, states.SERVICEWAIT) + + +@contextlib.contextmanager +def power_off_and_on(task): + """A context manager that handles a reboot. + + If disable_power_off is False, the node is powered off before yielding and + powered back on afterwards. Otherwise, one reboot is issued in the end. + """ + if not task.node.disable_power_off: + utils.node_power_action(task, states.POWER_OFF) + yield + next_state = (states.REBOOT if task.node.disable_power_off + else states.POWER_ON) + utils.node_power_action(task, next_state) diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py index 66024ccdb8..546b2f9919 100644 --- a/ironic/tests/unit/drivers/modules/test_agent.py +++ b/ironic/tests/unit/drivers/modules/test_agent.py @@ -342,6 +342,29 @@ class CommonTestsMixin: dhcp_factor_mock.assert_called_once_with() destroy_images_mock.assert_called_once_with(task.node) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(deploy_utils, 'destroy_http_instance_images', + autospec=True) + @mock.patch('ironic.common.dhcp_factory.DHCPFactory', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'clean_up_instance', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'clean_up_ramdisk', autospec=True) + def test_clean_up_disable_power_off(self, pxe_clean_up_ramdisk_mock, + pxe_clean_up_instance_mock, + dhcp_factor_mock, + destroy_images_mock, + node_power_action_mock): + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + task.node.disable_power_off = True + self.driver.clean_up(task) + pxe_clean_up_ramdisk_mock.assert_called_once_with( + task.driver.boot, task) + pxe_clean_up_instance_mock.assert_called_once_with( + task.driver.boot, task) + dhcp_factor_mock.assert_called_once_with() + destroy_images_mock.assert_called_once_with(task.node) + node_power_action_mock.assert_called_once_with(task, states.REBOOT) + class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): def setUp(self): @@ -388,6 +411,7 @@ class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): show_mock.assert_not_called() validate_http_mock.assert_not_called() + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', autospec=True) @mock.patch.object(deploy_utils, 'populate_storage_driver_internal_info', @@ -408,7 +432,7 @@ class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): unconfigure_tenant_net_mock, add_provisioning_net_mock, build_instance_info_mock, build_options_mock, pxe_prepare_ramdisk_mock, storage_driver_info_mock, - storage_attach_volumes_mock): + storage_attach_volumes_mock, node_power_action_mock): node = self.node node.network_interface = 'flat' node.save() @@ -427,7 +451,10 @@ class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): build_options_mock.assert_called_once_with(task.node) pxe_prepare_ramdisk_mock.assert_called_once_with( task.driver.boot, task, {'a': 'b'}) + node_power_action_mock.assert_called_once_with( + task, states.POWER_OFF) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', autospec=True) @@ -449,7 +476,8 @@ class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): unconfigure_tenant_net_mock, add_provisioning_net_mock, build_instance_info_mock, build_options_mock, pxe_prepare_ramdisk_mock, storage_driver_info_mock, - storage_attach_volumes_mock, is_fast_track_mock): + storage_attach_volumes_mock, is_fast_track_mock, + node_power_action_mock): # TODO(TheJulia): We should revisit this test. Smartnic # support didn't wire in tightly on testing for power in # these tests, and largely fast_track impacts power operations. @@ -474,6 +502,50 @@ class TestCustomAgentDeploy(CommonTestsMixin, db_base.DbTestCase): # to work, reboots as part of deployment would need the ramdisk # present and ready. self.assertFalse(build_options_mock.called) + node_power_action_mock.assert_not_called() + + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', + autospec=True) + @mock.patch.object(deploy_utils, 'populate_storage_driver_internal_info', + autospec=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', autospec=True) + @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True) + @mock.patch.object(deploy_utils, 'build_instance_info_for_deploy', + autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'add_provisioning_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, + 'unconfigure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'validate', + spec_set=True, autospec=True) + def test_prepare_disable_power_off( + self, validate_net_mock, + unconfigure_tenant_net_mock, add_provisioning_net_mock, + build_instance_info_mock, build_options_mock, + pxe_prepare_ramdisk_mock, storage_driver_info_mock, + storage_attach_volumes_mock, node_power_action_mock): + node = self.node + node.network_interface = 'flat' + node.disable_power_off = True + node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + task.node.provision_state = states.DEPLOYING + build_options_mock.return_value = {'a': 'b'} + self.driver.prepare(task) + storage_driver_info_mock.assert_called_once_with(task) + validate_net_mock.assert_called_once_with(mock.ANY, task) + add_provisioning_net_mock.assert_called_once_with(mock.ANY, task) + unconfigure_tenant_net_mock.assert_called_once_with(mock.ANY, task) + storage_attach_volumes_mock.assert_called_once_with( + task.driver.storage, task) + build_instance_info_mock.assert_not_called() + build_options_mock.assert_called_once_with(task.node) + pxe_prepare_ramdisk_mock.assert_called_once_with( + task.driver.boot, task, {'a': 'b'}) + node_power_action_mock.assert_not_called() class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase): @@ -758,6 +830,41 @@ class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase): self.context, self.node['uuid'], shared=False) as task: self.assertEqual(0, len(task.volume_targets)) + @mock.patch.object(noop_storage.NoopStorage, 'detach_volumes', + autospec=True) + @mock.patch.object(flat_network.FlatNetwork, + 'remove_provisioning_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, + 'unconfigure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_tear_down_disable_power_off(self, power_mock, + unconfigure_tenant_nets_mock, + remove_provisioning_net_mock, + storage_detach_volumes_mock): + object_utils.create_test_volume_target( + self.context, node_id=self.node.id) + node = self.node + node.network_interface = 'flat' + node.disable_power_off = True + node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + driver_return = self.driver.tear_down(task) + power_mock.assert_not_called() + self.assertEqual(driver_return, states.DELETED) + unconfigure_tenant_nets_mock.assert_called_once_with(mock.ANY, + task) + remove_provisioning_net_mock.assert_called_once_with(mock.ANY, + task) + storage_detach_volumes_mock.assert_called_once_with( + task.driver.storage, task) + # Verify no volumes exist for new task instances. + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + self.assertEqual(0, len(task.volume_targets)) + @mock.patch('ironic.drivers.modules.agent.check_image_size', autospec=True) @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', @@ -1193,8 +1300,19 @@ class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase): def test_tear_down_service(self, tear_down_service_mock): with task_manager.acquire(self.context, self.node.uuid) as task: self.driver.tear_down_service(task) - tear_down_service_mock.assert_called_once_with( - task) + tear_down_service_mock.assert_called_once_with(task) + + @mock.patch.object(agent_client.AgentClient, 'lockdown', autospec=True) + @mock.patch.object(deploy_utils, 'tear_down_inband_service', + autospec=True) + def test_tear_down_service_disable_power_off(self, tear_down_service_mock, + lockdown_mock): + with task_manager.acquire(self.context, self.node.uuid) as task: + task.node.disable_power_off = True + self.driver.tear_down_service(task) + tear_down_service_mock.assert_called_once_with(task) + lockdown_mock.assert_called_once_with(mock.ANY, task.node, + fail_if_unavailable=False) @mock.patch.object(deploy_utils, 'tear_down_inband_cleaning', autospec=True) @@ -2080,6 +2198,32 @@ class AgentRescueTestCase(db_base.DbTestCase): self.assertFalse(mock_prepare_ramdisk.called) self.assertEqual(states.RESCUEWAIT, result) + @mock.patch.object(flat_network.FlatNetwork, 'add_rescuing_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'unconfigure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch.object(fake.FakeBoot, 'prepare_ramdisk', autospec=True) + @mock.patch.object(fake.FakeBoot, 'clean_up_instance', autospec=True) + @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + def test_agent_rescue_disable_power_off( + self, mock_node_power_action, mock_build_agent_opts, + mock_clean_up_instance, mock_prepare_ramdisk, + mock_unconf_tenant_net, mock_add_rescue_net): + self.config(manage_agent_boot=True, group='agent') + mock_build_agent_opts.return_value = {'ipa-api-url': 'fake-api'} + with task_manager.acquire(self.context, self.node.uuid) as task: + task.node.disable_power_off = True + result = task.driver.rescue.rescue(task) + mock_node_power_action.assert_called_once_with(task, states.REBOOT) + mock_clean_up_instance.assert_called_once_with(mock.ANY, task) + mock_unconf_tenant_net.assert_called_once_with(mock.ANY, task) + mock_add_rescue_net.assert_called_once_with(mock.ANY, task) + mock_build_agent_opts.assert_called_once_with(task.node) + mock_prepare_ramdisk.assert_called_once_with( + mock.ANY, task, {'ipa-api-url': 'fake-api'}) + self.assertEqual(states.RESCUEWAIT, result) + @mock.patch.object(flat_network.FlatNetwork, 'remove_rescuing_network', spec_set=True, autospec=True) @mock.patch.object(flat_network.FlatNetwork, 'configure_tenant_networks', @@ -2128,6 +2272,30 @@ class AgentRescueTestCase(db_base.DbTestCase): mock_prepare_instance.assert_called_once_with(mock.ANY, task) self.assertEqual(states.ACTIVE, result) + @mock.patch.object(flat_network.FlatNetwork, 'remove_rescuing_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'configure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True) + @mock.patch.object(fake.FakeBoot, 'clean_up_ramdisk', autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + def test_agent_unrescue_disable_power_off( + self, mock_node_power_action, mock_clean_ramdisk, + mock_prepare_instance, mock_conf_tenant_net, + mock_remove_rescue_net): + """Test unrescue in case where boot driver prepares instance reboot.""" + self.config(manage_agent_boot=True, group='agent') + with task_manager.acquire(self.context, self.node.uuid) as task: + task.node.disable_power_off = True + result = task.driver.rescue.unrescue(task) + mock_node_power_action.assert_called_once_with(task, states.REBOOT) + mock_clean_ramdisk.assert_called_once_with( + mock.ANY, task) + mock_remove_rescue_net.assert_called_once_with(mock.ANY, task) + mock_conf_tenant_net.assert_called_once_with(mock.ANY, task) + mock_prepare_instance.assert_called_once_with(mock.ANY, task) + self.assertEqual(states.ACTIVE, result) + @mock.patch.object(fake.FakeBoot, 'clean_up_instance', autospec=True) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) def test_agent_rescue_power_on(self, mock_node_power_action, @@ -2343,6 +2511,9 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): def setUp(self): super().setUp() self.deploy = agent.CustomAgentDeploy() + self.node.provision_state = states.DEPLOYING + self.node.target_provision_state = states.ACTIVE + self.node.save() @mock.patch.object(manager_utils, 'power_on_node_if_needed', autospec=True) @mock.patch.object(time, 'sleep', lambda seconds: None) @@ -2357,9 +2528,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): node_power_action_mock, collect_mock, power_on_node_if_needed_mock): cfg.CONF.set_override('deploy_logs_collect', 'always', 'agent') - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.side_effect = [states.POWER_ON, states.POWER_OFF] @@ -2383,9 +2551,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): def test_tear_down_agent_soft_poweroff_doesnt_complete( self, power_off_mock, get_power_state_mock, node_power_action_mock, mock_collect): - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.return_value = states.POWER_ON self.deploy.tear_down_agent(task) @@ -2408,9 +2573,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): self, power_off_mock, get_power_state_mock, node_power_action_mock, mock_collect): power_off_mock.side_effect = RuntimeError("boom") - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.return_value = states.POWER_ON self.deploy.tear_down_agent(task) @@ -2434,9 +2596,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): # Test the situation when soft power off works, but ironic doesn't # learn about it. power_off_mock.side_effect = RuntimeError("boom") - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.side_effect = [states.POWER_ON, states.POWER_OFF] @@ -2458,9 +2617,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): def test_tear_down_agent_get_power_state_fails( self, power_off_mock, get_power_state_mock, node_power_action_mock, mock_collect, power_on_node_if_needed_mock): - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.return_value = RuntimeError("boom") power_on_node_if_needed_mock.return_value = None @@ -2483,9 +2639,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): def test_tear_down_agent_power_off_fails( self, power_off_mock, get_power_state_mock, node_power_action_mock, mock_collect): - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: get_power_state_mock.return_value = states.POWER_ON node_power_action_mock.side_effect = RuntimeError("boom") @@ -2509,10 +2662,8 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): driver_info = self.node.driver_info driver_info['deploy_forces_oob_reboot'] = True self.node.driver_info = driver_info - - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE self.node.save() + with task_manager.acquire(self.context, self.node.uuid) as task: self.deploy.tear_down_agent(task) @@ -2534,10 +2685,8 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): driver_info = self.node.driver_info driver_info['deploy_forces_oob_reboot'] = True self.node.driver_info = driver_info - - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE self.node.save() + with task_manager.acquire(self.context, self.node.uuid) as task: log_mock.reset_mock() @@ -2568,9 +2717,6 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): self, sync_mock, node_power_action_mock, collect_mock, power_on_node_if_needed_mock): cfg.CONF.set_override('deploy_logs_collect', 'always', 'agent') - self.node.provision_state = states.DEPLOYING - self.node.target_provision_state = states.ACTIVE - self.node.save() with task_manager.acquire(self.context, self.node.uuid) as task: self.deploy.tear_down_agent(task) node_power_action_mock.assert_called_once_with(task, states.REBOOT) @@ -2581,6 +2727,24 @@ class TearDownAgentTest(test_agent_base.AgentDeployMixinBaseTest): sync_mock.assert_called_once_with(agent_client.get_client(task), task.node) + @mock.patch.object(manager_utils, 'power_on_node_if_needed', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'lockdown', autospec=True) + def test_tear_down_agent_disable_power_off( + self, lockdown_mock, node_power_action_mock, + power_on_node_if_needed_mock): + self.node.disable_power_off = True + self.node.save() + with task_manager.acquire(self.context, self.node.uuid) as task: + power_on_node_if_needed_mock.return_value = None + self.deploy.tear_down_agent(task) + lockdown_mock.assert_called_once_with(mock.ANY, task.node) + node_power_action_mock.assert_not_called() + self.assertEqual(states.DEPLOYING, task.node.provision_state) + self.assertEqual(states.ACTIVE, task.node.target_provision_state) + self.assertIsNone(task.node.last_error) + class SwitchToTenantNetworkTest(test_agent_base.AgentDeployMixinBaseTest): diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index 89a95a66ab..33516292bc 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -894,3 +894,78 @@ class TestAgentClientAttempts(base.TestCase): params={'wait': 'false'}, timeout=60, verify=True) + + +@mock.patch('time.sleep', autospec=True) +@mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) +@mock.patch.object(agent_client.AgentClient, '_command', autospec=True) +class TestLockDown(base.TestCase): + def setUp(self): + super().setUp() + self.client = agent_client.AgentClient() + self.node = MockNode() + + def test_okay(self, mock_command, mock_status, mock_sleep): + mock_status.side_effect = [ + {}, + {}, + exception.AgentConnectionFailed(reason=''), + ] + + self.client.lockdown(self.node) + + mock_command.assert_has_calls([ + mock.call(self.client, node=self.node, + method='standby.sync', params={}, wait=True), + mock.call(self.client, node=self.node, + method='system.lockdown', params={}), + ]) + mock_status.assert_called_with(self.client, self.node, + expect_errors=True) + self.assertEqual(3, mock_status.call_count) + mock_sleep.assert_called_with(5) + + def test_okay_with_fail_if_unavailable(self, mock_command, mock_status, + mock_sleep): + mock_status.side_effect = [ + {}, + {}, + exception.AgentConnectionFailed(reason=''), + ] + + self.client.lockdown(self.node, fail_if_unavailable=False) + + mock_command.assert_has_calls([ + mock.call(self.client, node=self.node, + method='standby.sync', params={}, wait=True), + mock.call(self.client, node=self.node, + method='system.lockdown', params={}), + ]) + mock_status.assert_called_with(self.client, self.node, + expect_errors=True) + self.assertEqual(3, mock_status.call_count) + mock_sleep.assert_called_with(5) + + def test_agent_already_down(self, mock_command, mock_status, mock_sleep): + mock_status.side_effect = exception.AgentConnectionFailed(reason='') + + self.client.lockdown(self.node, fail_if_unavailable=False) + + mock_command.assert_not_called() + mock_status.assert_called_once_with(self.client, self.node, + expect_errors=True) + + def test_timeout(self, mock_command, mock_status, mock_sleep): + self.assertRaises(exception.AgentCommandTimeout, + self.client.lockdown, self.node) + + mock_command.assert_has_calls([ + mock.call(self.client, node=self.node, + method='standby.sync', params={}, wait=True), + mock.call(self.client, node=self.node, + method='system.lockdown', params={}), + ]) + mock_status.assert_called_with(self.client, self.node, + expect_errors=True) + mock_sleep.assert_called_with(5) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 171bdcae9e..c8940ff5ec 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1049,17 +1049,22 @@ class AgentMethodsTestCase(db_base.DbTestCase): def _test_tear_down_inband_cleaning( self, power_mock, remove_cleaning_network_mock, clean_up_ramdisk_mock, is_fast_track_mock, - manage_boot=True, fast_track=False, cleaning_error=False): + manage_boot=True, fast_track=False, cleaning_error=False, + disable_power_off=False): is_fast_track_mock.return_value = fast_track with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: + task.node.disable_power_off = disable_power_off if cleaning_error: task.node.fault = faults.CLEAN_FAILURE utils.tear_down_inband_cleaning(task, manage_boot=manage_boot) if not (fast_track or cleaning_error): - power_mock.assert_called_once_with(task, states.POWER_OFF) + if disable_power_off: + power_mock.assert_called_once_with(task, states.REBOOT) + else: + power_mock.assert_called_once_with(task, states.POWER_OFF) else: - self.assertFalse(power_mock.called) + power_mock.assert_not_called() remove_cleaning_network_mock.assert_called_once_with( task.driver.network, task) if manage_boot: @@ -1080,6 +1085,13 @@ class AgentMethodsTestCase(db_base.DbTestCase): def test_tear_down_inband_cleaning_cleaning_error(self): self._test_tear_down_inband_cleaning(cleaning_error=True) + def test_tear_down_inband_cleaning_disable_power_off(self): + self._test_tear_down_inband_cleaning(disable_power_off=True) + + def test_tear_down_inband_cleaning_disable_power_off_and_fast_track(self): + self._test_tear_down_inband_cleaning(disable_power_off=True, + fast_track=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) @mock.patch.object(pxe.PXEBoot, 'clean_up_ramdisk', autospec=True) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.' @@ -1123,6 +1135,31 @@ class AgentMethodsTestCase(db_base.DbTestCase): clean_up_ramdisk_mock.assert_called_once_with( task.driver.boot, task) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'clean_up_ramdisk', autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.' + 'remove_servicing_network', autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_tear_down_inband_service_disable_power_off( + self, power_mock, remove_service_network_mock, + clean_up_ramdisk_mock, prepare_instance_mock): + # NOTE(TheJulia): This should be back to servicing upon a heartbeat + # operation, before we go back to WAIT. We wouldn't know to teardown + # in a wait state anyway. + self.node.provision_state = states.SERVICING + self.node.disable_power_off = True + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + utils.tear_down_inband_service(task) + power_mock.assert_called_once_with(task, states.REBOOT) + remove_service_network_mock.assert_called_once_with( + task.driver.network, task) + clean_up_ramdisk_mock.assert_called_once_with( + task.driver.boot, task) + prepare_instance_mock.assert_called_once_with(task.driver.boot, + task) + def test_build_agent_options_conf(self): self.config(endpoint_override='https://api-url', group='service_catalog')