diff --git a/ironic/common/automated_clean_methods.py b/ironic/common/automated_clean_methods.py new file mode 100644 index 0000000000..0e31402eff --- /dev/null +++ b/ironic/common/automated_clean_methods.py @@ -0,0 +1,28 @@ +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +""" +Mapping of values for CONF.conductor.automated_clean_step_source, representing +different possible sources for cleaning steps. +""" + +AUTOGENERATED = 'autogenerated' +"Use traditional Ironic method with automatically determined cleaning steps" + +RUNBOOK = 'runbook' +"Use a runbook for automated cleaning; fail if no runbook is configured." + +HYBRID = 'hybrid' +"Use a runbook if configured; use autogenerated steps if not." diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 9c8b052805..cb17a6c62c 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -29,7 +29,8 @@ LOG = log.getLogger(__name__) @task_manager.require_exclusive_lock -def do_node_clean(task, clean_steps=None, disable_ramdisk=False): +def do_node_clean(task, clean_steps=None, disable_ramdisk=False, + automated_with_steps=False): """Internal RPC method to perform cleaning of a node. :param task: a TaskManager instance with an exclusive lock on its node @@ -38,9 +39,11 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False): For more information, see the clean_steps parameter of :func:`ConductorManager.do_node_clean`. :param disable_ramdisk: Whether to skip booting ramdisk for cleaning. + :param automated_with_steps: Indicates this is a declarative automated + clean powered by a runbook. """ node = task.node - manual_clean = clean_steps is not None + manual_clean = clean_steps is not None and automated_with_steps is False clean_type = 'manual' if manual_clean else 'automated' LOG.debug('Starting %(type)s cleaning for node %(node)s', {'type': clean_type, 'node': node.uuid}) @@ -77,10 +80,11 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False): return utils.cleaning_error_handler(task, msg) utils.wipe_cleaning_internal_info(task) - if manual_clean: + if clean_steps: node.set_driver_internal_info('clean_steps', clean_steps) node.set_driver_internal_info('cleaning_disable_ramdisk', disable_ramdisk) + node.set_driver_internal_info('declarative_cleaning', True) task.node.save() utils.node_update_cache(task) @@ -120,7 +124,9 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False): try: conductor_steps.set_node_cleaning_steps( - task, disable_ramdisk=disable_ramdisk) + task, disable_ramdisk=disable_ramdisk, + use_existing_steps=bool(clean_steps) + ) except Exception as e: # Catch all exceptions and follow the error handling # path so things are cleaned up properly. diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 08a2e33af7..38865e2c40 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -51,6 +51,7 @@ from oslo_utils import excutils from oslo_utils import timeutils from oslo_utils import uuidutils +from ironic.common import automated_clean_methods from ironic.common import boot_devices from ironic.common import driver_factory from ironic.common import exception @@ -1181,9 +1182,74 @@ class ConductorManager(base_manager.BaseConductorManager): # instance_uuid, refresh the node to get these changes. node.refresh() + # Get steps for automated cleaning, if we are configured to use + # a cleaning runbook for this node. clean_steps will be None + # in the case we need autogenerated cleaning steps + clean_steps, dis_rd = self._get_steps_for_automated_cleaning(task) + # Begin cleaning task.process_event('clean') - cleaning.do_node_clean(task) + cleaning.do_node_clean(task, + clean_steps=clean_steps, + disable_ramdisk=dis_rd, + automated_with_steps=bool(clean_steps)) + + def _get_steps_for_automated_cleaning(self, task): + """Return steps for automated cleaning, if configured + + Automated cleaning steps are determined two ways: + - Configured explicitly via runbook + - Autogenerated based on driver and step priorities -- usually + from IPA hardware managers + + This method returns the steps used if the node is configured + for automated cleaning via runbook. If the default, autogenerated + steps are to be used, we return none. + + :param node: A node object + :returns: tuple of: None or list of cleaning steps, disable_ramdisk + """ + node = task.node + rb_source = CONF.conductor.automated_cleaning_step_source + rb_validate = CONF.conductor.automated_cleaning_runbook_validate_traits + if rb_source == automated_clean_methods.AUTOGENERATED: + # Do not attempt to use a cleaning runbook, just return None + return None, False + + runbook_ident = ( + CONF.conductor.automated_cleaning_runbook_by_resource_class.get( + node.resource_class, CONF.conductor.automated_cleaning_runbook + ) + ) + + if CONF.conductor.automated_cleaning_runbook_from_node: + runbook_ident = node.driver_info.get('cleaning_runbook', + runbook_ident) + + if runbook_ident: + runbook = self.get_runbook(task.context, runbook_ident) + if rb_validate and (runbook.name not in + node.traits.get_trait_names()): + msg = _( + "Automated cleaning runbook %(rb)s is not valid for " + "node %(node)s. Runbook name must match a node trait." + ) % {'rb': runbook.name, 'node': node.uuid} + # NOTE(JayF): Cleaning failures due to misconfiguration + # logged at error, here and below. + LOG.error(msg) + raise exception.NodeCleaningFailure(node=node, message=msg) + + return convert_steps(runbook.steps), runbook.disable_ramdisk + elif rb_source == automated_clean_methods.RUNBOOK: + msg = _("Unable to perform cleaning for node %(node)s: " + "configuration requires runbook for automated " + "cleaning and none was configured." % {'node': node.uuid}) + LOG.error(msg) + raise exception.NodeCleaningFailure(node=node, message=msg) + else: + # Node is using automated_clean_methods.HYBRID and does not have + # a configured runbook; return "None" to signal normal cleaning. + return None, False @METRICS.timer('ConductorManager.do_node_clean') @messaging.expected_exceptions(exception.InvalidParameterValue, @@ -1249,8 +1315,8 @@ class ConductorManager(base_manager.BaseConductorManager): task.process_event( 'clean', callback=self._spawn_worker, - call_args=(cleaning.do_node_clean, task, clean_steps, - disable_ramdisk), + call_args=(cleaning.do_node_clean, task, + clean_steps, disable_ramdisk, False), err_handler=utils.provisioning_error_handler, target_state=states.MANAGEABLE) except exception.InvalidState: @@ -1332,10 +1398,15 @@ class ConductorManager(base_manager.BaseConductorManager): if node.retired: raise exception.NodeIsRetired(op=_('providing'), node=node.uuid) + steps, dis_rd = self._get_steps_for_automated_cleaning(task) + + # Begin cleaning task.process_event( 'provide', callback=self._spawn_worker, - call_args=(cleaning.do_node_clean, task), + call_args=( + cleaning.do_node_clean, + task, steps, dis_rd, bool(steps)), err_handler=utils.provisioning_error_handler) return @@ -3995,11 +4066,28 @@ class ConductorManager(base_manager.BaseConductorManager): "%(device_types)s from node %(node)s: %(exc)s", device_types=device_types) + @messaging.expected_exceptions(exception.NotFound) + def get_runbook(self, context, runbook_ident): + """Get runbook from the UUID or logical name. + + :param runbook_ident: the UUID or logical name of a runbook. + + :returns: object.Runbook + :raises: InvalidUuidOrName if the name or uuid provided is not valid. + :raises: RunbookNotFound if the runbook is not found. + """ + # If runbook_ident is instead a valid UUID, treat it as a UUID. + if uuidutils.is_uuid_like(runbook_ident): + return objects.Runbook.get_by_uuid( + context, runbook_ident) + else: + return objects.Runbook.get_by_name( + context, runbook_ident) + # NOTE(TheJulia): This is the end of the class definition for the # conductor manager. Methods for RPC and stuffs should go above this # point in the File. Everything below is a helper or periodic. - @METRICS.timer('get_vendor_passthru_metadata') def get_vendor_passthru_metadata(route_dict): d = {} @@ -4230,3 +4318,22 @@ def do_detach_virtual_media(task, device_types): for device_type in device_types: file_name = _virtual_media_file_name(task, device_type) image_utils.cleanup_remote_image(task, file_name) + + +def convert_steps(steps): + """Removes nonserializable fields from steps.""" + fixed_steps = [] + for step in steps: + result = { + 'interface': step['interface'], + 'step': step['step'], + 'args': step['args'], + } + + if 'priority' in step: + result['priority'] = step['priority'] + elif 'order' in step: + result['order'] = step['order'] + + fixed_steps.append(result) + return fixed_steps diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py index 79a23edeff..39f75fbe5c 100644 --- a/ironic/conductor/steps.py +++ b/ironic/conductor/steps.py @@ -293,15 +293,22 @@ def _get_verify_steps(task, enabled=False, sort=True): return verify_steps -def set_node_cleaning_steps(task, disable_ramdisk=False): +def set_node_cleaning_steps(task, disable_ramdisk=False, + use_existing_steps=False): """Set up the node with clean step information for cleaning. For automated cleaning, get the clean steps from the driver. For manual cleaning, the user's clean steps are known but need to be validated against the driver's clean steps. + :param task: A TaskManager object :param disable_ramdisk: If `True`, only steps with requires_ramdisk=False are accepted. + :param use_existing_steps: If `True`, validate existing steps from + driver_internal_info['clean_steps'] which comes from manual cleaning + API call or a runbook-based declarative automated cleaning. If + `False`, retrieve cleaning steps from the driver (implicit + automated cleaning). :raises: InvalidParameterValue if there is a problem with the user's clean steps. :raises: NodeCleaningFailure if there was a problem getting the @@ -309,14 +316,9 @@ def set_node_cleaning_steps(task, disable_ramdisk=False): """ node = task.node - # For manual cleaning, the target provision state is MANAGEABLE, whereas - # for automated cleaning, it is AVAILABLE. - manual_clean = node.target_provision_state == states.MANAGEABLE - - if not manual_clean: - # Get the prioritized steps for automated cleaning - steps = _get_cleaning_steps(task, enabled=True) - else: + # NOTE(JayF): This is true when doing manual cleaning or when doing + # cleaning via runbook. + if use_existing_steps is True: # For manual cleaning, the list of cleaning steps was specified by the # user and already saved in node.driver_internal_info['clean_steps']. # Now that we know what the driver's available clean steps are, we can @@ -324,7 +326,13 @@ def set_node_cleaning_steps(task, disable_ramdisk=False): steps = _validate_user_clean_steps( task, node.driver_internal_info['clean_steps'], disable_ramdisk=disable_ramdisk) + else: + # Get the prioritized steps for automated cleaning + steps = _get_cleaning_steps(task, enabled=True) + # For manual cleaning, the target provision state is MANAGEABLE, whereas + # for automated cleaning, it is AVAILABLE. + manual_clean = node.target_provision_state == states.MANAGEABLE LOG.debug('List of the steps for %(type)s cleaning of node %(node)s: ' '%(steps)s', {'type': 'manual' if manual_clean else 'automated', 'node': node.uuid, diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 0c7164e9fd..eeb4e1d759 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -676,6 +676,7 @@ def wipe_cleaning_internal_info(task): node.del_driver_internal_info('clean_step_index') node.del_driver_internal_info('cleaning_disable_ramdisk') node.del_driver_internal_info('steps_validated') + node.del_driver_internal_info('declarative_cleaning') async_steps.remove_node_flags(node) diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index b9634fae40..3575b865b4 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -18,6 +18,7 @@ from oslo_config import cfg from oslo_config import types +from ironic.common import automated_clean_methods from ironic.common import boot_modes from ironic.common.i18n import _ from ironic.common import lessee_sources @@ -146,12 +147,71 @@ opts = [ 'state. When enabled, the particular steps ' 'performed to clean a node depend on which driver ' 'that node is managed by; see the individual ' - 'driver\'s documentation for details. ' - 'NOTE: The introduction of the cleaning operation ' - 'causes instance deletion to take significantly ' - 'longer. In an environment where all tenants are ' - 'trusted (eg, because there is only one tenant), ' - 'this option could be safely disabled.')), + 'driver\'s documentation for details.')), + cfg.StrOpt('automated_cleaning_step_source', + default='autogenerated', + mutable=True, + choices=[ + (automated_clean_methods.AUTOGENERATED, _( + 'Collects steps from hardware interfaces and orders ' + 'by priority. This provides the original Ironic ' + 'cleaning behavior originally implemented in Kilo.')), + (automated_clean_methods.RUNBOOK, _( + 'Runs cleaning via a runbook specified in ' + 'configuration or node driver_info. If a runbook ' + 'is not specified while automated_clean is enabled, ' + 'cleaning will fail.')), + (automated_clean_methods.HYBRID, _( + 'Runs cleaning via a runbook if one is specified in ' + 'configuration or node driver_info. If a runbook ' + 'is not specified while automated_clean is enabled, ' + 'Ironic will fallback to \'autogenerated\' ' + 'cleaning steps.')) + ], + help=_('Determines how automated_cleaning is performed; the ' + 'default, \'autogenerated\' collects steps from ' + 'hardware interfaces, then ordering by priority; ' + '\'runbook\' requires a runbook to be specified in ' + 'config or driver_info, which is then used to clean the' + 'node; \'hybrid\' uses a runbook if available, ' + 'and falls-back to autogenerated cleaning steps if not.' + )), + cfg.StrOpt('automated_cleaning_runbook', + default=None, + mutable=True, + help=_('If set and [conductor]/automated_clean_step_source ' + 'is set to \'hybrid\' or \'runbook\', the runbook ' + 'UUID or name provided here will be used during ' + 'automated_cleaning for nodes which do not have a ' + 'resource_class-specific runbook or runbook set in ' + 'driver_info.')), + cfg.DictOpt('automated_cleaning_runbook_by_resource_class', + default={}, + mutable=True, + help=_('A dictionary of key-value pairs of node ' + 'resource_class and runbook UUID or name which will be ' + 'used to clean the node if ' + '[conductor]automated_clean_step_source is set to ' + '\'hybrid\' or \'runbook\' and a more specific runbook ' + 'has not been configured in driver_info.')), + cfg.BoolOpt('automated_cleaning_runbook_from_node', + default=False, + mutable=True, + help=_('When enabled, allows an administrator to configure ' + 'a runbook in ' + 'node[\'driver_info\'][\'cleaning_runbook\'] to ' + 'use for that node when ' + '[conductor]automated_clean_step_source is set to ' + '\'hybrid\' or \'runbook\'. ' + 'NOTE: This will permit any user with access to edit ' + 'node[\'driver_info\'] to circumvent cleaning.')), + cfg.BoolOpt('automated_cleaning_runbook_validate_traits', + default=True, + mutable=True, + help=_('When enabled, this option requires validation of a ' + 'runbook before it\'s used for automated cleaning. ' + 'Nodes configured with a runbook that is not validated ' + 'for use via trait matching will fail to clean.')), cfg.BoolOpt('allow_provisioning_in_maintenance', default=True, mutable=True, diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 62e04f1596..e6c1c56667 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -570,8 +570,11 @@ class HeartbeatMixin(object): task.resume_cleaning() # First, cache the clean steps self.refresh_clean_steps(task) + use_existing_steps = node.driver_internal_info.get( + 'declarative_cleaning', False) # Then set/verify node clean steps and start cleaning - conductor_steps.set_node_cleaning_steps(task) + conductor_steps.set_node_cleaning_steps( + task, use_existing_steps=use_existing_steps) cleaning.continue_node_clean(task) else: msg = _('Node failed to check cleaning progress') diff --git a/ironic/tests/unit/conductor/mgr_utils.py b/ironic/tests/unit/conductor/mgr_utils.py index 40c98dff2f..1f1bc0cbe1 100644 --- a/ironic/tests/unit/conductor/mgr_utils.py +++ b/ironic/tests/unit/conductor/mgr_utils.py @@ -51,7 +51,8 @@ class CommonMixIn(object): if node is None: node = self._create_node(**node_attrs) task = mock.Mock(spec_set=['node', 'release_resources', - 'spawn_after', 'process_event', 'driver']) + 'spawn_after', 'process_event', + 'driver', 'context']) task.node = node return task diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 6f17585706..45d9f058e6 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -439,7 +439,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): node.refresh() self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) - mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False, + use_existing_steps=mock.ANY) self.assertFalse(node.maintenance) self.assertIsNone(node.fault) @@ -470,7 +471,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): node.refresh() self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) - mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False, + use_existing_steps=mock.ANY) self.assertTrue(mock_power.called) def test__do_node_clean_automated_steps_fail(self): @@ -491,13 +493,13 @@ class DoNodeCleanTestCase(db_base.DbTestCase): autospec=True) def __do_node_clean(self, mock_power_valid, mock_network_valid, mock_next_step, mock_steps, clean_steps=None, - disable_ramdisk=False): - if clean_steps: + disable_ramdisk=False, automated_with_steps=False): + if clean_steps and not automated_with_steps: tgt_prov_state = states.MANAGEABLE else: tgt_prov_state = states.AVAILABLE - def set_steps(task, disable_ramdisk=None): + def set_steps(task, disable_ramdisk=None, use_existing_steps=None): dii = task.node.driver_internal_info dii['clean_steps'] = self.clean_steps task.node.driver_internal_info = dii @@ -516,7 +518,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): with task_manager.acquire( self.context, node.uuid, shared=False) as task: cleaning.do_node_clean(task, clean_steps=clean_steps, - disable_ramdisk=disable_ramdisk) + disable_ramdisk=disable_ramdisk, + automated_with_steps=automated_with_steps) node.refresh() @@ -529,7 +532,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): mock_next_step.assert_called_once_with( task, 0, disable_ramdisk=disable_ramdisk) mock_steps.assert_called_once_with( - task, disable_ramdisk=disable_ramdisk) + task, disable_ramdisk=disable_ramdisk, + use_existing_steps=mock.ANY) if clean_steps: self.assertEqual(clean_steps, node.driver_internal_info['clean_steps']) @@ -1204,6 +1208,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase): def test_continue_node_clean_no_skip_step(self): self._continue_node_clean(skip=False) + def test__do_node_clean_automated_with_steps(self): + """Test automated cleaning with steps from runbook.""" + self.__do_node_clean( + clean_steps=self.clean_steps, + automated_with_steps=True) + class DoNodeCleanAbortTestCase(db_base.DbTestCase): @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index c39b34028e..6d470293eb 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2551,7 +2551,7 @@ class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertNotIn('automatic_lessee', node.driver_internal_info) self.assertNotIn('is_source_a_path', node.driver_internal_info) mock_tear_down.assert_called_once_with(task.driver.deploy, task) - mock_clean.assert_called_once_with(task) + mock_clean.assert_called_once_with(task, mock.ANY, mock.ANY, mock.ANY) self.assertEqual({}, port.internal_info) mock_unbind.assert_called_once_with('foo', context=mock.ANY) if enabled_console: @@ -2600,7 +2600,8 @@ class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertIsNone(node.last_error) self.assertEqual({}, node.instance_info) mock_tear_down.assert_called_once_with(mock.ANY, mock.ANY) - mock_clean.assert_called_once_with(mock.ANY) + mock_clean.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY, + mock.ANY) if is_rescue_state: mock_rescue_clean.assert_called_once_with(mock.ANY, mock.ANY) else: @@ -2735,7 +2736,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, self.assertEqual(states.AVAILABLE, node.target_provision_state) self.assertIsNone(node.last_error) mock_spawn.assert_called_with(self.service, - cleaning.do_node_clean, mock.ANY) + cleaning.do_node_clean, + mock.ANY, mock.ANY, mock.ANY, mock.ANY) @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', autospec=True) @@ -2974,7 +2976,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_power_valid.assert_called_once_with(mock.ANY, mock.ANY) mock_network_valid.assert_called_once_with(mock.ANY, mock.ANY) mock_spawn.assert_called_with( - self.service, cleaning.do_node_clean, mock.ANY, clean_steps, False) + self.service, cleaning.do_node_clean, mock.ANY, clean_steps, + False, False) node.refresh() # Node will be moved to CLEANING self.assertEqual(states.CLEANING, node.provision_state) @@ -3007,7 +3010,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_power_valid.assert_called_once_with(mock.ANY, mock.ANY) mock_network_valid.assert_called_once_with(mock.ANY, mock.ANY) mock_spawn.assert_called_with( - self.service, cleaning.do_node_clean, mock.ANY, clean_steps, False) + self.service, cleaning.do_node_clean, mock.ANY, clean_steps, + False, False) node.refresh() # Make sure states were rolled back self.assertEqual(prv_state, node.provision_state) @@ -6732,6 +6736,194 @@ class ParallelPowerSyncTestCase(mgr_utils.CommonMixIn, db_base.DbTestCase): queue_mock.return_value.put.assert_has_calls(expected_calls) +@mgr_utils.mock_record_keepalive +@mock.patch.object(task_manager, 'acquire', autospec=True) +class GetStepsForAutomatedCleaningTestCase(mgr_utils.ServiceSetUpMixin, + mgr_utils.CommonMixIn, + db_base.DbTestCase): + def _set_node_trait(self, name): + self.node.traits = objects.TraitList.create(context=self.context, + node_id=self.node.id, + traits=[name]) + + def setUp(self): + super(GetStepsForAutomatedCleaningTestCase, self).setUp() + self._start_service() + self.node = obj_utils.create_test_node( + self.context, + driver='fake-hardware', + uuid=uuidutils.generate_uuid()) + self.node.create() + self.task = self._create_task(node=self.node) + self.rb_name = 'CUSTOM_BM_RB' + self._set_node_trait(self.rb_name) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_runbook_by_resource_class(self, mock_get_runbook, mt): + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {'baremetal': self.rb_name}, + group='conductor') + self.node.resource_class = 'baremetal' + rb = mock.Mock(spec_set=objects.Runbook) + rb.name = self.rb_name + exp_steps = [{'interface': 'deploy', 'step': 'test_step', 'args': {}}] + rb.steps = exp_steps + rb.disable_ramdisk = True + mock_get_runbook.return_value = rb + + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + mock_get_runbook.assert_called_once_with(mock.ANY, mock.ANY, + self.rb_name) + self.assertEqual(exp_steps, steps) + self.assertTrue(dr) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_fallback_to_default_runbook(self, mock_get_runbook, mt): + rb_to_use = 'CUSTOM_DEFAULT' + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook', rb_to_use, + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {'other': 'CUSTOM_OTHER'}, group='conductor') + self.node.resource_class = 'baremetal' + self._set_node_trait(rb_to_use) + rb = mock.Mock(spec_set=objects.Runbook) + rb.name = rb_to_use + exp_steps = [{'interface': 'deploy', 'step': 'test_step', 'args': {}}] + rb.steps = exp_steps + rb.disable_ramdisk = True + mock_get_runbook.return_value = rb + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + mock_get_runbook.assert_called_once_with(mock.ANY, mock.ANY, rb_to_use) + self.assertEqual(exp_steps, steps) + self.assertTrue(dr) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_autogenerated(self, mock_get_runbook, mt): + CONF.set_override('automated_cleaning_step_source', 'autogenerated', + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {'baremetal': self.rb_name}, + group='conductor') + self.node.resource_class = 'baremetal' + + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + self.assertIsNone(steps) + self.assertFalse(dr) + mock_get_runbook.assert_not_called() + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_runbook_from_node(self, mock_get_runbook, mt): + rb_to_use = 'CUSTOM_NODE_RB' + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook_from_node', True, + group='conductor') + self.node.driver_info = {'cleaning_runbook': rb_to_use} + self._set_node_trait(rb_to_use) + rb = mock.Mock(spec_set=objects.Runbook) + rb.name = rb_to_use + exp_steps = [{'interface': 'deploy', 'step': 'test_step', 'args': {}}] + rb.steps = exp_steps + rb.disable_ramdisk = True + mock_get_runbook.return_value = rb + + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + mock_get_runbook.assert_called_once_with(mock.ANY, mock.ANY, + rb_to_use) + self.assertEqual(steps, exp_steps) + self.assertTrue(dr) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_hybrid_no_runbook(self, mock_get_runbook, mt): + CONF.set_override('automated_cleaning_step_source', 'hybrid', + group='conductor') + CONF.set_override('automated_cleaning_runbook', None, + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {}, group='conductor') + + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + self.assertFalse(mock_get_runbook.called) + self.assertIsNone(steps) + self.assertFalse(dr) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_no_runbook_required(self, mock_get_runbook, mt): + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook', None, + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {}, group='conductor') + + self.assertRaises(exception.NodeCleaningFailure, + self.service._get_steps_for_automated_cleaning, + self.task) + self.assertFalse(mock_get_runbook.called) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_runbook_incompatible_with_node(self, mock_get_runbook, mt): + bad_rb_name = "CUSTOM_INCOMPATIBLE_RB" + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {'baremetal': bad_rb_name}, + group='conductor') + # NOTE(JayF): We specifically are not adding bad_rb_name to traits, + # which means this should be considered incompatible + self.node.resource_class = 'baremetal' + rb = mock.Mock(spec_set=objects.Runbook) + rb.name = bad_rb_name +# exp_steps = [{'interface': 'deploy', 'step': 'test_step', 'args': {}}] +# rb.steps = exp_steps +# rb.disable_ramdisk = True + mock_get_runbook.return_value = rb + + self.assertRaises(exception.NodeCleaningFailure, + self.service._get_steps_for_automated_cleaning, + self.task) + # NOTE(JayF): get_runbook is called in this case because we need to + # fetch the runbook to know it's incompatible + self.assertTrue(mock_get_runbook.called) + + @mock.patch.object(manager.ConductorManager, 'get_runbook', autospec=True) + def test_gsfac_runbook_incompatible_with_node_ignored( + self, mock_get_runbook, mt): + bad_rb_name = "CUSTOM_INCOMPATIBLE_RB" + CONF.set_override('automated_cleaning_runbook_validate_traits', False, + group='conductor') + CONF.set_override('automated_cleaning_step_source', 'runbook', + group='conductor') + CONF.set_override('automated_cleaning_runbook_by_resource_class', + {'baremetal': bad_rb_name}, + group='conductor') + # NOTE(JayF): We specifically are not adding bad_rb_name to traits, + # which means this should be considered incompatible + self.node.resource_class = 'baremetal' + rb = mock.Mock(spec_set=objects.Runbook) + rb.name = bad_rb_name + exp_steps = [{'interface': 'deploy', 'step': 'test_step', 'args': {}}] + rb.steps = exp_steps + rb.disable_ramdisk = True + mock_get_runbook.return_value = rb + + steps, dr = self.service._get_steps_for_automated_cleaning(self.task) + + mock_get_runbook.assert_called_once_with(mock.ANY, mock.ANY, + bad_rb_name) + self.assertEqual(steps, exp_steps) + self.assertTrue(dr) + + @mock.patch.object(task_manager, 'acquire', autospec=True) @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor', autospec=True) diff --git a/ironic/tests/unit/conductor/test_steps.py b/ironic/tests/unit/conductor/test_steps.py index 8782c4a62c..19a1f3d09d 100644 --- a/ironic/tests/unit/conductor/test_steps.py +++ b/ironic/tests/unit/conductor/test_steps.py @@ -843,7 +843,8 @@ class NodeCleaningStepsTestCase(db_base.DbTestCase): with task_manager.acquire( self.context, node.uuid, shared=False) as task: - conductor_steps.set_node_cleaning_steps(task) + conductor_steps.set_node_cleaning_steps(task, + use_existing_steps=False) node.refresh() self.assertEqual(self.clean_steps, node.driver_internal_info['clean_steps']) @@ -870,7 +871,8 @@ class NodeCleaningStepsTestCase(db_base.DbTestCase): with task_manager.acquire( self.context, node.uuid, shared=False) as task: - conductor_steps.set_node_cleaning_steps(task) + conductor_steps.set_node_cleaning_steps(task, + use_existing_steps=True) node.refresh() self.assertEqual(clean_steps, node.driver_internal_info['clean_steps']) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 010d24f40f..825da275b5 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -334,7 +334,32 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): mock_touch.assert_called_once_with(mock.ANY) mock_refresh.assert_called_once_with(mock.ANY, task, 'clean') mock_clean.assert_called_once_with(task) - mock_set_steps.assert_called_once_with(task) + mock_set_steps.assert_called_once_with(task, use_existing_steps=False) + + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'refresh_steps', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', + autospec=True) + @mock.patch.object(cleaning, 'continue_node_clean', autospec=True) + def test_heartbeat_resume_clean_declarative(self, mock_clean, + mock_set_steps, + mock_refresh, mock_touch): + # Test declarative cleaning case where use_existing_steps=True + self.node.clean_step = {} + self.node.provision_state = states.CLEANWAIT + info = self.node.driver_internal_info + info['declarative_cleaning'] = True + self.node.driver_internal_info = info + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_touch.assert_called_once_with(mock.ANY) + mock_refresh.assert_called_once_with(mock.ANY, task, 'clean') + mock_clean.assert_called_once_with(task) + mock_set_steps.assert_called_once_with(task, use_existing_steps=True) @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) @@ -1217,6 +1242,42 @@ class ContinueCleaningTest(AgentDeployMixinBaseTest): self.deploy.continue_cleaning(task) error_mock.assert_called_once_with(task, mock.ANY, traceback=False) + @mock.patch.object(cleaning, 'continue_node_clean', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_cleaning_declarative_flag_preserved(self, status_mock, + clean_mock): + # Test that declarative_cleaning flag is preserved during cleaning + self.node.clean_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'erase_devices', + 'reboot_requested': False + } + info = self.node.driver_internal_info + info['declarative_cleaning'] = True + self.node.driver_internal_info = info + self.node.save() + status_mock.return_value = [{ + 'command_status': 'SUCCEEDED', + 'command_name': 'execute_clean_step', + 'command_result': { + 'clean_step': self.node.clean_step + } + }] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.deploy.continue_cleaning(task) + clean_mock.assert_called_once_with(task) + self.assertEqual(states.CLEANING, task.node.provision_state) + self.assertEqual(states.AVAILABLE, + task.node.target_provision_state) + # Verify the flag is still present during cleaning + self.assertIn('declarative_cleaning', + task.node.driver_internal_info) + self.assertTrue( + task.node.driver_internal_info['declarative_cleaning']) + class ContinueServiceTest(AgentDeployMixinBaseTest): diff --git a/releasenotes/notes/automated-cleaning-by-runbook-b5b4b1f0f51040b0.yaml b/releasenotes/notes/automated-cleaning-by-runbook-b5b4b1f0f51040b0.yaml new file mode 100644 index 0000000000..79bafe8ac7 --- /dev/null +++ b/releasenotes/notes/automated-cleaning-by-runbook-b5b4b1f0f51040b0.yaml @@ -0,0 +1,43 @@ +features: + - | + Ironic operators can now assign specific steps to run during automated + cleaning instead of relying on the driver-based implicit steps typically + used for automated cleaning. + + To generally opt-in to this behavior, first set + ``[conductor]/automated_cleaning_step_source`` to either 'hybrid' or + 'runbook'. A setting of 'autogenerated' (default), preserves existing + behavior. Setting 'runbook' causes Ironic to attempt to find and use a + configured runbook for automated cleaning, and fails cleaning if one is + not found. The 'hybrid' setting causes Ironic to attempt to find and use a + configured runbook for automated cleaning. If one is not found, we fallback + to the default autogenerated cleaning. + + Configuration for cleaning runbooks is tiered to allow maximum flexibility; + Ironic will try to find a cleaning runbook in the following places. Values + representing the runbook name or the runbook UUID are accepted; and the + most specific value found is used. First, ``node.driver_info['cleaning_runbook']`` + is a node specific override, disabled by default. To enable it, set + ``[conductor]automated_cleaning_runbook_from_node`` to True. Next, + ``[conductor]automated_cleaning_runbook_by_resource_class`` allows you to + map resource classes to the expected runbook for them to use in + automated cleaning. The global default, and final fallback, is + ``[conductor]automated_cleaning_runbook``. + + As with normal runbook usage, the runbook name must match an active trait + in node.traits. This behavior can be changed by setting + ``[conductor]automated_cleaning_runbook_validate_traits`` to False. A cleaning + attempt that resolves to an incompatible runbook will cause an error + and leave the node uncleaned in a clean fail state. + +security: + - | + This change permits declarative cleaning via use of runbooks. Please note + there is no validation that a runbook performs typical, expected cleaning + actions such as a disk wipe. Operators should be careful to ensure they + are using sufficient steps to securely wipe the system. + + Additionally, operators of multitenant Ironic clusters should be careful + before setting ``[conductor]automated_cleaning_runbook_from_node`` to True, + as it may permit a node owner to render cleaning ineffective. This is not + a concern in deployments utilizing node.lessee via automated_lessee.