diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index a75a5161f0..2aaaab9a5e 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -22,6 +22,7 @@ from futurist import rejection from oslo_db import exception as db_exception from oslo_log import log from oslo_utils import excutils +import six from ironic.common import context as ironic_context from ironic.common import driver_factory @@ -440,7 +441,8 @@ class BaseConductorManager(object): this would process nodes whose provision_updated_at field value was 60 or more seconds before 'now'. :param: provision_state: provision_state that the node is in, - for the provisioning activity to have failed. + for the provisioning activity to have failed, + either one string or a set. :param: sort_key: the nodes are sorted based on this key. :param: callback_method: the callback method to be invoked in a spawned thread, for a failed node. This @@ -457,6 +459,9 @@ class BaseConductorManager(object): fsm. """ + if isinstance(provision_state, six.string_types): + provision_state = {provision_state} + node_iter = self.iter_nodes(filters=filters, sort_key=sort_key, sort_dir='asc') @@ -467,7 +472,7 @@ class BaseConductorManager(object): with task_manager.acquire(context, node_uuid, purpose='node state check') as task: if (task.node.maintenance or - task.node.provision_state != provision_state): + task.node.provision_state not in provision_state): continue target_state = (None if not keep_target_state else diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 6565513f7e..15c61242f9 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1561,15 +1561,24 @@ class ConductorManager(base_manager.BaseConductorManager): self._fail_if_in_state(context, filters, states.DEPLOYWAIT, sort_key, callback_method, err_handler) - @METRICS.timer('ConductorManager._check_deploying_status') + @METRICS.timer('ConductorManager._check_orphan_nodes') @periodics.periodic(spacing=CONF.conductor.check_provision_state_interval) - def _check_deploying_status(self, context): - """Periodically checks the status of nodes in DEPLOYING state. + def _check_orphan_nodes(self, context): + """Periodically checks the status of nodes that were taken over. - Periodically checks the nodes in DEPLOYING and the state of the - conductor deploying them. If we find out that a conductor that - was provisioning the node has died we then break release the - node and gracefully mark the deployment as failed. + Periodically checks the nodes that are managed by this conductor but + have a reservation from a conductor that went offline. + + 1. Nodes in DEPLOYING state move to DEPLOY FAIL. + + 2. Nodes in CLEANING state move to CLEAN FAIL with maintenance set. + + 3. Nodes in a transient power state get the power operation aborted. + + 4. Reservation is removed. + + The latter operation happens even for nodes in maintenance mode, + otherwise it's not possible to move them out of maintenance. :param context: request context. """ @@ -1578,12 +1587,14 @@ class ConductorManager(base_manager.BaseConductorManager): return node_iter = self.iter_nodes( - fields=['id', 'reservation'], - filters={'provision_state': states.DEPLOYING, - 'maintenance': False, - 'reserved_by_any_of': offline_conductors}) + fields=['id', 'reservation', 'maintenance', 'provision_state', + 'target_power_state'], + filters={'reserved_by_any_of': offline_conductors}) - for node_uuid, driver, node_id, conductor_hostname in node_iter: + state_cleanup_required = [] + + for (node_uuid, driver, node_id, conductor_hostname, + maintenance, provision_state, target_power_state) in node_iter: # NOTE(lucasagomes): Although very rare, this may lead to a # race condition. By the time we release the lock the conductor # that was previously managing the node could be back online. @@ -1604,11 +1615,43 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.warning("During checking for deploying state, when " "releasing the lock of the node %s, it was " "already unlocked.", node_uuid) + else: + LOG.warning('Forcibly removed reservation of conductor %(old)s' + ' on node %(node)s as that conductor went offline', + {'old': conductor_hostname, 'node': node_uuid}) + + # TODO(dtantsur): clean up all states that are not stable and + # are not one of WAIT states. + if not maintenance and (provision_state in (states.DEPLOYING, + states.CLEANING) or + target_power_state is not None): + LOG.debug('Node %(node)s taken over from conductor %(old)s ' + 'requires state clean up: provision state is ' + '%(state)s, target power state is %(pstate)s', + {'node': node_uuid, 'old': conductor_hostname, + 'state': provision_state, + 'pstate': target_power_state}) + state_cleanup_required.append(node_uuid) + + for node_uuid in state_cleanup_required: + with task_manager.acquire(context, node_uuid, + purpose='power state clean up') as task: + if not task.node.maintenance and task.node.target_power_state: + old_state = task.node.target_power_state + task.node.target_power_state = None + task.node.last_error = _('Pending power operation was ' + 'aborted due to conductor take ' + 'over') + task.node.save() + LOG.warning('Aborted pending power operation %(op)s ' + 'on node %(node)s due to conductor take over', + {'op': old_state, 'node': node_uuid}) self._fail_if_in_state( - context, {'id': node_id}, states.DEPLOYING, + context, {'uuid': node_uuid}, + {states.DEPLOYING, states.CLEANING}, 'provision_updated_at', - callback_method=utils.cleanup_after_timeout, + callback_method=utils.abort_on_conductor_take_over, err_handler=utils.provisioning_error_handler) @METRICS.timer('ConductorManager._do_adoption') diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 057d4dcd5c..a23390d5f9 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -377,6 +377,27 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, task.process_event('fail', target_state=target_state) +@task_manager.require_exclusive_lock +def abort_on_conductor_take_over(task): + """Set node's state when a task was aborted due to conductor take over. + + :param task: a TaskManager instance. + """ + msg = _('Operation was aborted due to conductor take over') + # By this time the "fail" even was processed, so we cannot end up in + # CLEANING or CLEAN WAIT, only in CLEAN FAIL. + if task.node.provision_state == states.CLEANFAIL: + cleaning_error_handler(task, msg, set_fail_state=False) + else: + # For aborted deployment (and potentially other operations), just set + # the last_error accordingly. + task.node.last_error = msg + task.node.save() + + LOG.warning('Aborted the current operation on node %s due to ' + 'conductor take over', task.node.uuid) + + def rescuing_error_handler(task, msg, set_fail_state=True): """Cleanup rescue task after timeout or failure. diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index bfb0ba84c4..46ffa8cc77 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -6278,16 +6278,17 @@ class DestroyPortgroupTestCase(mgr_utils.ServiceSetUpMixin, @mock.patch.object(manager.ConductorManager, '_fail_if_in_state') @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor') @mock.patch.object(dbapi.IMPL, 'get_offline_conductors') -class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin, - db_base.DbTestCase): +class ManagerCheckOrphanNodesTestCase(mgr_utils.ServiceSetUpMixin, + db_base.DbTestCase): def setUp(self): - super(ManagerCheckDeployingStatusTestCase, self).setUp() + super(ManagerCheckOrphanNodesTestCase, self).setUp() self._start_service() self.node = obj_utils.create_test_node( self.context, id=1, uuid=uuidutils.generate_uuid(), driver='fake', provision_state=states.DEPLOYING, - target_provision_state=states.DEPLOYDONE, + target_provision_state=states.ACTIVE, + target_power_state=states.POWER_ON, reservation='fake-conductor') # create a second node in a different state to test the @@ -6297,28 +6298,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin, driver='fake', provision_state=states.AVAILABLE, target_provision_state=states.NOSTATE) - def test__check_deploying_status(self, mock_off_cond, mock_mapped, - mock_fail_if): + def test__check_orphan_nodes(self, mock_off_cond, mock_mapped, + mock_fail_if): mock_off_cond.return_value = ['fake-conductor'] - self.service._check_deploying_status(self.context) + self.service._check_orphan_nodes(self.context) self.node.refresh() mock_off_cond.assert_called_once_with() mock_mapped.assert_called_once_with(self.node.uuid, 'fake') mock_fail_if.assert_called_once_with( - mock.ANY, {'id': self.node.id}, states.DEPLOYING, + mock.ANY, {'uuid': self.node.uuid}, + {states.DEPLOYING, states.CLEANING}, 'provision_updated_at', - callback_method=conductor_utils.cleanup_after_timeout, + callback_method=conductor_utils.abort_on_conductor_take_over, err_handler=conductor_utils.provisioning_error_handler) # assert node was released self.assertIsNone(self.node.reservation) + self.assertIsNone(self.node.target_power_state) + self.assertIsNotNone(self.node.last_error) - def test__check_deploying_status_alive(self, mock_off_cond, - mock_mapped, mock_fail_if): + def test__check_orphan_nodes_cleaning(self, mock_off_cond, mock_mapped, + mock_fail_if): + self.node.provision_state = states.CLEANING + self.node.save() + mock_off_cond.return_value = ['fake-conductor'] + + self.service._check_orphan_nodes(self.context) + + self.node.refresh() + mock_off_cond.assert_called_once_with() + mock_mapped.assert_called_once_with(self.node.uuid, 'fake') + mock_fail_if.assert_called_once_with( + mock.ANY, {'uuid': self.node.uuid}, + {states.DEPLOYING, states.CLEANING}, + 'provision_updated_at', + callback_method=conductor_utils.abort_on_conductor_take_over, + err_handler=conductor_utils.provisioning_error_handler) + # assert node was released + self.assertIsNone(self.node.reservation) + self.assertIsNone(self.node.target_power_state) + self.assertIsNotNone(self.node.last_error) + + def test__check_orphan_nodes_alive(self, mock_off_cond, + mock_mapped, mock_fail_if): mock_off_cond.return_value = [] - self.service._check_deploying_status(self.context) + self.service._check_orphan_nodes(self.context) self.node.refresh() mock_off_cond.assert_called_once_with() @@ -6328,7 +6354,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin, self.assertIsNotNone(self.node.reservation) @mock.patch.object(objects.Node, 'release') - def test__check_deploying_status_release_exceptions_skipping( + def test__check_orphan_nodes_release_exceptions_skipping( self, mock_release, mock_off_cond, mock_mapped, mock_fail_if): mock_off_cond.return_value = ['fake-conductor'] # Add another node so we can check both exceptions @@ -6341,7 +6367,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin, mock_mapped.return_value = True mock_release.side_effect = [exception.NodeNotFound('not found'), exception.NodeLocked('locked')] - self.service._check_deploying_status(self.context) + self.service._check_orphan_nodes(self.context) self.node.refresh() mock_off_cond.assert_called_once_with() @@ -6351,22 +6377,52 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin, # Assert we skipped and didn't try to call _fail_if_in_state self.assertFalse(mock_fail_if.called) - @mock.patch.object(objects.Node, 'release') - def test__check_deploying_status_release_node_not_locked( - self, mock_release, mock_off_cond, mock_mapped, mock_fail_if): + def test__check_orphan_nodes_release_node_not_locked( + self, mock_off_cond, mock_mapped, mock_fail_if): + # this simulates releasing the node elsewhere + count = [0] + + def _fake_release(*args, **kwargs): + self.node.reservation = None + self.node.save() + # raise an exception only the first time release is called + count[0] += 1 + if count[0] == 1: + raise exception.NodeNotLocked('not locked') + mock_off_cond.return_value = ['fake-conductor'] mock_mapped.return_value = True - mock_release.side_effect = exception.NodeNotLocked('not locked') - self.service._check_deploying_status(self.context) + with mock.patch.object(objects.Node, 'release', + side_effect=_fake_release) as mock_release: + self.service._check_orphan_nodes(self.context) + mock_release.assert_called_with(self.context, mock.ANY, + self.node.id) + + mock_off_cond.assert_called_once_with() + mock_mapped.assert_called_once_with(self.node.uuid, 'fake') + mock_fail_if.assert_called_once_with( + mock.ANY, {'uuid': self.node.uuid}, + {states.DEPLOYING, states.CLEANING}, + 'provision_updated_at', + callback_method=conductor_utils.abort_on_conductor_take_over, + err_handler=conductor_utils.provisioning_error_handler) + + def test__check_orphan_nodes_maintenance(self, mock_off_cond, mock_mapped, + mock_fail_if): + self.node.maintenance = True + self.node.save() + mock_off_cond.return_value = ['fake-conductor'] + + self.service._check_orphan_nodes(self.context) self.node.refresh() mock_off_cond.assert_called_once_with() mock_mapped.assert_called_once_with(self.node.uuid, 'fake') - mock_fail_if.assert_called_once_with( - mock.ANY, {'id': self.node.id}, states.DEPLOYING, - 'provision_updated_at', - callback_method=conductor_utils.cleanup_after_timeout, - err_handler=conductor_utils.provisioning_error_handler) + # assert node was released + self.assertIsNone(self.node.reservation) + # not changing states in maintenance + self.assertFalse(mock_fail_if.called) + self.assertIsNotNone(self.node.target_power_state) class TestIndirectionApiConductor(db_base.DbTestCase): diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index f1d96618c6..e5db5a44af 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -1234,6 +1234,25 @@ class ErrorHandlersTestCase(tests_base.TestCase): conductor_utils.cleaning_error_handler(self.task, 'foo') self.assertTrue(log_mock.exception.called) + def test_abort_on_conductor_take_over_cleaning(self): + self.node.maintenance = False + self.node.provision_state = states.CLEANFAIL + conductor_utils.abort_on_conductor_take_over(self.task) + self.assertTrue(self.node.maintenance) + self.assertIn('take over', self.node.maintenance_reason) + self.assertIn('take over', self.node.last_error) + self.task.driver.deploy.tear_down_cleaning.assert_called_once_with( + self.task) + self.node.save.assert_called_once_with() + + def test_abort_on_conductor_take_over_deploying(self): + self.node.maintenance = False + self.node.provision_state = states.DEPLOYFAIL + conductor_utils.abort_on_conductor_take_over(self.task) + self.assertFalse(self.node.maintenance) + self.assertIn('take over', self.node.last_error) + self.node.save.assert_called_once_with() + @mock.patch.object(conductor_utils, 'LOG') def test_spawn_cleaning_error_handler_no_worker(self, log_mock): exc = exception.NoFreeConductorWorker() diff --git a/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml new file mode 100644 index 0000000000..c5c3f1e116 --- /dev/null +++ b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + On node take over, any locks that are left from the old conductor are + cleared by the new one. Previously it only happened for nodes in + ``DEPLOYING`` state. + - | + On taking over nodes in ``CLEANING`` state, the new conductor moves them + to ``CLEAN FAIL`` and set maintenance.