Rework logic handling reserved orphaned nodes in the conductor

If a conductor dies while holding a reservation, the node can get
stuck in its current state. Currently the conductor that takes
over the node only cleans it up if it's in the DEPLOYING state.

This change applies the same logic for all nodes:

1. Reservation is cleared by the conductor that took over the node
   no matter what provision state.

2. CLEANING is also aborted, nodes are moved to CLEAN FAIL with
   maintenance on.

3. Target power state is cleared as well.

The reservation is cleared even for nodes in maintenance mode,
otherwise it's impossible to move them out of maintenance.

Change-Id: I379c1335692046ca9423fda5ea68d2f10c065cb5
Closes-Bug: #1588901
This commit is contained in:
Dmitry Tantsur 2018-02-20 19:47:52 +01:00
parent f00503c1f2
commit 5694b98fc8
6 changed files with 193 additions and 40 deletions

View File

@ -22,6 +22,7 @@ from futurist import rejection
from oslo_db import exception as db_exception
from oslo_log import log
from oslo_utils import excutils
import six
from ironic.common import context as ironic_context
from ironic.common import driver_factory
@ -440,7 +441,8 @@ class BaseConductorManager(object):
this would process nodes whose provision_updated_at
field value was 60 or more seconds before 'now'.
:param: provision_state: provision_state that the node is in,
for the provisioning activity to have failed.
for the provisioning activity to have failed,
either one string or a set.
:param: sort_key: the nodes are sorted based on this key.
:param: callback_method: the callback method to be invoked in a
spawned thread, for a failed node. This
@ -457,6 +459,9 @@ class BaseConductorManager(object):
fsm.
"""
if isinstance(provision_state, six.string_types):
provision_state = {provision_state}
node_iter = self.iter_nodes(filters=filters,
sort_key=sort_key,
sort_dir='asc')
@ -467,7 +472,7 @@ class BaseConductorManager(object):
with task_manager.acquire(context, node_uuid,
purpose='node state check') as task:
if (task.node.maintenance or
task.node.provision_state != provision_state):
task.node.provision_state not in provision_state):
continue
target_state = (None if not keep_target_state else

View File

@ -1561,15 +1561,24 @@ class ConductorManager(base_manager.BaseConductorManager):
self._fail_if_in_state(context, filters, states.DEPLOYWAIT,
sort_key, callback_method, err_handler)
@METRICS.timer('ConductorManager._check_deploying_status')
@METRICS.timer('ConductorManager._check_orphan_nodes')
@periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
def _check_deploying_status(self, context):
"""Periodically checks the status of nodes in DEPLOYING state.
def _check_orphan_nodes(self, context):
"""Periodically checks the status of nodes that were taken over.
Periodically checks the nodes in DEPLOYING and the state of the
conductor deploying them. If we find out that a conductor that
was provisioning the node has died we then break release the
node and gracefully mark the deployment as failed.
Periodically checks the nodes that are managed by this conductor but
have a reservation from a conductor that went offline.
1. Nodes in DEPLOYING state move to DEPLOY FAIL.
2. Nodes in CLEANING state move to CLEAN FAIL with maintenance set.
3. Nodes in a transient power state get the power operation aborted.
4. Reservation is removed.
The latter operation happens even for nodes in maintenance mode,
otherwise it's not possible to move them out of maintenance.
:param context: request context.
"""
@ -1578,12 +1587,14 @@ class ConductorManager(base_manager.BaseConductorManager):
return
node_iter = self.iter_nodes(
fields=['id', 'reservation'],
filters={'provision_state': states.DEPLOYING,
'maintenance': False,
'reserved_by_any_of': offline_conductors})
fields=['id', 'reservation', 'maintenance', 'provision_state',
'target_power_state'],
filters={'reserved_by_any_of': offline_conductors})
for node_uuid, driver, node_id, conductor_hostname in node_iter:
state_cleanup_required = []
for (node_uuid, driver, node_id, conductor_hostname,
maintenance, provision_state, target_power_state) in node_iter:
# NOTE(lucasagomes): Although very rare, this may lead to a
# race condition. By the time we release the lock the conductor
# that was previously managing the node could be back online.
@ -1604,11 +1615,43 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.warning("During checking for deploying state, when "
"releasing the lock of the node %s, it was "
"already unlocked.", node_uuid)
else:
LOG.warning('Forcibly removed reservation of conductor %(old)s'
' on node %(node)s as that conductor went offline',
{'old': conductor_hostname, 'node': node_uuid})
# TODO(dtantsur): clean up all states that are not stable and
# are not one of WAIT states.
if not maintenance and (provision_state in (states.DEPLOYING,
states.CLEANING) or
target_power_state is not None):
LOG.debug('Node %(node)s taken over from conductor %(old)s '
'requires state clean up: provision state is '
'%(state)s, target power state is %(pstate)s',
{'node': node_uuid, 'old': conductor_hostname,
'state': provision_state,
'pstate': target_power_state})
state_cleanup_required.append(node_uuid)
for node_uuid in state_cleanup_required:
with task_manager.acquire(context, node_uuid,
purpose='power state clean up') as task:
if not task.node.maintenance and task.node.target_power_state:
old_state = task.node.target_power_state
task.node.target_power_state = None
task.node.last_error = _('Pending power operation was '
'aborted due to conductor take '
'over')
task.node.save()
LOG.warning('Aborted pending power operation %(op)s '
'on node %(node)s due to conductor take over',
{'op': old_state, 'node': node_uuid})
self._fail_if_in_state(
context, {'id': node_id}, states.DEPLOYING,
context, {'uuid': node_uuid},
{states.DEPLOYING, states.CLEANING},
'provision_updated_at',
callback_method=utils.cleanup_after_timeout,
callback_method=utils.abort_on_conductor_take_over,
err_handler=utils.provisioning_error_handler)
@METRICS.timer('ConductorManager._do_adoption')

View File

@ -377,6 +377,27 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
task.process_event('fail', target_state=target_state)
@task_manager.require_exclusive_lock
def abort_on_conductor_take_over(task):
"""Set node's state when a task was aborted due to conductor take over.
:param task: a TaskManager instance.
"""
msg = _('Operation was aborted due to conductor take over')
# By this time the "fail" even was processed, so we cannot end up in
# CLEANING or CLEAN WAIT, only in CLEAN FAIL.
if task.node.provision_state == states.CLEANFAIL:
cleaning_error_handler(task, msg, set_fail_state=False)
else:
# For aborted deployment (and potentially other operations), just set
# the last_error accordingly.
task.node.last_error = msg
task.node.save()
LOG.warning('Aborted the current operation on node %s due to '
'conductor take over', task.node.uuid)
def rescuing_error_handler(task, msg, set_fail_state=True):
"""Cleanup rescue task after timeout or failure.

View File

@ -6278,16 +6278,17 @@ class DestroyPortgroupTestCase(mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager.ConductorManager, '_fail_if_in_state')
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
@mock.patch.object(dbapi.IMPL, 'get_offline_conductors')
class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
db_base.DbTestCase):
class ManagerCheckOrphanNodesTestCase(mgr_utils.ServiceSetUpMixin,
db_base.DbTestCase):
def setUp(self):
super(ManagerCheckDeployingStatusTestCase, self).setUp()
super(ManagerCheckOrphanNodesTestCase, self).setUp()
self._start_service()
self.node = obj_utils.create_test_node(
self.context, id=1, uuid=uuidutils.generate_uuid(),
driver='fake', provision_state=states.DEPLOYING,
target_provision_state=states.DEPLOYDONE,
target_provision_state=states.ACTIVE,
target_power_state=states.POWER_ON,
reservation='fake-conductor')
# create a second node in a different state to test the
@ -6297,28 +6298,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
driver='fake', provision_state=states.AVAILABLE,
target_provision_state=states.NOSTATE)
def test__check_deploying_status(self, mock_off_cond, mock_mapped,
mock_fail_if):
def test__check_orphan_nodes(self, mock_off_cond, mock_mapped,
mock_fail_if):
mock_off_cond.return_value = ['fake-conductor']
self.service._check_deploying_status(self.context)
self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
mock.ANY, {'id': self.node.id}, states.DEPLOYING,
mock.ANY, {'uuid': self.node.uuid},
{states.DEPLOYING, states.CLEANING},
'provision_updated_at',
callback_method=conductor_utils.cleanup_after_timeout,
callback_method=conductor_utils.abort_on_conductor_take_over,
err_handler=conductor_utils.provisioning_error_handler)
# assert node was released
self.assertIsNone(self.node.reservation)
self.assertIsNone(self.node.target_power_state)
self.assertIsNotNone(self.node.last_error)
def test__check_deploying_status_alive(self, mock_off_cond,
mock_mapped, mock_fail_if):
def test__check_orphan_nodes_cleaning(self, mock_off_cond, mock_mapped,
mock_fail_if):
self.node.provision_state = states.CLEANING
self.node.save()
mock_off_cond.return_value = ['fake-conductor']
self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
mock.ANY, {'uuid': self.node.uuid},
{states.DEPLOYING, states.CLEANING},
'provision_updated_at',
callback_method=conductor_utils.abort_on_conductor_take_over,
err_handler=conductor_utils.provisioning_error_handler)
# assert node was released
self.assertIsNone(self.node.reservation)
self.assertIsNone(self.node.target_power_state)
self.assertIsNotNone(self.node.last_error)
def test__check_orphan_nodes_alive(self, mock_off_cond,
mock_mapped, mock_fail_if):
mock_off_cond.return_value = []
self.service._check_deploying_status(self.context)
self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
@ -6328,7 +6354,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
self.assertIsNotNone(self.node.reservation)
@mock.patch.object(objects.Node, 'release')
def test__check_deploying_status_release_exceptions_skipping(
def test__check_orphan_nodes_release_exceptions_skipping(
self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
mock_off_cond.return_value = ['fake-conductor']
# Add another node so we can check both exceptions
@ -6341,7 +6367,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
mock_mapped.return_value = True
mock_release.side_effect = [exception.NodeNotFound('not found'),
exception.NodeLocked('locked')]
self.service._check_deploying_status(self.context)
self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
@ -6351,22 +6377,52 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
# Assert we skipped and didn't try to call _fail_if_in_state
self.assertFalse(mock_fail_if.called)
@mock.patch.object(objects.Node, 'release')
def test__check_deploying_status_release_node_not_locked(
self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
def test__check_orphan_nodes_release_node_not_locked(
self, mock_off_cond, mock_mapped, mock_fail_if):
# this simulates releasing the node elsewhere
count = [0]
def _fake_release(*args, **kwargs):
self.node.reservation = None
self.node.save()
# raise an exception only the first time release is called
count[0] += 1
if count[0] == 1:
raise exception.NodeNotLocked('not locked')
mock_off_cond.return_value = ['fake-conductor']
mock_mapped.return_value = True
mock_release.side_effect = exception.NodeNotLocked('not locked')
self.service._check_deploying_status(self.context)
with mock.patch.object(objects.Node, 'release',
side_effect=_fake_release) as mock_release:
self.service._check_orphan_nodes(self.context)
mock_release.assert_called_with(self.context, mock.ANY,
self.node.id)
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
mock.ANY, {'uuid': self.node.uuid},
{states.DEPLOYING, states.CLEANING},
'provision_updated_at',
callback_method=conductor_utils.abort_on_conductor_take_over,
err_handler=conductor_utils.provisioning_error_handler)
def test__check_orphan_nodes_maintenance(self, mock_off_cond, mock_mapped,
mock_fail_if):
self.node.maintenance = True
self.node.save()
mock_off_cond.return_value = ['fake-conductor']
self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
mock.ANY, {'id': self.node.id}, states.DEPLOYING,
'provision_updated_at',
callback_method=conductor_utils.cleanup_after_timeout,
err_handler=conductor_utils.provisioning_error_handler)
# assert node was released
self.assertIsNone(self.node.reservation)
# not changing states in maintenance
self.assertFalse(mock_fail_if.called)
self.assertIsNotNone(self.node.target_power_state)
class TestIndirectionApiConductor(db_base.DbTestCase):

View File

@ -1234,6 +1234,25 @@ class ErrorHandlersTestCase(tests_base.TestCase):
conductor_utils.cleaning_error_handler(self.task, 'foo')
self.assertTrue(log_mock.exception.called)
def test_abort_on_conductor_take_over_cleaning(self):
self.node.maintenance = False
self.node.provision_state = states.CLEANFAIL
conductor_utils.abort_on_conductor_take_over(self.task)
self.assertTrue(self.node.maintenance)
self.assertIn('take over', self.node.maintenance_reason)
self.assertIn('take over', self.node.last_error)
self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
self.task)
self.node.save.assert_called_once_with()
def test_abort_on_conductor_take_over_deploying(self):
self.node.maintenance = False
self.node.provision_state = states.DEPLOYFAIL
conductor_utils.abort_on_conductor_take_over(self.task)
self.assertFalse(self.node.maintenance)
self.assertIn('take over', self.node.last_error)
self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils, 'LOG')
def test_spawn_cleaning_error_handler_no_worker(self, log_mock):
exc = exception.NoFreeConductorWorker()

View File

@ -0,0 +1,9 @@
---
fixes:
- |
On node take over, any locks that are left from the old conductor are
cleared by the new one. Previously it only happened for nodes in
``DEPLOYING`` state.
- |
On taking over nodes in ``CLEANING`` state, the new conductor moves them
to ``CLEAN FAIL`` and set maintenance.