Merge "Add rescuewait timeout periodic task"
This commit is contained in:
commit
2c630d9337
etc/ironic
ironic
@ -1267,6 +1267,12 @@
|
||||
# (integer value)
|
||||
#check_provision_state_interval = 60
|
||||
|
||||
# Interval (seconds) between checks of rescue timeouts. This
|
||||
# option is part of rescue feature work, which is not
|
||||
# currently exposed to users. (integer value)
|
||||
# Minimum value: 1
|
||||
#check_rescue_state_interval = 60
|
||||
|
||||
# Timeout (seconds) to wait for a callback from a deploy
|
||||
# ramdisk. Set to 0 to disable timeout. (integer value)
|
||||
#deploy_callback_timeout = 1800
|
||||
@ -1355,6 +1361,14 @@
|
||||
# disable timeout. (integer value)
|
||||
#clean_callback_timeout = 1800
|
||||
|
||||
# Timeout (seconds) to wait for a callback from the rescue
|
||||
# ramdisk. If the timeout is reached the node will be put in
|
||||
# the "rescue failed" provision state. Set to 0 to disable
|
||||
# timeout. This option is part of rescue feature work, which
|
||||
# is not currently exposed to users. (integer value)
|
||||
# Minimum value: 0
|
||||
#rescue_callback_timeout = 1800
|
||||
|
||||
# Timeout (in seconds) of soft reboot and soft power off
|
||||
# operation. This value always has to be positive. (integer
|
||||
# value)
|
||||
|
@ -1727,6 +1727,27 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
keep_target_state=True,
|
||||
callback_method=utils.cleanup_cleanwait_timeout)
|
||||
|
||||
@METRICS.timer('ConductorManager._check_rescuewait_timeouts')
|
||||
@periodics.periodic(spacing=CONF.conductor.check_rescue_state_interval,
|
||||
enabled=bool(CONF.conductor.rescue_callback_timeout))
|
||||
def _check_rescuewait_timeouts(self, context):
|
||||
"""Periodically checks if rescue has timed out waiting for heartbeat.
|
||||
|
||||
If a rescue call has timed out, fail the rescue and clean up.
|
||||
|
||||
:param context: request context.
|
||||
"""
|
||||
callback_timeout = CONF.conductor.rescue_callback_timeout
|
||||
filters = {'reserved': False,
|
||||
'provision_state': states.RESCUEWAIT,
|
||||
'maintenance': False,
|
||||
'provisioned_before': callback_timeout}
|
||||
self._fail_if_in_state(context, filters, states.RESCUEWAIT,
|
||||
'provision_updated_at',
|
||||
keep_target_state=True,
|
||||
callback_method=utils.cleanup_rescuewait_timeout
|
||||
)
|
||||
|
||||
@METRICS.timer('ConductorManager._sync_local_state')
|
||||
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
|
||||
def _sync_local_state(self, context):
|
||||
|
@ -398,14 +398,48 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
task.process_event('fail', target_state=target_state)
|
||||
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def cleanup_rescuewait_timeout(task):
|
||||
"""Cleanup rescue task after timeout.
|
||||
|
||||
:param task: a TaskManager instance.
|
||||
"""
|
||||
node = task.node
|
||||
msg = _('Timeout reached while waiting for rescue ramdisk callback '
|
||||
'for node')
|
||||
errmsg = msg + ' %(node)s'
|
||||
LOG.error(errmsg, {'node': node.uuid})
|
||||
try:
|
||||
node_power_action(task, states.POWER_OFF)
|
||||
task.driver.rescue.clean_up(task)
|
||||
node.last_error = msg
|
||||
node.save()
|
||||
except Exception as e:
|
||||
if isinstance(e, exception.IronicException):
|
||||
error_msg = _('Cleanup failed for %(node_info)s after rescue '
|
||||
'timeout: %(error)s')
|
||||
node_info = ('node')
|
||||
node.last_error = error_msg % {'node_info': node_info, 'error': e}
|
||||
node_info = ('node %s') % node.uuid
|
||||
LOG.error(error_msg, {'node_info': node_info, 'error': e})
|
||||
else:
|
||||
node.last_error = _('Rescue timed out, but an unhandled '
|
||||
'exception was encountered while aborting. '
|
||||
'More info may be found in the log file.')
|
||||
LOG.exception('Rescue timed out for node %(node)s, an exception '
|
||||
'was encountered while aborting. Error: %(err)s',
|
||||
{'node': node.uuid, 'err': e})
|
||||
node.save()
|
||||
|
||||
|
||||
def _spawn_error_handler(e, node, state):
|
||||
"""Handle spawning error for node."""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
node.save()
|
||||
LOG.warning("No free conductor workers available to perform "
|
||||
"%(state)s on node %(node)s",
|
||||
{'state': state, 'node': node.uuid})
|
||||
"%(operation)s on node %(node)s",
|
||||
{'operation': state, 'node': node.uuid})
|
||||
|
||||
|
||||
def spawn_cleaning_error_handler(e, node):
|
||||
|
@ -52,6 +52,12 @@ opts = [
|
||||
default=60,
|
||||
help=_('Interval between checks of provision timeouts, '
|
||||
'in seconds.')),
|
||||
cfg.IntOpt('check_rescue_state_interval',
|
||||
default=60,
|
||||
min=1,
|
||||
help=_('Interval (seconds) between checks of rescue '
|
||||
'timeouts. This option is part of rescue feature '
|
||||
'work, which is not currently exposed to users.')),
|
||||
cfg.IntOpt('deploy_callback_timeout',
|
||||
default=1800,
|
||||
help=_('Timeout (seconds) to wait for a callback from '
|
||||
@ -145,6 +151,15 @@ opts = [
|
||||
'ramdisk doing the cleaning. If the timeout is reached '
|
||||
'the node will be put in the "clean failed" provision '
|
||||
'state. Set to 0 to disable timeout.')),
|
||||
cfg.IntOpt('rescue_callback_timeout',
|
||||
default=1800,
|
||||
min=0,
|
||||
help=_('Timeout (seconds) to wait for a callback from the '
|
||||
'rescue ramdisk. If the timeout is reached the node '
|
||||
'will be put in the "rescue failed" provision state. '
|
||||
'Set to 0 to disable timeout. This option is part of '
|
||||
'rescue feature work, which is not currently exposed '
|
||||
'to users.')),
|
||||
cfg.IntOpt('soft_power_off_timeout',
|
||||
default=600,
|
||||
min=1,
|
||||
|
@ -1610,6 +1610,32 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
def test__check_cleanwait_timeouts_manual_clean(self):
|
||||
self._check_cleanwait_timeouts(manual=True)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up')
|
||||
@mock.patch.object(conductor_utils, 'node_power_action')
|
||||
def test_check_rescuewait_timeouts(self, node_power_mock,
|
||||
mock_clean_up):
|
||||
self._start_service()
|
||||
CONF.set_override('rescue_callback_timeout', 1, group='conductor')
|
||||
tgt_prov_state = states.RESCUE
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
rescue_interface='fake',
|
||||
network_interface='flat',
|
||||
provision_state=states.RESCUEWAIT,
|
||||
target_provision_state=tgt_prov_state,
|
||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
|
||||
|
||||
self.service._check_rescuewait_timeouts(self.context)
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
self.assertEqual(states.RESCUEFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertIn('Timeout reached while waiting for rescue ramdisk',
|
||||
node.last_error)
|
||||
mock_clean_up.assert_called_once_with(mock.ANY)
|
||||
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
|
||||
|
||||
def test_do_node_tear_down_invalid_state(self):
|
||||
self._start_service()
|
||||
# test node.provision_state is incorrect for tear_down
|
||||
|
@ -1075,8 +1075,9 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
def setUp(self):
|
||||
super(ErrorHandlersTestCase, self).setUp()
|
||||
self.task = mock.Mock(spec=task_manager.TaskManager)
|
||||
self.task.driver = mock.Mock(spec_set=['deploy'])
|
||||
self.task.driver = mock.Mock(spec_set=['deploy', 'network', 'rescue'])
|
||||
self.task.node = mock.Mock(spec_set=objects.Node)
|
||||
self.task.shared = False
|
||||
self.node = self.task.node
|
||||
# NOTE(mariojv) Some of the test cases that use the task below require
|
||||
# strict typing of the node power state fields and would fail if passed
|
||||
@ -1237,6 +1238,43 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertFalse(self.node.save.called)
|
||||
self.assertFalse(log_mock.warning.called)
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG')
|
||||
@mock.patch.object(conductor_utils, 'node_power_action')
|
||||
def test_cleanup_rescuewait_timeout(self, node_power_mock, log_mock):
|
||||
conductor_utils.cleanup_rescuewait_timeout(self.task)
|
||||
self.assertTrue(log_mock.error.called)
|
||||
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
|
||||
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
|
||||
self.assertIn('Timeout reached', self.node.last_error)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG')
|
||||
@mock.patch.object(conductor_utils, 'node_power_action')
|
||||
def test_cleanup_rescuewait_timeout_known_exc(
|
||||
self, node_power_mock, log_mock):
|
||||
clean_up_mock = self.task.driver.rescue.clean_up
|
||||
clean_up_mock.side_effect = exception.IronicException('moocow')
|
||||
conductor_utils.cleanup_rescuewait_timeout(self.task)
|
||||
self.assertEqual(2, log_mock.error.call_count)
|
||||
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
|
||||
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
|
||||
self.assertIn('moocow', self.node.last_error)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG')
|
||||
@mock.patch.object(conductor_utils, 'node_power_action')
|
||||
def test_cleanup_rescuewait_timeout_unknown_exc(
|
||||
self, node_power_mock, log_mock):
|
||||
clean_up_mock = self.task.driver.rescue.clean_up
|
||||
clean_up_mock.side_effect = Exception('moocow')
|
||||
conductor_utils.cleanup_rescuewait_timeout(self.task)
|
||||
self.assertTrue(log_mock.error.called)
|
||||
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
|
||||
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
|
||||
self.assertIn('Rescue timed out', self.node.last_error)
|
||||
self.node.save.assert_called_once_with()
|
||||
self.assertTrue(log_mock.exception.called)
|
||||
|
||||
|
||||
class ValidatePortPhysnetTestCase(db_base.DbTestCase):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user