Merge "Do not enter maintenance if cleaning fails before running the 1st step"
This commit is contained in:
commit
07bdccea58
@ -314,9 +314,15 @@ cleaning.
|
||||
|
||||
Troubleshooting
|
||||
===============
|
||||
If cleaning fails on a node, the node will be put into ``clean failed`` state
|
||||
and placed in maintenance mode, to prevent ironic from taking actions on the
|
||||
node.
|
||||
If cleaning fails on a node, the node will be put into ``clean failed`` state.
|
||||
If the failure happens while running a clean step, the node is also placed in
|
||||
maintenance mode to prevent ironic from taking actions on the node. The
|
||||
operator should validate that no permanent damage has been done to the
|
||||
node and no processes are still running on it before removing the maintenance
|
||||
mode.
|
||||
|
||||
.. note:: Older versions of ironic may put the node to maintenance even when
|
||||
no clean step has been running.
|
||||
|
||||
Nodes in ``clean failed`` will not be powered off, as the node might be in a
|
||||
state such that powering it off could damage the node or remove useful
|
||||
|
@ -407,9 +407,9 @@ def cleanup_cleanwait_timeout(task):
|
||||
|
||||
|
||||
def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
tear_down_cleaning=True,
|
||||
set_fail_state=True):
|
||||
"""Put a failed node in CLEANFAIL and maintenance.
|
||||
tear_down_cleaning=True, set_fail_state=True,
|
||||
set_maintenance=None):
|
||||
"""Put a failed node in CLEANFAIL and maintenance (if needed).
|
||||
|
||||
:param task: a TaskManager instance.
|
||||
:param logmsg: Message to be logged.
|
||||
@ -420,12 +420,19 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
cleaning. Default to True.
|
||||
:param set_fail_state: Whether to set node to failed state. Default to
|
||||
True.
|
||||
:param set_maintenance: Whether to set maintenance mode. If None,
|
||||
maintenance mode will be set if and only if a clean step is being
|
||||
executed on a node.
|
||||
"""
|
||||
if set_maintenance is None:
|
||||
set_maintenance = bool(task.node.clean_step)
|
||||
|
||||
errmsg = errmsg or logmsg
|
||||
LOG.error(logmsg, exc_info=traceback)
|
||||
node = task.node
|
||||
node.fault = faults.CLEAN_FAILURE
|
||||
node.maintenance = True
|
||||
if set_maintenance:
|
||||
node.fault = faults.CLEAN_FAILURE
|
||||
node.maintenance = True
|
||||
|
||||
if tear_down_cleaning:
|
||||
try:
|
||||
@ -457,7 +464,7 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
node.last_error = errmsg
|
||||
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
|
||||
if not node.maintenance_reason:
|
||||
if not node.maintenance_reason and set_maintenance:
|
||||
node.maintenance_reason = errmsg
|
||||
node.save()
|
||||
|
||||
|
@ -18,6 +18,7 @@ from oslo_config import cfg
|
||||
from oslo_utils import uuidutils
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common import states
|
||||
from ironic.conductor import cleaning
|
||||
from ironic.conductor import steps as conductor_steps
|
||||
@ -63,6 +64,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
node.refresh()
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertFalse(node.maintenance)
|
||||
self.assertIsNone(node.fault)
|
||||
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',
|
||||
@ -331,6 +334,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertIn('is not allowed', node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
self.assertEqual('Original reason', node.maintenance_reason)
|
||||
self.assertIsNone(node.fault) # no clean step running
|
||||
self.assertFalse(mock_prep.called)
|
||||
self.assertFalse(mock_tear_down.called)
|
||||
|
||||
@ -356,6 +360,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
mock_prep.assert_called_once_with(mock.ANY, task)
|
||||
mock_validate.assert_called_once_with(mock.ANY, task)
|
||||
self.assertFalse(node.maintenance)
|
||||
self.assertIsNone(node.fault)
|
||||
|
||||
def test__do_node_clean_automated_prepare_clean_fail(self):
|
||||
self.__do_node_clean_prepare_clean_fail()
|
||||
@ -413,6 +419,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
mock_steps.assert_called_once_with(mock.ANY)
|
||||
self.assertFalse(node.maintenance)
|
||||
self.assertIsNone(node.fault)
|
||||
|
||||
def test__do_node_clean_automated_steps_fail(self):
|
||||
for invalid in (True, False):
|
||||
@ -460,6 +468,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
if clean_steps:
|
||||
self.assertEqual(clean_steps,
|
||||
node.driver_internal_info['clean_steps'])
|
||||
self.assertFalse(node.maintenance)
|
||||
|
||||
# Check that state didn't change
|
||||
self.assertEqual(states.CLEANING, node.provision_state)
|
||||
@ -751,6 +760,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
self.assertEqual(faults.CLEAN_FAILURE, node.fault)
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.clean_steps[0])
|
||||
mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning')
|
||||
@ -931,7 +941,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertEqual(1, tear_mock.call_count)
|
||||
self.assertTrue(node.maintenance)
|
||||
self.assertFalse(node.maintenance) # no step is running
|
||||
deploy_exec_calls = [
|
||||
mock.call(mock.ANY, mock.ANY, self.clean_steps[0]),
|
||||
mock.call(mock.ANY, mock.ANY, self.clean_steps[2]),
|
||||
@ -1038,7 +1048,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
self.assertTrue(node.maintenance) # the 1st clean step was running
|
||||
deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY,
|
||||
self.clean_steps[0])
|
||||
# Make sure we don't execute any other step and return
|
||||
|
@ -36,6 +36,7 @@ from ironic.common import boot_devices
|
||||
from ironic.common import components
|
||||
from ironic.common import driver_factory
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common import images
|
||||
from ironic.common import indicator_states
|
||||
from ironic.common import nova
|
||||
@ -1858,7 +1859,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertIsNotNone(node.last_error)
|
||||
mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY)
|
||||
|
||||
def _check_cleanwait_timeouts(self, manual=False):
|
||||
def _check_cleanwait_timeouts(self, manual=False, with_step=True):
|
||||
self._start_service()
|
||||
CONF.set_override('clean_callback_timeout', 1, group='conductor')
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
@ -1869,7 +1870,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
|
||||
clean_step={
|
||||
'interface': 'deploy',
|
||||
'step': 'erase_devices'},
|
||||
'step': 'erase_devices'} if with_step else {},
|
||||
driver_internal_info={
|
||||
'cleaning_reboot': manual,
|
||||
'clean_step_index': 0})
|
||||
@ -1880,6 +1881,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertEqual(with_step, node.maintenance)
|
||||
self.assertEqual(faults.CLEAN_FAILURE if with_step else None,
|
||||
node.fault)
|
||||
# Test that cleaning parameters have been purged in order
|
||||
# to prevent looping of the cleaning sequence
|
||||
self.assertEqual({}, node.clean_step)
|
||||
@ -1892,6 +1896,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
def test__check_cleanwait_timeouts_manual_clean(self):
|
||||
self._check_cleanwait_timeouts(manual=True)
|
||||
|
||||
def test__check_cleanwait_timeouts_boot_timeout(self):
|
||||
self._check_cleanwait_timeouts(with_step=False)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up',
|
||||
autospec=True)
|
||||
@mock.patch.object(conductor_utils, 'node_power_action', autospec=True)
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Failed cleaning no longer results in maintenance mode if no clean step is
|
||||
running, e.g. on PXE timeout or failed clean steps validation.
|
Loading…
Reference in New Issue
Block a user