Nodes in maintenance didn't fail, when they should have

In this code in base_manager.py, _fail_if_in_state() [1]: if the
node is in maintenance, nothing is done. This means that when a
node in maintenance is in mid deployment or cleaning and their
conductor dies, it won't get put into a failed state [2].

This fixes it.

[1] 8294afa623/ironic/conductor/base_manager.py (L485)
[2] 8294afa623/ironic/conductor/base_manager.py (L235)

Story #2007098
Task #38134

Change-Id: Ide70619271455685d09671ae16d744fc9ae58a02
This commit is contained in:
Ruby Loo 2020-01-16 15:55:29 +00:00
parent c4352b66fe
commit dd18c70543
5 changed files with 41 additions and 4 deletions

View File

@ -476,15 +476,19 @@ class BaseConductorManager(object):
node_iter = self.iter_nodes(filters=filters, node_iter = self.iter_nodes(filters=filters,
sort_key=sort_key, sort_key=sort_key,
sort_dir='asc') sort_dir='asc')
desired_maintenance = filters.get('maintenance')
workers_count = 0 workers_count = 0
for node_uuid, driver, conductor_group in node_iter: for node_uuid, driver, conductor_group in node_iter:
try: try:
with task_manager.acquire(context, node_uuid, with task_manager.acquire(context, node_uuid,
purpose='node state check') as task: purpose='node state check') as task:
if (task.node.maintenance # Check maintenance value since it could have changed
or task.node.provision_state # after the filtering was done.
not in provision_state): if (desired_maintenance is not None
and desired_maintenance != task.node.maintenance):
continue
if task.node.provision_state not in provision_state:
continue continue
target_state = (None if not keep_target_state else target_state = (None if not keep_target_state else

View File

@ -3188,6 +3188,7 @@ class ConductorManager(base_manager.BaseConductorManager):
callback_timeout = CONF.conductor.inspect_wait_timeout callback_timeout = CONF.conductor.inspect_wait_timeout
filters = {'reserved': False, filters = {'reserved': False,
'maintenance': False,
'provision_state': states.INSPECTWAIT, 'provision_state': states.INSPECTWAIT,
'inspection_started_before': callback_timeout} 'inspection_started_before': callback_timeout}
sort_key = 'inspection_started_at' sort_key = 'inspection_started_at'

View File

@ -521,3 +521,26 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertIsNone(test_node.last_error) self.assertIsNone(test_node.last_error)
self.assertTrue(log_mock.warning.called) self.assertTrue(log_mock.warning.called)
self.assertFalse(mock_notify.called) self.assertFalse(mock_notify.called)
class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def setUp(self):
super(MiscTestCase, self).setUp()
self._start_service()
def test__fail_transient_state(self):
node = obj_utils.create_test_node(self.context,
driver='fake-hardware',
provision_state=states.DEPLOYING)
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
node.refresh()
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
def test__fail_transient_state_maintenance(self):
node = obj_utils.create_test_node(self.context,
driver='fake-hardware',
maintenance=True,
provision_state=states.DEPLOYING)
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
node.refresh()
self.assertEqual(states.DEPLOYFAIL, node.provision_state)

View File

@ -8103,6 +8103,7 @@ class ManagerCheckInspectWaitTimeoutsTestCase(mgr_utils.CommonMixIn,
self.task2 = self._create_task(node=self.node2) self.task2 = self._create_task(node=self.node2)
self.filters = {'reserved': False, self.filters = {'reserved': False,
'maintenance': False,
'inspection_started_before': 300, 'inspection_started_before': 300,
'provision_state': states.INSPECTWAIT} 'provision_state': states.INSPECTWAIT}
self.columns = ['uuid', 'driver', 'conductor_group'] self.columns = ['uuid', 'driver', 'conductor_group']

View File

@ -0,0 +1,8 @@
---
fixes:
- |
If a node is in mid-deployment or cleaning and its conductor dies, ironic
will move that node into a failed state. However, this wasn't being done
if those nodes were also in maintenance. This has been fixed. See
`story 2007098 <https://storyboard.openstack.org/#!/story/2007098>`_ for
more details.