Nodes in maintenance didn't fail, when they should have
In this code in base_manager.py, _fail_if_in_state() [1]: if the node is in maintenance, nothing is done. This means that when a node in maintenance is in mid deployment or cleaning and their conductor dies, it won't get put into a failed state [2]. This fixes it. [1]8294afa623/ironic/conductor/base_manager.py (L485)
[2]8294afa623/ironic/conductor/base_manager.py (L235)
Story #2007098 Task #38134 Change-Id: Ide70619271455685d09671ae16d744fc9ae58a02
This commit is contained in:
parent
c4352b66fe
commit
dd18c70543
@ -476,15 +476,19 @@ class BaseConductorManager(object):
|
|||||||
node_iter = self.iter_nodes(filters=filters,
|
node_iter = self.iter_nodes(filters=filters,
|
||||||
sort_key=sort_key,
|
sort_key=sort_key,
|
||||||
sort_dir='asc')
|
sort_dir='asc')
|
||||||
|
desired_maintenance = filters.get('maintenance')
|
||||||
workers_count = 0
|
workers_count = 0
|
||||||
for node_uuid, driver, conductor_group in node_iter:
|
for node_uuid, driver, conductor_group in node_iter:
|
||||||
try:
|
try:
|
||||||
with task_manager.acquire(context, node_uuid,
|
with task_manager.acquire(context, node_uuid,
|
||||||
purpose='node state check') as task:
|
purpose='node state check') as task:
|
||||||
if (task.node.maintenance
|
# Check maintenance value since it could have changed
|
||||||
or task.node.provision_state
|
# after the filtering was done.
|
||||||
not in provision_state):
|
if (desired_maintenance is not None
|
||||||
|
and desired_maintenance != task.node.maintenance):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if task.node.provision_state not in provision_state:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
target_state = (None if not keep_target_state else
|
target_state = (None if not keep_target_state else
|
||||||
|
@ -3188,6 +3188,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
|||||||
callback_timeout = CONF.conductor.inspect_wait_timeout
|
callback_timeout = CONF.conductor.inspect_wait_timeout
|
||||||
|
|
||||||
filters = {'reserved': False,
|
filters = {'reserved': False,
|
||||||
|
'maintenance': False,
|
||||||
'provision_state': states.INSPECTWAIT,
|
'provision_state': states.INSPECTWAIT,
|
||||||
'inspection_started_before': callback_timeout}
|
'inspection_started_before': callback_timeout}
|
||||||
sort_key = 'inspection_started_at'
|
sort_key = 'inspection_started_at'
|
||||||
|
@ -521,3 +521,26 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
|||||||
self.assertIsNone(test_node.last_error)
|
self.assertIsNone(test_node.last_error)
|
||||||
self.assertTrue(log_mock.warning.called)
|
self.assertTrue(log_mock.warning.called)
|
||||||
self.assertFalse(mock_notify.called)
|
self.assertFalse(mock_notify.called)
|
||||||
|
|
||||||
|
|
||||||
|
class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(MiscTestCase, self).setUp()
|
||||||
|
self._start_service()
|
||||||
|
|
||||||
|
def test__fail_transient_state(self):
|
||||||
|
node = obj_utils.create_test_node(self.context,
|
||||||
|
driver='fake-hardware',
|
||||||
|
provision_state=states.DEPLOYING)
|
||||||
|
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
|
||||||
|
node.refresh()
|
||||||
|
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||||
|
|
||||||
|
def test__fail_transient_state_maintenance(self):
|
||||||
|
node = obj_utils.create_test_node(self.context,
|
||||||
|
driver='fake-hardware',
|
||||||
|
maintenance=True,
|
||||||
|
provision_state=states.DEPLOYING)
|
||||||
|
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
|
||||||
|
node.refresh()
|
||||||
|
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||||
|
@ -8103,6 +8103,7 @@ class ManagerCheckInspectWaitTimeoutsTestCase(mgr_utils.CommonMixIn,
|
|||||||
self.task2 = self._create_task(node=self.node2)
|
self.task2 = self._create_task(node=self.node2)
|
||||||
|
|
||||||
self.filters = {'reserved': False,
|
self.filters = {'reserved': False,
|
||||||
|
'maintenance': False,
|
||||||
'inspection_started_before': 300,
|
'inspection_started_before': 300,
|
||||||
'provision_state': states.INSPECTWAIT}
|
'provision_state': states.INSPECTWAIT}
|
||||||
self.columns = ['uuid', 'driver', 'conductor_group']
|
self.columns = ['uuid', 'driver', 'conductor_group']
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
If a node is in mid-deployment or cleaning and its conductor dies, ironic
|
||||||
|
will move that node into a failed state. However, this wasn't being done
|
||||||
|
if those nodes were also in maintenance. This has been fixed. See
|
||||||
|
`story 2007098 <https://storyboard.openstack.org/#!/story/2007098>`_ for
|
||||||
|
more details.
|
Loading…
Reference in New Issue
Block a user