Nodes in maintenance didn't fail, when they should have
In this code in base_manager.py, _fail_if_in_state() [1]: if the node is in maintenance, nothing is done. This means that when a node in maintenance is in mid deployment or cleaning and their conductor dies, it won't get put into a failed state [2]. This fixes it. [1]8294afa623/ironic/conductor/base_manager.py (L485)
[2]8294afa623/ironic/conductor/base_manager.py (L235)
Story #2007098 Task #38134 Change-Id: Ide70619271455685d09671ae16d744fc9ae58a02
This commit is contained in:
parent
c4352b66fe
commit
dd18c70543
@ -476,15 +476,19 @@ class BaseConductorManager(object):
|
||||
node_iter = self.iter_nodes(filters=filters,
|
||||
sort_key=sort_key,
|
||||
sort_dir='asc')
|
||||
|
||||
desired_maintenance = filters.get('maintenance')
|
||||
workers_count = 0
|
||||
for node_uuid, driver, conductor_group in node_iter:
|
||||
try:
|
||||
with task_manager.acquire(context, node_uuid,
|
||||
purpose='node state check') as task:
|
||||
if (task.node.maintenance
|
||||
or task.node.provision_state
|
||||
not in provision_state):
|
||||
# Check maintenance value since it could have changed
|
||||
# after the filtering was done.
|
||||
if (desired_maintenance is not None
|
||||
and desired_maintenance != task.node.maintenance):
|
||||
continue
|
||||
|
||||
if task.node.provision_state not in provision_state:
|
||||
continue
|
||||
|
||||
target_state = (None if not keep_target_state else
|
||||
|
@ -3188,6 +3188,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
callback_timeout = CONF.conductor.inspect_wait_timeout
|
||||
|
||||
filters = {'reserved': False,
|
||||
'maintenance': False,
|
||||
'provision_state': states.INSPECTWAIT,
|
||||
'inspection_started_before': callback_timeout}
|
||||
sort_key = 'inspection_started_at'
|
||||
|
@ -521,3 +521,26 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertIsNone(test_node.last_error)
|
||||
self.assertTrue(log_mock.warning.called)
|
||||
self.assertFalse(mock_notify.called)
|
||||
|
||||
|
||||
class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
super(MiscTestCase, self).setUp()
|
||||
self._start_service()
|
||||
|
||||
def test__fail_transient_state(self):
|
||||
node = obj_utils.create_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
provision_state=states.DEPLOYING)
|
||||
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
|
||||
node.refresh()
|
||||
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||
|
||||
def test__fail_transient_state_maintenance(self):
|
||||
node = obj_utils.create_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
maintenance=True,
|
||||
provision_state=states.DEPLOYING)
|
||||
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
|
||||
node.refresh()
|
||||
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||
|
@ -8103,6 +8103,7 @@ class ManagerCheckInspectWaitTimeoutsTestCase(mgr_utils.CommonMixIn,
|
||||
self.task2 = self._create_task(node=self.node2)
|
||||
|
||||
self.filters = {'reserved': False,
|
||||
'maintenance': False,
|
||||
'inspection_started_before': 300,
|
||||
'provision_state': states.INSPECTWAIT}
|
||||
self.columns = ['uuid', 'driver', 'conductor_group']
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
If a node is in mid-deployment or cleaning and its conductor dies, ironic
|
||||
will move that node into a failed state. However, this wasn't being done
|
||||
if those nodes were also in maintenance. This has been fixed. See
|
||||
`story 2007098 <https://storyboard.openstack.org/#!/story/2007098>`_ for
|
||||
more details.
|
Loading…
Reference in New Issue
Block a user