Node gets stuck in ING state when conductor goes down
If a node is in ING state such as INSPECTING, RESCUING, and the conductor goes down, when the conductor backs, the node gets stuck with that ING state. The cases for (DEPLOYING, CLEANING) are already processed as expected, but (INSPECTING, RESCUING, UNRESCUING, VERIFYING, ADOPTING). DELETING cannot be transitioned to 'fail' state. Change-Id: Ie6886ea78fac8bae81675dabf467939deb1c4460 Story: #2003147 Task: #23282
This commit is contained in:
parent
0536854118
commit
d70db292ae
@ -234,6 +234,15 @@ UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING,
|
||||
RESCUEWAIT, UNRESCUING)
|
||||
"""States that can be changed without external request."""
|
||||
|
||||
STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING,
|
||||
ADOPTING, RESCUING, UNRESCUING)
|
||||
"""States that cannot be resumed once a conductor dies.
|
||||
|
||||
If a node gets stuck with one of these states for some reason
|
||||
(eg. conductor goes down when executing task), node will be moved
|
||||
to fail state.
|
||||
"""
|
||||
|
||||
##############
|
||||
# Power states
|
||||
##############
|
||||
|
@ -179,14 +179,12 @@ class BaseConductorManager(object):
|
||||
self._periodic_tasks_worker.add_done_callback(
|
||||
self._on_periodic_tasks_stop)
|
||||
|
||||
self._fail_transient_state(
|
||||
states.DEPLOYING,
|
||||
_("The deployment can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
self._fail_transient_state(
|
||||
states.CLEANING,
|
||||
_("The cleaning can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
for state in states.STUCK_STATES_TREATED_AS_FAIL:
|
||||
self._fail_transient_state(
|
||||
state,
|
||||
_("The %(state)s state can't be resumed by conductor "
|
||||
"%(host)s. Moving to fail state.") %
|
||||
{'state': state, 'host': self.host})
|
||||
|
||||
# Start consoles if it set enabled in a greenthread.
|
||||
try:
|
||||
|
@ -13,6 +13,7 @@
|
||||
"""Test class for Ironic BaseConductorManager."""
|
||||
|
||||
import collections
|
||||
import uuid
|
||||
|
||||
import eventlet
|
||||
import futurist
|
||||
@ -24,6 +25,7 @@ from oslo_utils import uuidutils
|
||||
|
||||
from ironic.common import driver_factory
|
||||
from ironic.common import exception
|
||||
from ironic.common import states
|
||||
from ironic.conductor import base_manager
|
||||
from ironic.conductor import manager
|
||||
from ironic.conductor import notification_utils
|
||||
@ -206,6 +208,27 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertRaisesRegex(RuntimeError, 'already running',
|
||||
self.service.init_host)
|
||||
|
||||
def test_start_recover_nodes_stuck(self):
|
||||
state_trans = [
|
||||
(states.DEPLOYING, states.DEPLOYFAIL),
|
||||
(states.CLEANING, states.CLEANFAIL),
|
||||
(states.VERIFYING, states.ENROLL),
|
||||
(states.INSPECTING, states.INSPECTFAIL),
|
||||
(states.ADOPTING, states.ADOPTFAIL),
|
||||
(states.RESCUING, states.RESCUEFAIL),
|
||||
(states.UNRESCUING, states.UNRESCUEFAIL),
|
||||
]
|
||||
nodes = [obj_utils.create_test_node(self.context, uuid=uuid.uuid4(),
|
||||
driver='fake-hardware',
|
||||
provision_state=state[0])
|
||||
for state in state_trans]
|
||||
|
||||
self._start_service()
|
||||
for node, state in zip(nodes, state_trans):
|
||||
node.refresh()
|
||||
self.assertEqual(state[1], node.provision_state,
|
||||
'Test failed when recovering from %s' % state[0])
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG')
|
||||
def test_warning_on_low_workers_pool(self, log_mock):
|
||||
CONF.set_override('workers_pool_size', 3, 'conductor')
|
||||
|
@ -4035,11 +4035,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
|
||||
def test__do_node_rescue_returns_rescuewait(self, mock_rescue):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.RESCUING,
|
||||
instance_info={'rescue_password':
|
||||
'password'})
|
||||
self._start_service()
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
mock_rescue.return_value = states.RESCUEWAIT
|
||||
self.service._do_node_rescue(task)
|
||||
@ -4050,11 +4050,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
|
||||
def test__do_node_rescue_returns_rescue(self, mock_rescue):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.RESCUING,
|
||||
instance_info={'rescue_password':
|
||||
'password'})
|
||||
self._start_service()
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
mock_rescue.return_value = states.RESCUE
|
||||
self.service._do_node_rescue(task)
|
||||
@ -4066,11 +4066,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
|
||||
def test__do_node_rescue_errors(self, mock_rescue, mock_log):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.RESCUING,
|
||||
instance_info={'rescue_password':
|
||||
'password'})
|
||||
self._start_service()
|
||||
mock_rescue.side_effect = exception.InstanceRescueFailure(
|
||||
'failed to rescue')
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
@ -4086,11 +4086,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
|
||||
def test__do_node_rescue_bad_state(self, mock_rescue, mock_log):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.RESCUING,
|
||||
instance_info={'rescue_password':
|
||||
'password'})
|
||||
self._start_service()
|
||||
mock_rescue.return_value = states.ACTIVE
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
self.service._do_node_rescue(task)
|
||||
@ -4156,11 +4156,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
|
||||
def test__do_node_unrescue(self, mock_unrescue):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.UNRESCUING,
|
||||
target_provision_state=states.ACTIVE,
|
||||
instance_info={})
|
||||
self._start_service()
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
mock_unrescue.return_value = states.ACTIVE
|
||||
self.service._do_node_unrescue(task)
|
||||
@ -4171,11 +4171,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
|
||||
def test__do_node_unrescue_ironic_error(self, mock_unrescue, mock_log):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.UNRESCUING,
|
||||
target_provision_state=states.ACTIVE,
|
||||
instance_info={})
|
||||
self._start_service()
|
||||
mock_unrescue.side_effect = exception.InstanceUnrescueFailure(
|
||||
'Unable to unrescue')
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
@ -4190,11 +4190,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
|
||||
def test__do_node_unrescue_other_error(self, mock_unrescue, mock_log):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.UNRESCUING,
|
||||
target_provision_state=states.ACTIVE,
|
||||
instance_info={})
|
||||
self._start_service()
|
||||
mock_unrescue.side_effect = RuntimeError('Some failure')
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
self.assertRaises(RuntimeError,
|
||||
@ -4207,10 +4207,10 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
|
||||
def test__do_node_unrescue_bad_state(self, mock_unrescue):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
provision_state=states.UNRESCUING,
|
||||
instance_info={})
|
||||
self._start_service()
|
||||
mock_unrescue.return_value = states.RESCUEWAIT
|
||||
with task_manager.TaskManager(self.context, node.uuid) as task:
|
||||
self.service._do_node_unrescue(task)
|
||||
@ -4272,6 +4272,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
|
||||
def test__do_node_verify(self, mock_validate, mock_get_power_state,
|
||||
mock_notif):
|
||||
self._start_service()
|
||||
mock_get_power_state.return_value = states.POWER_OFF
|
||||
# Required for exception handling
|
||||
mock_notif.__name__ = 'NodeCorrectedPowerStateNotification'
|
||||
@ -4282,7 +4283,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
last_error=None,
|
||||
power_state=states.NOSTATE)
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node['id'], shared=False) as task:
|
||||
self.service._do_node_verify(task)
|
||||
@ -4311,6 +4311,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
|
||||
def test__do_node_verify_validation_fails(self, mock_validate,
|
||||
mock_get_power_state):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.VERIFYING,
|
||||
@ -4320,7 +4321,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
|
||||
mock_validate.side_effect = RuntimeError("boom")
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node['id'], shared=False) as task:
|
||||
self.service._do_node_verify(task)
|
||||
@ -4339,6 +4339,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
|
||||
def test__do_node_verify_get_state_fails(self, mock_validate,
|
||||
mock_get_power_state):
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.VERIFYING,
|
||||
@ -4348,7 +4349,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
|
||||
mock_get_power_state.side_effect = RuntimeError("boom")
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node['id'], shared=False) as task:
|
||||
self.service._do_node_verify(task)
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- If a node gets stuck in one of the states ``deploying``, ``cleaning``,
|
||||
``verifying``, ``inspecting``, ``adopting``, ``rescuing``, ``unrescuing``
|
||||
for some reason (eg. conductor goes down when executing a task), it will
|
||||
be moved to an appropriate failure state in the next time the conductor
|
||||
starts.
|
Loading…
Reference in New Issue
Block a user