Node gets stuck in ING state when conductor goes down

If a node is in ING state such as INSPECTING, RESCUING, and
the conductor goes down, when the conductor backs, the node
gets stuck with that ING state.

The cases for (DEPLOYING, CLEANING) are already processed
as expected, but (INSPECTING, RESCUING, UNRESCUING, VERIFYING,
ADOPTING). DELETING cannot be transitioned to 'fail' state.

Change-Id: Ie6886ea78fac8bae81675dabf467939deb1c4460
Story: #2003147
Task: #23282
This commit is contained in:
Dao Cong Tien 2018-07-30 17:02:09 +07:00
parent 0536854118
commit d70db292ae
5 changed files with 56 additions and 19 deletions

View File

@ -234,6 +234,15 @@ UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING,
RESCUEWAIT, UNRESCUING)
"""States that can be changed without external request."""
STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING,
ADOPTING, RESCUING, UNRESCUING)
"""States that cannot be resumed once a conductor dies.
If a node gets stuck with one of these states for some reason
(eg. conductor goes down when executing task), node will be moved
to fail state.
"""
##############
# Power states
##############

View File

@ -179,14 +179,12 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop)
self._fail_transient_state(
states.DEPLOYING,
_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
self._fail_transient_state(
states.CLEANING,
_("The cleaning can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
for state in states.STUCK_STATES_TREATED_AS_FAIL:
self._fail_transient_state(
state,
_("The %(state)s state can't be resumed by conductor "
"%(host)s. Moving to fail state.") %
{'state': state, 'host': self.host})
# Start consoles if it set enabled in a greenthread.
try:

View File

@ -13,6 +13,7 @@
"""Test class for Ironic BaseConductorManager."""
import collections
import uuid
import eventlet
import futurist
@ -24,6 +25,7 @@ from oslo_utils import uuidutils
from ironic.common import driver_factory
from ironic.common import exception
from ironic.common import states
from ironic.conductor import base_manager
from ironic.conductor import manager
from ironic.conductor import notification_utils
@ -206,6 +208,27 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertRaisesRegex(RuntimeError, 'already running',
self.service.init_host)
def test_start_recover_nodes_stuck(self):
state_trans = [
(states.DEPLOYING, states.DEPLOYFAIL),
(states.CLEANING, states.CLEANFAIL),
(states.VERIFYING, states.ENROLL),
(states.INSPECTING, states.INSPECTFAIL),
(states.ADOPTING, states.ADOPTFAIL),
(states.RESCUING, states.RESCUEFAIL),
(states.UNRESCUING, states.UNRESCUEFAIL),
]
nodes = [obj_utils.create_test_node(self.context, uuid=uuid.uuid4(),
driver='fake-hardware',
provision_state=state[0])
for state in state_trans]
self._start_service()
for node, state in zip(nodes, state_trans):
node.refresh()
self.assertEqual(state[1], node.provision_state,
'Test failed when recovering from %s' % state[0])
@mock.patch.object(base_manager, 'LOG')
def test_warning_on_low_workers_pool(self, log_mock):
CONF.set_override('workers_pool_size', 3, 'conductor')

View File

@ -4035,11 +4035,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_returns_rescuewait(self, mock_rescue):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_rescue.return_value = states.RESCUEWAIT
self.service._do_node_rescue(task)
@ -4050,11 +4050,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_returns_rescue(self, mock_rescue):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_rescue.return_value = states.RESCUE
self.service._do_node_rescue(task)
@ -4066,11 +4066,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_errors(self, mock_rescue, mock_log):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
self._start_service()
mock_rescue.side_effect = exception.InstanceRescueFailure(
'failed to rescue')
with task_manager.TaskManager(self.context, node.uuid) as task:
@ -4086,11 +4086,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_bad_state(self, mock_rescue, mock_log):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
self._start_service()
mock_rescue.return_value = states.ACTIVE
with task_manager.TaskManager(self.context, node.uuid) as task:
self.service._do_node_rescue(task)
@ -4156,11 +4156,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue(self, mock_unrescue):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_unrescue.return_value = states.ACTIVE
self.service._do_node_unrescue(task)
@ -4171,11 +4171,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_ironic_error(self, mock_unrescue, mock_log):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
self._start_service()
mock_unrescue.side_effect = exception.InstanceUnrescueFailure(
'Unable to unrescue')
with task_manager.TaskManager(self.context, node.uuid) as task:
@ -4190,11 +4190,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_other_error(self, mock_unrescue, mock_log):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
self._start_service()
mock_unrescue.side_effect = RuntimeError('Some failure')
with task_manager.TaskManager(self.context, node.uuid) as task:
self.assertRaises(RuntimeError,
@ -4207,10 +4207,10 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_bad_state(self, mock_unrescue):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
instance_info={})
self._start_service()
mock_unrescue.return_value = states.RESCUEWAIT
with task_manager.TaskManager(self.context, node.uuid) as task:
self.service._do_node_unrescue(task)
@ -4272,6 +4272,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify(self, mock_validate, mock_get_power_state,
mock_notif):
self._start_service()
mock_get_power_state.return_value = states.POWER_OFF
# Required for exception handling
mock_notif.__name__ = 'NodeCorrectedPowerStateNotification'
@ -4282,7 +4283,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
last_error=None,
power_state=states.NOSTATE)
self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)
@ -4311,6 +4311,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify_validation_fails(self, mock_validate,
mock_get_power_state):
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.VERIFYING,
@ -4320,7 +4321,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_validate.side_effect = RuntimeError("boom")
self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)
@ -4339,6 +4339,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify_get_state_fails(self, mock_validate,
mock_get_power_state):
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.VERIFYING,
@ -4348,7 +4349,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_get_power_state.side_effect = RuntimeError("boom")
self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)

View File

@ -0,0 +1,7 @@
---
fixes:
- If a node gets stuck in one of the states ``deploying``, ``cleaning``,
``verifying``, ``inspecting``, ``adopting``, ``rescuing``, ``unrescuing``
for some reason (eg. conductor goes down when executing a task), it will
be moved to an appropriate failure state in the next time the conductor
starts.