Fix issue where server fails to reboot

The iDRAC will return a "The command failed to set RequestedState" error
if the server is powered off and a reboot is requested.  In this
situation, convert the requested reboot into a power on to avoid this
error.

To minimize the chance of a race condition, it is critical to do this
check immediately before sending the power state change command.  This
keeps the window during which the server could change power states
without us knowing about it as small as possible.

Note that this error is returned in other scenarios as well.

Change-Id: I4c95959090423ff1929700f0fce596510447ac1c
Story: 2007487
Task: 39206
This commit is contained in:
Christopher Dearborn 2020-03-23 12:37:38 -04:00
parent 5f34c0e903
commit 95a3e1bdfd
4 changed files with 133 additions and 18 deletions

View File

@ -15,6 +15,8 @@
DRAC power interface
"""
import time
from ironic_lib import metrics_utils
from oslo_log import log as logging
from oslo_utils import importutils
@ -43,6 +45,10 @@ if drac_constants:
REVERSE_POWER_STATES = dict((v, k) for (k, v) in POWER_STATES.items())
POWER_STATE_TRIES = 15
POWER_STATE_SLEEP = 2
POWER_STATE_CHANGE_FAIL = 'The command failed to set RequestedState'
def _get_power_state(node):
"""Returns the current power state of the node.
@ -101,18 +107,60 @@ def _set_power_state(node, power_state):
_commit_boot_list_change(node)
client = drac_common.get_drac_client(node)
target_power_state = REVERSE_POWER_STATES[power_state]
tries = POWER_STATE_TRIES
try:
client.set_power_state(target_power_state)
except drac_exceptions.BaseClientException as exc:
LOG.error('DRAC driver failed to set power state for node '
'%(node_uuid)s to %(power_state)s. '
'Reason: %(error)s.',
{'node_uuid': node.uuid,
'power_state': power_state,
'error': exc})
raise exception.DracOperationError(error=exc)
# Cases have been seen where the iDRAC returns a SYS021 error even when
# the server is in the right power state and a valid power state change
# is attempted. Retry in this case.
while tries > 0:
# The iDRAC will return a SYS021 error if the server is powered off
# and a reboot is requested. In this situation, convert the requested
# reboot into a power on to avoid this error. To minimize the chance
# of a race condition, it is critical to do this check immediately
# before sending the power state change command. This keeps the
# window during which the server could change power states without us
# knowing about it as small as possible.
calc_power_state = power_state
if power_state == states.REBOOT:
current_power_state = _get_power_state(node)
# If the server is not on, then power it on instead of rebooting
if current_power_state != states.POWER_ON:
calc_power_state = states.POWER_ON
target_power_state = REVERSE_POWER_STATES[calc_power_state]
try:
client.set_power_state(target_power_state)
break
except drac_exceptions.BaseClientException as exc:
if (power_state == states.REBOOT
and POWER_STATE_CHANGE_FAIL in str(exc)
and tries > 0):
LOG.warning('DRAC driver failed to set power state for node '
'%(node_uuid)s to %(calc_power_state)s. '
'Reason: %(error)s. Retrying...',
{'node_uuid': node.uuid,
'calc_power_state': calc_power_state,
'error': exc})
tries -= 1
time.sleep(POWER_STATE_SLEEP)
else:
LOG.error('DRAC driver failed to set power state for node '
'%(node_uuid)s to %(calc_power_state)s. '
'Reason: %(error)s.',
{'node_uuid': node.uuid,
'calc_power_state': calc_power_state,
'error': exc})
raise exception.DracOperationError(error=exc)
if tries <= 0:
error_msg = (_('DRAC driver timed out while trying to set the power '
'state for node %(node_uuid)s to '
'%(calc_power_state)s.') %
{'node_uuid': node.uuid,
'calc_power_state': calc_power_state})
LOG.error(error_msg)
raise exception.DracOperationError(error_msg)
class DracRedfishPower(redfish_power.RedfishPower):
@ -199,13 +247,7 @@ class DracWSManPower(base.PowerInterface):
"timeout=%(timeout)s",
{'timeout': timeout})
current_power_state = _get_power_state(task.node)
if current_power_state == states.POWER_ON:
target_power_state = states.REBOOT
else:
target_power_state = states.POWER_ON
_set_power_state(task.node, target_power_state)
_set_power_state(task.node, states.REBOOT)
class DracPower(DracWSManPower):

View File

@ -144,3 +144,58 @@ class DracPowerTestCase(test_utils.BaseDracTest):
drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON]
mock_client.set_power_state.assert_called_once_with(drac_power_state)
@mock.patch('time.sleep')
def test_reboot_retries_success(self, mock_sleep, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.return_value = drac_constants.POWER_OFF
exc = drac_exceptions.DRACOperationFailed(
drac_messages=['The command failed to set RequestedState'])
mock_client.set_power_state.side_effect = [exc, None]
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.power.reboot(task)
drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON]
self.assertEqual(2, mock_client.set_power_state.call_count)
mock_client.set_power_state.assert_has_calls(
[mock.call(drac_power_state),
mock.call(drac_power_state)])
@mock.patch('time.sleep')
def test_reboot_retries_fail(self, mock_sleep, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.return_value = drac_constants.POWER_OFF
exc = drac_exceptions.DRACOperationFailed(
drac_messages=['The command failed to set RequestedState'])
mock_client.set_power_state.side_effect = exc
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.assertRaises(exception.DracOperationError,
task.driver.power.reboot, task)
self.assertEqual(drac_power.POWER_STATE_TRIES,
mock_client.set_power_state.call_count)
@mock.patch('time.sleep')
def test_reboot_retries_power_change_success(self, mock_sleep,
mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF,
drac_constants.POWER_ON]
exc = drac_exceptions.DRACOperationFailed(
drac_messages=['The command failed to set RequestedState'])
mock_client.set_power_state.side_effect = [exc, None]
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.power.reboot(task)
self.assertEqual(2, mock_client.set_power_state.call_count)
drac_power_state1 = drac_power.REVERSE_POWER_STATES[states.POWER_ON]
drac_power_state2 = drac_power.REVERSE_POWER_STATES[states.REBOOT]
mock_client.set_power_state.assert_has_calls(
[mock.call(drac_power_state1),
mock.call(drac_power_state2)])

View File

@ -106,6 +106,17 @@ if not dracclient:
sys.modules['dracclient.exceptions'] = dracclient.exceptions
dracclient.exceptions.BaseClientException = type('BaseClientException',
(Exception,), {})
dracclient.exceptions.DRACRequestFailed = type(
'DRACRequestFailed', (dracclient.exceptions.BaseClientException,), {})
class DRACOperationFailed(dracclient.exceptions.DRACRequestFailed):
def __init__(self, **kwargs):
super(DRACOperationFailed, self).__init__(
'DRAC operation failed. Messages: %(drac_messages)s' % kwargs)
dracclient.exceptions.DRACOperationFailed = DRACOperationFailed
# Now that the external library has been mocked, if anything had already
# loaded any of the drivers, reload them.
if 'ironic.drivers.modules.drac' in sys.modules:

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Fixed a bug where rebooting a node managed by the ``idrac`` hardware
type when using the WS-MAN power interface sometimes fails with a
``The command failed to set RequestedState`` error. See bug `2007487
<https://storyboard.openstack.org/#!/story/2007487>`_ for details.