redfish: handle hardware that is unable to set persistent boot

Some hardware has dropped support for continuous boot, see this thread:
http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html

Work around by falling back to one-time boot device, restoring it on
every reboot or power on.

Story: #2007527
Task: #39320
Change-Id: I762056dea4f7b8ccdb8f95b2f21e26b45763905d
This commit is contained in:
Dmitry Tantsur 2020-05-04 13:34:05 +02:00
parent 8562df092f
commit 18a8e2f6e9
5 changed files with 206 additions and 13 deletions
ironic
drivers/modules/redfish
tests/unit/drivers/modules/redfish
releasenotes/notes

@ -24,6 +24,7 @@ from ironic.common import components
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import indicator_states
from ironic.common import utils
from ironic.conductor import task_manager
from ironic.drivers import base
from ironic.drivers.modules.redfish import utils as redfish_utils
@ -68,6 +69,42 @@ if sushy:
v: k for k, v in INDICATOR_MAP.items()}
def _set_boot_device(task, system, device, enabled=None):
"""An internal routine to set the boot device.
:param task: a task from TaskManager.
:param system: a Redfish System object.
:param device: the Redfish boot device.
:param enabled: Redfish boot device persistence value or None.
:raises: SushyError on an error from the Sushy library
"""
try:
system.set_system_boot_options(device, enabled=enabled)
except sushy.exceptions.SushyError as e:
if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS:
# NOTE(dtantsur): continuous boot device settings have been
# removed from Redfish, and some vendors stopped supporting
# it before an alternative was provided. As a work around,
# use one-time boot and restore the boot device on every
# reboot via RedfishPower.
LOG.debug('Error %(error)s when trying to set a '
'persistent boot device on node %(node)s, '
'falling back to one-time boot settings',
{'error': e, 'node': task.node.uuid})
system.set_system_boot_options(
device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
LOG.warning('Could not set persistent boot device to '
'%(dev)s for node %(node)s, using one-time '
'boot device instead',
{'dev': device, 'node': task.node.uuid})
utils.set_node_nested_field(
task.node, 'driver_internal_info',
'redfish_boot_device', device)
task.node.save()
else:
raise
class RedfishManagement(base.ManagementInterface):
def __init__(self):
@ -107,6 +144,33 @@ class RedfishManagement(base.ManagementInterface):
"""
return list(BOOT_DEVICE_MAP_REV)
@task_manager.require_exclusive_lock
def restore_boot_device(self, task, system):
"""Restore boot device if needed.
Checks the redfish_boot_device internal flag and sets the one-time
boot device accordingly. A warning is issued if it fails.
This method is supposed to be called from the Redfish power interface
and should be considered private to the Redfish hardware type.
:param task: a task from TaskManager.
:param system: a Redfish System object.
"""
device = task.node.driver_internal_info.get('redfish_boot_device')
if not device:
return
LOG.debug('Restoring boot device %(dev)s on node %(node)s',
{'dev': device, 'node': task.node.uuid})
try:
_set_boot_device(task, system, device)
except sushy.exceptions.SushyError as e:
LOG.warning('Unable to recover boot device %(dev)s for node '
'%(node)s, relying on the pre-configured boot order. '
'Error: %(error)s',
{'dev': device, 'node': task.node.uuid, 'error': e})
@task_manager.require_exclusive_lock
def set_boot_device(self, task, device, persistent=False):
"""Set the boot device for a node.
@ -124,6 +188,10 @@ class RedfishManagement(base.ManagementInterface):
:raises: RedfishConnectionError when it fails to connect to Redfish
:raises: RedfishError on an error from the Sushy library
"""
utils.pop_node_nested_field(
task.node, 'driver_internal_info', 'redfish_boot_device')
task.node.save()
system = redfish_utils.get_system(task.node)
desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent]
@ -132,11 +200,9 @@ class RedfishManagement(base.ManagementInterface):
# NOTE(etingof): this can be racy, esp if BMC is not RESTful
enabled = (desired_persistence
if desired_persistence != current_persistence else None)
try:
system.set_system_boot_options(
BOOT_DEVICE_MAP_REV[device], enabled=enabled)
_set_boot_device(task, system, BOOT_DEVICE_MAP_REV[device],
enabled=enabled)
except sushy.exceptions.SushyError as e:
error_msg = (_('Redfish set boot device failed for node '
'%(node)s. Error: %(error)s') %

@ -22,6 +22,7 @@ from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as cond_utils
from ironic.drivers import base
from ironic.drivers.modules.redfish import management as redfish_mgmt
from ironic.drivers.modules.redfish import utils as redfish_utils
LOG = log.getLogger(__name__)
@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface):
:raises: RedfishError on an error from the Sushy library
"""
system = redfish_utils.get_system(task.node)
if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT)
and isinstance(task.driver.management,
redfish_mgmt.RedfishManagement)):
task.driver.management.restore_boot_device(task, system)
try:
_set_power_state(task, system, power_state, timeout=timeout)
except sushy.exceptions.SushyError as e:
@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface):
next_state = states.POWER_OFF
_set_power_state(task, system, next_state, timeout=timeout)
if isinstance(task.driver.management,
redfish_mgmt.RedfishManagement):
task.driver.management.restore_boot_device(task, system)
next_state = states.POWER_ON
_set_power_state(task, system, next_state, timeout=timeout)
except sushy.exceptions.SushyError as e:

@ -100,6 +100,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
fake_system.set_system_boot_options.assert_called_once_with(
expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
@ -123,6 +125,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
@ -170,6 +174,82 @@ class RedfishManagementTestCase(db_base.DbTestCase):
sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_boot_device_persistence_fallback(self, mock_get_system,
mock_sushy):
fake_system = mock.Mock()
fake_system.set_system_boot_options.side_effect = [
sushy.exceptions.SushyError(),
None,
]
mock_get_system.return_value = fake_system
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management.set_boot_device(
task, boot_devices.PXE, persistent=True)
fake_system.set_system_boot_options.assert_has_calls([
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS),
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE),
])
mock_get_system.assert_called_once_with(task.node)
task.node.refresh()
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_PXE,
task.node.driver_internal_info['redfish_boot_device'])
def test_restore_boot_device(self):
fake_system = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.node.driver_internal_info['redfish_boot_device'] = (
sushy.BOOT_SOURCE_TARGET_HDD
)
task.driver.management.restore_boot_device(task, fake_system)
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_HDD, enabled=None)
# The stored boot device is kept intact
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_HDD,
task.node.driver_internal_info['redfish_boot_device'])
def test_restore_boot_device_noop(self):
fake_system = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management.restore_boot_device(task, fake_system)
self.assertFalse(fake_system.set_system_boot_options.called)
@mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True)
def test_restore_boot_device_failure(self, mock_log):
fake_system = mock.Mock()
fake_system.set_system_boot_options.side_effect = (
sushy.exceptions.SushyError()
)
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.node.driver_internal_info['redfish_boot_device'] = (
sushy.BOOT_SOURCE_TARGET_HDD
)
task.driver.management.restore_boot_device(task, fake_system)
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_HDD, enabled=None)
self.assertTrue(mock_log.called)
# The stored boot device is kept intact
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_HDD,
task.node.driver_internal_info['redfish_boot_device'])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_get_boot_device(self, mock_get_system):

@ -20,6 +20,7 @@ from oslo_utils import importutils
from ironic.common import exception
from ironic.common import states
from ironic.conductor import task_manager
from ironic.drivers.modules.redfish import management as redfish_mgmt
from ironic.drivers.modules.redfish import power as redfish_power
from ironic.drivers.modules.redfish import utils as redfish_utils
from ironic.tests.unit.db import base as db_base
@ -84,19 +85,21 @@ class RedfishPowerTestCase(db_base.DbTestCase):
mock_get_system.assert_called_once_with(task.node)
mock_get_system.reset_mock()
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state(self, mock_get_system):
def test_set_power_state(self, mock_get_system, mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
expected_values = [
(states.POWER_ON, sushy.RESET_ON),
(states.POWER_OFF, sushy.RESET_FORCE_OFF),
(states.REBOOT, sushy.RESET_FORCE_RESTART),
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART),
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN)
(states.POWER_ON, sushy.RESET_ON, True),
(states.POWER_OFF, sushy.RESET_FORCE_OFF, False),
(states.REBOOT, sushy.RESET_FORCE_RESTART, True),
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True),
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False)
]
for target, expected in expected_values:
for target, expected, restore_bootdev in expected_values:
if target in (states.POWER_OFF, states.SOFT_POWER_OFF):
final = sushy.SYSTEM_POWER_STATE_OFF
transient = sushy.SYSTEM_POWER_STATE_ON
@ -115,9 +118,15 @@ class RedfishPowerTestCase(db_base.DbTestCase):
system_result[0].reset_system.assert_called_once_with(expected)
mock_get_system.assert_called_with(task.node)
self.assertEqual(4, mock_get_system.call_count)
if restore_bootdev:
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
else:
self.assertFalse(mock_restore_bootdev.called)
# Reset mocks
mock_get_system.reset_mock()
mock_restore_bootdev.reset_mock()
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state_not_reached(self, mock_get_system):
@ -166,8 +175,11 @@ class RedfishPowerTestCase(db_base.DbTestCase):
sushy.RESET_ON)
mock_get_system.assert_called_once_with(task.node)
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_from_power_off(self, mock_get_system):
def test_reboot_from_power_off(self, mock_get_system,
mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
system_result = [
@ -187,9 +199,13 @@ class RedfishPowerTestCase(db_base.DbTestCase):
sushy.RESET_ON)
mock_get_system.assert_called_with(task.node)
self.assertEqual(3, mock_get_system.call_count)
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_from_power_on(self, mock_get_system):
def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
system_result = [
@ -211,6 +227,8 @@ class RedfishPowerTestCase(db_base.DbTestCase):
])
mock_get_system.assert_called_with(task.node)
self.assertEqual(3, mock_get_system.call_count)
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_not_reached(self, mock_get_system):

@ -0,0 +1,18 @@
---
fixes:
- |
Provides a workaround for hardware that does not support persistent boot
device setting with the ``redfish`` hardware type. When such situation is
detected, ironic will fall back to one-time boot device setting, restoring
it on every reboot.
issues:
- |
Some redfish-enabled hardware is known not to support persistent boot
device setting that is used by the Bare Metal service for deployed
instances. The ``redfish`` hardware type tries to work around this problem,
but rebooting such an instance in-band may cause it to boot incorrectly.
A predictable boot order should be configured in the node's boot firmware
to avoid issues and at least metadata cleaning must be enabled.
See `this mailing list thread
<http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html>`_
for technical details.