From 2a0fd1d13f178c9872330cf6498dc25572a721da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aija=20Jaunt=C4=93va?= Date: Wed, 8 Sep 2021 11:20:20 -0400 Subject: [PATCH] Fix idrac-wsman set_power_state to wait on HW set_power_state has returned to the caller immediately without confirming the system has reached the requested state. This fixes that by synchronously waiting until the target state has been read before returning. That bug can cause instance workload deployments to fail on Dell EMC PowerEdge server models on which IPA ramdisk soft power off fails and ironic employs its OOB fallback strategy. After an otherwise successful deployment, the node is active, but is powered off. No error is reported in last_error. If the subsequent instance workflow expects the system to be powered on into the operating system, it fails. Story: 2009204 Task: 43261 Change-Id: I3112a22149c07e5508f26c79f33d09aeb905c308 --- ironic/drivers/modules/drac/power.py | 45 ++++++++++--------- .../unit/drivers/modules/drac/test_power.py | 14 ++++-- ...set-power-state-wait-cd8f9ff41b19c7a7.yaml | 10 +++++ 3 files changed, 44 insertions(+), 25 deletions(-) create mode 100644 releasenotes/notes/idrac-wsman-set-power-state-wait-cd8f9ff41b19c7a7.yaml diff --git a/ironic/drivers/modules/drac/power.py b/ironic/drivers/modules/drac/power.py index 96f6f99321..468cc64b6a 100644 --- a/ironic/drivers/modules/drac/power.py +++ b/ironic/drivers/modules/drac/power.py @@ -24,6 +24,7 @@ from oslo_utils import importutils from ironic.common import exception from ironic.common import states from ironic.conductor import task_manager +from ironic.conductor import utils as cond_utils from ironic.drivers import base from ironic.drivers.modules.drac import common as drac_common from ironic.drivers.modules.drac import management as drac_management @@ -87,15 +88,17 @@ def _commit_boot_list_change(node): node.save() -def _set_power_state(node, power_state): +def _set_power_state(task, power_state, timeout=None): """Turns the server power on/off or do a reboot. - :param node: an ironic node object. + :param task: a TaskManager instance containing the node to act on. :param power_state: a power state from :mod:`ironic.common.states`. + :param timeout: Time to wait for the node to reach the requested state. + When requested state is reboot, not used as not waiting then. :raises: InvalidParameterValue if required DRAC credentials are missing. :raises: DracOperationError on an error from python-dracclient """ - + node = task.node # NOTE(ifarkas): DRAC interface doesn't allow changing the boot device # multiple times in a row without a reboot. This is # because a change need to be committed via a @@ -131,6 +134,20 @@ def _set_power_state(node, power_state): try: client.set_power_state(target_power_state) + if calc_power_state == states.REBOOT: + # TODO(rloo): Support timeouts! + if timeout is not None: + LOG.warning("The 'idrac-wsman' Power Interface does not " + "support 'timeout' parameter when setting " + "power state to reboot. Ignoring " + "timeout=%(timeout)s", + {'timeout': timeout}) + else: + # Skipped for reboot as can't match reboot with on/off. + # Reboot so far has been part of workflow that is not followed + # by another power state change that could break the flow. + cond_utils.node_wait_for_power_state( + task, calc_power_state, timeout) break except drac_exceptions.BaseClientException as exc: if (power_state == states.REBOOT @@ -214,20 +231,13 @@ class DracWSManPower(base.PowerInterface): :param task: a TaskManager instance containing the node to act on. :param power_state: a power state from :mod:`ironic.common.states`. - :param timeout: timeout (in seconds). Unsupported by this interface. + :param timeout: Time to wait for the node to reach the requested state. + When requested state is reboot, not used as not waiting then. :raises: InvalidParameterValue if required DRAC credentials are missing. :raises: DracOperationError on an error from python-dracclient. """ - # TODO(rloo): Support timeouts! - if timeout is not None: - LOG.warning( - "The 'idrac' Power Interface's 'set_power_state' method " - "doesn't support the 'timeout' parameter. Ignoring " - "timeout=%(timeout)s", - {'timeout': timeout}) - - _set_power_state(task.node, power_state) + _set_power_state(task, power_state, timeout) @METRICS.timer('DracPower.reboot') @task_manager.require_exclusive_lock @@ -240,14 +250,7 @@ class DracWSManPower(base.PowerInterface): missing. :raises: DracOperationError on an error from python-dracclient. """ - # TODO(rloo): Support timeouts! - if timeout is not None: - LOG.warning("The 'idrac' Power Interface's 'reboot' method " - "doesn't support the 'timeout' parameter. Ignoring " - "timeout=%(timeout)s", - {'timeout': timeout}) - - _set_power_state(task.node, states.REBOOT) + _set_power_state(task, states.REBOOT, timeout) class DracPower(DracWSManPower): diff --git a/ironic/tests/unit/drivers/modules/drac/test_power.py b/ironic/tests/unit/drivers/modules/drac/test_power.py index aeb3c038e1..b6ba3e159a 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_power.py +++ b/ironic/tests/unit/drivers/modules/drac/test_power.py @@ -72,7 +72,8 @@ class DracPowerTestCase(test_utils.BaseDracTest): @mock.patch.object(drac_power.LOG, 'warning', autospec=True) def test_set_power_state(self, mock_log, mock_get_drac_client): mock_client = mock_get_drac_client.return_value - + mock_client.get_power_state.side_effect = [drac_constants.POWER_ON, + drac_constants.POWER_OFF] with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: task.driver.power.set_power_state(task, states.POWER_OFF) @@ -98,6 +99,8 @@ class DracPowerTestCase(test_utils.BaseDracTest): @mock.patch.object(drac_power.LOG, 'warning', autospec=True) def test_set_power_state_timeout(self, mock_log, mock_get_drac_client): mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.side_effect = [drac_constants.POWER_ON, + drac_constants.POWER_OFF] with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: @@ -106,7 +109,7 @@ class DracPowerTestCase(test_utils.BaseDracTest): drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_OFF] mock_client.set_power_state.assert_called_once_with(drac_power_state) - self.assertTrue(mock_log.called) + self.assertFalse(mock_log.called) @mock.patch.object(drac_power.LOG, 'warning', autospec=True) def test_reboot_while_powered_on(self, mock_log, mock_get_drac_client): @@ -137,7 +140,8 @@ class DracPowerTestCase(test_utils.BaseDracTest): def test_reboot_while_powered_off(self, mock_get_drac_client): mock_client = mock_get_drac_client.return_value - mock_client.get_power_state.return_value = drac_constants.POWER_OFF + mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF, + drac_constants.POWER_ON] with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: @@ -149,7 +153,9 @@ class DracPowerTestCase(test_utils.BaseDracTest): @mock.patch('time.sleep', autospec=True) def test_reboot_retries_success(self, mock_sleep, mock_get_drac_client): mock_client = mock_get_drac_client.return_value - mock_client.get_power_state.return_value = drac_constants.POWER_OFF + mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF, + drac_constants.POWER_OFF, + drac_constants.POWER_ON] exc = drac_exceptions.DRACOperationFailed( drac_messages=['The command failed to set RequestedState']) mock_client.set_power_state.side_effect = [exc, None] diff --git a/releasenotes/notes/idrac-wsman-set-power-state-wait-cd8f9ff41b19c7a7.yaml b/releasenotes/notes/idrac-wsman-set-power-state-wait-cd8f9ff41b19c7a7.yaml new file mode 100644 index 0000000000..4ac895625d --- /dev/null +++ b/releasenotes/notes/idrac-wsman-set-power-state-wait-cd8f9ff41b19c7a7.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Fixes ``idrac-wsman`` power interface to wait for the hardware to reach the + target state before returning. For systems where soft power off at the end + of deployment to boot to instance failed and forced hard power off was + used, this left node successfully deployed in off state without any errors. + This broke other workflows expecting node to be on booted into + OS at the end of deployment. Additional information can be found in + `story 2009204 `_.