From c15ed6a48e36da26576242277aa531720fd15d2d Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Tue, 24 May 2016 10:04:12 +0200 Subject: [PATCH] Wait for at least one suitable disk to appear on start up Some kernel modules take substantial time to initialize. For example, with mpt2sas RAID driver inspection and deployment randomly fail due to IPA starting before the driver finishes initialization. This problem is probably impossible to solve in a generic case, as modern Linux environment do not have a notion of "hardware is fully initialized" moment. All hardware is essentially hotplug. To solve it at least for the simplest case, this patch adds a wait loop on start up waiting for at least one suitable disk to appear in inventory. Note that root device hints are not considered, as the node might not be known at that moment yet. Change-Id: Id163ca28f7c140c302ea04947ded3f3c58b284de Partial-Bug: #1582797 --- ironic_python_agent/hardware.py | 23 +++++++++++ ironic_python_agent/tests/unit/test_agent.py | 6 +++ .../tests/unit/test_hardware.py | 38 ++++++++++++++++++- .../notes/disk-wait-2e0e85e0947f80e9.yaml | 5 +++ 4 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index 3e74d350d..ac7e25d3f 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -16,6 +16,7 @@ import abc import functools import os import shlex +import time import netifaces from oslo_concurrency import processutils @@ -38,6 +39,9 @@ UNIT_CONVERTER = pint.UnitRegistry(filename=None) UNIT_CONVERTER.define('MB = []') UNIT_CONVERTER.define('GB = 1024 MB') +_DISK_WAIT_ATTEMPTS = 10 +_DISK_WAIT_DELAY = 3 + def _get_device_vendor(dev): """Get the vendor name of a given device.""" @@ -394,8 +398,27 @@ class GenericHardwareManager(HardwareManager): self.sys_path = '/sys' def evaluate_hardware_support(self): + # Do some initialization before we declare ourself ready + self._wait_for_disks() return HardwareSupport.GENERIC + def _wait_for_disks(self): + # Wait for at least one suitable disk to show up, otherwise neither + # inspection not deployment have any chances to succeed. + for attempt in range(_DISK_WAIT_ATTEMPTS): + try: + block_devices = self.list_block_devices() + utils.guess_root_disk(block_devices) + except errors.DeviceNotFound: + LOG.debug('Still waiting for at least one disk to appear, ' + 'attempt %d of %d', attempt + 1, _DISK_WAIT_ATTEMPTS) + time.sleep(_DISK_WAIT_DELAY) + else: + break + else: + LOG.warning('No disks detected in %d seconds', + _DISK_WAIT_DELAY * _DISK_WAIT_ATTEMPTS) + def _get_interface_info(self, interface_name): addr_path = '{0}/class/net/{1}/address'.format(self.sys_path, interface_name) diff --git a/ironic_python_agent/tests/unit/test_agent.py b/ironic_python_agent/tests/unit/test_agent.py index b51e68518..9885014a4 100644 --- a/ironic_python_agent/tests/unit/test_agent.py +++ b/ironic_python_agent/tests/unit/test_agent.py @@ -127,6 +127,8 @@ class TestHeartbeater(test_base.BaseTestCase): self.assertEqual(2.7, self.heartbeater.error_delay) +@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks', + lambda self: None) class TestBaseAgent(test_base.BaseTestCase): def setUp(self): @@ -294,6 +296,8 @@ class TestBaseAgent(test_base.BaseTestCase): self.agent.get_node_uuid) +@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks', + lambda self: None) class TestAgentStandalone(test_base.BaseTestCase): def setUp(self): @@ -338,6 +342,8 @@ class TestAgentStandalone(test_base.BaseTestCase): self.assertFalse(self.agent.api_client.lookup_node.called) +@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks', + lambda self: None) @mock.patch.object(socket, 'gethostbyname', autospec=True) @mock.patch.object(utils, 'execute', autospec=True) class TestAdvertiseAddress(test_base.BaseTestCase): diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py index 5c0b60814..1aa04c624 100644 --- a/ironic_python_agent/tests/unit/test_hardware.py +++ b/ironic_python_agent/tests/unit/test_hardware.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import time + import mock import netifaces -import os from oslo_concurrency import processutils from oslo_utils import units from oslotest import base as test_base @@ -1084,6 +1086,40 @@ class TestGenericHardwareManager(test_base.BaseTestCase): self.assertEqual('NEC', self.hardware.get_system_vendor_info().manufacturer) + @mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices', + autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + @mock.patch.object(utils, 'guess_root_disk', autospec=True) + def test_evaluate_hw_waits_for_disks(self, mocked_root_dev, mocked_sleep, + mocked_block_dev): + mocked_root_dev.side_effect = [ + errors.DeviceNotFound('boom'), + None + ] + + result = self.hardware.evaluate_hardware_support() + + self.assertEqual(hardware.HardwareSupport.GENERIC, result) + mocked_root_dev.assert_called_with(mocked_block_dev.return_value) + self.assertEqual(2, mocked_root_dev.call_count) + mocked_sleep.assert_called_once_with(hardware._DISK_WAIT_DELAY) + + @mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices', + autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + @mock.patch.object(utils, 'guess_root_disk', autospec=True) + def test_evaluate_hw_disks_timeout(self, mocked_root_dev, mocked_sleep, + mocked_block_dev): + mocked_root_dev.side_effect = errors.DeviceNotFound('boom') + + result = self.hardware.evaluate_hardware_support() + + self.assertEqual(hardware.HardwareSupport.GENERIC, result) + mocked_root_dev.assert_called_with(mocked_block_dev.return_value) + self.assertEqual(hardware._DISK_WAIT_ATTEMPTS, + mocked_root_dev.call_count) + mocked_sleep.assert_called_with(hardware._DISK_WAIT_DELAY) + @mock.patch.object(utils, 'execute', autospec=True) class TestModuleFunctions(test_base.BaseTestCase): diff --git a/releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml b/releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml new file mode 100644 index 000000000..3b5ea703d --- /dev/null +++ b/releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - On start up wait up to 30 seconds for the first disk device suitable for + deployment to appear. This is to fix both inspection and deployment on + hardware that takes long to initialize (e.g. some RAID devices).