Wait for at least one suitable disk to appear on start up
Some kernel modules take substantial time to initialize. For example, with mpt2sas RAID driver inspection and deployment randomly fail due to IPA starting before the driver finishes initialization. This problem is probably impossible to solve in a generic case, as modern Linux environment do not have a notion of "hardware is fully initialized" moment. All hardware is essentially hotplug. To solve it at least for the simplest case, this patch adds a wait loop on start up waiting for at least one suitable disk to appear in inventory. Note that root device hints are not considered, as the node might not be known at that moment yet. Change-Id: Id163ca28f7c140c302ea04947ded3f3c58b284de Partial-Bug: #1582797
This commit is contained in:
parent
015fad6054
commit
c15ed6a48e
@ -16,6 +16,7 @@ import abc
|
||||
import functools
|
||||
import os
|
||||
import shlex
|
||||
import time
|
||||
|
||||
import netifaces
|
||||
from oslo_concurrency import processutils
|
||||
@ -38,6 +39,9 @@ UNIT_CONVERTER = pint.UnitRegistry(filename=None)
|
||||
UNIT_CONVERTER.define('MB = []')
|
||||
UNIT_CONVERTER.define('GB = 1024 MB')
|
||||
|
||||
_DISK_WAIT_ATTEMPTS = 10
|
||||
_DISK_WAIT_DELAY = 3
|
||||
|
||||
|
||||
def _get_device_vendor(dev):
|
||||
"""Get the vendor name of a given device."""
|
||||
@ -394,8 +398,27 @@ class GenericHardwareManager(HardwareManager):
|
||||
self.sys_path = '/sys'
|
||||
|
||||
def evaluate_hardware_support(self):
|
||||
# Do some initialization before we declare ourself ready
|
||||
self._wait_for_disks()
|
||||
return HardwareSupport.GENERIC
|
||||
|
||||
def _wait_for_disks(self):
|
||||
# Wait for at least one suitable disk to show up, otherwise neither
|
||||
# inspection not deployment have any chances to succeed.
|
||||
for attempt in range(_DISK_WAIT_ATTEMPTS):
|
||||
try:
|
||||
block_devices = self.list_block_devices()
|
||||
utils.guess_root_disk(block_devices)
|
||||
except errors.DeviceNotFound:
|
||||
LOG.debug('Still waiting for at least one disk to appear, '
|
||||
'attempt %d of %d', attempt + 1, _DISK_WAIT_ATTEMPTS)
|
||||
time.sleep(_DISK_WAIT_DELAY)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
LOG.warning('No disks detected in %d seconds',
|
||||
_DISK_WAIT_DELAY * _DISK_WAIT_ATTEMPTS)
|
||||
|
||||
def _get_interface_info(self, interface_name):
|
||||
addr_path = '{0}/class/net/{1}/address'.format(self.sys_path,
|
||||
interface_name)
|
||||
|
@ -127,6 +127,8 @@ class TestHeartbeater(test_base.BaseTestCase):
|
||||
self.assertEqual(2.7, self.heartbeater.error_delay)
|
||||
|
||||
|
||||
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
|
||||
lambda self: None)
|
||||
class TestBaseAgent(test_base.BaseTestCase):
|
||||
|
||||
def setUp(self):
|
||||
@ -294,6 +296,8 @@ class TestBaseAgent(test_base.BaseTestCase):
|
||||
self.agent.get_node_uuid)
|
||||
|
||||
|
||||
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
|
||||
lambda self: None)
|
||||
class TestAgentStandalone(test_base.BaseTestCase):
|
||||
|
||||
def setUp(self):
|
||||
@ -338,6 +342,8 @@ class TestAgentStandalone(test_base.BaseTestCase):
|
||||
self.assertFalse(self.agent.api_client.lookup_node.called)
|
||||
|
||||
|
||||
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
|
||||
lambda self: None)
|
||||
@mock.patch.object(socket, 'gethostbyname', autospec=True)
|
||||
@mock.patch.object(utils, 'execute', autospec=True)
|
||||
class TestAdvertiseAddress(test_base.BaseTestCase):
|
||||
|
@ -12,9 +12,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
import mock
|
||||
import netifaces
|
||||
import os
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_utils import units
|
||||
from oslotest import base as test_base
|
||||
@ -1084,6 +1086,40 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
|
||||
self.assertEqual('NEC',
|
||||
self.hardware.get_system_vendor_info().manufacturer)
|
||||
|
||||
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
|
||||
autospec=True)
|
||||
@mock.patch.object(time, 'sleep', autospec=True)
|
||||
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
|
||||
def test_evaluate_hw_waits_for_disks(self, mocked_root_dev, mocked_sleep,
|
||||
mocked_block_dev):
|
||||
mocked_root_dev.side_effect = [
|
||||
errors.DeviceNotFound('boom'),
|
||||
None
|
||||
]
|
||||
|
||||
result = self.hardware.evaluate_hardware_support()
|
||||
|
||||
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
|
||||
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
|
||||
self.assertEqual(2, mocked_root_dev.call_count)
|
||||
mocked_sleep.assert_called_once_with(hardware._DISK_WAIT_DELAY)
|
||||
|
||||
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
|
||||
autospec=True)
|
||||
@mock.patch.object(time, 'sleep', autospec=True)
|
||||
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
|
||||
def test_evaluate_hw_disks_timeout(self, mocked_root_dev, mocked_sleep,
|
||||
mocked_block_dev):
|
||||
mocked_root_dev.side_effect = errors.DeviceNotFound('boom')
|
||||
|
||||
result = self.hardware.evaluate_hardware_support()
|
||||
|
||||
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
|
||||
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
|
||||
self.assertEqual(hardware._DISK_WAIT_ATTEMPTS,
|
||||
mocked_root_dev.call_count)
|
||||
mocked_sleep.assert_called_with(hardware._DISK_WAIT_DELAY)
|
||||
|
||||
|
||||
@mock.patch.object(utils, 'execute', autospec=True)
|
||||
class TestModuleFunctions(test_base.BaseTestCase):
|
||||
|
5
releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml
Normal file
5
releasenotes/notes/disk-wait-2e0e85e0947f80e9.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- On start up wait up to 30 seconds for the first disk device suitable for
|
||||
deployment to appear. This is to fix both inspection and deployment on
|
||||
hardware that takes long to initialize (e.g. some RAID devices).
|
Loading…
Reference in New Issue
Block a user