Wait for the interfaces to get IP addresses before inspection
In the DIB build the DHCP code (provided by the dhcp-all-interfaces element) races with the service starting IPA. It does not matter for deployment itself, as we're waiting for the route to the Ironic API to appear. However, for inspection it may result in reporting back all NIC's without IP addresses. Inspection fails in this case. This change makes inspection wait for *all* NIC's to get their IP addresses up to a small timeout. The timeout is 60 seconds by default and can be changed via the new ipa-inspection-dhcp-wait-timeout kernel option (0 to not wait). After the wait inspection proceedes in any case, so the worst downside is making inspection 60 seconds longer. To avoid waiting for NIC's that are not even connected, this change extends the NetworkInterface class with 'has_carrier' field. Closes-Bug: #1564954 Change-Id: I5bf14de4c1c622f4bf6e3eadbe20c44759da5d66
This commit is contained in:
parent
3cf5369cb6
commit
3deb25a3ce
@ -114,6 +114,13 @@ cli_opts = [
|
||||
help='Comma-separated list of plugins providing additional '
|
||||
'hardware data for inspection, empty value gives '
|
||||
'a minimum required set of plugins.'),
|
||||
|
||||
cfg.IntOpt('inspection_dhcp_wait_timeout',
|
||||
default=APARAMS.get('ipa-inspection-dhcp-wait-timeout',
|
||||
inspector.DEFAULT_DHCP_WAIT_TIMEOUT),
|
||||
help='Maximum time (in seconds) to wait for all NIC\'s '
|
||||
'to get their IP addresses via DHCP before inspection. '
|
||||
'Set to 0 to disable waiting completely.'),
|
||||
]
|
||||
|
||||
CONF.register_cli_opts(cli_opts)
|
||||
|
@ -175,12 +175,14 @@ class BlockDevice(encoding.SerializableComparable):
|
||||
|
||||
class NetworkInterface(encoding.SerializableComparable):
|
||||
serializable_fields = ('name', 'mac_address', 'switch_port_descr',
|
||||
'switch_chassis_descr', 'ipv4_address')
|
||||
'switch_chassis_descr', 'ipv4_address',
|
||||
'has_carrier')
|
||||
|
||||
def __init__(self, name, mac_addr, ipv4_address=None):
|
||||
def __init__(self, name, mac_addr, ipv4_address=None, has_carrier=True):
|
||||
self.name = name
|
||||
self.mac_address = mac_addr
|
||||
self.ipv4_address = ipv4_address
|
||||
self.has_carrier = has_carrier
|
||||
# TODO(russellhaering): Pull these from LLDP
|
||||
self.switch_port_descr = None
|
||||
self.switch_chassis_descr = None
|
||||
@ -402,7 +404,8 @@ class GenericHardwareManager(HardwareManager):
|
||||
|
||||
return NetworkInterface(
|
||||
interface_name, mac_addr,
|
||||
ipv4_address=self.get_ipv4_addr(interface_name))
|
||||
ipv4_address=self.get_ipv4_addr(interface_name),
|
||||
has_carrier=self._interface_has_carrier(interface_name))
|
||||
|
||||
def get_ipv4_addr(self, interface_id):
|
||||
try:
|
||||
@ -412,6 +415,17 @@ class GenericHardwareManager(HardwareManager):
|
||||
# No default IPv4 address found
|
||||
return None
|
||||
|
||||
def _interface_has_carrier(self, interface_name):
|
||||
path = '{0}/class/net/{1}/carrier'.format(self.sys_path,
|
||||
interface_name)
|
||||
try:
|
||||
with open(path, 'rt') as fp:
|
||||
return fp.read().strip() == '1'
|
||||
except EnvironmentError:
|
||||
LOG.debug('No carrier information for interface %s',
|
||||
interface_name)
|
||||
return False
|
||||
|
||||
def _is_device(self, interface_name):
|
||||
device_path = '{0}/class/net/{1}/device'.format(self.sys_path,
|
||||
interface_name)
|
||||
|
@ -17,6 +17,7 @@ import base64
|
||||
import io
|
||||
import json
|
||||
import tarfile
|
||||
import time
|
||||
|
||||
import netaddr
|
||||
from oslo_concurrency import processutils
|
||||
@ -36,6 +37,9 @@ from ironic_python_agent import utils
|
||||
LOG = logging.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
DEFAULT_COLLECTOR = 'default'
|
||||
DEFAULT_DHCP_WAIT_TIMEOUT = 60
|
||||
|
||||
_DHCP_RETRY_INTERVAL = 2
|
||||
_COLLECTOR_NS = 'ironic_python_agent.inspector.collectors'
|
||||
_NO_LOGGING_FIELDS = ('logs',)
|
||||
|
||||
@ -215,6 +219,38 @@ def discover_scheduling_properties(inventory, data, root_disk=None):
|
||||
LOG.info('value for %s is %s', key, data[key])
|
||||
|
||||
|
||||
def wait_for_dhcp():
|
||||
"""Wait until all NIC's get their IP addresses via DHCP or timeout happens.
|
||||
|
||||
Ignores interfaces which do not even have a carrier.
|
||||
|
||||
Note: only supports IPv4 addresses for now.
|
||||
|
||||
:return: True if all NIC's got IP addresses, False if timeout happened.
|
||||
Also returns True if waiting is disabled via configuration.
|
||||
"""
|
||||
if not CONF.inspection_dhcp_wait_timeout:
|
||||
return True
|
||||
|
||||
threshold = time.time() + CONF.inspection_dhcp_wait_timeout
|
||||
while time.time() <= threshold:
|
||||
interfaces = hardware.dispatch_to_managers('list_network_interfaces')
|
||||
missing = [iface.name for iface in interfaces
|
||||
if iface.has_carrier and not iface.ipv4_address]
|
||||
if not missing:
|
||||
return True
|
||||
|
||||
LOG.debug('Still waiting for interfaces %s to get IP addresses',
|
||||
missing)
|
||||
time.sleep(_DHCP_RETRY_INTERVAL)
|
||||
|
||||
LOG.warning('Not all network interfaces received IP addresses in '
|
||||
'%(timeout)d seconds: %(missing)s',
|
||||
{'timeout': CONF.inspection_dhcp_wait_timeout,
|
||||
'missing': missing})
|
||||
return False
|
||||
|
||||
|
||||
def collect_default(data, failures):
|
||||
"""The default inspection collector.
|
||||
|
||||
@ -229,6 +265,7 @@ def collect_default(data, failures):
|
||||
:param data: mutable data that we'll send to inspector
|
||||
:param failures: AccumulatedFailures object
|
||||
"""
|
||||
wait_for_dhcp()
|
||||
inventory = hardware.dispatch_to_managers('list_hardware_info')
|
||||
|
||||
# In the future we will only need the current version of inventory,
|
||||
|
@ -268,7 +268,7 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
|
||||
mocked_open.return_value.__enter__ = lambda s: s
|
||||
mocked_open.return_value.__exit__ = mock.Mock()
|
||||
read_mock = mocked_open.return_value.read
|
||||
read_mock.return_value = '00:0c:29:8c:11:b1\n'
|
||||
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', '1']
|
||||
mocked_ifaddresses.return_value = {
|
||||
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
|
||||
}
|
||||
@ -277,6 +277,32 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
|
||||
self.assertEqual('eth0', interfaces[0].name)
|
||||
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
|
||||
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
|
||||
self.assertTrue(interfaces[0].has_carrier)
|
||||
|
||||
@mock.patch('netifaces.ifaddresses')
|
||||
@mock.patch('os.listdir')
|
||||
@mock.patch('os.path.exists')
|
||||
@mock.patch('six.moves.builtins.open')
|
||||
def test_list_network_interfaces_no_carrier(self,
|
||||
mocked_open,
|
||||
mocked_exists,
|
||||
mocked_listdir,
|
||||
mocked_ifaddresses):
|
||||
mocked_listdir.return_value = ['lo', 'eth0']
|
||||
mocked_exists.side_effect = [False, True]
|
||||
mocked_open.return_value.__enter__ = lambda s: s
|
||||
mocked_open.return_value.__exit__ = mock.Mock()
|
||||
read_mock = mocked_open.return_value.read
|
||||
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', OSError('boom')]
|
||||
mocked_ifaddresses.return_value = {
|
||||
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
|
||||
}
|
||||
interfaces = self.hardware.list_network_interfaces()
|
||||
self.assertEqual(1, len(interfaces))
|
||||
self.assertEqual('eth0', interfaces[0].name)
|
||||
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
|
||||
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
|
||||
self.assertFalse(interfaces[0].has_carrier)
|
||||
|
||||
@mock.patch.object(utils, 'execute')
|
||||
def test_get_os_install_device(self, mocked_execute):
|
||||
|
@ -18,6 +18,7 @@ import collections
|
||||
import copy
|
||||
import io
|
||||
import tarfile
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import mock
|
||||
@ -328,11 +329,13 @@ class TestDiscoverSchedulingProperties(BaseDiscoverTest):
|
||||
|
||||
@mock.patch.object(utils, 'get_agent_params',
|
||||
lambda: {'BOOTIF': 'boot:if'})
|
||||
@mock.patch.object(inspector, 'wait_for_dhcp', autospec=True)
|
||||
@mock.patch.object(inspector, 'discover_scheduling_properties', autospec=True)
|
||||
@mock.patch.object(inspector, 'discover_network_properties', autospec=True)
|
||||
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
|
||||
class TestCollectDefault(BaseDiscoverTest):
|
||||
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched):
|
||||
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched,
|
||||
mock_wait_for_dhcp):
|
||||
mock_dispatch.return_value = self.inventory
|
||||
|
||||
inspector.collect_default(self.data, self.failures)
|
||||
@ -351,9 +354,10 @@ class TestCollectDefault(BaseDiscoverTest):
|
||||
mock_discover_sched.assert_called_once_with(
|
||||
self.inventory, self.data,
|
||||
root_disk=self.inventory['disks'][0])
|
||||
mock_wait_for_dhcp.assert_called_once_with()
|
||||
|
||||
def test_no_root_disk(self, mock_dispatch, mock_discover_net,
|
||||
mock_discover_sched):
|
||||
mock_discover_sched, mock_wait_for_dhcp):
|
||||
mock_dispatch.return_value = self.inventory
|
||||
self.inventory['disks'] = []
|
||||
|
||||
@ -371,6 +375,7 @@ class TestCollectDefault(BaseDiscoverTest):
|
||||
self.failures)
|
||||
mock_discover_sched.assert_called_once_with(
|
||||
self.inventory, self.data, root_disk=None)
|
||||
mock_wait_for_dhcp.assert_called_once_with()
|
||||
|
||||
|
||||
@mock.patch.object(utils, 'execute', autospec=True)
|
||||
@ -453,3 +458,56 @@ class TestCollectExtraHardware(unittest.TestCase):
|
||||
self.assertNotIn('data', self.data)
|
||||
self.assertTrue(self.failures)
|
||||
mock_execute.assert_called_once_with('hardware-detect')
|
||||
|
||||
|
||||
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
|
||||
class TestWaitForDhcp(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super(TestWaitForDhcp, self).setUp()
|
||||
CONF.set_override('inspection_dhcp_wait_timeout',
|
||||
inspector.DEFAULT_DHCP_WAIT_TIMEOUT)
|
||||
|
||||
@mock.patch.object(time, 'sleep', autospec=True)
|
||||
def test_ok(self, mocked_sleep, mocked_dispatch):
|
||||
mocked_dispatch.side_effect = [
|
||||
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
|
||||
ipv4_address=None),
|
||||
hardware.NetworkInterface(name='em1', mac_addr='abcd',
|
||||
ipv4_address='1.2.3.4')],
|
||||
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
|
||||
ipv4_address=None),
|
||||
hardware.NetworkInterface(name='em1', mac_addr='abcd',
|
||||
ipv4_address='1.2.3.4')],
|
||||
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
|
||||
ipv4_address='1.1.1.1'),
|
||||
hardware.NetworkInterface(name='em1', mac_addr='abcd',
|
||||
ipv4_address='1.2.3.4')],
|
||||
]
|
||||
|
||||
self.assertTrue(inspector.wait_for_dhcp())
|
||||
|
||||
mocked_dispatch.assert_called_with('list_network_interfaces')
|
||||
self.assertEqual(2, mocked_sleep.call_count)
|
||||
self.assertEqual(3, mocked_dispatch.call_count)
|
||||
|
||||
@mock.patch.object(inspector, '_DHCP_RETRY_INTERVAL', 0.01)
|
||||
def test_timeout(self, mocked_dispatch):
|
||||
CONF.set_override('inspection_dhcp_wait_timeout', 0.02)
|
||||
|
||||
mocked_dispatch.return_value = [
|
||||
hardware.NetworkInterface(name='em0', mac_addr='abcd',
|
||||
ipv4_address=None),
|
||||
hardware.NetworkInterface(name='em1', mac_addr='abcd',
|
||||
ipv4_address='1.2.3.4'),
|
||||
]
|
||||
|
||||
self.assertFalse(inspector.wait_for_dhcp())
|
||||
|
||||
mocked_dispatch.assert_called_with('list_network_interfaces')
|
||||
|
||||
def test_disabled(self, mocked_dispatch):
|
||||
CONF.set_override('inspection_dhcp_wait_timeout', 0)
|
||||
|
||||
self.assertTrue(inspector.wait_for_dhcp())
|
||||
|
||||
self.assertFalse(mocked_dispatch.called)
|
||||
|
@ -160,14 +160,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase):
|
||||
u'name': u'eth0',
|
||||
u'ipv4_address': None,
|
||||
u'switch_chassis_descr': None,
|
||||
u'switch_port_descr': None
|
||||
u'switch_port_descr': None,
|
||||
u'has_carrier': True,
|
||||
},
|
||||
{
|
||||
u'mac_address': u'00:0c:29:8c:11:b2',
|
||||
u'name': u'eth1',
|
||||
u'ipv4_address': None,
|
||||
u'switch_chassis_descr': None,
|
||||
'switch_port_descr': None
|
||||
u'switch_port_descr': None,
|
||||
u'has_carrier': True,
|
||||
}
|
||||
],
|
||||
u'cpu': {
|
||||
@ -295,14 +297,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase):
|
||||
u'name': u'eth0',
|
||||
u'ipv4_address': None,
|
||||
u'switch_chassis_descr': None,
|
||||
u'switch_port_descr': None
|
||||
u'switch_port_descr': None,
|
||||
u'has_carrier': True,
|
||||
},
|
||||
{
|
||||
u'mac_address': u'00:0c:29:8c:11:b2',
|
||||
u'name': u'eth1',
|
||||
u'ipv4_address': None,
|
||||
u'switch_chassis_descr': None,
|
||||
'switch_port_descr': None
|
||||
u'switch_port_descr': None,
|
||||
u'has_carrier': True,
|
||||
}
|
||||
],
|
||||
u'cpu': {
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- The "has_carrier" flag was added to the network interface information.
|
||||
fixes:
|
||||
- Inspection code now waits up to 1 minute for all NICs to get their IP addresses.
|
||||
Otherwise inspection fails for some users due to race between DIB DHCP code
|
||||
and IPA startup. See https://bugs.launchpad.net/bugs/1564954 for details.
|
Loading…
Reference in New Issue
Block a user