Make port binding failure (configurably) fatal

When using drivers like `networking-generic-switch`, there can be issues
with changing the VLAN the server is attached to.

Currently, the instance will still continue to attempt to boot, in some
cases the instance can go into ACTIVE (if the server happens to be on
the provisioning network already), but the server is not attached to the
correct VLAN, generally causing the network to fail to come up.

If we look at the ports, they are marked as binding failed. This is
something that Nova checks for after doing the port update to attach the
binding information. Lets add that check into Ironic, such that the
above build will fail with a good hint at where to look for problems
(namely Neutron).

Change-Id: I3811941a3dff0a9f968258d05cc020e1f52e3e40
This commit is contained in:
John Garbutt
2019-12-19 10:43:55 +00:00
committed by Afonne-CID
parent 5fccd55c9f
commit 876a9d4027
7 changed files with 107 additions and 16 deletions

View File

@@ -362,8 +362,27 @@ def add_ports_to_network(task, network_uuid, security_groups=None):
update_port_attrs)
if CONF.neutron.dhcpv6_stateful_address_count > 1:
_add_ip_addresses_for_ipv6_stateful(task.context, port, client)
if is_smart_nic:
wait_for_port_status(client, port.id, 'ACTIVE')
binding_fail_fatal = False
is_neutron_iface = node.network_interface == 'neutron'
if is_neutron_iface:
binding_fail_fatal = getattr(
CONF.neutron, 'fail_on_port_binding_failure', False)
default_failure_behavior = is_smart_nic or binding_fail_fatal
fail_on_binding_failure = node.driver_info.get(
'fail_on_binding_failure', default_failure_behavior)
# NOTE(cid): Only check port status if it's a smart NIC or if we're
# configured to fail on binding failures. This avoids unnecessary
# failures when using network interfaces where binding may fail but
# we want to continue anyway (like in the case of flat networks).
if fail_on_binding_failure:
wait_for_port_status(
client, port.id, 'ACTIVE',
fail_on_binding_failure=fail_on_binding_failure)
except openstack_exc.OpenStackCloudException as e:
failures.append(ironic_port.uuid)
LOG.warning("Could not create neutron port for node's "
@@ -1016,12 +1035,15 @@ def wait_for_host_agent(client, host_id, target_state='up'):
stop=tenacity.stop_after_attempt(CONF.agent.neutron_agent_max_attempts),
wait=tenacity.wait_fixed(CONF.agent.neutron_agent_status_retry_interval),
reraise=True)
def wait_for_port_status(client, port_id, status):
def wait_for_port_status(client, port_id, status,
fail_on_binding_failure=None):
"""Wait for port status to be the desired status
:param client: A Neutron client object.
:param port_id: Neutron port_id
:param status: Port's target status, can be ACTIVE, DOWN ... etc.
:param fail_on_binding_failure: Whether to raise exception if port
binding fails.
:returns: boolean indicates that the port status matches the
required value passed by param status.
:raises: InvalidParameterValue if the port does not exist.
@@ -1035,6 +1057,14 @@ def wait_for_port_status(client, port_id, status):
{'port_id': port_id, 'status': port.status})
if port.status == status:
return True
# fail early on port binding failure
if port.get('binding:vif_type') == 'binding_failed':
msg = "Binding failed for neutron port %s" % port_id
if fail_on_binding_failure:
LOG.error(msg)
raise openstack_exc.OpenStackCloudException(msg)
LOG.warning(msg)
raise exception.NetworkError(
'Port %(port_id)s failed to reach status %(status)s' % {
'port_id': port_id, 'status': status})

View File

@@ -26,6 +26,10 @@ opts = [
mutable=True,
help=_('Delay value to wait for Neutron agents to setup '
'sufficient DHCP configuration for port.')),
cfg.BoolOpt('fail_on_port_binding_failure',
default=True,
help=_('Whether to fail or continue deployment if neutron '
'port binding fails.')),
cfg.IntOpt('retries',
default=3,
mutable=True,

View File

@@ -294,8 +294,27 @@ def plug_port_to_tenant_network(task, port_like_obj, client=None):
try:
neutron.update_neutron_port(task.context, vif_id, port_attrs)
if is_smart_nic:
neutron.wait_for_port_status(client, vif_id, 'ACTIVE')
binding_fail_fatal = False
is_neutron_iface = node.network_interface == 'neutron'
if is_neutron_iface:
binding_fail_fatal = getattr(
CONF.neutron, 'fail_on_port_binding_failure', False)
default_failure_behavior = is_smart_nic or binding_fail_fatal
fail_on_binding_failure = node.driver_info.get(
'fail_on_binding_failure', default_failure_behavior)
# NOTE(cid): Only check port status if it's a smart NIC or if we're
# configured to fail on binding failures. This avoids unnecessary
# failures when using network interfaces where binding may fail but
# we want to continue anyway (like in the case of flat networks).
if fail_on_binding_failure:
neutron.wait_for_port_status(
client, vif_id, 'ACTIVE',
fail_on_binding_failure=fail_on_binding_failure)
except openstack_exc.OpenStackCloudException as e:
msg = (_('Could not add public network VIF %(vif)s '
'to node %(node)s, possible network issue. %(exc)s') %

View File

@@ -1026,7 +1026,8 @@ class TestNeutronNetworkActions(db_base.DbTestCase):
wait_agent_mock.assert_called_once_with(
self.client_mock, 'hostname')
wait_port_mock.assert_called_once_with(
self.client_mock, self.neutron_port.id, 'ACTIVE')
self.client_mock, self.neutron_port.id, 'ACTIVE',
fail_on_binding_failure=True)
@mock.patch.object(neutron, 'is_smartnic_port', autospec=True)
@mock.patch.object(neutron, 'wait_for_host_agent', autospec=True)

View File

@@ -428,8 +428,11 @@ class TestCommonFunctions(db_base.DbTestCase):
task, self.vif_id, {'physnet2'})
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
def test_plug_port_to_tenant_network_client(self, mock_gc, mock_update):
def test_plug_port_to_tenant_network_client(self, mock_gc,
wait_mock_status,
mock_update):
self.port.internal_info = {common.TENANT_VIF_KEY: self.vif_id}
self.port.save()
with task_manager.acquire(self.context, self.node.id) as task:
@@ -439,8 +442,11 @@ class TestCommonFunctions(db_base.DbTestCase):
self.assertTrue(mock_update.called)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
def test_plug_port_to_tenant_network_no_client(self, mock_gc, mock_update):
def test_plug_port_to_tenant_network_no_client(self, mock_gc,
wait_mock_status,
mock_update):
self.port.internal_info = {common.TENANT_VIF_KEY: self.vif_id}
self.port.save()
with task_manager.acquire(self.context, self.node.id) as task:
@@ -480,7 +486,7 @@ class TestCommonFunctions(db_base.DbTestCase):
wait_agent_mock.assert_called_once_with(
nclient, 'hostname')
wait_port_mock.assert_called_once_with(
nclient, self.vif_id, 'ACTIVE')
nclient, self.vif_id, 'ACTIVE', fail_on_binding_failure=True)
self.assertTrue(mock_update.called)

View File

@@ -560,10 +560,12 @@ class NeutronInterfaceTestCase(db_base.DbTestCase):
@mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
@mock.patch.object(neutron, 'LOG', autospec=True)
def test_configure_tenant_networks_multiple_ports_one_vif_id(
self, log_mock, client_mock, update_mock, wait_agent_mock):
self, log_mock, client_mock, wait_mock_status, update_mock,
wait_agent_mock):
expected_attrs = {
'binding:vnic_type': 'baremetal',
'binding:host_id': self.node.uuid,
@@ -582,8 +584,10 @@ class NeutronInterfaceTestCase(db_base.DbTestCase):
@mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
def test_configure_tenant_networks_update_fail(self, client_mock,
wait_mock_status,
update_mock,
wait_agent_mock):
update_mock.side_effect = openstack_exc.OpenStackCloudException(
@@ -594,11 +598,26 @@ class NeutronInterfaceTestCase(db_base.DbTestCase):
self.interface.configure_tenant_networks, task)
client_mock.assert_called_once_with(context=task.context)
@mock.patch.object(neutron_common, '_get_port_by_uuid', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
def test_configure_tenant_networks_update_binding_fail(self, client_mock,
mock_get_port):
self.config(fail_on_port_binding_failure=True, group='neutron')
port = mock.MagicMock()
port.get.return_value = 'binding_failed'
mock_get_port.return_value = port
with task_manager.acquire(self.context, self.node.id) as task:
self.assertRaisesRegex(
exception.NetworkError, 'Binding failed',
self.interface.configure_tenant_networks, task)
@mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
def _test_configure_tenant_networks(self, client_mock, update_mock,
wait_agent_mock,
def _test_configure_tenant_networks(self, client_mock, wait_mock_status,
update_mock, wait_agent_mock,
is_client_id=False):
# NOTE(TheJulia): Until we have a replacement for infiniband client-id
# storage, extra has to stay put. On a plus side, this would be
@@ -668,12 +687,13 @@ class NeutronInterfaceTestCase(db_base.DbTestCase):
@mock.patch.object(neutron_common, 'get_neutron_port_data', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
@mock.patch.object(neutron_common, 'get_local_group_information',
autospec=True)
def test_configure_tenant_networks_with_portgroups(
self, glgi_mock, client_mock, update_mock, wait_agent_mock,
port_data_mock):
self, glgi_mock, client_mock, wait_mock_status, update_mock,
wait_agent_mock, port_data_mock):
pg = utils.create_test_portgroup(
self.context, node_id=self.node.id, address='ff:54:00:cf:2d:32',
internal_info={'tenant_vif_port_id': uuidutils.generate_uuid()})
@@ -729,12 +749,13 @@ class NeutronInterfaceTestCase(db_base.DbTestCase):
@mock.patch.object(neutron_common, 'get_neutron_port_data', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True)
@mock.patch.object(neutron_common, 'update_neutron_port', autospec=True)
@mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True)
@mock.patch.object(neutron_common, 'get_client', autospec=True)
@mock.patch.object(neutron_common, 'get_local_group_information',
autospec=True)
def test_configure_tenant_networks_with_portgroups_no_address(
self, glgi_mock, client_mock, update_mock, wait_agent_mock,
port_data_mock):
self, glgi_mock, client_mock, wait_mock_status, update_mock,
wait_agent_mock, port_data_mock):
pg = utils.create_test_portgroup(
self.context, node_id=self.node.id, address=None,
internal_info={'tenant_vif_port_id': uuidutils.generate_uuid()})

View File

@@ -0,0 +1,10 @@
---
features:
- |
Adds a new boolean configuration option ``[neutron]fail_on_port_binding_failure``
and corresponding node ``driver_info`` setting ``fail_on_binding_failure``
to control whether deployment should fail or continue if Neutron port
binding fails. With a default of ``true``, if your network is not
configured properly, this will likely cause deployment failures.
To maintain the previous behavior, explicitly set this option to False in
your configuration.