From 876a9d40275d21bf2a7889a2d864877292f18143 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 19 Dec 2019 10:43:55 +0000 Subject: [PATCH] Make port binding failure (configurably) fatal When using drivers like `networking-generic-switch`, there can be issues with changing the VLAN the server is attached to. Currently, the instance will still continue to attempt to boot, in some cases the instance can go into ACTIVE (if the server happens to be on the provisioning network already), but the server is not attached to the correct VLAN, generally causing the network to fail to come up. If we look at the ports, they are marked as binding failed. This is something that Nova checks for after doing the port update to attach the binding information. Lets add that check into Ironic, such that the above build will fail with a good hint at where to look for problems (namely Neutron). Change-Id: I3811941a3dff0a9f968258d05cc020e1f52e3e40 --- ironic/common/neutron.py | 36 +++++++++++++++++-- ironic/conf/neutron.py | 4 +++ ironic/drivers/modules/network/common.py | 23 ++++++++++-- ironic/tests/unit/common/test_neutron.py | 3 +- .../drivers/modules/network/test_common.py | 12 +++++-- .../drivers/modules/network/test_neutron.py | 35 ++++++++++++++---- ...port-binding-failure-e4c9749a84bd947f.yaml | 10 ++++++ 7 files changed, 107 insertions(+), 16 deletions(-) create mode 100644 releasenotes/notes/configure-fail-early-on-port-binding-failure-e4c9749a84bd947f.yaml diff --git a/ironic/common/neutron.py b/ironic/common/neutron.py index c3ca005dc9..57adc04ed8 100644 --- a/ironic/common/neutron.py +++ b/ironic/common/neutron.py @@ -362,8 +362,27 @@ def add_ports_to_network(task, network_uuid, security_groups=None): update_port_attrs) if CONF.neutron.dhcpv6_stateful_address_count > 1: _add_ip_addresses_for_ipv6_stateful(task.context, port, client) - if is_smart_nic: - wait_for_port_status(client, port.id, 'ACTIVE') + + binding_fail_fatal = False + is_neutron_iface = node.network_interface == 'neutron' + + if is_neutron_iface: + binding_fail_fatal = getattr( + CONF.neutron, 'fail_on_port_binding_failure', False) + + default_failure_behavior = is_smart_nic or binding_fail_fatal + + fail_on_binding_failure = node.driver_info.get( + 'fail_on_binding_failure', default_failure_behavior) + + # NOTE(cid): Only check port status if it's a smart NIC or if we're + # configured to fail on binding failures. This avoids unnecessary + # failures when using network interfaces where binding may fail but + # we want to continue anyway (like in the case of flat networks). + if fail_on_binding_failure: + wait_for_port_status( + client, port.id, 'ACTIVE', + fail_on_binding_failure=fail_on_binding_failure) except openstack_exc.OpenStackCloudException as e: failures.append(ironic_port.uuid) LOG.warning("Could not create neutron port for node's " @@ -1016,12 +1035,15 @@ def wait_for_host_agent(client, host_id, target_state='up'): stop=tenacity.stop_after_attempt(CONF.agent.neutron_agent_max_attempts), wait=tenacity.wait_fixed(CONF.agent.neutron_agent_status_retry_interval), reraise=True) -def wait_for_port_status(client, port_id, status): +def wait_for_port_status(client, port_id, status, + fail_on_binding_failure=None): """Wait for port status to be the desired status :param client: A Neutron client object. :param port_id: Neutron port_id :param status: Port's target status, can be ACTIVE, DOWN ... etc. + :param fail_on_binding_failure: Whether to raise exception if port + binding fails. :returns: boolean indicates that the port status matches the required value passed by param status. :raises: InvalidParameterValue if the port does not exist. @@ -1035,6 +1057,14 @@ def wait_for_port_status(client, port_id, status): {'port_id': port_id, 'status': port.status}) if port.status == status: return True + + # fail early on port binding failure + if port.get('binding:vif_type') == 'binding_failed': + msg = "Binding failed for neutron port %s" % port_id + if fail_on_binding_failure: + LOG.error(msg) + raise openstack_exc.OpenStackCloudException(msg) + LOG.warning(msg) raise exception.NetworkError( 'Port %(port_id)s failed to reach status %(status)s' % { 'port_id': port_id, 'status': status}) diff --git a/ironic/conf/neutron.py b/ironic/conf/neutron.py index 4d9faeb05c..c36ceee4a4 100644 --- a/ironic/conf/neutron.py +++ b/ironic/conf/neutron.py @@ -26,6 +26,10 @@ opts = [ mutable=True, help=_('Delay value to wait for Neutron agents to setup ' 'sufficient DHCP configuration for port.')), + cfg.BoolOpt('fail_on_port_binding_failure', + default=True, + help=_('Whether to fail or continue deployment if neutron ' + 'port binding fails.')), cfg.IntOpt('retries', default=3, mutable=True, diff --git a/ironic/drivers/modules/network/common.py b/ironic/drivers/modules/network/common.py index 9e790d456e..ff5ed85fd7 100644 --- a/ironic/drivers/modules/network/common.py +++ b/ironic/drivers/modules/network/common.py @@ -294,8 +294,27 @@ def plug_port_to_tenant_network(task, port_like_obj, client=None): try: neutron.update_neutron_port(task.context, vif_id, port_attrs) - if is_smart_nic: - neutron.wait_for_port_status(client, vif_id, 'ACTIVE') + + binding_fail_fatal = False + is_neutron_iface = node.network_interface == 'neutron' + + if is_neutron_iface: + binding_fail_fatal = getattr( + CONF.neutron, 'fail_on_port_binding_failure', False) + + default_failure_behavior = is_smart_nic or binding_fail_fatal + + fail_on_binding_failure = node.driver_info.get( + 'fail_on_binding_failure', default_failure_behavior) + + # NOTE(cid): Only check port status if it's a smart NIC or if we're + # configured to fail on binding failures. This avoids unnecessary + # failures when using network interfaces where binding may fail but + # we want to continue anyway (like in the case of flat networks). + if fail_on_binding_failure: + neutron.wait_for_port_status( + client, vif_id, 'ACTIVE', + fail_on_binding_failure=fail_on_binding_failure) except openstack_exc.OpenStackCloudException as e: msg = (_('Could not add public network VIF %(vif)s ' 'to node %(node)s, possible network issue. %(exc)s') % diff --git a/ironic/tests/unit/common/test_neutron.py b/ironic/tests/unit/common/test_neutron.py index ceb6763b2e..bd7ce87516 100644 --- a/ironic/tests/unit/common/test_neutron.py +++ b/ironic/tests/unit/common/test_neutron.py @@ -1026,7 +1026,8 @@ class TestNeutronNetworkActions(db_base.DbTestCase): wait_agent_mock.assert_called_once_with( self.client_mock, 'hostname') wait_port_mock.assert_called_once_with( - self.client_mock, self.neutron_port.id, 'ACTIVE') + self.client_mock, self.neutron_port.id, 'ACTIVE', + fail_on_binding_failure=True) @mock.patch.object(neutron, 'is_smartnic_port', autospec=True) @mock.patch.object(neutron, 'wait_for_host_agent', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/network/test_common.py b/ironic/tests/unit/drivers/modules/network/test_common.py index e4e0d4cd86..e676a4343d 100644 --- a/ironic/tests/unit/drivers/modules/network/test_common.py +++ b/ironic/tests/unit/drivers/modules/network/test_common.py @@ -428,8 +428,11 @@ class TestCommonFunctions(db_base.DbTestCase): task, self.vif_id, {'physnet2'}) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) - def test_plug_port_to_tenant_network_client(self, mock_gc, mock_update): + def test_plug_port_to_tenant_network_client(self, mock_gc, + wait_mock_status, + mock_update): self.port.internal_info = {common.TENANT_VIF_KEY: self.vif_id} self.port.save() with task_manager.acquire(self.context, self.node.id) as task: @@ -439,8 +442,11 @@ class TestCommonFunctions(db_base.DbTestCase): self.assertTrue(mock_update.called) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) - def test_plug_port_to_tenant_network_no_client(self, mock_gc, mock_update): + def test_plug_port_to_tenant_network_no_client(self, mock_gc, + wait_mock_status, + mock_update): self.port.internal_info = {common.TENANT_VIF_KEY: self.vif_id} self.port.save() with task_manager.acquire(self.context, self.node.id) as task: @@ -480,7 +486,7 @@ class TestCommonFunctions(db_base.DbTestCase): wait_agent_mock.assert_called_once_with( nclient, 'hostname') wait_port_mock.assert_called_once_with( - nclient, self.vif_id, 'ACTIVE') + nclient, self.vif_id, 'ACTIVE', fail_on_binding_failure=True) self.assertTrue(mock_update.called) diff --git a/ironic/tests/unit/drivers/modules/network/test_neutron.py b/ironic/tests/unit/drivers/modules/network/test_neutron.py index d98c76eb59..3b62573c34 100644 --- a/ironic/tests/unit/drivers/modules/network/test_neutron.py +++ b/ironic/tests/unit/drivers/modules/network/test_neutron.py @@ -560,10 +560,12 @@ class NeutronInterfaceTestCase(db_base.DbTestCase): @mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) @mock.patch.object(neutron, 'LOG', autospec=True) def test_configure_tenant_networks_multiple_ports_one_vif_id( - self, log_mock, client_mock, update_mock, wait_agent_mock): + self, log_mock, client_mock, wait_mock_status, update_mock, + wait_agent_mock): expected_attrs = { 'binding:vnic_type': 'baremetal', 'binding:host_id': self.node.uuid, @@ -582,8 +584,10 @@ class NeutronInterfaceTestCase(db_base.DbTestCase): @mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) def test_configure_tenant_networks_update_fail(self, client_mock, + wait_mock_status, update_mock, wait_agent_mock): update_mock.side_effect = openstack_exc.OpenStackCloudException( @@ -594,11 +598,26 @@ class NeutronInterfaceTestCase(db_base.DbTestCase): self.interface.configure_tenant_networks, task) client_mock.assert_called_once_with(context=task.context) + @mock.patch.object(neutron_common, '_get_port_by_uuid', autospec=True) + @mock.patch.object(neutron_common, 'get_client', autospec=True) + def test_configure_tenant_networks_update_binding_fail(self, client_mock, + mock_get_port): + self.config(fail_on_port_binding_failure=True, group='neutron') + port = mock.MagicMock() + port.get.return_value = 'binding_failed' + mock_get_port.return_value = port + + with task_manager.acquire(self.context, self.node.id) as task: + self.assertRaisesRegex( + exception.NetworkError, 'Binding failed', + self.interface.configure_tenant_networks, task) + @mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) - def _test_configure_tenant_networks(self, client_mock, update_mock, - wait_agent_mock, + def _test_configure_tenant_networks(self, client_mock, wait_mock_status, + update_mock, wait_agent_mock, is_client_id=False): # NOTE(TheJulia): Until we have a replacement for infiniband client-id # storage, extra has to stay put. On a plus side, this would be @@ -668,12 +687,13 @@ class NeutronInterfaceTestCase(db_base.DbTestCase): @mock.patch.object(neutron_common, 'get_neutron_port_data', autospec=True) @mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) @mock.patch.object(neutron_common, 'get_local_group_information', autospec=True) def test_configure_tenant_networks_with_portgroups( - self, glgi_mock, client_mock, update_mock, wait_agent_mock, - port_data_mock): + self, glgi_mock, client_mock, wait_mock_status, update_mock, + wait_agent_mock, port_data_mock): pg = utils.create_test_portgroup( self.context, node_id=self.node.id, address='ff:54:00:cf:2d:32', internal_info={'tenant_vif_port_id': uuidutils.generate_uuid()}) @@ -729,12 +749,13 @@ class NeutronInterfaceTestCase(db_base.DbTestCase): @mock.patch.object(neutron_common, 'get_neutron_port_data', autospec=True) @mock.patch.object(neutron_common, 'wait_for_host_agent', autospec=True) @mock.patch.object(neutron_common, 'update_neutron_port', autospec=True) + @mock.patch.object(neutron_common, 'wait_for_port_status', autospec=True) @mock.patch.object(neutron_common, 'get_client', autospec=True) @mock.patch.object(neutron_common, 'get_local_group_information', autospec=True) def test_configure_tenant_networks_with_portgroups_no_address( - self, glgi_mock, client_mock, update_mock, wait_agent_mock, - port_data_mock): + self, glgi_mock, client_mock, wait_mock_status, update_mock, + wait_agent_mock, port_data_mock): pg = utils.create_test_portgroup( self.context, node_id=self.node.id, address=None, internal_info={'tenant_vif_port_id': uuidutils.generate_uuid()}) diff --git a/releasenotes/notes/configure-fail-early-on-port-binding-failure-e4c9749a84bd947f.yaml b/releasenotes/notes/configure-fail-early-on-port-binding-failure-e4c9749a84bd947f.yaml new file mode 100644 index 0000000000..b40de54263 --- /dev/null +++ b/releasenotes/notes/configure-fail-early-on-port-binding-failure-e4c9749a84bd947f.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + Adds a new boolean configuration option ``[neutron]fail_on_port_binding_failure`` + and corresponding node ``driver_info`` setting ``fail_on_binding_failure`` + to control whether deployment should fail or continue if Neutron port + binding fails. With a default of ``true``, if your network is not + configured properly, this will likely cause deployment failures. + To maintain the previous behavior, explicitly set this option to False in + your configuration.