From 472d84d25cee0694500e583845718a4f377cc75c Mon Sep 17 00:00:00 2001 From: LIU Yulong Date: Mon, 11 Jan 2016 12:02:55 +0800 Subject: [PATCH] Catch PortNotFound after HA router race condition When neutron server deleted all the resources of a HA router, L3 agent can not aware that, so race happened in some procedure like this: 1. Neutron server delete all resources of a HA router. 2. RPC fanout to L3 agent 1 in which the HA router was master state. 3. In l3 agent 2 'backup' router set itself to masert and notify neutron server a HA router state change notify. 4. PorNotFound rasied in updating router HA port status. How the step 2 and 3 happens? Consider that l3 agent 2 has much more HA routers than l3 agent 1, or any reason that causes l3 agent 2 gets/processes the deleting RPC later than l3 agent 1. Then l3 agent 1 remove HA router's keepalived process will soonly be detected by backup router in l3 agent 2 via VRRP protocol. Now the router deleting RPC is in the queue of RouterUpdate or any step of a HA router deleting procedure, and the router_info will still have 'the' router info. So l3 agent 2 will do the state change procedure, AKA notify the neutron server to update router state. This patch is mainly to deal with the race by catching the PorNotFound exception in neutron-server side. Change-Id: I34d7347595bfceb8a70685672a6287e1a44ede6b Closes-Bug: #1533454 Related-Bug: #1523780 --- neutron/db/l3_hamode_db.py | 3 ++- neutron/tests/unit/db/test_l3_hamode_db.py | 24 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/neutron/db/l3_hamode_db.py b/neutron/db/l3_hamode_db.py index 5dc92a8233f..e32616381a8 100644 --- a/neutron/db/l3_hamode_db.py +++ b/neutron/db/l3_hamode_db.py @@ -687,6 +687,7 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, try: self._core_plugin.update_port(admin_ctx, port['id'], {attributes.PORT: port}) - except (orm.exc.StaleDataError, orm.exc.ObjectDeletedError): + except (orm.exc.StaleDataError, orm.exc.ObjectDeletedError, + n_exc.PortNotFound): # Take concurrently deleted interfaces in to account pass diff --git a/neutron/tests/unit/db/test_l3_hamode_db.py b/neutron/tests/unit/db/test_l3_hamode_db.py index c1902645004..15d899c39b9 100644 --- a/neutron/tests/unit/db/test_l3_hamode_db.py +++ b/neutron/tests/unit/db/test_l3_hamode_db.py @@ -642,6 +642,19 @@ class L3HATestCase(L3HATestFramework): self.admin_ctx, self.agent1['host'], self.agent1) self.assertEqual('active', routers[0][constants.HA_ROUTER_STATE_KEY]) + def test_update_routers_states_port_not_found(self): + router1 = self._create_router() + self._bind_router(router1['id']) + port = {'id': 'foo', 'device_id': router1['id']} + with mock.patch.object(self.core_plugin, 'get_ports', + return_value=[port]): + with mock.patch.object( + self.core_plugin, 'update_port', + side_effect=n_exc.PortNotFound(port_id='foo')): + states = {router1['id']: 'active'} + self.plugin.update_routers_states( + self.admin_ctx, states, self.agent1['host']) + def test_exclude_dvr_agents_for_ha_candidates(self): """Test dvr agents configured with "dvr" only, as opposed to "dvr_snat", are excluded. @@ -774,6 +787,17 @@ class L3HATestCase(L3HATestFramework): self.assertNotIn('HA network tenant %s' % tenant_id, nets_after) + def test_update_port_status_port_bingding_deleted_concurrently(self): + router1 = self._create_router() + self._bind_router(router1['id']) + states = {router1['id']: 'active'} + with mock.patch.object(self.plugin, 'get_ha_router_port_bindings'): + (self.admin_ctx.session.query( + l3_hamode_db.L3HARouterAgentPortBinding). + filter_by(router_id=router1['id']).delete()) + self.plugin.update_routers_states( + self.admin_ctx, states, self.agent1['host']) + class L3HAModeDbTestCase(L3HATestFramework):