Catch PortNotFound after HA router race condition
When neutron server deleted all the resources of a HA router, L3 agent can not aware that, so race happened in some procedure like this: 1. Neutron server delete all resources of a HA router. 2. RPC fanout to L3 agent 1 in which the HA router was master state. 3. In l3 agent 2 'backup' router set itself to masert and notify neutron server a HA router state change notify. 4. PorNotFound rasied in updating router HA port status. How the step 2 and 3 happens? Consider that l3 agent 2 has much more HA routers than l3 agent 1, or any reason that causes l3 agent 2 gets/processes the deleting RPC later than l3 agent 1. Then l3 agent 1 remove HA router's keepalived process will soonly be detected by backup router in l3 agent 2 via VRRP protocol. Now the router deleting RPC is in the queue of RouterUpdate or any step of a HA router deleting procedure, and the router_info will still have 'the' router info. So l3 agent 2 will do the state change procedure, AKA notify the neutron server to update router state. This patch is mainly to deal with the race by catching the PorNotFound exception in neutron-server side. Change-Id: I34d7347595bfceb8a70685672a6287e1a44ede6b Closes-Bug: #1533454 Related-Bug: #1523780
This commit is contained in:
parent
3fb153b15b
commit
472d84d25c
@ -687,6 +687,7 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
|
||||
try:
|
||||
self._core_plugin.update_port(admin_ctx, port['id'],
|
||||
{attributes.PORT: port})
|
||||
except (orm.exc.StaleDataError, orm.exc.ObjectDeletedError):
|
||||
except (orm.exc.StaleDataError, orm.exc.ObjectDeletedError,
|
||||
n_exc.PortNotFound):
|
||||
# Take concurrently deleted interfaces in to account
|
||||
pass
|
||||
|
@ -642,6 +642,19 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.admin_ctx, self.agent1['host'], self.agent1)
|
||||
self.assertEqual('active', routers[0][constants.HA_ROUTER_STATE_KEY])
|
||||
|
||||
def test_update_routers_states_port_not_found(self):
|
||||
router1 = self._create_router()
|
||||
self._bind_router(router1['id'])
|
||||
port = {'id': 'foo', 'device_id': router1['id']}
|
||||
with mock.patch.object(self.core_plugin, 'get_ports',
|
||||
return_value=[port]):
|
||||
with mock.patch.object(
|
||||
self.core_plugin, 'update_port',
|
||||
side_effect=n_exc.PortNotFound(port_id='foo')):
|
||||
states = {router1['id']: 'active'}
|
||||
self.plugin.update_routers_states(
|
||||
self.admin_ctx, states, self.agent1['host'])
|
||||
|
||||
def test_exclude_dvr_agents_for_ha_candidates(self):
|
||||
"""Test dvr agents configured with "dvr" only, as opposed to "dvr_snat",
|
||||
are excluded.
|
||||
@ -774,6 +787,17 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.assertNotIn('HA network tenant %s' % tenant_id,
|
||||
nets_after)
|
||||
|
||||
def test_update_port_status_port_bingding_deleted_concurrently(self):
|
||||
router1 = self._create_router()
|
||||
self._bind_router(router1['id'])
|
||||
states = {router1['id']: 'active'}
|
||||
with mock.patch.object(self.plugin, 'get_ha_router_port_bindings'):
|
||||
(self.admin_ctx.session.query(
|
||||
l3_hamode_db.L3HARouterAgentPortBinding).
|
||||
filter_by(router_id=router1['id']).delete())
|
||||
self.plugin.update_routers_states(
|
||||
self.admin_ctx, states, self.agent1['host'])
|
||||
|
||||
|
||||
class L3HAModeDbTestCase(L3HATestFramework):
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user