NSXv HK: recover broken backup edge appliances

When there is a failure during the recycling of an edge appliace to the
backup pool, the edge at the backend may still be attached to networks
and use IP addresses which are free for reuse by Neutron.
Housekeeping job should address such cases.

Change-Id: I3a8ba622f742064bdc8906ba745da0a54a4576ac
This commit is contained in:
Kobi Samoray 2017-12-21 16:36:15 +02:00
parent b2d3abc66a
commit 84da10cf22
7 changed files with 196 additions and 4 deletions

View File

@ -60,3 +60,6 @@ error_dhcp_edge: scans for DHCP Edge appliances which are in ERROR state.
When in non-readonly mode, the job will attempt recovery of the DHCP edges by When in non-readonly mode, the job will attempt recovery of the DHCP edges by
removing stale elements from the Neutron DB and reconfigure the interfaces at removing stale elements from the Neutron DB and reconfigure the interfaces at
the backend when required. the backend when required.
error_backup_edge: scans from backup Edge appliances which are in ERROR state.
When in non-readonly mode, the job will reset the Edge appliance configuration.

View File

@ -76,6 +76,7 @@ openstack.nsxclient.v2 =
project_plugin_list = vmware_nsx.osc.v2.project_plugin_map:ListProjectPluginMap project_plugin_list = vmware_nsx.osc.v2.project_plugin_map:ListProjectPluginMap
vmware_nsx.neutron.nsxv.housekeeper.jobs = vmware_nsx.neutron.nsxv.housekeeper.jobs =
error_dhcp_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_dhcp_edge:ErrorDhcpEdgeJob error_dhcp_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_dhcp_edge:ErrorDhcpEdgeJob
error_backup_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_backup_edge:ErrorBackupEdgeJob
[build_sphinx] [build_sphinx]
source-dir = doc/source source-dir = doc/source

View File

@ -707,7 +707,7 @@ nsxv_opts = [
help=_("If False, different tenants will not use the same " help=_("If False, different tenants will not use the same "
"DHCP edge or router edge.")), "DHCP edge or router edge.")),
cfg.ListOpt('housekeeping_jobs', cfg.ListOpt('housekeeping_jobs',
default=['error_dhcp_edge'], default=['error_dhcp_edge', 'error_backup_edge'],
help=_("List of the enabled housekeeping jobs")), help=_("List of the enabled housekeeping jobs")),
cfg.BoolOpt('housekeeping_readonly', cfg.BoolOpt('housekeeping_readonly',
default=True, default=True,

View File

@ -0,0 +1,105 @@
# Copyright 2017 VMware, Inc.
# All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from neutron_lib import constants
from oslo_log import log
from sqlalchemy.orm import exc as sa_exc
from vmware_nsx.common import locking
from vmware_nsx.common import nsxv_constants
from vmware_nsx.db import nsxv_db
from vmware_nsx.plugins.common.housekeeper import base_job
from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az
from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_const
LOG = log.getLogger(__name__)
class ErrorBackupEdgeJob(base_job.BaseJob):
def __init__(self, readonly):
super(ErrorBackupEdgeJob, self).__init__(readonly)
self.azs = nsx_az.NsxVAvailabilityZones()
def get_name(self):
return 'error_backup_edge'
def get_description(self):
return 'revalidate backup Edge appliances in ERROR state'
def run(self, context):
super(ErrorBackupEdgeJob, self).run(context)
# Gather ERROR state backup edges into dict
filters = {'status': [constants.ERROR]}
like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"}
with locking.LockManager.get_lock('nsx-edge-backup-pool'):
error_edge_bindings = nsxv_db.get_nsxv_router_bindings(
context.session, filters=filters, like_filters=like_filters)
if not error_edge_bindings:
LOG.debug('Housekeeping: no backup edges in ERROR state detected')
return
# Keep list of current broken backup edges - as it may change while
# HK is running
for binding in error_edge_bindings:
LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR'
' state', binding['edge_id'])
if not self.readonly:
with locking.LockManager.get_lock(binding['edge_id']):
self._handle_backup_edge(context, binding)
def _handle_backup_edge(self, context, binding):
dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE)
az = self.azs.get_availability_zone(
binding['availability_zone'])
try:
update_result = self.plugin.nsx_v.update_edge(
context, binding['router_id'], binding['edge_id'],
binding['router_id'], None,
appliance_size=binding['appliance_size'],
dist=dist, availability_zone=az)
if update_result:
nsxv_db.update_nsxv_router_binding(
context.session, binding['router_id'],
status=constants.ACTIVE)
except Exception as e:
LOG.error('Housekeeping: failed to recover Edge '
'appliance %s with exception %s',
binding['edge_id'], e)
update_result = False
if not update_result:
LOG.warning('Housekeeping: failed to recover Edge '
'appliance %s, trying to delete', binding['edge_id'])
self._delete_edge(context, binding, dist)
def _delete_edge(self, context, binding, dist):
try:
nsxv_db.update_nsxv_router_binding(
context.session, binding['router_id'],
status=constants.PENDING_DELETE)
except sa_exc.NoResultFound:
LOG.debug("Housekeeping: Router binding %s does not exist.",
binding['router_id'])
try:
self.plugin.nsx_v.delete_edge(context, binding['router_id'],
binding['edge_id'], dist=dist)
except Exception as e:
LOG.warning('Housekeeping: Failed to delete edge %s with '
'exception %s', binding['edge_id'], e)

View File

@ -355,7 +355,7 @@ class NsxVPluginV2(addr_pair_db.AllowedAddressPairsMixin,
self.housekeeper = housekeeper.NsxvHousekeeper( self.housekeeper = housekeeper.NsxvHousekeeper(
hk_ns='vmware_nsx.neutron.nsxv.housekeeper.jobs', hk_ns='vmware_nsx.neutron.nsxv.housekeeper.jobs',
hk_jobs=['error_dhcp_edge']) hk_jobs=cfg.CONF.nsxv.housekeeping_jobs)
self.init_is_complete = True self.init_is_complete = True

View File

@ -319,11 +319,12 @@ class EdgeManager(object):
def _clean_all_error_edge_bindings(self, context, availability_zone): def _clean_all_error_edge_bindings(self, context, availability_zone):
# Find all backup edges in error state & # Find all backup edges in error state &
# backup edges which are in pending-XXX state for too long # backup edges which are in pending-XXX state for too long
filters = {'status': [constants.ERROR, filters = {'status': [constants.PENDING_CREATE,
constants.PENDING_CREATE,
constants.PENDING_UPDATE, constants.PENDING_UPDATE,
constants.PENDING_DELETE], constants.PENDING_DELETE],
'availability_zone': [availability_zone.name]} 'availability_zone': [availability_zone.name]}
if cfg.CONF.nsxv.housekeeping_readonly:
filters['status'].append(constants.ERROR)
like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"}
router_bindings = nsxv_db.get_nsxv_router_bindings( router_bindings = nsxv_db.get_nsxv_router_bindings(
context.session, filters=filters, like_filters=like_filters) context.session, filters=filters, like_filters=like_filters)

View File

@ -0,0 +1,82 @@
# Copyright 2017 VMware, Inc.
# All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from neutron.tests import base
from neutron_lib import constants
from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge
FAKE_ROUTER_BINDINGS = [
{
'router_id': 'backup-3b0b1fe1-c984', 'status': 'ERROR',
'availability_zone': 'default', 'edge_id': 'edge-782',
'edge_type': 'service', 'appliance_size': 'compact'}]
class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase):
def _is_readonly(self):
return True
def setUp(self):
def get_plugin_mock(alias=constants.CORE):
if alias in (constants.CORE, constants.L3):
return self.plugin
super(ErrorBackupEdgeTestCaseReadOnly, self).setUp()
self.plugin = mock.Mock()
self.context = mock.Mock()
self.context.session = mock.Mock()
mock.patch('neutron_lib.plugins.directory.get_plugin',
side_effect=get_plugin_mock).start()
self.log = mock.Mock()
error_backup_edge.LOG = self.log
self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly())
def test_clean_run(self):
mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings',
return_value=[]).start()
self.job.run(self.context)
self.log.warning.assert_not_called()
def test_broken_backup_edge(self):
mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings',
return_value=FAKE_ROUTER_BINDINGS).start()
self.job.run(self.context)
self.log.warning.assert_called_once()
class ErrorBackupEdgeTestCaseReadWrite(ErrorBackupEdgeTestCaseReadOnly):
def _is_readonly(self):
return False
def test_broken_backup_edge(self):
upd_binding = mock.patch(
'vmware_nsx.db.nsxv_db.update_nsxv_router_binding').start()
upd_edge = mock.patch.object(self.plugin.nsx_v, 'update_edge').start()
self.job.azs = mock.Mock()
az = mock.Mock()
mock.patch.object(self.job.azs, 'get_availability_zone',
return_value=az).start()
super(ErrorBackupEdgeTestCaseReadWrite, self
).test_broken_backup_edge()
upd_binding.assert_has_calls(
[mock.call(mock.ANY, r['router_id'], status='ACTIVE')
for r in FAKE_ROUTER_BINDINGS])
upd_edge.assert_called_with(
self.context, 'backup-3b0b1fe1-c984', 'edge-782',
'backup-3b0b1fe1-c984', None, appliance_size='compact',
availability_zone=az, dist=False)