From 84da10cf22b432e8d6ff15b014b90ce0d9b7678f Mon Sep 17 00:00:00 2001 From: Kobi Samoray Date: Thu, 21 Dec 2017 16:36:15 +0200 Subject: [PATCH] NSXv HK: recover broken backup edge appliances When there is a failure during the recycling of an edge appliace to the backup pool, the edge at the backend may still be attached to networks and use IP addresses which are free for reuse by Neutron. Housekeeping job should address such cases. Change-Id: I3a8ba622f742064bdc8906ba745da0a54a4576ac --- doc/source/housekeeper.rst | 3 + setup.cfg | 1 + vmware_nsx/common/config.py | 2 +- .../nsx_v/housekeeper/error_backup_edge.py | 105 ++++++++++++++++++ vmware_nsx/plugins/nsx_v/plugin.py | 2 +- .../plugins/nsx_v/vshield/edge_utils.py | 5 +- .../housekeeper/test_error_backup_edge.py | 82 ++++++++++++++ 7 files changed, 196 insertions(+), 4 deletions(-) create mode 100644 vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py create mode 100644 vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py diff --git a/doc/source/housekeeper.rst b/doc/source/housekeeper.rst index bcdb9486cc..8576b1fbfd 100644 --- a/doc/source/housekeeper.rst +++ b/doc/source/housekeeper.rst @@ -60,3 +60,6 @@ error_dhcp_edge: scans for DHCP Edge appliances which are in ERROR state. When in non-readonly mode, the job will attempt recovery of the DHCP edges by removing stale elements from the Neutron DB and reconfigure the interfaces at the backend when required. + +error_backup_edge: scans from backup Edge appliances which are in ERROR state. +When in non-readonly mode, the job will reset the Edge appliance configuration. diff --git a/setup.cfg b/setup.cfg index 273279206f..ebc909569a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -76,6 +76,7 @@ openstack.nsxclient.v2 = project_plugin_list = vmware_nsx.osc.v2.project_plugin_map:ListProjectPluginMap vmware_nsx.neutron.nsxv.housekeeper.jobs = error_dhcp_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_dhcp_edge:ErrorDhcpEdgeJob + error_backup_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_backup_edge:ErrorBackupEdgeJob [build_sphinx] source-dir = doc/source diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py index 202a41ffc5..da31be99d7 100644 --- a/vmware_nsx/common/config.py +++ b/vmware_nsx/common/config.py @@ -707,7 +707,7 @@ nsxv_opts = [ help=_("If False, different tenants will not use the same " "DHCP edge or router edge.")), cfg.ListOpt('housekeeping_jobs', - default=['error_dhcp_edge'], + default=['error_dhcp_edge', 'error_backup_edge'], help=_("List of the enabled housekeeping jobs")), cfg.BoolOpt('housekeeping_readonly', default=True, diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py new file mode 100644 index 0000000000..df780e7bc9 --- /dev/null +++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py @@ -0,0 +1,105 @@ +# Copyright 2017 VMware, Inc. +# All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from neutron_lib import constants +from oslo_log import log +from sqlalchemy.orm import exc as sa_exc + +from vmware_nsx.common import locking +from vmware_nsx.common import nsxv_constants +from vmware_nsx.db import nsxv_db +from vmware_nsx.plugins.common.housekeeper import base_job +from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az +from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_const + +LOG = log.getLogger(__name__) + + +class ErrorBackupEdgeJob(base_job.BaseJob): + def __init__(self, readonly): + super(ErrorBackupEdgeJob, self).__init__(readonly) + self.azs = nsx_az.NsxVAvailabilityZones() + + def get_name(self): + return 'error_backup_edge' + + def get_description(self): + return 'revalidate backup Edge appliances in ERROR state' + + def run(self, context): + super(ErrorBackupEdgeJob, self).run(context) + + # Gather ERROR state backup edges into dict + filters = {'status': [constants.ERROR]} + like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} + with locking.LockManager.get_lock('nsx-edge-backup-pool'): + error_edge_bindings = nsxv_db.get_nsxv_router_bindings( + context.session, filters=filters, like_filters=like_filters) + + if not error_edge_bindings: + LOG.debug('Housekeeping: no backup edges in ERROR state detected') + return + + # Keep list of current broken backup edges - as it may change while + # HK is running + for binding in error_edge_bindings: + LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR' + ' state', binding['edge_id']) + + if not self.readonly: + with locking.LockManager.get_lock(binding['edge_id']): + self._handle_backup_edge(context, binding) + + def _handle_backup_edge(self, context, binding): + dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE) + az = self.azs.get_availability_zone( + binding['availability_zone']) + try: + update_result = self.plugin.nsx_v.update_edge( + context, binding['router_id'], binding['edge_id'], + binding['router_id'], None, + appliance_size=binding['appliance_size'], + dist=dist, availability_zone=az) + + if update_result: + nsxv_db.update_nsxv_router_binding( + context.session, binding['router_id'], + status=constants.ACTIVE) + except Exception as e: + LOG.error('Housekeeping: failed to recover Edge ' + 'appliance %s with exception %s', + binding['edge_id'], e) + update_result = False + + if not update_result: + LOG.warning('Housekeeping: failed to recover Edge ' + 'appliance %s, trying to delete', binding['edge_id']) + self._delete_edge(context, binding, dist) + + def _delete_edge(self, context, binding, dist): + try: + nsxv_db.update_nsxv_router_binding( + context.session, binding['router_id'], + status=constants.PENDING_DELETE) + except sa_exc.NoResultFound: + LOG.debug("Housekeeping: Router binding %s does not exist.", + binding['router_id']) + + try: + self.plugin.nsx_v.delete_edge(context, binding['router_id'], + binding['edge_id'], dist=dist) + except Exception as e: + LOG.warning('Housekeeping: Failed to delete edge %s with ' + 'exception %s', binding['edge_id'], e) diff --git a/vmware_nsx/plugins/nsx_v/plugin.py b/vmware_nsx/plugins/nsx_v/plugin.py index 7e884b8eb1..f62293405f 100644 --- a/vmware_nsx/plugins/nsx_v/plugin.py +++ b/vmware_nsx/plugins/nsx_v/plugin.py @@ -355,7 +355,7 @@ class NsxVPluginV2(addr_pair_db.AllowedAddressPairsMixin, self.housekeeper = housekeeper.NsxvHousekeeper( hk_ns='vmware_nsx.neutron.nsxv.housekeeper.jobs', - hk_jobs=['error_dhcp_edge']) + hk_jobs=cfg.CONF.nsxv.housekeeping_jobs) self.init_is_complete = True diff --git a/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py b/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py index 5312a381ac..18f68ea96a 100644 --- a/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py +++ b/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py @@ -319,11 +319,12 @@ class EdgeManager(object): def _clean_all_error_edge_bindings(self, context, availability_zone): # Find all backup edges in error state & # backup edges which are in pending-XXX state for too long - filters = {'status': [constants.ERROR, - constants.PENDING_CREATE, + filters = {'status': [constants.PENDING_CREATE, constants.PENDING_UPDATE, constants.PENDING_DELETE], 'availability_zone': [availability_zone.name]} + if cfg.CONF.nsxv.housekeeping_readonly: + filters['status'].append(constants.ERROR) like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} router_bindings = nsxv_db.get_nsxv_router_bindings( context.session, filters=filters, like_filters=like_filters) diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py new file mode 100644 index 0000000000..beba17ee65 --- /dev/null +++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py @@ -0,0 +1,82 @@ +# Copyright 2017 VMware, Inc. +# All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock +from neutron.tests import base +from neutron_lib import constants + +from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge + +FAKE_ROUTER_BINDINGS = [ + { + 'router_id': 'backup-3b0b1fe1-c984', 'status': 'ERROR', + 'availability_zone': 'default', 'edge_id': 'edge-782', + 'edge_type': 'service', 'appliance_size': 'compact'}] + + +class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase): + def _is_readonly(self): + return True + + def setUp(self): + def get_plugin_mock(alias=constants.CORE): + if alias in (constants.CORE, constants.L3): + return self.plugin + + super(ErrorBackupEdgeTestCaseReadOnly, self).setUp() + self.plugin = mock.Mock() + self.context = mock.Mock() + self.context.session = mock.Mock() + mock.patch('neutron_lib.plugins.directory.get_plugin', + side_effect=get_plugin_mock).start() + self.log = mock.Mock() + error_backup_edge.LOG = self.log + self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly()) + + def test_clean_run(self): + mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings', + return_value=[]).start() + self.job.run(self.context) + self.log.warning.assert_not_called() + + def test_broken_backup_edge(self): + mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings', + return_value=FAKE_ROUTER_BINDINGS).start() + + self.job.run(self.context) + self.log.warning.assert_called_once() + + +class ErrorBackupEdgeTestCaseReadWrite(ErrorBackupEdgeTestCaseReadOnly): + def _is_readonly(self): + return False + + def test_broken_backup_edge(self): + upd_binding = mock.patch( + 'vmware_nsx.db.nsxv_db.update_nsxv_router_binding').start() + upd_edge = mock.patch.object(self.plugin.nsx_v, 'update_edge').start() + self.job.azs = mock.Mock() + az = mock.Mock() + mock.patch.object(self.job.azs, 'get_availability_zone', + return_value=az).start() + super(ErrorBackupEdgeTestCaseReadWrite, self + ).test_broken_backup_edge() + upd_binding.assert_has_calls( + [mock.call(mock.ANY, r['router_id'], status='ACTIVE') + for r in FAKE_ROUTER_BINDINGS]) + upd_edge.assert_called_with( + self.context, 'backup-3b0b1fe1-c984', 'edge-782', + 'backup-3b0b1fe1-c984', None, appliance_size='compact', + availability_zone=az, dist=False)