NSXv HK: recover broken backup edge appliances
When there is a failure during the recycling of an edge appliace to the backup pool, the edge at the backend may still be attached to networks and use IP addresses which are free for reuse by Neutron. Housekeeping job should address such cases. Change-Id: I3a8ba622f742064bdc8906ba745da0a54a4576ac
This commit is contained in:
parent
b2d3abc66a
commit
84da10cf22
@ -60,3 +60,6 @@ error_dhcp_edge: scans for DHCP Edge appliances which are in ERROR state.
|
||||
When in non-readonly mode, the job will attempt recovery of the DHCP edges by
|
||||
removing stale elements from the Neutron DB and reconfigure the interfaces at
|
||||
the backend when required.
|
||||
|
||||
error_backup_edge: scans from backup Edge appliances which are in ERROR state.
|
||||
When in non-readonly mode, the job will reset the Edge appliance configuration.
|
||||
|
@ -76,6 +76,7 @@ openstack.nsxclient.v2 =
|
||||
project_plugin_list = vmware_nsx.osc.v2.project_plugin_map:ListProjectPluginMap
|
||||
vmware_nsx.neutron.nsxv.housekeeper.jobs =
|
||||
error_dhcp_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_dhcp_edge:ErrorDhcpEdgeJob
|
||||
error_backup_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_backup_edge:ErrorBackupEdgeJob
|
||||
|
||||
[build_sphinx]
|
||||
source-dir = doc/source
|
||||
|
@ -707,7 +707,7 @@ nsxv_opts = [
|
||||
help=_("If False, different tenants will not use the same "
|
||||
"DHCP edge or router edge.")),
|
||||
cfg.ListOpt('housekeeping_jobs',
|
||||
default=['error_dhcp_edge'],
|
||||
default=['error_dhcp_edge', 'error_backup_edge'],
|
||||
help=_("List of the enabled housekeeping jobs")),
|
||||
cfg.BoolOpt('housekeeping_readonly',
|
||||
default=True,
|
||||
|
105
vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py
Normal file
105
vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py
Normal file
@ -0,0 +1,105 @@
|
||||
# Copyright 2017 VMware, Inc.
|
||||
# All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from neutron_lib import constants
|
||||
from oslo_log import log
|
||||
from sqlalchemy.orm import exc as sa_exc
|
||||
|
||||
from vmware_nsx.common import locking
|
||||
from vmware_nsx.common import nsxv_constants
|
||||
from vmware_nsx.db import nsxv_db
|
||||
from vmware_nsx.plugins.common.housekeeper import base_job
|
||||
from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az
|
||||
from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_const
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class ErrorBackupEdgeJob(base_job.BaseJob):
|
||||
def __init__(self, readonly):
|
||||
super(ErrorBackupEdgeJob, self).__init__(readonly)
|
||||
self.azs = nsx_az.NsxVAvailabilityZones()
|
||||
|
||||
def get_name(self):
|
||||
return 'error_backup_edge'
|
||||
|
||||
def get_description(self):
|
||||
return 'revalidate backup Edge appliances in ERROR state'
|
||||
|
||||
def run(self, context):
|
||||
super(ErrorBackupEdgeJob, self).run(context)
|
||||
|
||||
# Gather ERROR state backup edges into dict
|
||||
filters = {'status': [constants.ERROR]}
|
||||
like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"}
|
||||
with locking.LockManager.get_lock('nsx-edge-backup-pool'):
|
||||
error_edge_bindings = nsxv_db.get_nsxv_router_bindings(
|
||||
context.session, filters=filters, like_filters=like_filters)
|
||||
|
||||
if not error_edge_bindings:
|
||||
LOG.debug('Housekeeping: no backup edges in ERROR state detected')
|
||||
return
|
||||
|
||||
# Keep list of current broken backup edges - as it may change while
|
||||
# HK is running
|
||||
for binding in error_edge_bindings:
|
||||
LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR'
|
||||
' state', binding['edge_id'])
|
||||
|
||||
if not self.readonly:
|
||||
with locking.LockManager.get_lock(binding['edge_id']):
|
||||
self._handle_backup_edge(context, binding)
|
||||
|
||||
def _handle_backup_edge(self, context, binding):
|
||||
dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE)
|
||||
az = self.azs.get_availability_zone(
|
||||
binding['availability_zone'])
|
||||
try:
|
||||
update_result = self.plugin.nsx_v.update_edge(
|
||||
context, binding['router_id'], binding['edge_id'],
|
||||
binding['router_id'], None,
|
||||
appliance_size=binding['appliance_size'],
|
||||
dist=dist, availability_zone=az)
|
||||
|
||||
if update_result:
|
||||
nsxv_db.update_nsxv_router_binding(
|
||||
context.session, binding['router_id'],
|
||||
status=constants.ACTIVE)
|
||||
except Exception as e:
|
||||
LOG.error('Housekeeping: failed to recover Edge '
|
||||
'appliance %s with exception %s',
|
||||
binding['edge_id'], e)
|
||||
update_result = False
|
||||
|
||||
if not update_result:
|
||||
LOG.warning('Housekeeping: failed to recover Edge '
|
||||
'appliance %s, trying to delete', binding['edge_id'])
|
||||
self._delete_edge(context, binding, dist)
|
||||
|
||||
def _delete_edge(self, context, binding, dist):
|
||||
try:
|
||||
nsxv_db.update_nsxv_router_binding(
|
||||
context.session, binding['router_id'],
|
||||
status=constants.PENDING_DELETE)
|
||||
except sa_exc.NoResultFound:
|
||||
LOG.debug("Housekeeping: Router binding %s does not exist.",
|
||||
binding['router_id'])
|
||||
|
||||
try:
|
||||
self.plugin.nsx_v.delete_edge(context, binding['router_id'],
|
||||
binding['edge_id'], dist=dist)
|
||||
except Exception as e:
|
||||
LOG.warning('Housekeeping: Failed to delete edge %s with '
|
||||
'exception %s', binding['edge_id'], e)
|
@ -355,7 +355,7 @@ class NsxVPluginV2(addr_pair_db.AllowedAddressPairsMixin,
|
||||
|
||||
self.housekeeper = housekeeper.NsxvHousekeeper(
|
||||
hk_ns='vmware_nsx.neutron.nsxv.housekeeper.jobs',
|
||||
hk_jobs=['error_dhcp_edge'])
|
||||
hk_jobs=cfg.CONF.nsxv.housekeeping_jobs)
|
||||
|
||||
self.init_is_complete = True
|
||||
|
||||
|
@ -319,11 +319,12 @@ class EdgeManager(object):
|
||||
def _clean_all_error_edge_bindings(self, context, availability_zone):
|
||||
# Find all backup edges in error state &
|
||||
# backup edges which are in pending-XXX state for too long
|
||||
filters = {'status': [constants.ERROR,
|
||||
constants.PENDING_CREATE,
|
||||
filters = {'status': [constants.PENDING_CREATE,
|
||||
constants.PENDING_UPDATE,
|
||||
constants.PENDING_DELETE],
|
||||
'availability_zone': [availability_zone.name]}
|
||||
if cfg.CONF.nsxv.housekeeping_readonly:
|
||||
filters['status'].append(constants.ERROR)
|
||||
like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"}
|
||||
router_bindings = nsxv_db.get_nsxv_router_bindings(
|
||||
context.session, filters=filters, like_filters=like_filters)
|
||||
|
@ -0,0 +1,82 @@
|
||||
# Copyright 2017 VMware, Inc.
|
||||
# All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import mock
|
||||
from neutron.tests import base
|
||||
from neutron_lib import constants
|
||||
|
||||
from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge
|
||||
|
||||
FAKE_ROUTER_BINDINGS = [
|
||||
{
|
||||
'router_id': 'backup-3b0b1fe1-c984', 'status': 'ERROR',
|
||||
'availability_zone': 'default', 'edge_id': 'edge-782',
|
||||
'edge_type': 'service', 'appliance_size': 'compact'}]
|
||||
|
||||
|
||||
class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase):
|
||||
def _is_readonly(self):
|
||||
return True
|
||||
|
||||
def setUp(self):
|
||||
def get_plugin_mock(alias=constants.CORE):
|
||||
if alias in (constants.CORE, constants.L3):
|
||||
return self.plugin
|
||||
|
||||
super(ErrorBackupEdgeTestCaseReadOnly, self).setUp()
|
||||
self.plugin = mock.Mock()
|
||||
self.context = mock.Mock()
|
||||
self.context.session = mock.Mock()
|
||||
mock.patch('neutron_lib.plugins.directory.get_plugin',
|
||||
side_effect=get_plugin_mock).start()
|
||||
self.log = mock.Mock()
|
||||
error_backup_edge.LOG = self.log
|
||||
self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly())
|
||||
|
||||
def test_clean_run(self):
|
||||
mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings',
|
||||
return_value=[]).start()
|
||||
self.job.run(self.context)
|
||||
self.log.warning.assert_not_called()
|
||||
|
||||
def test_broken_backup_edge(self):
|
||||
mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings',
|
||||
return_value=FAKE_ROUTER_BINDINGS).start()
|
||||
|
||||
self.job.run(self.context)
|
||||
self.log.warning.assert_called_once()
|
||||
|
||||
|
||||
class ErrorBackupEdgeTestCaseReadWrite(ErrorBackupEdgeTestCaseReadOnly):
|
||||
def _is_readonly(self):
|
||||
return False
|
||||
|
||||
def test_broken_backup_edge(self):
|
||||
upd_binding = mock.patch(
|
||||
'vmware_nsx.db.nsxv_db.update_nsxv_router_binding').start()
|
||||
upd_edge = mock.patch.object(self.plugin.nsx_v, 'update_edge').start()
|
||||
self.job.azs = mock.Mock()
|
||||
az = mock.Mock()
|
||||
mock.patch.object(self.job.azs, 'get_availability_zone',
|
||||
return_value=az).start()
|
||||
super(ErrorBackupEdgeTestCaseReadWrite, self
|
||||
).test_broken_backup_edge()
|
||||
upd_binding.assert_has_calls(
|
||||
[mock.call(mock.ANY, r['router_id'], status='ACTIVE')
|
||||
for r in FAKE_ROUTER_BINDINGS])
|
||||
upd_edge.assert_called_with(
|
||||
self.context, 'backup-3b0b1fe1-c984', 'edge-782',
|
||||
'backup-3b0b1fe1-c984', None, appliance_size='compact',
|
||||
availability_zone=az, dist=False)
|
Loading…
Reference in New Issue
Block a user