Merge "NetApp SolidFire: Fix failback failing after service restart"

This commit is contained in:
Zuul 2020-04-13 22:08:09 +00:00 committed by Gerrit Code Review
commit e9304a91c3
3 changed files with 85 additions and 53 deletions

View File

@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'login': 'admin'},
'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
'clusterPairID': 33,
'clusterAPIVersion': '9.4',
'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
'svip': '10.10.23.2',
'mvipNodeID': 1,
@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
cinder_vols.append(vol)
mock_map_sf_volumes.return_value = sf_vols
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
self.configuration.replication_device = []
reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'fake', None)
mock_map_sf_volumes.assert_not_called()
fake_replication_device = {'backend_id': 'fake',
'mvip': '0.0.0.0',
@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()
reset_mocks()
drv_args = {'active_backend_id': 'default'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()
reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
secondary_id='not_fake_id', groups=None)
mock_map_sf_volumes.assert_not_called()
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
reset_mocks()
drv_args = {'active_backend_id': None}
drv_args = {'active_backend_id': 'secondary'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
sfv.cluster_pairs = [None]
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols,
secondary_id='fake', groups=None)
mock_map_sf_volumes.assert_not_called()
sfv.cluster_pairs = self.cluster_pairs
sfv.cluster_pairs[0]['backend_id'] = 'fake'
sfv.replication_enabled = True
cluster_id, updates, _ = sfv.failover_host(
ctx, cinder_vols, secondary_id='default', groups=None)
self.assertEqual(5, len(updates))
for update in updates:
self.assertEqual(fields.ReplicationStatus.ENABLED,
update['updates']['replication_status'])
self.assertEqual('', cluster_id)
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()
reset_mocks()
drv_args = {'active_backend_id': None}
@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_get_cluster_info.assert_not_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()
mock_issue_api_request.assert_not_called()
@mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
@mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')

View File

@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
2.0.16 - Add options for replication mode (Async, Sync and
SnapshotsOnly)
2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
service is restarted
"""
VERSION = '2.0.16'
VERSION = '2.0.17'
# ThirdPartySystems wiki page
CI_WIKI_NAME = "NetApp_SolidFire_CI"
@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
self.active_cluster = self._create_cluster_reference(
remote_endpoint)
# When in failed-over state, we have only endpoint info from the
# primary cluster.
self.primary_cluster = {"endpoint": self._build_endpoint_info()}
self.failed_over = True
self.replication_enabled = True
else:
self.primary_cluster = self._create_cluster_reference()
self.active_cluster = self.primary_cluster
self.active_cluster = self._create_cluster_reference()
if self.configuration.replication_device:
self._set_cluster_pairs()
self.replication_enabled = True
LOG.debug("Active cluster: %s", self.active_cluster)
@ -441,9 +441,11 @@ class SolidFireDriver(san.SanISCSIDriver):
# clusterPairID in remote_info for us
self._create_remote_pairing(remote_info)
if self.cluster_pairs:
self.cluster_pairs.clear()
self.cluster_pairs.append(remote_info)
LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
self.replication_enabled = True
def _create_cluster_reference(self, endpoint=None):
cluster_ref = {}
@ -2356,8 +2358,13 @@ class SolidFireDriver(san.SanISCSIDriver):
failback = False
volume_updates = []
LOG.info("Failing over. Secondary ID is: %s",
secondary_id)
if not self.replication_enabled:
LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
# NOTE(erlon): For now we only support one replication target device.
# So, there are two cases we have to deal with here:
@ -2375,8 +2382,10 @@ class SolidFireDriver(san.SanISCSIDriver):
"state.")
raise exception.InvalidReplicationTarget(msg)
elif secondary_id == "default" and self.failed_over:
remote = self.primary_cluster
LOG.info("Failing back to primary cluster.")
remote = self._create_cluster_reference()
failback = True
else:
repl_configs = self.configuration.replication_device[0]
if secondary_id and repl_configs['backend_id'] != secondary_id:
@ -2384,25 +2393,24 @@ class SolidFireDriver(san.SanISCSIDriver):
"one in cinder.conf.") % secondary_id
raise exception.InvalidReplicationTarget(msg)
LOG.info("Failing over to secondary cluster %s.", secondary_id)
remote = self.cluster_pairs[0]
if not remote or not self.replication_enabled:
LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled, or there are no available "
"targets to fail-over to.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
LOG.debug("Target cluster to failover: %s.",
{'name': remote['name'],
'mvip': remote['mvip'],
'clusterAPIVersion': remote['clusterAPIVersion']})
target_vols = self._map_sf_volumes(volumes,
endpoint=remote['endpoint'])
LOG.debug("Mapped target_vols: %s", target_vols)
LOG.debug("Total Cinder volumes found in target: %d",
len(target_vols))
primary_vols = None
try:
primary_vols = self._map_sf_volumes(volumes)
LOG.debug("Mapped Primary_vols: %s", target_vols)
LOG.debug("Total Cinder volumes found in primary cluster: %d",
len(primary_vols))
except SolidFireAPIException:
# API Request failed on source. Failover/failback will skip next
# calls to it.
@ -2437,14 +2445,26 @@ class SolidFireDriver(san.SanISCSIDriver):
else:
primary_vol = None
LOG.debug('Failing-over volume %s, target vol %s, '
'primary vol %s', v, target_vol, primary_vol)
LOG.info('Failing-over volume %s.', v.id)
LOG.debug('Target vol: %s',
{'access': target_vol['access'],
'accountID': target_vol['accountID'],
'name': target_vol['name'],
'status': target_vol['status'],
'volumeID': target_vol['volumeID']})
LOG.debug('Primary vol: %s',
{'access': primary_vol['access'],
'accountID': primary_vol['accountID'],
'name': primary_vol['name'],
'status': primary_vol['status'],
'volumeID': primary_vol['volumeID']})
try:
self._failover_volume(target_vol, remote, primary_vol)
sf_account = self._get_create_account(
v.project_id, endpoint=remote['endpoint'])
LOG.debug("Target account: %s", sf_account['accountID'])
conn_info = self._build_connection_info(
sf_account, target_vol, endpoint=remote['endpoint'])
@ -2472,12 +2492,7 @@ class SolidFireDriver(san.SanISCSIDriver):
except Exception as e:
volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }})
if failback:
LOG.error("Error trying to failback volume %s", v.id)
else:
LOG.error("Error trying to failover volume %s", v.id)
LOG.error("Error trying to failover volume %s", v.id)
msg = e.message if hasattr(e, 'message') else e
LOG.exception(msg)
@ -2485,20 +2500,17 @@ class SolidFireDriver(san.SanISCSIDriver):
volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }})
# FIXME(jdg): This introduces a problem for us, up until now our driver
# has been pretty much stateless and has allowed customers to run
# active/active HA c-vol services with SolidFire. The introduction of
# the active_cluster and failed_over attributes is going to break that
# but for now that's going to be the trade off of using replication
self.active_cluster = remote
if failback:
active_cluster_id = None
active_cluster_id = ''
self.failed_over = False
# Recreating cluster pairs after a successful failback
self._set_cluster_pairs()
else:
active_cluster_id = remote['backend_id']
self.failed_over = True
self.active_cluster = remote
return active_cluster_id, volume_updates, []
def freeze_backend(self, context):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
NetApp SolidFire driver: Fixed an issue that causes failback
to fail after a volume service restart. This change fixes
bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.