NetApp SolidFire: Fix failback failing after service restart

After a successful failover, if the volume service was restarted,
the driver wouldn't be able to failback to the primary cluster
due to incomplete cluster data.

This patch address this problem by gathering data from the
primary cluster during the failback procedure.

Change-Id: I796fcf8969adae3338861a1b2f3310fafcacfbcb
Closes-bug: #1859653
This commit is contained in:
Fernando Ferraz 2020-02-06 21:22:33 +00:00
parent a039f5e7a5
commit f24eb2fc63
3 changed files with 85 additions and 53 deletions

View File

@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'login': 'admin'}, 'login': 'admin'},
'name': 'AutoTest2-6AjG-FOR-TEST-ONLY', 'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
'clusterPairID': 33, 'clusterPairID': 33,
'clusterAPIVersion': '9.4',
'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889', 'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
'svip': '10.10.23.2', 'svip': '10.10.23.2',
'mvipNodeID': 1, 'mvipNodeID': 1,
@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
cinder_vols.append(vol) cinder_vols.append(vol)
mock_map_sf_volumes.return_value = sf_vols mock_map_sf_volumes.return_value = sf_vols
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
self.configuration.replication_device = []
reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'fake', None)
mock_map_sf_volumes.assert_not_called()
fake_replication_device = {'backend_id': 'fake', fake_replication_device = {'backend_id': 'fake',
'mvip': '0.0.0.0', 'mvip': '0.0.0.0',
@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
sfv.failover_host, ctx, cinder_vols, 'default', None) sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called() mock_map_sf_volumes.assert_not_called()
reset_mocks()
drv_args = {'active_backend_id': 'default'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration, sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
secondary_id='not_fake_id', groups=None) secondary_id='not_fake_id', groups=None)
mock_map_sf_volumes.assert_not_called() mock_map_sf_volumes.assert_not_called()
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': 'secondary'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration, sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args) **drv_args)
sfv.cluster_pairs = [None] sfv.cluster_pairs = self.cluster_pairs
self.assertRaises(exception.UnableToFailOver, sfv.cluster_pairs[0]['backend_id'] = 'fake'
sfv.failover_host, ctx, cinder_vols, sfv.replication_enabled = True
secondary_id='fake', groups=None) cluster_id, updates, _ = sfv.failover_host(
mock_map_sf_volumes.assert_not_called() ctx, cinder_vols, secondary_id='default', groups=None)
self.assertEqual(5, len(updates))
for update in updates:
self.assertEqual(fields.ReplicationStatus.ENABLED,
update['updates']['replication_status'])
self.assertEqual('', cluster_id)
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': None}
@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_get_create_account.assert_called() mock_get_create_account.assert_called()
mock_failover_volume.assert_called() mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called() mock_map_sf_volumes.assert_called()
mock_get_cluster_info.assert_not_called()
mock_update_cluster_status.assert_called() mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called() mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called() mock_create_cluster_reference.assert_called()
mock_issue_api_request.assert_not_called()
@mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request') @mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
@mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference') @mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')

View File

@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors 2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
2.0.16 - Add options for replication mode (Async, Sync and 2.0.16 - Add options for replication mode (Async, Sync and
SnapshotsOnly) SnapshotsOnly)
2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
service is restarted
""" """
VERSION = '2.0.16' VERSION = '2.0.17'
# ThirdPartySystems wiki page # ThirdPartySystems wiki page
CI_WIKI_NAME = "NetApp_SolidFire_CI" CI_WIKI_NAME = "NetApp_SolidFire_CI"
@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
self.active_cluster = self._create_cluster_reference( self.active_cluster = self._create_cluster_reference(
remote_endpoint) remote_endpoint)
# When in failed-over state, we have only endpoint info from the
# primary cluster.
self.primary_cluster = {"endpoint": self._build_endpoint_info()}
self.failed_over = True self.failed_over = True
self.replication_enabled = True
else: else:
self.primary_cluster = self._create_cluster_reference() self.active_cluster = self._create_cluster_reference()
self.active_cluster = self.primary_cluster
if self.configuration.replication_device: if self.configuration.replication_device:
self._set_cluster_pairs() self._set_cluster_pairs()
self.replication_enabled = True
LOG.debug("Active cluster: %s", self.active_cluster) LOG.debug("Active cluster: %s", self.active_cluster)
@ -441,9 +441,11 @@ class SolidFireDriver(san.SanISCSIDriver):
# clusterPairID in remote_info for us # clusterPairID in remote_info for us
self._create_remote_pairing(remote_info) self._create_remote_pairing(remote_info)
if self.cluster_pairs:
self.cluster_pairs.clear()
self.cluster_pairs.append(remote_info) self.cluster_pairs.append(remote_info)
LOG.debug("Available cluster pairs: %s", self.cluster_pairs) LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
self.replication_enabled = True
def _create_cluster_reference(self, endpoint=None): def _create_cluster_reference(self, endpoint=None):
cluster_ref = {} cluster_ref = {}
@ -2356,8 +2358,13 @@ class SolidFireDriver(san.SanISCSIDriver):
failback = False failback = False
volume_updates = [] volume_updates = []
LOG.info("Failing over. Secondary ID is: %s", if not self.replication_enabled:
secondary_id) LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
# NOTE(erlon): For now we only support one replication target device. # NOTE(erlon): For now we only support one replication target device.
# So, there are two cases we have to deal with here: # So, there are two cases we have to deal with here:
@ -2375,8 +2382,10 @@ class SolidFireDriver(san.SanISCSIDriver):
"state.") "state.")
raise exception.InvalidReplicationTarget(msg) raise exception.InvalidReplicationTarget(msg)
elif secondary_id == "default" and self.failed_over: elif secondary_id == "default" and self.failed_over:
remote = self.primary_cluster LOG.info("Failing back to primary cluster.")
remote = self._create_cluster_reference()
failback = True failback = True
else: else:
repl_configs = self.configuration.replication_device[0] repl_configs = self.configuration.replication_device[0]
if secondary_id and repl_configs['backend_id'] != secondary_id: if secondary_id and repl_configs['backend_id'] != secondary_id:
@ -2384,25 +2393,24 @@ class SolidFireDriver(san.SanISCSIDriver):
"one in cinder.conf.") % secondary_id "one in cinder.conf.") % secondary_id
raise exception.InvalidReplicationTarget(msg) raise exception.InvalidReplicationTarget(msg)
LOG.info("Failing over to secondary cluster %s.", secondary_id)
remote = self.cluster_pairs[0] remote = self.cluster_pairs[0]
if not remote or not self.replication_enabled: LOG.debug("Target cluster to failover: %s.",
LOG.error("SolidFire driver received failover_host " {'name': remote['name'],
"request, however replication is NOT " 'mvip': remote['mvip'],
"enabled, or there are no available " 'clusterAPIVersion': remote['clusterAPIVersion']})
"targets to fail-over to.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
target_vols = self._map_sf_volumes(volumes, target_vols = self._map_sf_volumes(volumes,
endpoint=remote['endpoint']) endpoint=remote['endpoint'])
LOG.debug("Mapped target_vols: %s", target_vols) LOG.debug("Total Cinder volumes found in target: %d",
len(target_vols))
primary_vols = None primary_vols = None
try: try:
primary_vols = self._map_sf_volumes(volumes) primary_vols = self._map_sf_volumes(volumes)
LOG.debug("Mapped Primary_vols: %s", target_vols) LOG.debug("Total Cinder volumes found in primary cluster: %d",
len(primary_vols))
except SolidFireAPIException: except SolidFireAPIException:
# API Request failed on source. Failover/failback will skip next # API Request failed on source. Failover/failback will skip next
# calls to it. # calls to it.
@ -2437,14 +2445,26 @@ class SolidFireDriver(san.SanISCSIDriver):
else: else:
primary_vol = None primary_vol = None
LOG.debug('Failing-over volume %s, target vol %s, ' LOG.info('Failing-over volume %s.', v.id)
'primary vol %s', v, target_vol, primary_vol) LOG.debug('Target vol: %s',
{'access': target_vol['access'],
'accountID': target_vol['accountID'],
'name': target_vol['name'],
'status': target_vol['status'],
'volumeID': target_vol['volumeID']})
LOG.debug('Primary vol: %s',
{'access': primary_vol['access'],
'accountID': primary_vol['accountID'],
'name': primary_vol['name'],
'status': primary_vol['status'],
'volumeID': primary_vol['volumeID']})
try: try:
self._failover_volume(target_vol, remote, primary_vol) self._failover_volume(target_vol, remote, primary_vol)
sf_account = self._get_create_account( sf_account = self._get_create_account(
v.project_id, endpoint=remote['endpoint']) v.project_id, endpoint=remote['endpoint'])
LOG.debug("Target account: %s", sf_account['accountID'])
conn_info = self._build_connection_info( conn_info = self._build_connection_info(
sf_account, target_vol, endpoint=remote['endpoint']) sf_account, target_vol, endpoint=remote['endpoint'])
@ -2472,12 +2492,7 @@ class SolidFireDriver(san.SanISCSIDriver):
except Exception as e: except Exception as e:
volume_updates.append({'volume_id': v['id'], volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }}) 'updates': {'status': 'error', }})
if failback:
LOG.error("Error trying to failback volume %s", v.id)
else:
LOG.error("Error trying to failover volume %s", v.id) LOG.error("Error trying to failover volume %s", v.id)
msg = e.message if hasattr(e, 'message') else e msg = e.message if hasattr(e, 'message') else e
LOG.exception(msg) LOG.exception(msg)
@ -2485,20 +2500,17 @@ class SolidFireDriver(san.SanISCSIDriver):
volume_updates.append({'volume_id': v['id'], volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }}) 'updates': {'status': 'error', }})
# FIXME(jdg): This introduces a problem for us, up until now our driver self.active_cluster = remote
# has been pretty much stateless and has allowed customers to run
# active/active HA c-vol services with SolidFire. The introduction of
# the active_cluster and failed_over attributes is going to break that
# but for now that's going to be the trade off of using replication
if failback: if failback:
active_cluster_id = None active_cluster_id = ''
self.failed_over = False self.failed_over = False
# Recreating cluster pairs after a successful failback
self._set_cluster_pairs()
else: else:
active_cluster_id = remote['backend_id'] active_cluster_id = remote['backend_id']
self.failed_over = True self.failed_over = True
self.active_cluster = remote
return active_cluster_id, volume_updates, [] return active_cluster_id, volume_updates, []
def freeze_backend(self, context): def freeze_backend(self, context):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
NetApp SolidFire driver: Fixed an issue that causes failback
to fail after a volume service restart. This change fixes
bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.