diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py index bd9bfd30313..8b9cb07766b 100644 --- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py +++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py @@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase): 'login': 'admin'}, 'name': 'AutoTest2-6AjG-FOR-TEST-ONLY', 'clusterPairID': 33, + 'clusterAPIVersion': '9.4', 'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889', 'svip': '10.10.23.2', 'mvipNodeID': 1, @@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase): cinder_vols.append(vol) mock_map_sf_volumes.return_value = sf_vols - mock_create_cluster_reference.return_value = self.cluster_pairs[0] + + self.configuration.replication_device = [] + + reset_mocks() + drv_args = {'active_backend_id': None} + sfv = solidfire.SolidFireDriver(configuration=self.configuration, + **drv_args) + + self.assertRaises(exception.UnableToFailOver, + sfv.failover_host, ctx, cinder_vols, 'fake', None) + mock_map_sf_volumes.assert_not_called() fake_replication_device = {'backend_id': 'fake', 'mvip': '0.0.0.0', @@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase): sfv.failover_host, ctx, cinder_vols, 'default', None) mock_map_sf_volumes.assert_not_called() - reset_mocks() - drv_args = {'active_backend_id': 'default'} - sfv = solidfire.SolidFireDriver(configuration=self.configuration, - **drv_args) - self.assertRaises(exception.UnableToFailOver, - sfv.failover_host, ctx, cinder_vols, 'default', None) - mock_map_sf_volumes.assert_not_called() - reset_mocks() drv_args = {'active_backend_id': None} sfv = solidfire.SolidFireDriver(configuration=self.configuration, @@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase): secondary_id='not_fake_id', groups=None) mock_map_sf_volumes.assert_not_called() + mock_create_cluster_reference.return_value = self.cluster_pairs[0] + reset_mocks() - drv_args = {'active_backend_id': None} + drv_args = {'active_backend_id': 'secondary'} sfv = solidfire.SolidFireDriver(configuration=self.configuration, **drv_args) - sfv.cluster_pairs = [None] - self.assertRaises(exception.UnableToFailOver, - sfv.failover_host, ctx, cinder_vols, - secondary_id='fake', groups=None) - mock_map_sf_volumes.assert_not_called() + sfv.cluster_pairs = self.cluster_pairs + sfv.cluster_pairs[0]['backend_id'] = 'fake' + sfv.replication_enabled = True + cluster_id, updates, _ = sfv.failover_host( + ctx, cinder_vols, secondary_id='default', groups=None) + self.assertEqual(5, len(updates)) + for update in updates: + self.assertEqual(fields.ReplicationStatus.ENABLED, + update['updates']['replication_status']) + self.assertEqual('', cluster_id) + mock_get_create_account.assert_called() + mock_failover_volume.assert_called() + mock_map_sf_volumes.assert_called() + mock_update_cluster_status.assert_called() + mock_set_cluster_pairs.assert_called() + mock_create_cluster_reference.assert_called() reset_mocks() drv_args = {'active_backend_id': None} @@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase): mock_get_create_account.assert_called() mock_failover_volume.assert_called() mock_map_sf_volumes.assert_called() - mock_get_cluster_info.assert_not_called() mock_update_cluster_status.assert_called() mock_set_cluster_pairs.assert_called() mock_create_cluster_reference.assert_called() - mock_issue_api_request.assert_not_called() @mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request') @mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference') diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py index 8ab9fa04a9d..ba5f8a5e713 100644 --- a/cinder/volume/drivers/solidfire.py +++ b/cinder/volume/drivers/solidfire.py @@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver): 2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors 2.0.16 - Add options for replication mode (Async, Sync and SnapshotsOnly) + 2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume + service is restarted """ - VERSION = '2.0.16' + VERSION = '2.0.17' # ThirdPartySystems wiki page CI_WIKI_NAME = "NetApp_SolidFire_CI" @@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver): self.active_cluster = self._create_cluster_reference( remote_endpoint) - # When in failed-over state, we have only endpoint info from the - # primary cluster. - self.primary_cluster = {"endpoint": self._build_endpoint_info()} self.failed_over = True + self.replication_enabled = True else: - self.primary_cluster = self._create_cluster_reference() - self.active_cluster = self.primary_cluster + self.active_cluster = self._create_cluster_reference() if self.configuration.replication_device: self._set_cluster_pairs() + self.replication_enabled = True LOG.debug("Active cluster: %s", self.active_cluster) @@ -441,9 +441,11 @@ class SolidFireDriver(san.SanISCSIDriver): # clusterPairID in remote_info for us self._create_remote_pairing(remote_info) + if self.cluster_pairs: + self.cluster_pairs.clear() + self.cluster_pairs.append(remote_info) LOG.debug("Available cluster pairs: %s", self.cluster_pairs) - self.replication_enabled = True def _create_cluster_reference(self, endpoint=None): cluster_ref = {} @@ -2356,8 +2358,13 @@ class SolidFireDriver(san.SanISCSIDriver): failback = False volume_updates = [] - LOG.info("Failing over. Secondary ID is: %s", - secondary_id) + if not self.replication_enabled: + LOG.error("SolidFire driver received failover_host " + "request, however replication is NOT " + "enabled.") + raise exception.UnableToFailOver(reason=_("Failover requested " + "on non replicated " + "backend.")) # NOTE(erlon): For now we only support one replication target device. # So, there are two cases we have to deal with here: @@ -2375,8 +2382,10 @@ class SolidFireDriver(san.SanISCSIDriver): "state.") raise exception.InvalidReplicationTarget(msg) elif secondary_id == "default" and self.failed_over: - remote = self.primary_cluster + LOG.info("Failing back to primary cluster.") + remote = self._create_cluster_reference() failback = True + else: repl_configs = self.configuration.replication_device[0] if secondary_id and repl_configs['backend_id'] != secondary_id: @@ -2384,25 +2393,24 @@ class SolidFireDriver(san.SanISCSIDriver): "one in cinder.conf.") % secondary_id raise exception.InvalidReplicationTarget(msg) + LOG.info("Failing over to secondary cluster %s.", secondary_id) remote = self.cluster_pairs[0] - if not remote or not self.replication_enabled: - LOG.error("SolidFire driver received failover_host " - "request, however replication is NOT " - "enabled, or there are no available " - "targets to fail-over to.") - raise exception.UnableToFailOver(reason=_("Failover requested " - "on non replicated " - "backend.")) + LOG.debug("Target cluster to failover: %s.", + {'name': remote['name'], + 'mvip': remote['mvip'], + 'clusterAPIVersion': remote['clusterAPIVersion']}) target_vols = self._map_sf_volumes(volumes, endpoint=remote['endpoint']) - LOG.debug("Mapped target_vols: %s", target_vols) + LOG.debug("Total Cinder volumes found in target: %d", + len(target_vols)) primary_vols = None try: primary_vols = self._map_sf_volumes(volumes) - LOG.debug("Mapped Primary_vols: %s", target_vols) + LOG.debug("Total Cinder volumes found in primary cluster: %d", + len(primary_vols)) except SolidFireAPIException: # API Request failed on source. Failover/failback will skip next # calls to it. @@ -2437,14 +2445,26 @@ class SolidFireDriver(san.SanISCSIDriver): else: primary_vol = None - LOG.debug('Failing-over volume %s, target vol %s, ' - 'primary vol %s', v, target_vol, primary_vol) + LOG.info('Failing-over volume %s.', v.id) + LOG.debug('Target vol: %s', + {'access': target_vol['access'], + 'accountID': target_vol['accountID'], + 'name': target_vol['name'], + 'status': target_vol['status'], + 'volumeID': target_vol['volumeID']}) + LOG.debug('Primary vol: %s', + {'access': primary_vol['access'], + 'accountID': primary_vol['accountID'], + 'name': primary_vol['name'], + 'status': primary_vol['status'], + 'volumeID': primary_vol['volumeID']}) try: self._failover_volume(target_vol, remote, primary_vol) sf_account = self._get_create_account( v.project_id, endpoint=remote['endpoint']) + LOG.debug("Target account: %s", sf_account['accountID']) conn_info = self._build_connection_info( sf_account, target_vol, endpoint=remote['endpoint']) @@ -2472,12 +2492,7 @@ class SolidFireDriver(san.SanISCSIDriver): except Exception as e: volume_updates.append({'volume_id': v['id'], 'updates': {'status': 'error', }}) - - if failback: - LOG.error("Error trying to failback volume %s", v.id) - else: - LOG.error("Error trying to failover volume %s", v.id) - + LOG.error("Error trying to failover volume %s", v.id) msg = e.message if hasattr(e, 'message') else e LOG.exception(msg) @@ -2485,20 +2500,17 @@ class SolidFireDriver(san.SanISCSIDriver): volume_updates.append({'volume_id': v['id'], 'updates': {'status': 'error', }}) - # FIXME(jdg): This introduces a problem for us, up until now our driver - # has been pretty much stateless and has allowed customers to run - # active/active HA c-vol services with SolidFire. The introduction of - # the active_cluster and failed_over attributes is going to break that - # but for now that's going to be the trade off of using replication + self.active_cluster = remote + if failback: - active_cluster_id = None + active_cluster_id = '' self.failed_over = False + # Recreating cluster pairs after a successful failback + self._set_cluster_pairs() else: active_cluster_id = remote['backend_id'] self.failed_over = True - self.active_cluster = remote - return active_cluster_id, volume_updates, [] def freeze_backend(self, context): diff --git a/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml b/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml new file mode 100644 index 00000000000..e1364ce7ec4 --- /dev/null +++ b/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + NetApp SolidFire driver: Fixed an issue that causes failback + to fail after a volume service restart. This change fixes + bug `1859653 `_. \ No newline at end of file