Merge "NetApp SolidFire: Fix failback failing after service restart"

2020-04-13 22:08:09 +00:00 · 2020-04-13 22:08:09 +00:00 · e9304a91c3
commit e9304a91c3
parent 80c6c6e2dc f24eb2fc63
3 changed files with 85 additions and 53 deletions
--- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
+++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
                           'login': 'admin'},
              'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
              'clusterPairID': 33,
+              'clusterAPIVersion': '9.4',
              'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
              'svip': '10.10.23.2',
              'mvipNodeID': 1,
@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
            cinder_vols.append(vol)

        mock_map_sf_volumes.return_value = sf_vols
-        mock_create_cluster_reference.return_value = self.cluster_pairs[0]
+
+        self.configuration.replication_device = []
+
+        reset_mocks()
+        drv_args = {'active_backend_id': None}
+        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
+                                        **drv_args)
+
+        self.assertRaises(exception.UnableToFailOver,
+                          sfv.failover_host, ctx, cinder_vols, 'fake', None)
+        mock_map_sf_volumes.assert_not_called()

        fake_replication_device = {'backend_id': 'fake',
                                   'mvip': '0.0.0.0',
@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
                          sfv.failover_host, ctx, cinder_vols, 'default', None)
        mock_map_sf_volumes.assert_not_called()

-        reset_mocks()
-        drv_args = {'active_backend_id': 'default'}
-        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
-                                        **drv_args)
-        self.assertRaises(exception.UnableToFailOver,
-                          sfv.failover_host, ctx, cinder_vols, 'default', None)
-        mock_map_sf_volumes.assert_not_called()
-
        reset_mocks()
        drv_args = {'active_backend_id': None}
        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
                          secondary_id='not_fake_id', groups=None)
        mock_map_sf_volumes.assert_not_called()

+        mock_create_cluster_reference.return_value = self.cluster_pairs[0]
+
        reset_mocks()
-        drv_args = {'active_backend_id': None}
+        drv_args = {'active_backend_id': 'secondary'}
        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
                                        **drv_args)
-        sfv.cluster_pairs = [None]
-        self.assertRaises(exception.UnableToFailOver,
-                          sfv.failover_host, ctx, cinder_vols,
-                          secondary_id='fake', groups=None)
-        mock_map_sf_volumes.assert_not_called()
+        sfv.cluster_pairs = self.cluster_pairs
+        sfv.cluster_pairs[0]['backend_id'] = 'fake'
+        sfv.replication_enabled = True
+        cluster_id, updates, _ = sfv.failover_host(
+            ctx, cinder_vols, secondary_id='default', groups=None)
+        self.assertEqual(5, len(updates))
+        for update in updates:
+            self.assertEqual(fields.ReplicationStatus.ENABLED,
+                             update['updates']['replication_status'])
+        self.assertEqual('', cluster_id)
+        mock_get_create_account.assert_called()
+        mock_failover_volume.assert_called()
+        mock_map_sf_volumes.assert_called()
+        mock_update_cluster_status.assert_called()
+        mock_set_cluster_pairs.assert_called()
+        mock_create_cluster_reference.assert_called()

        reset_mocks()
        drv_args = {'active_backend_id': None}
@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
        mock_get_create_account.assert_called()
        mock_failover_volume.assert_called()
        mock_map_sf_volumes.assert_called()
-        mock_get_cluster_info.assert_not_called()
        mock_update_cluster_status.assert_called()
        mock_set_cluster_pairs.assert_called()
        mock_create_cluster_reference.assert_called()
-        mock_issue_api_request.assert_not_called()

    @mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
    @mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')
--- a/cinder/volume/drivers/solidfire.py
+++ b/cinder/volume/drivers/solidfire.py
@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
          2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
          2.0.16 - Add options for replication mode (Async, Sync and
                   SnapshotsOnly)
+          2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
+                   service is restarted
    """

-    VERSION = '2.0.16'
+    VERSION = '2.0.17'

    # ThirdPartySystems wiki page
    CI_WIKI_NAME = "NetApp_SolidFire_CI"
@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
            self.active_cluster = self._create_cluster_reference(
                remote_endpoint)

-            # When in failed-over state, we have only endpoint info from the
-            # primary cluster.
-            self.primary_cluster = {"endpoint": self._build_endpoint_info()}
            self.failed_over = True
+            self.replication_enabled = True
        else:
-            self.primary_cluster = self._create_cluster_reference()
-            self.active_cluster = self.primary_cluster
+            self.active_cluster = self._create_cluster_reference()
            if self.configuration.replication_device:
                self._set_cluster_pairs()
+                self.replication_enabled = True

        LOG.debug("Active cluster: %s", self.active_cluster)

@ -441,9 +441,11 @@ class SolidFireDriver(san.SanISCSIDriver):
            # clusterPairID in remote_info for us
            self._create_remote_pairing(remote_info)

+        if self.cluster_pairs:
+            self.cluster_pairs.clear()
+
        self.cluster_pairs.append(remote_info)
        LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
-        self.replication_enabled = True

    def _create_cluster_reference(self, endpoint=None):
        cluster_ref = {}
@ -2356,8 +2358,13 @@ class SolidFireDriver(san.SanISCSIDriver):
        failback = False
        volume_updates = []

-        LOG.info("Failing over. Secondary ID is: %s",
-                 secondary_id)
+        if not self.replication_enabled:
+            LOG.error("SolidFire driver received failover_host "
+                      "request, however replication is NOT "
+                      "enabled.")
+            raise exception.UnableToFailOver(reason=_("Failover requested "
+                                                      "on non replicated "
+                                                      "backend."))

        # NOTE(erlon): For now we only support one replication target device.
        # So, there are two cases we have to deal with here:
@ -2375,8 +2382,10 @@ class SolidFireDriver(san.SanISCSIDriver):
                    "state.")
            raise exception.InvalidReplicationTarget(msg)
        elif secondary_id == "default" and self.failed_over:
-            remote = self.primary_cluster
+            LOG.info("Failing back to primary cluster.")
+            remote = self._create_cluster_reference()
            failback = True
+
        else:
            repl_configs = self.configuration.replication_device[0]
            if secondary_id and repl_configs['backend_id'] != secondary_id:
@ -2384,25 +2393,24 @@ class SolidFireDriver(san.SanISCSIDriver):
                        "one in cinder.conf.") % secondary_id
                raise exception.InvalidReplicationTarget(msg)

+            LOG.info("Failing over to secondary cluster %s.", secondary_id)
            remote = self.cluster_pairs[0]

-        if not remote or not self.replication_enabled:
-            LOG.error("SolidFire driver received failover_host "
-                      "request, however replication is NOT "
-                      "enabled, or there are no available "
-                      "targets to fail-over to.")
-            raise exception.UnableToFailOver(reason=_("Failover requested "
-                                                      "on non replicated "
-                                                      "backend."))
+        LOG.debug("Target cluster to failover: %s.",
+                  {'name': remote['name'],
+                   'mvip': remote['mvip'],
+                   'clusterAPIVersion': remote['clusterAPIVersion']})

        target_vols = self._map_sf_volumes(volumes,
                                           endpoint=remote['endpoint'])
-        LOG.debug("Mapped target_vols: %s", target_vols)
+        LOG.debug("Total Cinder volumes found in target: %d",
+                  len(target_vols))

        primary_vols = None
        try:
            primary_vols = self._map_sf_volumes(volumes)
-            LOG.debug("Mapped Primary_vols: %s", target_vols)
+            LOG.debug("Total Cinder volumes found in primary cluster: %d",
+                      len(primary_vols))
        except SolidFireAPIException:
            # API Request failed on source. Failover/failback will skip next
            # calls to it.
@ -2437,14 +2445,26 @@ class SolidFireDriver(san.SanISCSIDriver):
                else:
                    primary_vol = None

-                LOG.debug('Failing-over volume %s, target vol %s, '
-                          'primary vol %s', v, target_vol, primary_vol)
+                LOG.info('Failing-over volume %s.', v.id)
+                LOG.debug('Target vol: %s',
+                          {'access': target_vol['access'],
+                           'accountID': target_vol['accountID'],
+                           'name': target_vol['name'],
+                           'status': target_vol['status'],
+                           'volumeID': target_vol['volumeID']})
+                LOG.debug('Primary vol: %s',
+                          {'access': primary_vol['access'],
+                           'accountID': primary_vol['accountID'],
+                           'name': primary_vol['name'],
+                           'status': primary_vol['status'],
+                           'volumeID': primary_vol['volumeID']})

                try:
                    self._failover_volume(target_vol, remote, primary_vol)

                    sf_account = self._get_create_account(
                        v.project_id, endpoint=remote['endpoint'])
+                    LOG.debug("Target account: %s", sf_account['accountID'])

                    conn_info = self._build_connection_info(
                        sf_account, target_vol, endpoint=remote['endpoint'])
@ -2472,12 +2492,7 @@ class SolidFireDriver(san.SanISCSIDriver):
                except Exception as e:
                    volume_updates.append({'volume_id': v['id'],
                                           'updates': {'status': 'error', }})
-
-                    if failback:
-                        LOG.error("Error trying to failback volume %s", v.id)
-                    else:
-                        LOG.error("Error trying to failover volume %s", v.id)
-
+                    LOG.error("Error trying to failover volume %s", v.id)
                    msg = e.message if hasattr(e, 'message') else e
                    LOG.exception(msg)

@ -2485,20 +2500,17 @@ class SolidFireDriver(san.SanISCSIDriver):
                volume_updates.append({'volume_id': v['id'],
                                       'updates': {'status': 'error', }})

-        # FIXME(jdg): This introduces a problem for us, up until now our driver
-        # has been pretty much stateless and has allowed customers to run
-        # active/active HA c-vol services with SolidFire.  The introduction of
-        # the active_cluster and failed_over attributes is going to break that
-        # but for now that's going to be the trade off of using replication
+        self.active_cluster = remote
+
        if failback:
-            active_cluster_id = None
+            active_cluster_id = ''
            self.failed_over = False
+            # Recreating cluster pairs after a successful failback
+            self._set_cluster_pairs()
        else:
            active_cluster_id = remote['backend_id']
            self.failed_over = True

-        self.active_cluster = remote
-
        return active_cluster_id, volume_updates, []

    def freeze_backend(self, context):
--- a/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml
+++ b/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml
@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    NetApp SolidFire driver: Fixed an issue that causes failback
+    to fail after a volume service restart. This change fixes
+    bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.