From 31aa1e279ed21bf24817bfac1fc1645eb2c3dbac Mon Sep 17 00:00:00 2001
From: Ciara Stacke <ciara.stacke@emc.com>
Date: Fri, 9 Mar 2018 11:14:33 +0000
Subject: [PATCH] VMAX driver - Replication failover performance improvement

Currently, failover of synchronous volumes on a VMAX backend can take a
long time as each volume is failed over consecutively. This patch
rectifies the issue by creating an internal group to facilitate the use
of group API's in the backend instead of volume API's, meaning the
volumes can failed-over concurrently.

Change-Id: Ie805c680662c160b77afcef47a3e0443bf1b15ef
Closes-bug: 1747894
---
 .../volume/drivers/dell_emc/vmax/test_vmax.py | 147 ++-------------
 cinder/volume/drivers/dell_emc/vmax/common.py | 176 +++++++-----------
 .../volume/drivers/dell_emc/vmax/provision.py |  29 ---
 cinder/volume/drivers/dell_emc/vmax/utils.py  |  13 ++
 4 files changed, 91 insertions(+), 274 deletions(-)

diff --git a/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py b/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py
index 760fd708a89..542f1920224 100644
--- a/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py
+++ b/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py
@@ -3240,25 +3240,6 @@ class VMAXProvisionTest(test.TestCase):
                 self.data.rdf_group_no, self.data.device_id2, extra_specs)
             mock_del_rdf.assert_called_once()
 
-    def test_failover_volume(self):
-        array = self.data.array
-        device_id = self.data.device_id
-        rdf_group_name = self.data.rdf_group_name
-        extra_specs = self.data.extra_specs
-        with mock.patch.object(
-                self.provision.rest, 'modify_rdf_device_pair') as mod_rdf:
-            self.provision.failover_volume(
-                array, device_id, rdf_group_name,
-                extra_specs, '', True)
-            mod_rdf.assert_called_once_with(
-                array, device_id, rdf_group_name, extra_specs)
-            mod_rdf.reset_mock()
-            self.provision.failover_volume(
-                array, device_id, rdf_group_name,
-                extra_specs, '', False)
-            mod_rdf.assert_called_once_with(
-                array, device_id, rdf_group_name, extra_specs)
-
     @mock.patch.object(rest.VMAXRest, 'get_storage_group',
                        return_value=None)
     def test_create_volume_group_success(self, mock_get_sg):
@@ -7030,127 +7011,19 @@ class VMAXCommonReplicationTest(test.TestCase):
 
     def test_failover_host(self):
         volumes = [self.data.test_volume, self.data.test_clone_volume]
-        with mock.patch.object(self.common, '_failover_volume',
-                               return_value={}) as mock_fo:
+        with mock.patch.object(self.common, '_failover_replication',
+                               return_value=(None, {})) as mock_fo:
             self.common.failover_host(volumes)
-            self.assertEqual(2, mock_fo.call_count)
-
-    def test_failover_host_exception(self):
-        volumes = [self.data.test_volume, self.data.test_clone_volume]
-        self.assertRaises(exception.VolumeBackendAPIException,
-                          self.common.failover_host,
-                          volumes, secondary_id="default")
+            mock_fo.assert_called_once()
 
     @mock.patch.object(common.VMAXCommon, 'failover_replication',
                        return_value=({}, {}))
-    @mock.patch.object(common.VMAXCommon, '_failover_volume',
-                       return_value={})
-    def test_failover_host_groups(self, mock_fv, mock_fg):
+    def test_failover_host_groups(self, mock_fg):
         volumes = [self.data.test_volume_group_member]
         group1 = self.data.test_group
         self.common.failover_host(volumes, None, [group1])
-        mock_fv.assert_not_called()
         mock_fg.assert_called_once()
 
-    def test_failover_volume(self):
-        ref_model_update = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.FAILED_OVER,
-                 'replication_driver_data': self.data.provider_location,
-                 'provider_location': self.data.provider_location3}}
-        model_update = self.common._failover_volume(
-            self.data.test_volume, True, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update['updates']['replication_driver_data'])
-
-        model_update['updates']['provider_location'] = ast.literal_eval(
-            model_update['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update, model_update)
-
-        ref_model_update2 = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.ENABLED,
-                 'replication_driver_data': self.data.provider_location,
-                 'provider_location': self.data.provider_location3}}
-        model_update2 = self.common._failover_volume(
-            self.data.test_volume, False, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update2['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update2['updates']['replication_driver_data'])
-
-        model_update2['updates']['provider_location'] = ast.literal_eval(
-            model_update2['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update2, model_update2)
-
-    def test_failover_legacy_volume(self):
-        ref_model_update = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.FAILED_OVER,
-                 'replication_driver_data': self.data.legacy_provider_location,
-                 'provider_location': self.data.legacy_provider_location2}}
-        model_update = self.common._failover_volume(
-            self.data.test_legacy_vol, True, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update['updates']['replication_driver_data'])
-
-        model_update['updates']['provider_location'] = ast.literal_eval(
-            model_update['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update, model_update)
-
-    def test_failover_volume_exception(self):
-        with mock.patch.object(
-                self.provision, 'failover_volume',
-                side_effect=exception.VolumeBackendAPIException):
-            ref_model_update = {
-                'volume_id': self.data.test_volume.id,
-                'updates': {'replication_status':
-                            fields.ReplicationStatus.FAILOVER_ERROR,
-                            'replication_driver_data': six.text_type(
-                                self.data.provider_location3),
-                            'provider_location': six.text_type(
-                                self.data.provider_location)}}
-            model_update = self.common._failover_volume(
-                self.data.test_volume, True, self.extra_specs)
-            self.assertEqual(ref_model_update, model_update)
-
-    @mock.patch.object(
-        common.VMAXCommon, '_find_device_on_array',
-        side_effect=[None, VMAXCommonData.device_id,
-                     VMAXCommonData.device_id, VMAXCommonData.device_id])
-    @mock.patch.object(
-        common.VMAXCommon, '_get_masking_views_from_volume',
-        side_effect=['OS-host-MV', None, exception.VolumeBackendAPIException])
-    def test_recover_volumes_on_failback(self, mock_mv, mock_dev):
-        recovery1 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('error', recovery1['updates']['status'])
-        recovery2 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('in-use', recovery2['updates']['status'])
-        recovery3 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('available', recovery3['updates']['status'])
-        recovery4 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('available', recovery4['updates']['status'])
-
     def test_get_remote_target_device(self):
         target_device1, _, _, _, _ = (
             self.common.get_remote_target_device(
@@ -7405,6 +7278,13 @@ class VMAXCommonReplicationTest(test.TestCase):
             self.assertEqual(fields.ReplicationStatus.ERROR,
                              model_update['replication_status'])
 
+    @mock.patch.object(provision.VMAXProvision, 'failover_group')
+    def test_failover_replication_metro(self, mock_fo):
+        volumes = [self.data.test_volume]
+        _, vol_model_updates = self.common._failover_replication(
+            volumes, group, None, host=True, is_metro=True)
+        mock_fo.assert_not_called()
+
     @mock.patch.object(utils.VMAXUtils, 'get_volume_group_utils',
                        return_value=(VMAXCommonData.array, {}))
     @mock.patch.object(common.VMAXCommon, '_cleanup_group_replication')
@@ -7485,16 +7365,13 @@ class VMAXCommonReplicationTest(test.TestCase):
 
     @mock.patch.object(common.VMAXCommon, '_failover_replication',
                        return_value=({}, {}))
-    @mock.patch.object(common.VMAXCommon, '_failover_volume',
-                       return_value={})
-    def test_failover_host_async(self, mock_fv, mock_fg):
+    def test_failover_host_async(self, mock_fg):
         volumes = [self.data.test_volume]
         extra_specs = deepcopy(self.extra_specs)
         extra_specs['rep_mode'] = utils.REP_ASYNC
         with mock.patch.object(common.VMAXCommon, '_initial_setup',
                                return_value=extra_specs):
             self.async_driver.common.failover_host(volumes, None, [])
-        mock_fv.assert_not_called()
         mock_fg.assert_called_once()
 
     @mock.patch.object(common.VMAXCommon, '_retype_volume', return_value=True)
diff --git a/cinder/volume/drivers/dell_emc/vmax/common.py b/cinder/volume/drivers/dell_emc/vmax/common.py
index b63334c3add..b74f9c9d612 100644
--- a/cinder/volume/drivers/dell_emc/vmax/common.py
+++ b/cinder/volume/drivers/dell_emc/vmax/common.py
@@ -2843,8 +2843,7 @@ class VMAXCommon(object):
                     % {'backend': self.configuration.safe_get(
                        'volume_backend_name')})
                 LOG.error(exception_message)
-                raise exception.VolumeBackendAPIException(
-                    data=exception_message)
+                return
         else:
             if self.failover:
                 self.failover = False
@@ -2858,8 +2857,7 @@ class VMAXCommon(object):
                     % {'backend': self.configuration.safe_get(
                        'volume_backend_name')})
                 LOG.error(exception_message)
-                raise exception.VolumeBackendAPIException(
-                    data=exception_message)
+                return
 
         if groups:
             for group in groups:
@@ -2876,118 +2874,74 @@ class VMAXCommon(object):
                 volume_update_list += vol_updates
 
         rep_mode = self.rep_config['mode']
-        if rep_mode == utils.REP_ASYNC:
+
+        sync_vol_list, non_rep_vol_list, async_vol_list, metro_list = (
+            [], [], [], [])
+        for volume in volumes:
+            array = ast.literal_eval(volume.provider_location)['array']
+            extra_specs = self._initial_setup(volume)
+            extra_specs[utils.ARRAY] = array
+            if self.utils.is_replication_enabled(extra_specs):
+                device_id = self._find_device_on_array(
+                    volume, extra_specs)
+                self._sync_check(
+                    array, device_id, volume.name, extra_specs)
+                if rep_mode == utils.REP_SYNC:
+                    sync_vol_list.append(volume)
+                elif rep_mode == utils.REP_ASYNC:
+                    async_vol_list.append(volume)
+                else:
+                    metro_list.append(volume)
+            else:
+                non_rep_vol_list.append(volume)
+
+        if len(async_vol_list) > 0:
             vol_grp_name = self.utils.get_async_rdf_managed_grp_name(
                 self.rep_config)
-            __, volume_update_list = (
+            __, vol_updates = (
                 self._failover_replication(
-                    volumes, None, vol_grp_name,
+                    async_vol_list, None, vol_grp_name,
                     secondary_backend_id=group_fo, host=True))
+            volume_update_list += vol_updates
 
-        for volume in volumes:
-            extra_specs = self._initial_setup(volume)
-            if self.utils.is_replication_enabled(extra_specs):
-                if rep_mode == utils.REP_SYNC:
-                    model_update = self._failover_volume(
-                        volume, self.failover, extra_specs)
-                    volume_update_list.append(model_update)
-            else:
-                if self.failover:
-                    # Since the array has been failed-over,
-                    # volumes without replication should be in error.
+        if len(sync_vol_list) > 0:
+            extra_specs = self._initial_setup(sync_vol_list[0])
+            array = ast.literal_eval(
+                sync_vol_list[0].provider_location)['array']
+            extra_specs[utils.ARRAY] = array
+            temp_grp_name = self.utils.get_temp_failover_grp_name(
+                self.rep_config)
+            self.provision.create_volume_group(
+                array, temp_grp_name, extra_specs)
+            device_ids = self._get_volume_device_ids(sync_vol_list, array)
+            self.masking.add_volumes_to_storage_group(
+                array, device_ids, temp_grp_name, extra_specs)
+            __, vol_updates = (
+                self._failover_replication(
+                    sync_vol_list, None, temp_grp_name,
+                    secondary_backend_id=group_fo, host=True))
+            volume_update_list += vol_updates
+            self.rest.delete_storage_group(array, temp_grp_name)
+
+        if len(metro_list) > 0:
+            __, vol_updates = (
+                self._failover_replication(
+                    sync_vol_list, None, None, secondary_backend_id=group_fo,
+                    host=True, is_metro=True))
+            volume_update_list += vol_updates
+
+        if len(non_rep_vol_list) > 0:
+            if self.failover:
+                # Since the array has been failed-over,
+                # volumes without replication should be in error.
+                for vol in non_rep_vol_list:
                     volume_update_list.append({
-                        'volume_id': volume.id,
+                        'volume_id': vol.id,
                         'updates': {'status': 'error'}})
-                else:
-                    # This is a failback, so we will attempt
-                    # to recover non-failed over volumes
-                    recovery = self.recover_volumes_on_failback(
-                        volume, extra_specs)
-                    volume_update_list.append(recovery)
 
         LOG.info("Failover host complete.")
         return secondary_id, volume_update_list, group_update_list
 
-    def _failover_volume(self, vol, failover, extra_specs):
-        """Failover a volume.
-
-        :param vol: the volume object
-        :param failover: flag to indicate failover or failback -- bool
-        :param extra_specs: the extra specifications
-        :returns: model_update -- dict
-        """
-        loc = vol.provider_location
-        rep_data = vol.replication_driver_data
-        try:
-            name = ast.literal_eval(loc)
-            replication_keybindings = ast.literal_eval(rep_data)
-            try:
-                array = name['array']
-            except KeyError:
-                array = (name['keybindings']
-                         ['SystemName'].split('+')[1].strip('-'))
-            device_id = self._find_device_on_array(vol, {utils.ARRAY: array})
-
-            (target_device, remote_array, rdf_group,
-             local_vol_state, pair_state) = (
-                self.get_remote_target_device(array, vol, device_id))
-
-            self._sync_check(array, device_id, vol.name, extra_specs)
-            self.provision.failover_volume(
-                array, device_id, rdf_group, extra_specs,
-                local_vol_state, failover)
-
-            if failover:
-                new_status = REPLICATION_FAILOVER
-            else:
-                new_status = REPLICATION_ENABLED
-
-            # Transfer ownership to secondary_backend_id and
-            # update provider_location field
-            loc = six.text_type(replication_keybindings)
-            rep_data = six.text_type(name)
-
-        except Exception as ex:
-            msg = ('Failed to failover volume %(volume_id)s. '
-                   'Error: %(error)s.')
-            LOG.error(msg, {'volume_id': vol.id,
-                            'error': ex}, )
-            new_status = FAILOVER_ERROR
-
-        model_update = {'volume_id': vol.id,
-                        'updates':
-                            {'replication_status': new_status,
-                             'replication_driver_data': rep_data,
-                             'provider_location': loc}}
-        return model_update
-
-    def recover_volumes_on_failback(self, volume, extra_specs):
-        """Recover volumes on failback.
-
-        On failback, attempt to recover non RE(replication enabled)
-        volumes from primary array.
-        :param volume: the volume object
-        :param extra_specs: the extra specifications
-        :returns: volume_update
-        """
-        # Check if volume still exists on the primary
-        volume_update = {'volume_id': volume.id}
-        device_id = self._find_device_on_array(volume, extra_specs)
-        if not device_id:
-            volume_update['updates'] = {'status': 'error'}
-        else:
-            try:
-                maskingview = self._get_masking_views_from_volume(
-                    extra_specs[utils.ARRAY], device_id, None)
-            except Exception:
-                maskingview = None
-                LOG.debug("Unable to determine if volume is in masking view.")
-            if not maskingview:
-                volume_update['updates'] = {'status': 'available'}
-            else:
-                volume_update['updates'] = {'status': 'in-use'}
-        return volume_update
-
     def get_remote_target_device(self, array, volume, device_id):
         """Get the remote target for a given volume.
 
@@ -4121,7 +4075,7 @@ class VMAXCommon(object):
 
     def _failover_replication(
             self, volumes, group, vol_grp_name,
-            secondary_backend_id=None, host=False):
+            secondary_backend_id=None, host=False, is_metro=False):
         """Failover replication for a group.
 
         :param volumes: the list of volumes
@@ -4139,7 +4093,8 @@ class VMAXCommon(object):
 
         try:
             extra_specs = self._initial_setup(volumes[0])
-            array = extra_specs[utils.ARRAY]
+            array = ast.literal_eval(volumes[0].provider_location)['array']
+            extra_specs[utils.ARRAY] = array
             if group:
                 volume_group = self._find_volume_group(array, group)
                 if volume_group:
@@ -4148,12 +4103,13 @@ class VMAXCommon(object):
                 if vol_grp_name is None:
                     raise exception.GroupNotFound(group_id=group.id)
 
-            rdf_group_no, _ = self.get_rdf_details(array)
             # As we only support a single replication target, ignore
             # any secondary_backend_id which is not 'default'
             failover = False if secondary_backend_id == 'default' else True
-            self.provision.failover_group(
-                array, vol_grp_name, rdf_group_no, extra_specs, failover)
+            if not is_metro:
+                rdf_group_no, _ = self.get_rdf_details(array)
+                self.provision.failover_group(
+                    array, vol_grp_name, rdf_group_no, extra_specs, failover)
             if failover:
                 model_update.update({
                     'replication_status':
diff --git a/cinder/volume/drivers/dell_emc/vmax/provision.py b/cinder/volume/drivers/dell_emc/vmax/provision.py
index 9123e5a10be..b356bbd4ca2 100644
--- a/cinder/volume/drivers/dell_emc/vmax/provision.py
+++ b/cinder/volume/drivers/dell_emc/vmax/provision.py
@@ -593,35 +593,6 @@ class VMAXProvision(object):
         rc = timer.start(interval=UNLINK_INTERVAL).wait()
         return rc
 
-    def failover_volume(self, array, device_id, rdf_group,
-                        extra_specs, local_vol_state, failover):
-        """Failover or back a volume pair.
-
-        :param array: the array serial number
-        :param device_id: the source device id
-        :param rdf_group: the rdf group number
-        :param extra_specs: extra specs
-        :param local_vol_state: the local volume state
-        :param failover: flag to indicate failover or failback -- bool
-        """
-        if local_vol_state == WRITE_DISABLED:
-            LOG.info("Volume %(dev)s is already failed over.",
-                     {'dev': device_id})
-            return
-        if failover:
-            action = "Failing over"
-        else:
-            action = "Failing back"
-        LOG.info("%(action)s rdf pair: source device: %(src)s ",
-                 {'action': action, 'src': device_id})
-
-        @coordination.synchronized('emc-rg-{rdfg_no}')
-        def _failover_volume(rdfg_no):
-            self.rest.modify_rdf_device_pair(
-                array, device_id, rdfg_no, extra_specs)
-
-        _failover_volume(rdf_group)
-
     def get_or_create_volume_group(self, array, group, extra_specs):
         """Get or create a volume group.
 
diff --git a/cinder/volume/drivers/dell_emc/vmax/utils.py b/cinder/volume/drivers/dell_emc/vmax/utils.py
index ebfae2444b3..003fb9fa530 100644
--- a/cinder/volume/drivers/dell_emc/vmax/utils.py
+++ b/cinder/volume/drivers/dell_emc/vmax/utils.py
@@ -811,3 +811,16 @@ class VMAXUtils(object):
                 [REP_ASYNC, REP_METRO]):
             return True
         return False
+
+    @staticmethod
+    def get_temp_failover_grp_name(rep_config):
+        """Get the temporary group name used for failover.
+
+        :param rep_config: the replication config
+        :return: temp_grp_name
+        """
+        temp_grp_name = ("OS-%(rdf)s-temp-rdf-sg"
+                         % {'rdf': rep_config['rdf_group_label']})
+        LOG.debug("The temp rdf managed group name is %(name)s",
+                  {'name': temp_grp_name})
+        return temp_grp_name