Merge "VMAX driver - Replication failover performance improvement"

2018-03-26 18:11:39 +00:00 · 2018-03-26 18:11:39 +00:00 · b89e73a2ac
commit b89e73a2ac
parent 184af8ebc6 31aa1e279e
4 changed files with 91 additions and 274 deletions
--- a/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py
+++ b/cinder/tests/unit/volume/drivers/dell_emc/vmax/test_vmax.py
@ -3240,25 +3240,6 @@ class VMAXProvisionTest(test.TestCase):
                self.data.rdf_group_no, self.data.device_id2, extra_specs)
            mock_del_rdf.assert_called_once()

-    def test_failover_volume(self):
-        array = self.data.array
-        device_id = self.data.device_id
-        rdf_group_name = self.data.rdf_group_name
-        extra_specs = self.data.extra_specs
-        with mock.patch.object(
-                self.provision.rest, 'modify_rdf_device_pair') as mod_rdf:
-            self.provision.failover_volume(
-                array, device_id, rdf_group_name,
-                extra_specs, '', True)
-            mod_rdf.assert_called_once_with(
-                array, device_id, rdf_group_name, extra_specs)
-            mod_rdf.reset_mock()
-            self.provision.failover_volume(
-                array, device_id, rdf_group_name,
-                extra_specs, '', False)
-            mod_rdf.assert_called_once_with(
-                array, device_id, rdf_group_name, extra_specs)
-
    @mock.patch.object(rest.VMAXRest, 'get_storage_group',
                       return_value=None)
    def test_create_volume_group_success(self, mock_get_sg):
@ -7030,127 +7011,19 @@ class VMAXCommonReplicationTest(test.TestCase):

    def test_failover_host(self):
        volumes = [self.data.test_volume, self.data.test_clone_volume]
-        with mock.patch.object(self.common, '_failover_volume',
-                               return_value={}) as mock_fo:
+        with mock.patch.object(self.common, '_failover_replication',
+                               return_value=(None, {})) as mock_fo:
            self.common.failover_host(volumes)
-            self.assertEqual(2, mock_fo.call_count)
-
-    def test_failover_host_exception(self):
-        volumes = [self.data.test_volume, self.data.test_clone_volume]
-        self.assertRaises(exception.VolumeBackendAPIException,
-                          self.common.failover_host,
-                          volumes, secondary_id="default")
+            mock_fo.assert_called_once()

    @mock.patch.object(common.VMAXCommon, 'failover_replication',
                       return_value=({}, {}))
-    @mock.patch.object(common.VMAXCommon, '_failover_volume',
-                       return_value={})
-    def test_failover_host_groups(self, mock_fv, mock_fg):
+    def test_failover_host_groups(self, mock_fg):
        volumes = [self.data.test_volume_group_member]
        group1 = self.data.test_group
        self.common.failover_host(volumes, None, [group1])
-        mock_fv.assert_not_called()
        mock_fg.assert_called_once()

-    def test_failover_volume(self):
-        ref_model_update = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.FAILED_OVER,
-                 'replication_driver_data': self.data.provider_location,
-                 'provider_location': self.data.provider_location3}}
-        model_update = self.common._failover_volume(
-            self.data.test_volume, True, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update['updates']['replication_driver_data'])
-
-        model_update['updates']['provider_location'] = ast.literal_eval(
-            model_update['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update, model_update)
-
-        ref_model_update2 = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.ENABLED,
-                 'replication_driver_data': self.data.provider_location,
-                 'provider_location': self.data.provider_location3}}
-        model_update2 = self.common._failover_volume(
-            self.data.test_volume, False, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update2['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update2['updates']['replication_driver_data'])
-
-        model_update2['updates']['provider_location'] = ast.literal_eval(
-            model_update2['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update2, model_update2)
-
-    def test_failover_legacy_volume(self):
-        ref_model_update = {
-            'volume_id': self.data.test_volume.id,
-            'updates':
-                {'replication_status': fields.ReplicationStatus.FAILED_OVER,
-                 'replication_driver_data': self.data.legacy_provider_location,
-                 'provider_location': self.data.legacy_provider_location2}}
-        model_update = self.common._failover_volume(
-            self.data.test_legacy_vol, True, self.extra_specs)
-
-        # Decode string representations of dicts into dicts, because
-        # the string representations are randomly ordered and therefore
-        # hard to compare.
-        model_update['updates']['replication_driver_data'] = ast.literal_eval(
-            model_update['updates']['replication_driver_data'])
-
-        model_update['updates']['provider_location'] = ast.literal_eval(
-            model_update['updates']['provider_location'])
-
-        self.assertEqual(ref_model_update, model_update)
-
-    def test_failover_volume_exception(self):
-        with mock.patch.object(
-                self.provision, 'failover_volume',
-                side_effect=exception.VolumeBackendAPIException):
-            ref_model_update = {
-                'volume_id': self.data.test_volume.id,
-                'updates': {'replication_status':
-                            fields.ReplicationStatus.FAILOVER_ERROR,
-                            'replication_driver_data': six.text_type(
-                                self.data.provider_location3),
-                            'provider_location': six.text_type(
-                                self.data.provider_location)}}
-            model_update = self.common._failover_volume(
-                self.data.test_volume, True, self.extra_specs)
-            self.assertEqual(ref_model_update, model_update)
-
-    @mock.patch.object(
-        common.VMAXCommon, '_find_device_on_array',
-        side_effect=[None, VMAXCommonData.device_id,
-                     VMAXCommonData.device_id, VMAXCommonData.device_id])
-    @mock.patch.object(
-        common.VMAXCommon, '_get_masking_views_from_volume',
-        side_effect=['OS-host-MV', None, exception.VolumeBackendAPIException])
-    def test_recover_volumes_on_failback(self, mock_mv, mock_dev):
-        recovery1 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('error', recovery1['updates']['status'])
-        recovery2 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('in-use', recovery2['updates']['status'])
-        recovery3 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('available', recovery3['updates']['status'])
-        recovery4 = self.common.recover_volumes_on_failback(
-            self.data.test_volume, self.extra_specs)
-        self.assertEqual('available', recovery4['updates']['status'])
-
    def test_get_remote_target_device(self):
        target_device1, _, _, _, _ = (
            self.common.get_remote_target_device(
@ -7405,6 +7278,13 @@ class VMAXCommonReplicationTest(test.TestCase):
            self.assertEqual(fields.ReplicationStatus.ERROR,
                             model_update['replication_status'])

+    @mock.patch.object(provision.VMAXProvision, 'failover_group')
+    def test_failover_replication_metro(self, mock_fo):
+        volumes = [self.data.test_volume]
+        _, vol_model_updates = self.common._failover_replication(
+            volumes, group, None, host=True, is_metro=True)
+        mock_fo.assert_not_called()
+
    @mock.patch.object(utils.VMAXUtils, 'get_volume_group_utils',
                       return_value=(VMAXCommonData.array, {}))
    @mock.patch.object(common.VMAXCommon, '_cleanup_group_replication')
@ -7485,16 +7365,13 @@ class VMAXCommonReplicationTest(test.TestCase):

    @mock.patch.object(common.VMAXCommon, '_failover_replication',
                       return_value=({}, {}))
-    @mock.patch.object(common.VMAXCommon, '_failover_volume',
-                       return_value={})
-    def test_failover_host_async(self, mock_fv, mock_fg):
+    def test_failover_host_async(self, mock_fg):
        volumes = [self.data.test_volume]
        extra_specs = deepcopy(self.extra_specs)
        extra_specs['rep_mode'] = utils.REP_ASYNC
        with mock.patch.object(common.VMAXCommon, '_initial_setup',
                               return_value=extra_specs):
            self.async_driver.common.failover_host(volumes, None, [])
-        mock_fv.assert_not_called()
        mock_fg.assert_called_once()

    @mock.patch.object(common.VMAXCommon, '_retype_volume', return_value=True)
--- a/cinder/volume/drivers/dell_emc/vmax/common.py
+++ b/cinder/volume/drivers/dell_emc/vmax/common.py
@ -2843,8 +2843,7 @@ class VMAXCommon(object):
                    % {'backend': self.configuration.safe_get(
                       'volume_backend_name')})
                LOG.error(exception_message)
-                raise exception.VolumeBackendAPIException(
-                    data=exception_message)
+                return
        else:
            if self.failover:
                self.failover = False
@ -2858,8 +2857,7 @@ class VMAXCommon(object):
                    % {'backend': self.configuration.safe_get(
                       'volume_backend_name')})
                LOG.error(exception_message)
-                raise exception.VolumeBackendAPIException(
-                    data=exception_message)
+                return

        if groups:
            for group in groups:
@ -2876,118 +2874,74 @@ class VMAXCommon(object):
                volume_update_list += vol_updates

        rep_mode = self.rep_config['mode']
-        if rep_mode == utils.REP_ASYNC:
+
+        sync_vol_list, non_rep_vol_list, async_vol_list, metro_list = (
+            [], [], [], [])
+        for volume in volumes:
+            array = ast.literal_eval(volume.provider_location)['array']
+            extra_specs = self._initial_setup(volume)
+            extra_specs[utils.ARRAY] = array
+            if self.utils.is_replication_enabled(extra_specs):
+                device_id = self._find_device_on_array(
+                    volume, extra_specs)
+                self._sync_check(
+                    array, device_id, volume.name, extra_specs)
+                if rep_mode == utils.REP_SYNC:
+                    sync_vol_list.append(volume)
+                elif rep_mode == utils.REP_ASYNC:
+                    async_vol_list.append(volume)
+                else:
+                    metro_list.append(volume)
+            else:
+                non_rep_vol_list.append(volume)
+
+        if len(async_vol_list) > 0:
            vol_grp_name = self.utils.get_async_rdf_managed_grp_name(
                self.rep_config)
-            __, volume_update_list = (
+            __, vol_updates = (
                self._failover_replication(
-                    volumes, None, vol_grp_name,
+                    async_vol_list, None, vol_grp_name,
                    secondary_backend_id=group_fo, host=True))
+            volume_update_list += vol_updates

-        for volume in volumes:
-            extra_specs = self._initial_setup(volume)
-            if self.utils.is_replication_enabled(extra_specs):
-                if rep_mode == utils.REP_SYNC:
-                    model_update = self._failover_volume(
-                        volume, self.failover, extra_specs)
-                    volume_update_list.append(model_update)
-            else:
-                if self.failover:
-                    # Since the array has been failed-over,
-                    # volumes without replication should be in error.
+        if len(sync_vol_list) > 0:
+            extra_specs = self._initial_setup(sync_vol_list[0])
+            array = ast.literal_eval(
+                sync_vol_list[0].provider_location)['array']
+            extra_specs[utils.ARRAY] = array
+            temp_grp_name = self.utils.get_temp_failover_grp_name(
+                self.rep_config)
+            self.provision.create_volume_group(
+                array, temp_grp_name, extra_specs)
+            device_ids = self._get_volume_device_ids(sync_vol_list, array)
+            self.masking.add_volumes_to_storage_group(
+                array, device_ids, temp_grp_name, extra_specs)
+            __, vol_updates = (
+                self._failover_replication(
+                    sync_vol_list, None, temp_grp_name,
+                    secondary_backend_id=group_fo, host=True))
+            volume_update_list += vol_updates
+            self.rest.delete_storage_group(array, temp_grp_name)
+
+        if len(metro_list) > 0:
+            __, vol_updates = (
+                self._failover_replication(
+                    sync_vol_list, None, None, secondary_backend_id=group_fo,
+                    host=True, is_metro=True))
+            volume_update_list += vol_updates
+
+        if len(non_rep_vol_list) > 0:
+            if self.failover:
+                # Since the array has been failed-over,
+                # volumes without replication should be in error.
+                for vol in non_rep_vol_list:
                    volume_update_list.append({
-                        'volume_id': volume.id,
+                        'volume_id': vol.id,
                        'updates': {'status': 'error'}})
-                else:
-                    # This is a failback, so we will attempt
-                    # to recover non-failed over volumes
-                    recovery = self.recover_volumes_on_failback(
-                        volume, extra_specs)
-                    volume_update_list.append(recovery)

        LOG.info("Failover host complete.")
        return secondary_id, volume_update_list, group_update_list

-    def _failover_volume(self, vol, failover, extra_specs):
-        """Failover a volume.
-
-        :param vol: the volume object
-        :param failover: flag to indicate failover or failback -- bool
-        :param extra_specs: the extra specifications
-        :returns: model_update -- dict
-        """
-        loc = vol.provider_location
-        rep_data = vol.replication_driver_data
-        try:
-            name = ast.literal_eval(loc)
-            replication_keybindings = ast.literal_eval(rep_data)
-            try:
-                array = name['array']
-            except KeyError:
-                array = (name['keybindings']
-                         ['SystemName'].split('+')[1].strip('-'))
-            device_id = self._find_device_on_array(vol, {utils.ARRAY: array})
-
-            (target_device, remote_array, rdf_group,
-             local_vol_state, pair_state) = (
-                self.get_remote_target_device(array, vol, device_id))
-
-            self._sync_check(array, device_id, vol.name, extra_specs)
-            self.provision.failover_volume(
-                array, device_id, rdf_group, extra_specs,
-                local_vol_state, failover)
-
-            if failover:
-                new_status = REPLICATION_FAILOVER
-            else:
-                new_status = REPLICATION_ENABLED
-
-            # Transfer ownership to secondary_backend_id and
-            # update provider_location field
-            loc = six.text_type(replication_keybindings)
-            rep_data = six.text_type(name)
-
-        except Exception as ex:
-            msg = ('Failed to failover volume %(volume_id)s. '
-                   'Error: %(error)s.')
-            LOG.error(msg, {'volume_id': vol.id,
-                            'error': ex}, )
-            new_status = FAILOVER_ERROR
-
-        model_update = {'volume_id': vol.id,
-                        'updates':
-                            {'replication_status': new_status,
-                             'replication_driver_data': rep_data,
-                             'provider_location': loc}}
-        return model_update
-
-    def recover_volumes_on_failback(self, volume, extra_specs):
-        """Recover volumes on failback.
-
-        On failback, attempt to recover non RE(replication enabled)
-        volumes from primary array.
-        :param volume: the volume object
-        :param extra_specs: the extra specifications
-        :returns: volume_update
-        """
-        # Check if volume still exists on the primary
-        volume_update = {'volume_id': volume.id}
-        device_id = self._find_device_on_array(volume, extra_specs)
-        if not device_id:
-            volume_update['updates'] = {'status': 'error'}
-        else:
-            try:
-                maskingview = self._get_masking_views_from_volume(
-                    extra_specs[utils.ARRAY], device_id, None)
-            except Exception:
-                maskingview = None
-                LOG.debug("Unable to determine if volume is in masking view.")
-            if not maskingview:
-                volume_update['updates'] = {'status': 'available'}
-            else:
-                volume_update['updates'] = {'status': 'in-use'}
-        return volume_update
-
    def get_remote_target_device(self, array, volume, device_id):
        """Get the remote target for a given volume.

@ -4121,7 +4075,7 @@ class VMAXCommon(object):

    def _failover_replication(
            self, volumes, group, vol_grp_name,
-            secondary_backend_id=None, host=False):
+            secondary_backend_id=None, host=False, is_metro=False):
        """Failover replication for a group.

        :param volumes: the list of volumes
@ -4139,7 +4093,8 @@ class VMAXCommon(object):

        try:
            extra_specs = self._initial_setup(volumes[0])
-            array = extra_specs[utils.ARRAY]
+            array = ast.literal_eval(volumes[0].provider_location)['array']
+            extra_specs[utils.ARRAY] = array
            if group:
                volume_group = self._find_volume_group(array, group)
                if volume_group:
@ -4148,12 +4103,13 @@ class VMAXCommon(object):
                if vol_grp_name is None:
                    raise exception.GroupNotFound(group_id=group.id)

-            rdf_group_no, _ = self.get_rdf_details(array)
            # As we only support a single replication target, ignore
            # any secondary_backend_id which is not 'default'
            failover = False if secondary_backend_id == 'default' else True
-            self.provision.failover_group(
-                array, vol_grp_name, rdf_group_no, extra_specs, failover)
+            if not is_metro:
+                rdf_group_no, _ = self.get_rdf_details(array)
+                self.provision.failover_group(
+                    array, vol_grp_name, rdf_group_no, extra_specs, failover)
            if failover:
                model_update.update({
                    'replication_status':
--- a/cinder/volume/drivers/dell_emc/vmax/provision.py
+++ b/cinder/volume/drivers/dell_emc/vmax/provision.py
@ -593,35 +593,6 @@ class VMAXProvision(object):
        rc = timer.start(interval=UNLINK_INTERVAL).wait()
        return rc

-    def failover_volume(self, array, device_id, rdf_group,
-                        extra_specs, local_vol_state, failover):
-        """Failover or back a volume pair.
-
-        :param array: the array serial number
-        :param device_id: the source device id
-        :param rdf_group: the rdf group number
-        :param extra_specs: extra specs
-        :param local_vol_state: the local volume state
-        :param failover: flag to indicate failover or failback -- bool
-        """
-        if local_vol_state == WRITE_DISABLED:
-            LOG.info("Volume %(dev)s is already failed over.",
-                     {'dev': device_id})
-            return
-        if failover:
-            action = "Failing over"
-        else:
-            action = "Failing back"
-        LOG.info("%(action)s rdf pair: source device: %(src)s ",
-                 {'action': action, 'src': device_id})
-
-        @coordination.synchronized('emc-rg-{rdfg_no}')
-        def _failover_volume(rdfg_no):
-            self.rest.modify_rdf_device_pair(
-                array, device_id, rdfg_no, extra_specs)
-
-        _failover_volume(rdf_group)
-
    def get_or_create_volume_group(self, array, group, extra_specs):
        """Get or create a volume group.

--- a/cinder/volume/drivers/dell_emc/vmax/utils.py
+++ b/cinder/volume/drivers/dell_emc/vmax/utils.py
@ -811,3 +811,16 @@ class VMAXUtils(object):
                [REP_ASYNC, REP_METRO]):
            return True
        return False
+
+    @staticmethod
+    def get_temp_failover_grp_name(rep_config):
+        """Get the temporary group name used for failover.
+
+        :param rep_config: the replication config
+        :return: temp_grp_name
+        """
+        temp_grp_name = ("OS-%(rdf)s-temp-rdf-sg"
+                         % {'rdf': rep_config['rdf_group_label']})
+        LOG.debug("The temp rdf managed group name is %(name)s",
+                  {'name': temp_grp_name})
+        return temp_grp_name