Merge "VMAX driver - Replication failover performance improvement"

This commit is contained in:
Zuul 2018-03-26 18:11:39 +00:00 committed by Gerrit Code Review
commit b89e73a2ac
4 changed files with 91 additions and 274 deletions

View File

@ -3240,25 +3240,6 @@ class VMAXProvisionTest(test.TestCase):
self.data.rdf_group_no, self.data.device_id2, extra_specs) self.data.rdf_group_no, self.data.device_id2, extra_specs)
mock_del_rdf.assert_called_once() mock_del_rdf.assert_called_once()
def test_failover_volume(self):
array = self.data.array
device_id = self.data.device_id
rdf_group_name = self.data.rdf_group_name
extra_specs = self.data.extra_specs
with mock.patch.object(
self.provision.rest, 'modify_rdf_device_pair') as mod_rdf:
self.provision.failover_volume(
array, device_id, rdf_group_name,
extra_specs, '', True)
mod_rdf.assert_called_once_with(
array, device_id, rdf_group_name, extra_specs)
mod_rdf.reset_mock()
self.provision.failover_volume(
array, device_id, rdf_group_name,
extra_specs, '', False)
mod_rdf.assert_called_once_with(
array, device_id, rdf_group_name, extra_specs)
@mock.patch.object(rest.VMAXRest, 'get_storage_group', @mock.patch.object(rest.VMAXRest, 'get_storage_group',
return_value=None) return_value=None)
def test_create_volume_group_success(self, mock_get_sg): def test_create_volume_group_success(self, mock_get_sg):
@ -7030,127 +7011,19 @@ class VMAXCommonReplicationTest(test.TestCase):
def test_failover_host(self): def test_failover_host(self):
volumes = [self.data.test_volume, self.data.test_clone_volume] volumes = [self.data.test_volume, self.data.test_clone_volume]
with mock.patch.object(self.common, '_failover_volume', with mock.patch.object(self.common, '_failover_replication',
return_value={}) as mock_fo: return_value=(None, {})) as mock_fo:
self.common.failover_host(volumes) self.common.failover_host(volumes)
self.assertEqual(2, mock_fo.call_count) mock_fo.assert_called_once()
def test_failover_host_exception(self):
volumes = [self.data.test_volume, self.data.test_clone_volume]
self.assertRaises(exception.VolumeBackendAPIException,
self.common.failover_host,
volumes, secondary_id="default")
@mock.patch.object(common.VMAXCommon, 'failover_replication', @mock.patch.object(common.VMAXCommon, 'failover_replication',
return_value=({}, {})) return_value=({}, {}))
@mock.patch.object(common.VMAXCommon, '_failover_volume', def test_failover_host_groups(self, mock_fg):
return_value={})
def test_failover_host_groups(self, mock_fv, mock_fg):
volumes = [self.data.test_volume_group_member] volumes = [self.data.test_volume_group_member]
group1 = self.data.test_group group1 = self.data.test_group
self.common.failover_host(volumes, None, [group1]) self.common.failover_host(volumes, None, [group1])
mock_fv.assert_not_called()
mock_fg.assert_called_once() mock_fg.assert_called_once()
def test_failover_volume(self):
ref_model_update = {
'volume_id': self.data.test_volume.id,
'updates':
{'replication_status': fields.ReplicationStatus.FAILED_OVER,
'replication_driver_data': self.data.provider_location,
'provider_location': self.data.provider_location3}}
model_update = self.common._failover_volume(
self.data.test_volume, True, self.extra_specs)
# Decode string representations of dicts into dicts, because
# the string representations are randomly ordered and therefore
# hard to compare.
model_update['updates']['replication_driver_data'] = ast.literal_eval(
model_update['updates']['replication_driver_data'])
model_update['updates']['provider_location'] = ast.literal_eval(
model_update['updates']['provider_location'])
self.assertEqual(ref_model_update, model_update)
ref_model_update2 = {
'volume_id': self.data.test_volume.id,
'updates':
{'replication_status': fields.ReplicationStatus.ENABLED,
'replication_driver_data': self.data.provider_location,
'provider_location': self.data.provider_location3}}
model_update2 = self.common._failover_volume(
self.data.test_volume, False, self.extra_specs)
# Decode string representations of dicts into dicts, because
# the string representations are randomly ordered and therefore
# hard to compare.
model_update2['updates']['replication_driver_data'] = ast.literal_eval(
model_update2['updates']['replication_driver_data'])
model_update2['updates']['provider_location'] = ast.literal_eval(
model_update2['updates']['provider_location'])
self.assertEqual(ref_model_update2, model_update2)
def test_failover_legacy_volume(self):
ref_model_update = {
'volume_id': self.data.test_volume.id,
'updates':
{'replication_status': fields.ReplicationStatus.FAILED_OVER,
'replication_driver_data': self.data.legacy_provider_location,
'provider_location': self.data.legacy_provider_location2}}
model_update = self.common._failover_volume(
self.data.test_legacy_vol, True, self.extra_specs)
# Decode string representations of dicts into dicts, because
# the string representations are randomly ordered and therefore
# hard to compare.
model_update['updates']['replication_driver_data'] = ast.literal_eval(
model_update['updates']['replication_driver_data'])
model_update['updates']['provider_location'] = ast.literal_eval(
model_update['updates']['provider_location'])
self.assertEqual(ref_model_update, model_update)
def test_failover_volume_exception(self):
with mock.patch.object(
self.provision, 'failover_volume',
side_effect=exception.VolumeBackendAPIException):
ref_model_update = {
'volume_id': self.data.test_volume.id,
'updates': {'replication_status':
fields.ReplicationStatus.FAILOVER_ERROR,
'replication_driver_data': six.text_type(
self.data.provider_location3),
'provider_location': six.text_type(
self.data.provider_location)}}
model_update = self.common._failover_volume(
self.data.test_volume, True, self.extra_specs)
self.assertEqual(ref_model_update, model_update)
@mock.patch.object(
common.VMAXCommon, '_find_device_on_array',
side_effect=[None, VMAXCommonData.device_id,
VMAXCommonData.device_id, VMAXCommonData.device_id])
@mock.patch.object(
common.VMAXCommon, '_get_masking_views_from_volume',
side_effect=['OS-host-MV', None, exception.VolumeBackendAPIException])
def test_recover_volumes_on_failback(self, mock_mv, mock_dev):
recovery1 = self.common.recover_volumes_on_failback(
self.data.test_volume, self.extra_specs)
self.assertEqual('error', recovery1['updates']['status'])
recovery2 = self.common.recover_volumes_on_failback(
self.data.test_volume, self.extra_specs)
self.assertEqual('in-use', recovery2['updates']['status'])
recovery3 = self.common.recover_volumes_on_failback(
self.data.test_volume, self.extra_specs)
self.assertEqual('available', recovery3['updates']['status'])
recovery4 = self.common.recover_volumes_on_failback(
self.data.test_volume, self.extra_specs)
self.assertEqual('available', recovery4['updates']['status'])
def test_get_remote_target_device(self): def test_get_remote_target_device(self):
target_device1, _, _, _, _ = ( target_device1, _, _, _, _ = (
self.common.get_remote_target_device( self.common.get_remote_target_device(
@ -7405,6 +7278,13 @@ class VMAXCommonReplicationTest(test.TestCase):
self.assertEqual(fields.ReplicationStatus.ERROR, self.assertEqual(fields.ReplicationStatus.ERROR,
model_update['replication_status']) model_update['replication_status'])
@mock.patch.object(provision.VMAXProvision, 'failover_group')
def test_failover_replication_metro(self, mock_fo):
volumes = [self.data.test_volume]
_, vol_model_updates = self.common._failover_replication(
volumes, group, None, host=True, is_metro=True)
mock_fo.assert_not_called()
@mock.patch.object(utils.VMAXUtils, 'get_volume_group_utils', @mock.patch.object(utils.VMAXUtils, 'get_volume_group_utils',
return_value=(VMAXCommonData.array, {})) return_value=(VMAXCommonData.array, {}))
@mock.patch.object(common.VMAXCommon, '_cleanup_group_replication') @mock.patch.object(common.VMAXCommon, '_cleanup_group_replication')
@ -7485,16 +7365,13 @@ class VMAXCommonReplicationTest(test.TestCase):
@mock.patch.object(common.VMAXCommon, '_failover_replication', @mock.patch.object(common.VMAXCommon, '_failover_replication',
return_value=({}, {})) return_value=({}, {}))
@mock.patch.object(common.VMAXCommon, '_failover_volume', def test_failover_host_async(self, mock_fg):
return_value={})
def test_failover_host_async(self, mock_fv, mock_fg):
volumes = [self.data.test_volume] volumes = [self.data.test_volume]
extra_specs = deepcopy(self.extra_specs) extra_specs = deepcopy(self.extra_specs)
extra_specs['rep_mode'] = utils.REP_ASYNC extra_specs['rep_mode'] = utils.REP_ASYNC
with mock.patch.object(common.VMAXCommon, '_initial_setup', with mock.patch.object(common.VMAXCommon, '_initial_setup',
return_value=extra_specs): return_value=extra_specs):
self.async_driver.common.failover_host(volumes, None, []) self.async_driver.common.failover_host(volumes, None, [])
mock_fv.assert_not_called()
mock_fg.assert_called_once() mock_fg.assert_called_once()
@mock.patch.object(common.VMAXCommon, '_retype_volume', return_value=True) @mock.patch.object(common.VMAXCommon, '_retype_volume', return_value=True)

View File

@ -2843,8 +2843,7 @@ class VMAXCommon(object):
% {'backend': self.configuration.safe_get( % {'backend': self.configuration.safe_get(
'volume_backend_name')}) 'volume_backend_name')})
LOG.error(exception_message) LOG.error(exception_message)
raise exception.VolumeBackendAPIException( return
data=exception_message)
else: else:
if self.failover: if self.failover:
self.failover = False self.failover = False
@ -2858,8 +2857,7 @@ class VMAXCommon(object):
% {'backend': self.configuration.safe_get( % {'backend': self.configuration.safe_get(
'volume_backend_name')}) 'volume_backend_name')})
LOG.error(exception_message) LOG.error(exception_message)
raise exception.VolumeBackendAPIException( return
data=exception_message)
if groups: if groups:
for group in groups: for group in groups:
@ -2876,118 +2874,74 @@ class VMAXCommon(object):
volume_update_list += vol_updates volume_update_list += vol_updates
rep_mode = self.rep_config['mode'] rep_mode = self.rep_config['mode']
if rep_mode == utils.REP_ASYNC:
sync_vol_list, non_rep_vol_list, async_vol_list, metro_list = (
[], [], [], [])
for volume in volumes:
array = ast.literal_eval(volume.provider_location)['array']
extra_specs = self._initial_setup(volume)
extra_specs[utils.ARRAY] = array
if self.utils.is_replication_enabled(extra_specs):
device_id = self._find_device_on_array(
volume, extra_specs)
self._sync_check(
array, device_id, volume.name, extra_specs)
if rep_mode == utils.REP_SYNC:
sync_vol_list.append(volume)
elif rep_mode == utils.REP_ASYNC:
async_vol_list.append(volume)
else:
metro_list.append(volume)
else:
non_rep_vol_list.append(volume)
if len(async_vol_list) > 0:
vol_grp_name = self.utils.get_async_rdf_managed_grp_name( vol_grp_name = self.utils.get_async_rdf_managed_grp_name(
self.rep_config) self.rep_config)
__, volume_update_list = ( __, vol_updates = (
self._failover_replication( self._failover_replication(
volumes, None, vol_grp_name, async_vol_list, None, vol_grp_name,
secondary_backend_id=group_fo, host=True)) secondary_backend_id=group_fo, host=True))
volume_update_list += vol_updates
for volume in volumes: if len(sync_vol_list) > 0:
extra_specs = self._initial_setup(volume) extra_specs = self._initial_setup(sync_vol_list[0])
if self.utils.is_replication_enabled(extra_specs): array = ast.literal_eval(
if rep_mode == utils.REP_SYNC: sync_vol_list[0].provider_location)['array']
model_update = self._failover_volume( extra_specs[utils.ARRAY] = array
volume, self.failover, extra_specs) temp_grp_name = self.utils.get_temp_failover_grp_name(
volume_update_list.append(model_update) self.rep_config)
else: self.provision.create_volume_group(
if self.failover: array, temp_grp_name, extra_specs)
# Since the array has been failed-over, device_ids = self._get_volume_device_ids(sync_vol_list, array)
# volumes without replication should be in error. self.masking.add_volumes_to_storage_group(
array, device_ids, temp_grp_name, extra_specs)
__, vol_updates = (
self._failover_replication(
sync_vol_list, None, temp_grp_name,
secondary_backend_id=group_fo, host=True))
volume_update_list += vol_updates
self.rest.delete_storage_group(array, temp_grp_name)
if len(metro_list) > 0:
__, vol_updates = (
self._failover_replication(
sync_vol_list, None, None, secondary_backend_id=group_fo,
host=True, is_metro=True))
volume_update_list += vol_updates
if len(non_rep_vol_list) > 0:
if self.failover:
# Since the array has been failed-over,
# volumes without replication should be in error.
for vol in non_rep_vol_list:
volume_update_list.append({ volume_update_list.append({
'volume_id': volume.id, 'volume_id': vol.id,
'updates': {'status': 'error'}}) 'updates': {'status': 'error'}})
else:
# This is a failback, so we will attempt
# to recover non-failed over volumes
recovery = self.recover_volumes_on_failback(
volume, extra_specs)
volume_update_list.append(recovery)
LOG.info("Failover host complete.") LOG.info("Failover host complete.")
return secondary_id, volume_update_list, group_update_list return secondary_id, volume_update_list, group_update_list
def _failover_volume(self, vol, failover, extra_specs):
"""Failover a volume.
:param vol: the volume object
:param failover: flag to indicate failover or failback -- bool
:param extra_specs: the extra specifications
:returns: model_update -- dict
"""
loc = vol.provider_location
rep_data = vol.replication_driver_data
try:
name = ast.literal_eval(loc)
replication_keybindings = ast.literal_eval(rep_data)
try:
array = name['array']
except KeyError:
array = (name['keybindings']
['SystemName'].split('+')[1].strip('-'))
device_id = self._find_device_on_array(vol, {utils.ARRAY: array})
(target_device, remote_array, rdf_group,
local_vol_state, pair_state) = (
self.get_remote_target_device(array, vol, device_id))
self._sync_check(array, device_id, vol.name, extra_specs)
self.provision.failover_volume(
array, device_id, rdf_group, extra_specs,
local_vol_state, failover)
if failover:
new_status = REPLICATION_FAILOVER
else:
new_status = REPLICATION_ENABLED
# Transfer ownership to secondary_backend_id and
# update provider_location field
loc = six.text_type(replication_keybindings)
rep_data = six.text_type(name)
except Exception as ex:
msg = ('Failed to failover volume %(volume_id)s. '
'Error: %(error)s.')
LOG.error(msg, {'volume_id': vol.id,
'error': ex}, )
new_status = FAILOVER_ERROR
model_update = {'volume_id': vol.id,
'updates':
{'replication_status': new_status,
'replication_driver_data': rep_data,
'provider_location': loc}}
return model_update
def recover_volumes_on_failback(self, volume, extra_specs):
"""Recover volumes on failback.
On failback, attempt to recover non RE(replication enabled)
volumes from primary array.
:param volume: the volume object
:param extra_specs: the extra specifications
:returns: volume_update
"""
# Check if volume still exists on the primary
volume_update = {'volume_id': volume.id}
device_id = self._find_device_on_array(volume, extra_specs)
if not device_id:
volume_update['updates'] = {'status': 'error'}
else:
try:
maskingview = self._get_masking_views_from_volume(
extra_specs[utils.ARRAY], device_id, None)
except Exception:
maskingview = None
LOG.debug("Unable to determine if volume is in masking view.")
if not maskingview:
volume_update['updates'] = {'status': 'available'}
else:
volume_update['updates'] = {'status': 'in-use'}
return volume_update
def get_remote_target_device(self, array, volume, device_id): def get_remote_target_device(self, array, volume, device_id):
"""Get the remote target for a given volume. """Get the remote target for a given volume.
@ -4121,7 +4075,7 @@ class VMAXCommon(object):
def _failover_replication( def _failover_replication(
self, volumes, group, vol_grp_name, self, volumes, group, vol_grp_name,
secondary_backend_id=None, host=False): secondary_backend_id=None, host=False, is_metro=False):
"""Failover replication for a group. """Failover replication for a group.
:param volumes: the list of volumes :param volumes: the list of volumes
@ -4139,7 +4093,8 @@ class VMAXCommon(object):
try: try:
extra_specs = self._initial_setup(volumes[0]) extra_specs = self._initial_setup(volumes[0])
array = extra_specs[utils.ARRAY] array = ast.literal_eval(volumes[0].provider_location)['array']
extra_specs[utils.ARRAY] = array
if group: if group:
volume_group = self._find_volume_group(array, group) volume_group = self._find_volume_group(array, group)
if volume_group: if volume_group:
@ -4148,12 +4103,13 @@ class VMAXCommon(object):
if vol_grp_name is None: if vol_grp_name is None:
raise exception.GroupNotFound(group_id=group.id) raise exception.GroupNotFound(group_id=group.id)
rdf_group_no, _ = self.get_rdf_details(array)
# As we only support a single replication target, ignore # As we only support a single replication target, ignore
# any secondary_backend_id which is not 'default' # any secondary_backend_id which is not 'default'
failover = False if secondary_backend_id == 'default' else True failover = False if secondary_backend_id == 'default' else True
self.provision.failover_group( if not is_metro:
array, vol_grp_name, rdf_group_no, extra_specs, failover) rdf_group_no, _ = self.get_rdf_details(array)
self.provision.failover_group(
array, vol_grp_name, rdf_group_no, extra_specs, failover)
if failover: if failover:
model_update.update({ model_update.update({
'replication_status': 'replication_status':

View File

@ -593,35 +593,6 @@ class VMAXProvision(object):
rc = timer.start(interval=UNLINK_INTERVAL).wait() rc = timer.start(interval=UNLINK_INTERVAL).wait()
return rc return rc
def failover_volume(self, array, device_id, rdf_group,
extra_specs, local_vol_state, failover):
"""Failover or back a volume pair.
:param array: the array serial number
:param device_id: the source device id
:param rdf_group: the rdf group number
:param extra_specs: extra specs
:param local_vol_state: the local volume state
:param failover: flag to indicate failover or failback -- bool
"""
if local_vol_state == WRITE_DISABLED:
LOG.info("Volume %(dev)s is already failed over.",
{'dev': device_id})
return
if failover:
action = "Failing over"
else:
action = "Failing back"
LOG.info("%(action)s rdf pair: source device: %(src)s ",
{'action': action, 'src': device_id})
@coordination.synchronized('emc-rg-{rdfg_no}')
def _failover_volume(rdfg_no):
self.rest.modify_rdf_device_pair(
array, device_id, rdfg_no, extra_specs)
_failover_volume(rdf_group)
def get_or_create_volume_group(self, array, group, extra_specs): def get_or_create_volume_group(self, array, group, extra_specs):
"""Get or create a volume group. """Get or create a volume group.

View File

@ -811,3 +811,16 @@ class VMAXUtils(object):
[REP_ASYNC, REP_METRO]): [REP_ASYNC, REP_METRO]):
return True return True
return False return False
@staticmethod
def get_temp_failover_grp_name(rep_config):
"""Get the temporary group name used for failover.
:param rep_config: the replication config
:return: temp_grp_name
"""
temp_grp_name = ("OS-%(rdf)s-temp-rdf-sg"
% {'rdf': rep_config['rdf_group_label']})
LOG.debug("The temp rdf managed group name is %(name)s",
{'name': temp_grp_name})
return temp_grp_name