From 0ee414082318d22b3ed19acad6a479cb105c30e5 Mon Sep 17 00:00:00 2001 From: danielarthurt Date: Mon, 6 Apr 2020 13:26:13 +0000 Subject: [PATCH] [NetApp] Fix falsely report migration cancelation success NetApp ONTAP share delete operation can fail sometimes when is triggered immediately after migration cancelation on a overloaded NetApp backend. Migration cancelation invokes "abort_volume_move" which is an asynchronous API. If share delete operation is requested immediately after call the former API, it fails because the "abort_volume_move" is still in progress. Now NetApp cDOT driver checks, for a period of time, if the ``volume-move-abort`` operation has ended before report migration cancelation success. Change-Id: I76e11fef27c9723f019cfdfdc6ea86878db78776 Closes-Bug: #1688620 --- .../netapp/dataontap/cluster_mode/lib_base.py | 29 +++++++++ manila/share/drivers/netapp/options.py | 8 ++- .../dataontap/cluster_mode/test_lib_base.py | 60 +++++++++++++++---- ...igration-cancelation-fb913131eb8eb82a.yaml | 8 +++ 4 files changed, 91 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml diff --git a/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py b/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py index 935d1cffd5..b49cc898db 100644 --- a/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py +++ b/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py @@ -2512,6 +2512,8 @@ class NetAppCmodeFileStorageLibrary(object): """Abort an ongoing migration.""" vserver, vserver_client = self._get_vserver(share_server=share_server) share_volume = self._get_backend_share_name(source_share['id']) + retries = (self.configuration.netapp_migration_cancel_timeout / 5 or + 1) try: self._get_volume_move_status(source_share, share_server) @@ -2521,6 +2523,33 @@ class NetAppCmodeFileStorageLibrary(object): self._client.abort_volume_move(share_volume, vserver) + @manila_utils.retry(exception.InUse, interval=5, + retries=retries, backoff_rate=1) + def wait_for_migration_cancel_complete(): + move_status = self._get_volume_move_status(source_share, + share_server) + if move_status['state'] == 'failed': + return + else: + msg = "Migration cancelation isn't finished yet." + raise exception.InUse(message=msg) + + try: + wait_for_migration_cancel_complete() + except exception.InUse: + move_status = self._get_volume_move_status(source_share, + share_server) + msg_args = { + 'share_move_state': move_status['state'] + } + msg = _("Migration cancelation was not successful. The share " + "migration state failed while transitioning from " + "%(share_move_state)s state to 'failed'. Retries " + "exhausted.") % msg_args + raise exception.NetAppException(message=msg) + except exception.NetAppException: + LOG.exception("Could not get volume move status.") + msg = ("Share volume move operation for share %(shr)s from host " "%(src)s to %(dest)s was successfully aborted.") msg_args = { diff --git a/manila/share/drivers/netapp/options.py b/manila/share/drivers/netapp/options.py index c5af307c41..d3e3d2d540 100644 --- a/manila/share/drivers/netapp/options.py +++ b/manila/share/drivers/netapp/options.py @@ -156,7 +156,13 @@ netapp_data_motion_opts = [ default=3600, # One Hour, help='The maximum time in seconds to wait for the completion ' 'of a volume clone split operation in order to start a ' - 'volume move.'), ] + 'volume move.'), + cfg.IntOpt('netapp_migration_cancel_timeout', + min=0, + default=3600, # One Hour, + help='The maximum time in seconds that migration cancel ' + 'waits for all migration operations be completely ' + 'aborted.'), ] CONF = cfg.CONF CONF.register_opts(netapp_proxy_opts) diff --git a/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py b/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py index e5ea2e464e..1777f26e7e 100644 --- a/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py +++ b/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py @@ -5210,35 +5210,69 @@ class NetAppFileStorageLibraryTestCase(test.TestCase): self.assertDictMatch(expected_progress, migration_progress) mock_info_log.assert_called_once() - @ddt.data(utils.annotated('already_canceled', (True, )), - utils.annotated('not_canceled_yet', (False, ))) - def test_migration_cancel(self, already_canceled): + @ddt.data({'state': 'failed'}, + {'state': 'healthy'}) + @ddt.unpack + def test_migration_cancel(self, state): source_snapshots = mock.Mock() snapshot_mappings = mock.Mock() - already_canceled = already_canceled[0] - mock_exception_log = self.mock_object(lib_base.LOG, 'exception') + self.library.configuration.netapp_migration_cancel_timeout = 15 mock_info_log = self.mock_object(lib_base.LOG, 'info') - vol_move_side_effect = (exception.NetAppException - if already_canceled else None) + self.mock_object(self.library, '_get_vserver', mock.Mock(return_value=(fake.VSERVER1, mock.Mock()))) self.mock_object(self.library, '_get_backend_share_name', mock.Mock(return_value=fake.SHARE_NAME)) self.mock_object(self.client, 'abort_volume_move') self.mock_object(self.client, 'get_volume_move_status', - mock.Mock(side_effect=vol_move_side_effect)) + mock.Mock(return_value={'state': state})) - retval = self.library.migration_cancel( + if state == 'failed': + retval = self.library.migration_cancel( + self.context, fake_share.fake_share_instance(), + fake_share.fake_share_instance(), source_snapshots, + snapshot_mappings, share_server=fake.SHARE_SERVER, + destination_share_server='dst_srv') + + self.assertIsNone(retval) + mock_info_log.assert_called_once() + else: + self.assertRaises( + (exception.NetAppException), + self.library.migration_cancel, self.context, + fake_share.fake_share_instance(), + fake_share.fake_share_instance, source_snapshots, + snapshot_mappings, share_server=fake.SHARE_SERVER, + destination_share_server='dst_srv') + + @ddt.data({'already_canceled': True, 'effect': exception.NetAppException}, + {'already_canceled': False, 'effect': + (None, exception.NetAppException)}) + @ddt.unpack + def test_migration_cancel_exception_volume_status(self, already_canceled, + effect): + source_snapshots = mock.Mock() + snapshot_mappings = mock.Mock() + self.library.configuration.netapp_migration_cancel_timeout = 1 + mock_exception_log = self.mock_object(lib_base.LOG, 'exception') + mock_info_log = self.mock_object(lib_base.LOG, 'info') + self.mock_object(self.library, '_get_vserver', + mock.Mock(return_value=(fake.VSERVER1, mock.Mock()))) + self.mock_object(self.library, '_get_backend_share_name', + mock.Mock(return_value=fake.SHARE_NAME)) + self.mock_object(self.client, 'abort_volume_move') + self.mock_object(self.client, 'get_volume_move_status', + mock.Mock(side_effect=effect)) + self.library.migration_cancel( self.context, fake_share.fake_share_instance(), fake_share.fake_share_instance(), source_snapshots, snapshot_mappings, share_server=fake.SHARE_SERVER, destination_share_server='dst_srv') - self.assertIsNone(retval) - if already_canceled: - mock_exception_log.assert_called_once() - else: + mock_exception_log.assert_called_once() + if not already_canceled: mock_info_log.assert_called_once() + self.assertEqual(not already_canceled, self.client.abort_volume_move.called) diff --git a/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml b/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml new file mode 100644 index 0000000000..35cdf7c319 --- /dev/null +++ b/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + NetApp cDOT driver with a very busy backend may report that the + ``migration cancelation`` operation was successful even if wasn't. Now + during the ``migration cancelation`` operation the current state of the + migration task will be tracked and the operation will only report success + when the operation ends correctly.