RBD: Flattening of child volumes during deletion

This patch allows delete_volume and delete_snapshot calls
to fail less often when using RBD volume clones and snapshots.

RBD clone v2 support allows remove() to pass in situations
where it would previously fail, but it still fails with an
ImageBusy error in some situations.  For example:

 volume1
   -> snapshot s1 of volume 1
     -> volume2 cloned from snapshot 1
Deleting snapshot s1 would fail with ImageBusy.

This is fixed by using RBD flatten operations to break
dependencies between volumes/snapshots and other RBD volumes
or snapshots.

Delete now works as follows:
  1. Attempt RBD remove()
     This is the "fast path" for removing a simple volume
     that involves no extra overhead.
  2. If busy and the volume has child dependencies,
     flatten those dependencies with RBD flatten()
  3. Attempt RBD remove() again
     This will succeed in more cases than (1) would have.
  4. If remove() failed, use trash_move() instead to move
     the image to the trash instead.
     This allows Cinder deletion of a volume (volume1) to proceed
     in the scenario where volume2 was cloned from snapshot s1 of
     volume1, and snapshot s1 has been trashed and not fully
     deleted from the RBD backend.  (Snapshots in the trash
     namespace are no longer visible but are still in the
     dependency chain.)

This allows Cinder deletions to succeed in most scenarios where
they would previously fail.

In cases where a .clone_snap snapshot is present, we still do a
rename to .deleted instead of deleting/trashing the volume.  This
should be worked on further in a follow-up as it is likely not
necessary most of the time.

A new configuration option, rbd_concurrent_flatten_operations, was
introduced to limit how many flatten calls can be made at the same time.
This is to prevent overloading the backend. The default value is 3.

Co-Authored-By: Eric Harney <eharney@redhat.com>
Co-Authored-By: Sofia Enriquez <lsofia.enriquez@gmail.com>

Closes-Bug: #1969643
Change-Id: I009d0748fdc829ca0b4f99bc9b70dadd19717d04
This commit is contained in:
Eric Harney 2022-04-20 12:08:31 -04:00
parent 0634096238
commit 1a675c9aa1
3 changed files with 404 additions and 231 deletions

View File

@ -193,6 +193,7 @@ class RBDTestCase(test.TestCase):
cfg.rados_connection_interval = 5 cfg.rados_connection_interval = 5
cfg.backup_use_temp_snapshot = False cfg.backup_use_temp_snapshot = False
cfg.enable_deferred_deletion = False cfg.enable_deferred_deletion = False
cfg.rbd_concurrent_flatten_operations = 3
# Because the mocked conf doesn't actually have an underlying oslo conf # Because the mocked conf doesn't actually have an underlying oslo conf
# it doesn't have the set_default method, so we use a fake one. # it doesn't have the set_default method, so we use a fake one.
@ -792,15 +793,15 @@ class RBDTestCase(test.TestCase):
self.driver.delete_volume(self.volume_a) self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value, self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name, self.volume_a.name,
None) None)
(self.driver.rbd.Image.return_value self.mock_proxy.return_value.__enter__.return_value.\
.list_snaps.assert_called_once_with()) list_snaps.assert_called_once_with()
client.__enter__.assert_called_once_with() client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None) client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with( mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value) self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse( self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called) self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
@ -827,16 +828,16 @@ class RBDTestCase(test.TestCase):
self.driver.delete_volume(self.volume_a) self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value, self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name, self.volume_a.name,
None) None)
m_del_clone_parent_refs.assert_called_once() m_del_clone_parent_refs.assert_not_called()
(self.driver.rbd.Image.return_value self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
.list_snaps.assert_called_once_with()) assert_called_once_with()
client.__enter__.assert_called_once_with() client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None) client.__exit__.assert_called_once_with(None, None, None)
m_del_back_snaps.assert_called_once_with( m_del_back_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value) self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse( self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called) self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
@ -859,18 +860,17 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a) drv.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
drv.rbd.Image.return_value, self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name, self.volume_a.name,
None) None)
drv.rbd.Image.return_value.list_snaps.assert_called_once_with()
client.__enter__.assert_called_once_with() client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None) client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with( mock_delete_backup_snaps.assert_called_once_with(
drv.rbd.Image.return_value) self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse( self.assertFalse(
drv.rbd.Image.return_value.unprotect_snap.called) drv.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
1, drv.rbd.RBD.return_value.trash_move.call_count) 0, drv.rbd.RBD.return_value.trash_move.call_count)
self.driver.rbd.RBD.return_value.remove.assert_not_called() self.driver.rbd.RBD.return_value.remove.assert_not_called()
@common_mocks @common_mocks
@ -926,7 +926,7 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a) drv.delete_volume(self.volume_a)
self.assertEqual( self.assertEqual(
1, drv.rbd.RBD.return_value.trash_move.call_count) 0, drv.rbd.RBD.return_value.trash_move.call_count)
@common_mocks @common_mocks
def test_deferred_deletion_w_deleted_parent(self): def test_deferred_deletion_w_deleted_parent(self):
@ -940,16 +940,18 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a) drv.delete_volume(self.volume_a)
self.assertEqual( self.assertEqual(
2, drv.rbd.RBD.return_value.trash_move.call_count) 0, drv.rbd.RBD.return_value.trash_move.call_count)
@common_mocks @common_mocks
def test_delete_volume_not_found_at_open(self): def test_delete_volume_not_found_at_open(self):
self.mock_rbd.Image.side_effect = self.mock_rbd.ImageNotFound self.mock_rbd.Image.side_effect = self.mock_rbd.ImageNotFound
self.mock_proxy.side_effect = self.mock_rbd.ImageNotFound
self.assertIsNone(self.driver.delete_volume(self.volume_a)) self.assertIsNone(self.driver.delete_volume(self.volume_a))
with mock.patch.object(driver, 'RADOSClient') as client: with mock.patch.object(driver, 'RADOSClient') as client:
client = self.mock_client.return_value.__enter__.return_value client = self.mock_client.return_value.__enter__.return_value
self.mock_rbd.Image.assert_called_once_with(client.ioctx, self.mock_proxy.assert_called_once_with(self.driver,
self.volume_a.name) self.volume_a.name,
ioctx=client.ioctx)
# Make sure the exception was raised # Make sure the exception was raised
self.assertEqual([self.mock_rbd.ImageNotFound], RAISED_EXCEPTIONS) self.assertEqual([self.mock_rbd.ImageNotFound], RAISED_EXCEPTIONS)
@ -958,45 +960,58 @@ class RBDTestCase(test.TestCase):
self.mock_rbd.Image.return_value.list_snaps.return_value = [] self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = ( self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageBusy) self.mock_rbd.ImageBusy,
None)
mock_get_clone_info = self.mock_object(self.driver, '_get_clone_info')
mock_get_clone_info.return_value = (None, None, None)
mock_delete_backup_snaps = self.mock_object(self.driver, mock_delete_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps') '_delete_backup_snaps')
mock_rados_client = self.mock_object(driver, 'RADOSClient') mock_rados_client = self.mock_object(driver, 'RADOSClient')
mock_flatten = self.mock_object(self.driver, '_flatten')
self.driver.delete_volume(self.volume_a) with mock.patch.object(self.driver, '_get_clone_info') as \
mock_get_clone_info:
mock_get_clone_info.return_value = (None, None, None)
self.driver.rbd.Image.return_value.list_children.\
return_value = [('pool1', 'child1'),
('pool1', 'child2')]
self.mock_proxy.return_value.__enter__.return_value.list_children.\
return_value = [('pool1', 'child1'), ('pool1', 'child2')]
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_flatten.assert_has_calls(
self.mock_rbd.Image.return_value, [mock.call('pool1', 'child1'),
self.volume_a.name, mock.call('pool1', 'child2')])
None)
self.mock_rbd.Image.return_value.list_snaps.assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
1, self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageBusy, RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.\ mock_get_clone_info.assert_called_once_with(
assert_called_once_with( self.mock_proxy.return_value.__enter__.return_value,
mock.ANY,
self.volume_a.name, self.volume_a.name,
0) None)
self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.
called)
self.assertEqual(
2,
self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageBusy,
RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.assert_not_called()
@common_mocks @common_mocks
def test_delete_volume_has_snapshots(self): def test_delete_volume_has_snapshots(self):
self.mock_rbd.Image.return_value.list_snaps.return_value = [] self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = ( self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageHasSnapshots) self.mock_rbd.ImageHasSnapshots, # initial vol remove attempt
None # removal of child image
)
mock_get_clone_info = self.mock_object(self.driver, mock_get_clone_info = self.mock_object(self.driver,
'_get_clone_info', '_get_clone_info',
return_value=(None, return_value=(None,
@ -1005,18 +1020,70 @@ class RBDTestCase(test.TestCase):
m_del_backup_snaps = self.mock_object(self.driver, m_del_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps') '_delete_backup_snaps')
mock_try_remove_volume = self.mock_object(self.driver,
'_try_remove_volume',
return_value=True)
mock_rados_client = self.mock_object(driver, 'RADOSClient')
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
mock_rados_client.assert_called_once_with(self.driver)
m_del_backup_snaps.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
1, self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageHasSnapshots,
RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.assert_not_called()
mock_try_remove_volume.assert_called_once_with(mock.ANY,
self.volume_a.name)
@common_mocks
def test_delete_volume_has_snapshots_trash(self):
self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageHasSnapshots, # initial vol remove attempt
None # removal of child image
)
mock_get_clone_info = self.mock_object(self.driver,
'_get_clone_info',
return_value=(None,
None,
None))
m_del_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps')
mock_try_remove_volume = self.mock_object(self.driver,
'_try_remove_volume',
return_value=False)
mock_trash_volume = self.mock_object(self.driver,
'_move_volume_to_trash')
with mock.patch.object(driver, 'RADOSClient') as mock_rados_client: with mock.patch.object(driver, 'RADOSClient') as mock_rados_client:
self.driver.delete_volume(self.volume_a) self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value, self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name, self.volume_a.name,
None) None)
(self.mock_rbd.Image.return_value.list_snaps self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
.assert_called_once_with()) assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver) mock_rados_client.assert_called_once_with(self.driver)
m_del_backup_snaps.assert_called_once_with( m_del_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value) self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse( self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called) self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
@ -1027,10 +1094,14 @@ class RBDTestCase(test.TestCase):
RAISED_EXCEPTIONS) RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.\ self.mock_rbd.RBD.return_value.trash_move.\
assert_called_once_with( assert_not_called()
mock.ANY,
self.volume_a.name, mock_trash_volume.assert_called_once_with(mock.ANY,
0) self.volume_a.name,
0)
mock_try_remove_volume.assert_called_once_with(mock.ANY,
self.volume_a.name)
@common_mocks @common_mocks
def test_delete_volume_not_found(self): def test_delete_volume_not_found(self):
@ -1046,15 +1117,20 @@ class RBDTestCase(test.TestCase):
mock_get_clone_info = self.mock_object(self.driver, '_get_clone_info') mock_get_clone_info = self.mock_object(self.driver, '_get_clone_info')
mock_get_clone_info.return_value = (None, None, None) mock_get_clone_info.return_value = (None, None, None)
mock_find_clone_snap = self.mock_object(self.driver,
'_find_clone_snap',
return_value=None)
self.assertIsNone(self.driver.delete_volume(self.volume_a)) self.assertIsNone(self.driver.delete_volume(self.volume_a))
image = self.mock_proxy.return_value.__enter__.return_value
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value, image,
self.volume_a.name, self.volume_a.name,
None) None)
self.mock_rbd.Image.return_value.list_snaps.assert_called_once_with() mock_find_clone_snap.assert_called_once_with(image)
mock_rados_client.assert_called_once_with(self.driver) mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with( mock_delete_backup_snaps.assert_called_once_with(image)
self.mock_rbd.Image.return_value)
self.assertFalse( self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called) self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
@ -1083,21 +1159,22 @@ class RBDTestCase(test.TestCase):
return_value=(None, return_value=(None,
None, None,
None)) None))
self.mock_object(self.driver, '_find_clone_snap',
return_value=snapshots[2]['name'])
with mock.patch.object(self.driver, '_delete_backup_snaps') as \ with mock.patch.object(self.driver, '_delete_backup_snaps') as \
mock_delete_backup_snaps: mock_delete_backup_snaps:
self.driver.delete_volume(self.volume_a) self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with( mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value, self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name, self.volume_a.name,
snapshots[2]['name']) snapshots[2]['name'])
(self.driver.rbd.Image.return_value
.list_snaps.assert_called_once_with())
client.__enter__.assert_called_once_with() client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None) client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with( mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value) self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse( self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called) self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual( self.assertEqual(
@ -1281,26 +1358,40 @@ class RBDTestCase(test.TestCase):
proxy.__enter__.return_value = proxy proxy.__enter__.return_value = proxy
proxy.unprotect_snap.side_effect = ( proxy.unprotect_snap.side_effect = (
self.mock_rbd.ImageBusy,
None)
with mock.patch.object(self.driver, '_flatten_children') as \
mock_flatten_children:
self.driver.delete_snapshot(self.snapshot)
mock_flatten_children.assert_called_once_with(mock.ANY,
self.volume_a.name,
self.snapshot.name)
self.assertTrue(proxy.unprotect_snap.called)
self.assertTrue(proxy.remove_snap.called)
@common_mocks
@mock.patch.object(driver.RBDDriver, '_flatten')
@mock.patch('cinder.objects.Volume.get_by_id')
def test_delete_busy_snapshot_fail(self, volume_get_by_id, flatten_mock):
volume_get_by_id.return_value = self.volume_a
proxy = self.mock_proxy.return_value
proxy.__enter__.return_value = proxy
proxy.unprotect_snap.side_effect = (
self.mock_rbd.ImageBusy,
self.mock_rbd.ImageBusy,
self.mock_rbd.ImageBusy) self.mock_rbd.ImageBusy)
flatten_mock.side_effect = exception.SnapshotIsBusy(self.snapshot.name)
with mock.patch.object(self.driver, '_get_children_info') as \ self.assertRaises(exception.SnapshotIsBusy,
mock_get_children_info: self.driver.delete_snapshot,
mock_get_children_info.return_value = [('pool', 'volume2')] self.snapshot)
with mock.patch.object(driver, 'LOG') as \ self.assertTrue(proxy.unprotect_snap.called)
mock_log: self.assertFalse(proxy.remove_snap.called)
self.assertRaises(exception.SnapshotIsBusy,
self.driver.delete_snapshot,
self.snapshot)
mock_get_children_info.assert_called_once_with(
proxy,
self.snapshot.name)
self.assertTrue(mock_log.info.called)
self.assertTrue(proxy.unprotect_snap.called)
self.assertFalse(proxy.remove_snap.called)
@common_mocks @common_mocks
@mock.patch('cinder.objects.Volume.get_by_id') @mock.patch('cinder.objects.Volume.get_by_id')
@ -1325,19 +1416,6 @@ class RBDTestCase(test.TestCase):
self.snapshot) self.snapshot)
image.rollback_to_snap.assert_called_once_with(self.snapshot.name) image.rollback_to_snap.assert_called_once_with(self.snapshot.name)
@common_mocks
def test_get_children_info(self):
volume = self.mock_proxy
volume.set_snap = mock.Mock()
volume.list_children = mock.Mock()
list_children = [('pool', 'volume2')]
volume.list_children.return_value = list_children
info = self.driver._get_children_info(volume,
self.snapshot['name'])
self.assertEqual(list_children, info)
@common_mocks @common_mocks
def test_get_clone_info(self): def test_get_clone_info(self):
volume = self.mock_rbd.Image() volume = self.mock_rbd.Image()

View File

@ -1,4 +1,5 @@
# Copyright 2013 OpenStack Foundation # Copyright 2013 OpenStack Foundation
# Copyright 2022 Red Hat, Inc.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may # Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain # not use this file except in compliance with the License. You may obtain
@ -133,6 +134,9 @@ RBD_OPTS = [
cfg.IntOpt('deferred_deletion_purge_interval', default=60, cfg.IntOpt('deferred_deletion_purge_interval', default=60,
help='Number of seconds between runs of the periodic task ' help='Number of seconds between runs of the periodic task '
'to purge volumes tagged for deletion.'), 'to purge volumes tagged for deletion.'),
cfg.IntOpt('rbd_concurrent_flatten_operations', default=3, min=0,
help='Number of flatten operations that will run '
'concurrently on this volume service.')
] ]
CONF = cfg.CONF CONF = cfg.CONF
@ -356,6 +360,10 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
self.keyring_data: Optional[str] = None self.keyring_data: Optional[str] = None
self._set_keyring_attributes() self._set_keyring_attributes()
self._semaphore = utils.semaphore_factory(
limit=self.configuration.rbd_concurrent_flatten_operations,
concurrent_processes=1)
def _set_keyring_attributes(self) -> None: def _set_keyring_attributes(self) -> None:
# The rbd_keyring_conf option is not available for OpenStack usage # The rbd_keyring_conf option is not available for OpenStack usage
# for security reasons (OSSN-0085) and in OpenStack we use # for security reasons (OSSN-0085) and in OpenStack we use
@ -830,6 +838,25 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
'dst_size': volume.size}) 'dst_size': volume.size})
self._resize(volume) self._resize(volume)
def _flatten_volume(
self,
image_name: str,
client: RADOSClient) -> None:
# Flatten destination volume
try:
with RBDVolumeProxy(self, image_name, client=client,
ioctx=client.ioctx) as dest_volume:
LOG.debug("flattening volume %s", image_name)
dest_volume.flatten()
except Exception as e:
msg = (_("Failed to flatten volume %(volume)s with "
"error: %(error)s.") %
{'volume': image_name,
'error': e})
LOG.exception(msg)
raise exception.VolumeBackendAPIException(data=msg)
def create_cloned_volume( def create_cloned_volume(
self, self,
volume: Volume, volume: Volume,
@ -890,24 +917,12 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
# volumes are always flattened. # volumes are always flattened.
if (volume.use_quota and if (volume.use_quota and
depth >= self.configuration.rbd_max_clone_depth): depth >= self.configuration.rbd_max_clone_depth):
LOG.info("maximum clone depth (%d) has been reached - " LOG.info("maximum clone depth (%d) has been reached - "
"flattening dest volume", "flattening dest volume",
self.configuration.rbd_max_clone_depth) self.configuration.rbd_max_clone_depth)
# Flatten destination volume self._flatten_volume(dest_name, client)
try:
with RBDVolumeProxy(self, dest_name, client=client,
ioctx=client.ioctx) as dest_volume:
LOG.debug("flattening dest volume %s", dest_name)
dest_volume.flatten()
except Exception as e:
msg = (_("Failed to flatten volume %(volume)s with "
"error: %(error)s.") %
{'volume': dest_name,
'error': e})
LOG.exception(msg)
src_volume.close()
raise exception.VolumeBackendAPIException(data=msg)
try: try:
# remove temporary snap # remove temporary snap
@ -1168,11 +1183,22 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
return volume_update return volume_update
@utils.limit_operations
def _do_flatten(self, volume_name: str, pool: str) -> None:
LOG.debug('flattening %s/%s', pool, volume_name)
try:
with RBDVolumeProxy(self, volume_name, pool=pool) as vol:
vol.flatten()
LOG.debug('flattening of %s/%s has completed',
pool, volume_name)
except self.rbd.ImageNotFound:
LOG.debug('image %s not found during flatten', volume_name)
# do nothing
def _flatten(self, pool: str, volume_name: str) -> None: def _flatten(self, pool: str, volume_name: str) -> None:
LOG.debug('flattening %(pool)s/%(img)s', image = pool + '/' + volume_name
dict(pool=pool, img=volume_name)) LOG.debug('Queueing %s for flattening', image)
with RBDVolumeProxy(self, volume_name, pool) as vol: self._do_flatten(volume_name, pool)
vol.flatten()
def _get_stripe_unit(self, ioctx: 'rados.Ioctx', volume_name: str) -> int: def _get_stripe_unit(self, ioctx: 'rados.Ioctx', volume_name: str) -> int:
"""Return the correct stripe unit for a cloned volume. """Return the correct stripe unit for a cloned volume.
@ -1310,23 +1336,6 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
return (None, None, None) return (None, None, None)
def _get_children_info(self,
volume: 'rbd.Image',
snap: Optional[str]) -> list[tuple]:
"""List children for the given snapshot of a volume(image).
Returns a list of (pool, image).
"""
children_list = []
if snap:
volume.set_snap(snap)
children_list = volume.list_children()
volume.set_snap(None)
return children_list
def _delete_clone_parent_refs(self, def _delete_clone_parent_refs(self,
client: RADOSClient, client: RADOSClient,
parent_name: str, parent_name: str,
@ -1369,96 +1378,161 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
g_parent_snap = typing.cast(str, g_parent_snap) g_parent_snap = typing.cast(str, g_parent_snap)
self._delete_clone_parent_refs(client, g_parent, g_parent_snap) self._delete_clone_parent_refs(client, g_parent, g_parent_snap)
def delete_volume(self, volume: Volume) -> None: def _flatten_children(self,
"""Deletes a logical volume.""" client_ioctx: 'rados.Ioctx',
volume_name = volume.name volume_name: str,
with RADOSClient(self) as client: snap_name: Optional[str] = None) -> None:
with RBDVolumeProxy(self,
volume_name,
ioctx=client_ioctx) as rbd_image:
if snap_name is not None:
rbd_image.set_snap(snap_name)
children_list = rbd_image.list_children()
for (pool, child_name) in children_list:
LOG.info('Image %(pool)s/%(image)s%(snap)s is dependent '
'on the image %(volume_name)s.',
{'pool': pool,
'image': child_name,
'volume_name': volume_name,
'snap': '@' + snap_name if snap_name else ''})
try: try:
rbd_image = self.rbd.Image(client.ioctx, volume_name) self._flatten(pool, child_name)
except self.rbd.ImageNotFound: except Exception as e:
LOG.info("volume %s no longer exists in backend", LOG.error(e)
volume_name) raise
return
clone_snap = None def _move_volume_to_trash(self,
parent = None client_ioctx: 'rados.Ioctx',
volume_name: str,
delay: int) -> None:
# trash_move() will succeed in some situations when a regular
# remove() call will fail due to image dependencies
LOG.debug("moving volume %s to trash", volume_name)
try:
self.RBDProxy().trash_move(client_ioctx,
volume_name,
delay)
except self.rbd.ImageBusy:
msg = _('ImageBusy error raised while trashing RBD '
'volume.')
LOG.warning(msg)
raise exception.VolumeIsBusy(msg, volume_name=volume_name)
# Ensure any backup snapshots are deleted def _try_remove_volume(self, client: 'rados', volume_name: str) -> bool:
self._delete_backup_snaps(rbd_image) # Try a couple of times to delete the volume, rather than
# stopping on the first error.
# If the volume has non-clone snapshots this delete is expected to # In the event of simultaneous Cinder delete operations,
# raise VolumeIsBusy so do so straight away. # this gives a window for other deletes of snapshots and images
# to complete, freeing dependencies which allow this remove to
# succeed.
@utils.retry((self.rbd.ImageBusy, self.rbd.ImageHasSnapshots),
self.configuration.rados_connection_interval,
self.configuration.rados_connection_retries)
def _do_try_remove_volume(self, client, volume_name: str) -> bool:
try: try:
snaps = rbd_image.list_snaps() LOG.debug('Trying to remove image %s', volume_name)
for snap in snaps: self.RBDProxy().remove(client.ioctx, volume_name)
if snap['name'].endswith('.clone_snap'): return True
LOG.debug("volume has clone snapshot(s)") except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
# We grab one of these and use it when fetching parent with excutils.save_and_reraise_exception():
# info in case the volume has been flattened. msg = _('deletion failed')
clone_snap = snap['name'] LOG.info(msg)
break
return False
return _do_try_remove_volume(self, client, volume_name)
@staticmethod
def _find_clone_snap(rbd_image: RBDVolumeProxy) -> Optional[str]:
snaps = rbd_image.list_snaps()
for snap in snaps:
if snap['name'].endswith('.clone_snap'):
LOG.debug("volume has clone snapshot(s)")
# We grab one of these and use it when fetching parent
# info in case the volume has been flattened.
clone_snap = snap['name']
return clone_snap
return None
def _delete_volume(self, volume: Volume, client: RADOSClient) -> None:
clone_snap = None
parent = None
parent_snap = None
try:
with RBDVolumeProxy(self,
volume.name,
ioctx=client.ioctx) as rbd_image:
# Ensure any backup snapshots are deleted
self._delete_backup_snaps(rbd_image)
clone_snap = self._find_clone_snap(rbd_image)
# Determine if this volume is itself a clone # Determine if this volume is itself a clone
_pool, parent, parent_snap = self._get_clone_info(rbd_image, _pool, parent, parent_snap = self._get_clone_info(rbd_image,
volume_name, volume.name,
clone_snap) clone_snap)
finally: except self.rbd.ImageNotFound:
rbd_image.close() LOG.info("volume %s no longer exists in backend", volume.name)
return
@utils.retry(self.rbd.ImageBusy, if clone_snap is not None:
self.configuration.rados_connection_interval, # If the volume has copy-on-write clones, keep it as a silent
self.configuration.rados_connection_retries) # volume which will be deleted when its snapshots and clones
def _try_remove_volume(client: Any, volume_name: str) -> None: # are deleted.
if self.configuration.enable_deferred_deletion: # TODO: only do this if it actually can't be deleted?
delay = self.configuration.deferred_deletion_delay new_name = "%s.deleted" % (volume.name)
else: self.RBDProxy().rename(client.ioctx, volume.name, new_name)
try: return
self.RBDProxy().remove(client.ioctx, volume_name)
return
except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
delay = 0
LOG.debug("moving volume %s to trash", volume_name)
# When using the RBD v2 clone api, deleting a volume
# that has a snapshot in the trash space raises a
# busy exception.
# In order to solve this, call the trash operation
# which should succeed when the volume has
# dependencies.
self.RBDProxy().trash_move(client.ioctx,
volume_name,
delay)
if clone_snap is None: LOG.debug("deleting RBD volume %s", volume.name)
LOG.debug("deleting rbd volume %s", volume_name)
try:
_try_remove_volume(client, volume_name)
except self.rbd.ImageBusy:
msg = (_("ImageBusy error raised while deleting rbd "
"volume. This may have been caused by a "
"connection from a client that has crashed and, "
"if so, may be resolved by retrying the delete "
"after 30 seconds has elapsed."))
LOG.warning(msg)
# Now raise this so that the volume stays available and the
# deletion can be retried.
raise exception.VolumeIsBusy(msg, volume_name=volume_name)
except self.rbd.ImageNotFound:
LOG.info("RBD volume %s not found, allowing delete "
"operation to proceed.", volume_name)
return
# If it is a clone, walk back up the parent chain deleting try:
# references. self.RBDProxy().remove(client.ioctx, volume.name)
if parent: return # the fast path was successful
LOG.debug("volume is a clone so cleaning references") except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
parent_snap = typing.cast(str, parent_snap) self._flatten_children(client.ioctx, volume.name)
self._delete_clone_parent_refs(client, parent, parent_snap) except self.rbd.ImageNotFound:
else: LOG.info("RBD volume %s not found, allowing delete "
# If the volume has copy-on-write clones we will not be able to "operation to proceed.", volume.name)
# delete it. Instead we will keep it as a silent volume which return
# will be deleted when it's snapshot and clones are deleted.
new_name = "%s.deleted" % (volume_name) try:
self.RBDProxy().rename(client.ioctx, volume_name, new_name) if self._try_remove_volume(client, volume.name):
return
except self.rbd.ImageHasSnapshots:
# perform trash instead, which can succeed when snapshots exist
pass
except self.rbd.ImageBusy:
msg = _('ImageBusy error raised while deleting RBD volume')
raise exception.VolumeIsBusy(msg, volume_name=volume.name)
delay = 0
if self.configuration.enable_deferred_deletion:
delay = self.configuration.deferred_deletion_delay
# Since it failed to remove above, trash the volume here instead.
# This covers the scenario of an image unable to be deleted because
# a child snapshot of it has been trashed but not yet removed.
# That snapshot is not visible but is still in the dependency
# chain of RBD images.
self._move_volume_to_trash(client.ioctx, volume.name, delay)
# If it is a clone, walk back up the parent chain deleting
# references.
if parent:
LOG.debug("volume is a clone so cleaning references")
parent_snap = typing.cast(str, parent_snap)
self._delete_clone_parent_refs(client, parent, parent_snap)
def delete_volume(self, volume: Volume) -> None:
"""Deletes an RBD volume."""
with RADOSClient(self) as client:
self._delete_volume(volume, client)
def create_snapshot(self, snapshot: Snapshot) -> None: def create_snapshot(self, snapshot: Snapshot) -> None:
"""Creates an rbd snapshot.""" """Creates an rbd snapshot."""
@ -1472,38 +1546,42 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
volume_name = snapshot.volume_name volume_name = snapshot.volume_name
snap_name = snapshot.name snap_name = snapshot.name
@utils.retry(self.rbd.ImageBusy,
self.configuration.rados_connection_interval,
self.configuration.rados_connection_retries)
def do_unprotect_snap(self, volume_name, snap_name):
try:
with RBDVolumeProxy(self, volume_name) as volume:
volume.unprotect_snap(snap_name)
except self.rbd.InvalidArgument:
LOG.info(
"InvalidArgument: Unable to unprotect snapshot %s.",
snap_name)
except self.rbd.ImageNotFound as e:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
raise e
except self.rbd.ImageBusy as e:
# flatten and then retry the operation
with RADOSClient(self) as client:
self._flatten_children(client.ioctx,
volume_name,
snap_name)
raise e
try:
do_unprotect_snap(self, volume_name, snap_name)
except self.rbd.ImageBusy:
raise exception.SnapshotIsBusy(snapshot_name=snap_name)
except self.rbd.ImageNotFound:
return
try: try:
with RBDVolumeProxy(self, volume_name) as volume: with RBDVolumeProxy(self, volume_name) as volume:
try: volume.remove_snap(snap_name)
volume.unprotect_snap(snap_name)
except self.rbd.InvalidArgument:
LOG.info(
"InvalidArgument: Unable to unprotect snapshot %s.",
snap_name)
except self.rbd.ImageNotFound:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
return
except self.rbd.ImageBusy:
children_list = self._get_children_info(volume, snap_name)
if children_list:
for (pool, image) in children_list:
LOG.info('Image %(pool)s/%(image)s is dependent '
'on the snapshot %(snap)s.',
{'pool': pool,
'image': image,
'snap': snap_name})
raise exception.SnapshotIsBusy(snapshot_name=snap_name)
try:
volume.remove_snap(snap_name)
except self.rbd.ImageNotFound:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
except self.rbd.ImageNotFound: except self.rbd.ImageNotFound:
LOG.warning("Volume %s does not exist in backend.", volume_name) LOG.info("Snapshot %s does not exist in backend.",
snap_name)
def snapshot_revert_use_temp_snapshot(self) -> bool: def snapshot_revert_use_temp_snapshot(self) -> bool:
"""Disable the use of a temporary snapshot on revert.""" """Disable the use of a temporary snapshot on revert."""

View File

@ -0,0 +1,17 @@
---
upgrade:
- |
Cinder now uses the RBD trash functionality to handle some volume deletions.
Therefore, deployments must either a) enable scheduled RBD trash purging on
the RBD backend or b) enable the Cinder RBD driver's enable_deferred_deletion
option to have Cinder purge the RBD trash.
This adds the new configuration option 'rbd_concurrent_flatten_operations',
which limits how many RBD flattens the driver will run simultaneously.
This can be used to prevent flatten operations from consuming too much I/O
capacity on the Ceph cluster. It defaults to 3.
fixes:
- |
`Bug #1969643 <https://bugs.launchpad.net/cinder/+bug/1969643>`_:
The RBD driver can now delete volumes with other volumes cloned from it
(or its snapshots) in cases where deletion would previously fail. This
uses the RBD trash functionality.