Merge "Deadlock prevention support in synchronize"

This commit is contained in:
Zuul 2023-07-03 19:09:39 +00:00 committed by Gerrit Code Review
commit 3c0aea5feb
3 changed files with 160 additions and 25 deletions

View File

@ -135,18 +135,43 @@ def synchronized_remove(glob_name, coordinator=COORDINATOR):
coordinator.remove_lock(glob_name)
def synchronized(lock_name: str,
def __acquire(lock, blocking, f_name):
"""Acquire a lock and return the time when it was acquired."""
t1 = timeutils.now()
name = utils.convert_str(lock.name)
LOG.debug('Acquiring lock "%s" by "%s"', name, f_name)
lock.acquire(blocking)
t2 = timeutils.now()
LOG.debug('Lock "%s" acquired by "%s" :: waited %0.3fs',
name, f_name, t2 - t1)
return t2
def __release(lock, acquired_time, f_name):
"""Release a lock ignoring exceptions."""
name = utils.convert_str(lock.name)
try:
lock.release()
held = timeutils.now() - acquired_time
LOG.debug('Lock "%s" released by "%s" :: held %0.3fs',
name, f_name, held)
except Exception as e:
LOG.error('Failed to release lock "%s": %s', name, e)
def synchronized(*lock_names: str,
blocking: bool = True,
coordinator: Coordinator = COORDINATOR) -> Callable:
"""Synchronization decorator.
:param str lock_name: Lock name.
:param str lock_names: Arbitrary number of Lock names.
:param blocking: If True, blocks until the lock is acquired.
If False, raises exception when not acquired. Otherwise,
the value is used as a timeout value and if lock is not acquired
after this number of seconds exception is raised.
after this number of seconds exception is raised. This is a keyword
only argument.
:param coordinator: Coordinator class to use when creating lock.
Defaults to the global coordinator.
Defaults to the global coordinator. This is a keyword only argument.
:raises tooz.coordination.LockAcquireFailed: if lock is not acquired
Decorating a method like so::
@ -175,37 +200,44 @@ def synchronized(lock_name: str,
def foo(self, vol, snap):
...
Multiple locks can be requested simultaneously and the decorator will
reorder the names by rendered lock names to prevent potential deadlocks.
@synchronized('{f_name}-{vol.id}-{snap[name]}',
'{f_name}-{vol.id}.delete')
def foo(self, vol, snap):
...
Available field names are: decorated function parameters and
`f_name` as a decorated function name.
"""
@decorator.decorator
def _synchronized(f, *a, **k) -> Callable:
call_args = inspect.getcallargs(f, *a, **k)
call_args['f_name'] = f.__name__
lock = coordinator.get_lock(lock_name.format(**call_args))
name = utils.convert_str(lock.name)
# Prevent deadlocks not duplicating and sorting them by name to always
# acquire them in the same order.
names = sorted(set([name.format(**call_args) for name in lock_names]))
locks = [coordinator.get_lock(name) for name in names]
acquired_times = []
f_name = f.__name__
t1 = timeutils.now()
t2 = None
try:
LOG.debug('Acquiring lock "%(name)s" by "%(f_name)s"',
{'name': name, 'f_name': f_name})
with lock(blocking):
t2 = timeutils.now()
LOG.debug('Lock "%(name)s" acquired by "%(f_name)s" :: '
'waited %(wait)s',
{'name': name, 'f_name': f_name,
'wait': "%0.3fs" % (t2 - t1)})
return f(*a, **k)
if len(locks) > 1: # Don't pollute logs for single locks
LOG.debug('Acquiring %s locks by %s', len(locks), f_name)
for lock in locks:
acquired_times.append(__acquire(lock, blocking, f_name))
if len(locks) > 1:
t = timeutils.now() - t1
LOG.debug('Acquired %s locks by %s in %0.3fs',
len(locks), f_name, t)
return f(*a, **k)
finally:
t3 = timeutils.now()
if t2 is None:
held_secs = "N/A"
else:
held_secs = "%0.3fs" % (t3 - t2)
LOG.debug(
'Lock "%(name)s" released by "%(f_name)s" :: held %(held)s',
{'name': name, 'f_name': f_name, 'held': held_secs})
for lock, acquired_time in zip(locks, acquired_times):
__release(lock, acquired_time, f_name)
return _synchronized

View File

@ -209,3 +209,71 @@ class CoordinationTestCase(test.TestCase):
coordination.synchronized_remove(mock.sentinel.glob_name, coordinator)
coordinator.remove_lock.assert_called_once_with(
mock.sentinel.glob_name)
@mock.patch.object(coordination.COORDINATOR, 'get_lock')
def test_synchronized_multiple_templates(self, get_lock):
"""Test locks requested in the right order and duplicates removed."""
locks = ['lock-{f_name}-%s-{foo.val}-{bar[val]}' % i for i in range(3)]
expect = [f'lock-func-{i}-7-8' for i in range(3)]
@coordination.synchronized(locks[1], locks[0], locks[1], locks[2])
def func(foo, bar):
pass
foo = mock.Mock(val=7)
bar = mock.MagicMock()
bar.__getitem__.return_value = 8
func(foo, bar)
self.assertEqual(len(expect), get_lock.call_count)
get_lock.assert_has_calls([mock.call(lock) for lock in expect])
self.assertEqual(len(expect), get_lock.return_value.acquire.call_count)
get_lock.return_value.acquire.assert_has_calls(
[mock.call(True)] * len(expect))
self.assertEqual(['foo', 'bar'], inspect.getfullargspec(func)[0])
@mock.patch('oslo_utils.timeutils.now', side_effect=[1, 2])
def test___acquire(self, mock_now):
lock = mock.Mock()
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__acquire'
res = getattr(coordination, '__acquire')(lock, mock.sentinel.blocking,
mock.sentinel.f_name)
self.assertEqual(2, res)
self.assertEqual(2, mock_now.call_count)
mock_now.assert_has_calls([mock.call(), mock.call()])
lock.acquire.assert_called_once_with(mock.sentinel.blocking)
@mock.patch('oslo_utils.timeutils.now')
def test___acquire_propagates_exception(self, mock_now):
lock = mock.Mock()
lock.acquire.side_effect = ValueError
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__acquire'
self.assertRaises(ValueError,
getattr(coordination, '__acquire'),
lock, mock.sentinel.blocking, mock.sentinel.f_name)
mock_now.assert_called_once_with()
lock.acquire.assert_called_once_with(mock.sentinel.blocking)
@mock.patch('oslo_utils.timeutils.now', return_value=2)
def test___release(self, mock_now):
lock = mock.Mock()
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__release'
getattr(coordination, '__release')(lock, 1, mock.sentinel.f_name)
mock_now.assert_called_once_with()
lock.release.assert_called_once_with()
@mock.patch('oslo_utils.timeutils.now')
def test___release_ignores_exception(self, mock_now):
lock = mock.Mock()
lock.release.side_effect = ValueError
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__release'
getattr(coordination, '__release')(lock, 1, mock.sentinel.f_name)
mock_now.assert_not_called()
lock.release.assert_called_once_with()

View File

@ -642,6 +642,10 @@ race conditions.
Global locking functionality is provided by the `synchronized` decorator from
`cinder.coordination`.
.. attention:: Optional `blocking` and `coordinator` arguments to the
`synchronized` decorator are **keyword** arguments only and cannot be passed
as positional arguments.
This method is more advanced than the one used for the `Process locks`_ and the
`Node locks`_, as the name supports templates. For the template we have all
the method parameters as well as `f_name` that represents that name of the
@ -673,6 +677,37 @@ attribute from `self`, and a recursive reference in the `snapshot` parameter.
@coordination.synchronized('{self.driver_prefix}-{snapshot.volume.id}')
def create_snapshot(self, snapshot):
Some drivers may require multiple locks for a critical section, which could
potentially create deadlocks. Like in the following example, where `PowerMax`
method `move_volume_between_storage_groups` creates 2 locks:
.. code-block:: python
@coordination.synchronized(
"emc-sg-{source_storagegroup_name}-{serial_number}")
@coordination.synchronized(
"emc-sg-{target_storagegroup_name}-{serial_number}")
def move_volume_between_storage_groups(
self, serial_number, device_id, source_storagegroup_name,
target_storagegroup_name, extra_specs, force=False,
parent_sg=None):
That code can result in a deadlock if 2 opposite requests come in concurrently
and their first lock acquisition interleaves.
The solution is calling the `synchronized` decorator with both lock names and
let it resolve the acquire ordering issue for us. The right code would be:
.. code-block:: python
@coordination.synchronized(
"emc-sg-{source_storagegroup_name}-{serial_number}",
"emc-sg-{target_storagegroup_name}-{serial_number}")
def move_volume_between_storage_groups(
self, serial_number, device_id, source_storagegroup_name,
target_storagegroup_name, extra_specs, force=False,
parent_sg=None):
Internally Cinder uses the `Tooz library`_ to provide the distributed locking.
By default, this library is configured for Active-Passive deployments, where it
uses file locks equivalent to those used for `Node locks`_.