From 7753694d29330262e3c56be9457d315855f9e4b2 Mon Sep 17 00:00:00 2001
From: Chris Dent <cdent@anticdent.org>
Date: Wed, 28 Aug 2019 18:29:19 +0100
Subject: [PATCH] Add allocation_conflict_retry_count conf setting

A [placement]/allocation_conflict_retry_count config setting
has been added to replace the RP_CONFLICT_RETRY_COUNT constant
previous used to set the number of retries that will be
attempted when there is a resource provider generation conflict
when writing allocations.

Change-Id: Iaa2c9007d8656dd02cda9313459a233efd38c841
Story: 2006467
Task: 36398
---
 placement/conf/placement.py                   |  8 +++
 placement/objects/allocation.py               |  8 +--
 .../tests/functional/db/test_allocation.py    | 54 ++++++++++---------
 .../tests/functional/fixtures/gabbits.py      |  5 ++
 ...conflict_retry_count-329daae86059f5ec.yaml | 18 +++++++
 5 files changed, 61 insertions(+), 32 deletions(-)
 create mode 100644 releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml

diff --git a/placement/conf/placement.py b/placement/conf/placement.py
index 44580ce8c..35807400b 100644
--- a/placement/conf/placement.py
+++ b/placement/conf/placement.py
@@ -61,6 +61,14 @@ a project or user identifier for the consumer. In cleaning up the data
 modeling, we no longer allow missing project and user information. If an older
 client makes an allocation, we'll use this in place of the information it
 doesn't provide.
+"""),
+    cfg.IntOpt(
+        'allocation_conflict_retry_count',
+        default=10,
+        help="""
+The number of times to retry, server-side, writing allocations when there is
+a resource provider generation conflict. Raising this value may be useful
+when many concurrent allocations to the same resource provider are expected.
 """),
 ]
 
diff --git a/placement/objects/allocation.py b/placement/objects/allocation.py
index b370087ed..524606fd3 100644
--- a/placement/objects/allocation.py
+++ b/placement/objects/allocation.py
@@ -35,10 +35,6 @@ _USER_TBL = models.User.__table__
 
 LOG = logging.getLogger(__name__)
 
-# The number of times to retry set_allocations if there has
-# been a resource provider (not consumer) generation coflict.
-RP_CONFLICT_RETRY_COUNT = 10
-
 
 class Allocation(object):
 
@@ -499,7 +495,7 @@ def replace_all(context, alloc_list):
     # and try again. For sake of simplicity (and because we don't have
     # easy access to the information) we reload all the resource
     # providers that may be present.
-    retries = RP_CONFLICT_RETRY_COUNT
+    retries = context.config.placement.allocation_conflict_retry_count
     while retries:
         retries -= 1
         try:
@@ -526,7 +522,7 @@ def replace_all(context, alloc_list):
         # information from the allocations is not coherent as this
         # could be multiple consumers and providers.
         LOG.warning('Exceeded retry limit of %d on allocations write',
-                    RP_CONFLICT_RETRY_COUNT)
+                    context.config.placement.allocation_conflict_retry_count)
         raise exception.ResourceProviderConcurrentUpdateDetected()
 
 
diff --git a/placement/tests/functional/db/test_allocation.py b/placement/tests/functional/db/test_allocation.py
index 912130391..7f80b158f 100644
--- a/placement/tests/functional/db/test_allocation.py
+++ b/placement/tests/functional/db/test_allocation.py
@@ -624,40 +624,42 @@ class TestAllocationListCreateDelete(tb.PlacementDbBaseTestCase):
         ]
 
         # Make sure the right exception happens when the retry loop expires.
-        with mock.patch.object(alloc_obj, 'RP_CONFLICT_RETRY_COUNT', 0):
-            self.assertRaises(
-                exception.ResourceProviderConcurrentUpdateDetected,
-                alloc_obj.replace_all, self.ctx, alloc_list)
-            mock_log.warning.assert_called_with(
-                'Exceeded retry limit of %d on allocations write', 0)
+        self.conf_fixture.config(allocation_conflict_retry_count=0,
+                                 group='placement')
+        self.assertRaises(
+            exception.ResourceProviderConcurrentUpdateDetected,
+            alloc_obj.replace_all, self.ctx, alloc_list)
+        mock_log.warning.assert_called_with(
+            'Exceeded retry limit of %d on allocations write', 0)
 
         # Make sure the right thing happens after a small number of failures.
         # There's a bit of mock magic going on here to enusre that we can
         # both do some side effects on _set_allocations as well as have the
         # real behavior. Two generation conflicts and then a success.
         mock_log.reset_mock()
-        with mock.patch.object(alloc_obj, 'RP_CONFLICT_RETRY_COUNT', 3):
-            unmocked_set = alloc_obj._set_allocations
-            with mock.patch('placement.objects.allocation.'
-                            '_set_allocations') as mock_set:
-                exceptions = iter([
-                    exception.ResourceProviderConcurrentUpdateDetected(),
-                    exception.ResourceProviderConcurrentUpdateDetected(),
-                ])
+        self.conf_fixture.config(allocation_conflict_retry_count=3,
+                                 group='placement')
+        unmocked_set = alloc_obj._set_allocations
+        with mock.patch('placement.objects.allocation.'
+                        '_set_allocations') as mock_set:
+            exceptions = iter([
+                exception.ResourceProviderConcurrentUpdateDetected(),
+                exception.ResourceProviderConcurrentUpdateDetected(),
+            ])
 
-                def side_effect(*args, **kwargs):
-                    try:
-                        raise next(exceptions)
-                    except StopIteration:
-                        return unmocked_set(*args, **kwargs)
+            def side_effect(*args, **kwargs):
+                try:
+                    raise next(exceptions)
+                except StopIteration:
+                    return unmocked_set(*args, **kwargs)
 
-                mock_set.side_effect = side_effect
-                alloc_obj.replace_all(self.ctx, alloc_list)
-                self.assertEqual(2, mock_log.debug.call_count)
-                mock_log.debug.called_with(
-                    'Retrying allocations write on resource provider '
-                    'generation conflict')
-                self.assertEqual(3, mock_set.call_count)
+            mock_set.side_effect = side_effect
+            alloc_obj.replace_all(self.ctx, alloc_list)
+            self.assertEqual(2, mock_log.debug.call_count)
+            mock_log.debug.called_with(
+                'Retrying allocations write on resource provider '
+                'generation conflict')
+            self.assertEqual(3, mock_set.call_count)
 
         # Confirm we're using a different rp object after the change
         # and that it has a higher generation.
diff --git a/placement/tests/functional/fixtures/gabbits.py b/placement/tests/functional/fixtures/gabbits.py
index e88c463b7..2a212e43a 100644
--- a/placement/tests/functional/fixtures/gabbits.py
+++ b/placement/tests/functional/fixtures/gabbits.py
@@ -79,6 +79,11 @@ class APIFixture(fixture.GabbiFixture):
         self.placement_db_fixture.setUp()
 
         self.context = context.RequestContext()
+        # Some database interaction methods require access to the oslo config
+        # via the context. Within the WSGI application this is taken care of
+        # but here in the fixtures we use some of those methods to create
+        # entities.
+        self.context.config = self.conf_fixture.conf
 
         # Set default policy opts, otherwise the deploy module can
         # NoSuchOptError.
diff --git a/releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml b/releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml
new file mode 100644
index 000000000..339c31355
--- /dev/null
+++ b/releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml
@@ -0,0 +1,18 @@
+---
+fixes:
+  - |
+    When a single resource provider receives many concurrent allocation writes,
+    retries may be performed server side when there is a resource provider
+    generation conflict. When those retries are all consumed, the client
+    receives an HTTP 409 response and may choose to try the request again.
+
+    In an environment where high levels of concurrent allocation writes are
+    common, such as a busy clustered hypervisor, the default retry count may be
+    too low. See story 2006467_
+
+    A new configuation setting,
+    ``[placement]/allocation_conflict_retry_count``, has been added to address
+    this situation. It defines the number of times to retry, server-side,
+    writing allocations when there is a resource provider generation conflict.
+
+    .. _2006467: https://storyboard.openstack.org/#!/story/2006467