Do not reset quota cache timestamp when invalid

The quota cache may not be a valid dictionary when invalidateQuotaCache() is called (e.g. when 'ignore-provider-quota' is used in OpenStack). In that case, don't attempt to treat the None as a dictionary as this raises a TypeError exception. This bug was preventing Quota errors from OpenStack from causing nodepool to retry the node request when ignore-provider-quota is True, because the OpenStack handler calles invalidateQuotaCache() before raising the QuotaException. Since invalidateQuotaCache() was raising TypeError, it prevented the QuotaException from being raised and the node allocation was outright failed. A test has been added to verify that nodepool and OpenStack will now retry node allocations as intended. This fixes that bug, but does change the behavior of OpenStack when ignore-provider-quota is True and it returns a Quota error. Change-Id: I1916c56c4f07c6a5d53ce82f4c1bb32bddbd7d63 Signed-off-by: Joshua Watt <JPEWhacker@gmail.com>
2022-04-25 08:48:38 -05:00 · 2022-04-25 08:48:38 -05:00 · 2c632af426
commit 2c632af426
parent 46c9e01ab8
5 changed files with 81 additions and 12 deletions
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
        ]
        self._azs = ['az1', 'az2']
        self._server_list = []
-        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
-            _get_quota()
+        self._update_quota()
        self._down_ports = [
            Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
                  device_owner="compute:nova"),
@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
                  device_owner=None),
        ]

+    def _update_quota(self):
+        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
+            _get_quota()
+
    def _get(self, name_or_id, instance_list):
        self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
        for instance in instance_list:
@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
            private_v4 = 'fake'
            host_id = 'fake'
            interface_ip = 'fake'
+        self._update_quota()
        over_quota = False
        if (instance_type == Dummy.INSTANCE and
            self.max_instances > -1 and
@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
        return self._azs.copy()

    def get_compute_limits(self):
+        self._update_quota()
        return Dummy(
            'limits',
            max_total_cores=self.max_cores,
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@ -259,7 +259,8 @@ class QuotaSupport:

    def __init__(self, *args, **kw):
        super().__init__(*args, **kw)
-        self._current_nodepool_quota = None
+        self._current_nodepool_quota_timestamp = 0
+        self._current_nodepool_quota = {}

    @abc.abstractmethod
    def quotaNeededByLabel(self, label, pool):
@ -291,7 +292,7 @@ class QuotaSupport:
        pass

    def invalidateQuotaCache(self):
-        self._current_nodepool_quota['timestamp'] = 0
+        self._current_nodepool_quota_timestamp = 0

    def estimatedNodepoolQuota(self):
        '''
@ -307,8 +308,8 @@ class QuotaSupport:

        if self._current_nodepool_quota:
            now = time.time()
-            if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+            if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
+                return copy.deepcopy(self._current_nodepool_quota)

        # This is initialized with the full tenant quota and later becomes
        # the quota available for nodepool.
@ -318,7 +319,7 @@ class QuotaSupport:
            if self._current_nodepool_quota:
                self.log.exception("Unable to get provider quota, "
                                   "using cached value")
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+                return copy.deepcopy(self._current_nodepool_quota)
            raise

        self.log.debug("Provider quota for %s: %s",
@ -328,10 +329,8 @@ class QuotaSupport:
        # to get the quota available for us.
        nodepool_quota.subtract(self.unmanagedQuotaUsed())

-        self._current_nodepool_quota = {
-            'quota': nodepool_quota,
-            'timestamp': time.time()
-        }
+        self._current_nodepool_quota = nodepool_quota
+        self._current_nodepool_quota_timestamp = time.time()

        self.log.debug("Available quota for %s: %s",
                       self.provider.name, nodepool_quota)
--- a/nodepool/tests/init.py
+++ b/nodepool/tests/init.py
@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
        self.wait_for_threads()
        return ready_nodes[label]

+    def waitForAnyNodeInState(self, state):
+        # Wait for a node to be in the aborted state
+        for _ in iterate_timeout(ONE_MINUTE, Exception,
+                                 f"Timeout waiting for node in {state} state",
+                                 interval=1):
+            for node in self.zk.nodeIterator(False):
+                if node.state == state:
+                    return node
+
    def waitForNodeRequest(self, req, states=None):
        '''
        Wait for a node request to transition to a final state.
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):

        # patch the cloud with requested quota
        def fake_get_quota():
+            nonlocal max_cores, max_instances, max_ram
            return (max_cores, max_instances, max_ram)
        self.useFixture(fixtures.MockPatchObject(
            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):

        # Now, reduce the quota so the next node unexpectedly
        # (according to nodepool's quota estimate) fails.
-        client.max_instances = 1
+        max_instances = 1

        # Request a second node; this request should pause the handler.
        req2 = zk.NodeRequest()
@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
        req3 = self.waitForNodeRequest(req3)
        self.assertEqual(req3.state, zk.FAILED)

+    def test_ignore_provider_quota_true_with_provider_error(self):
+        '''
+        Tests that quota errors returned from the provider when allocating a
+        node are correctly handled and retried. The node request should be
+        retried until there is sufficient quota to satisfy it.
+        '''
+        max_instances = 0
+
+        def fake_get_quota():
+            nonlocal max_instances
+            return (100, max_instances, 1000000)
+
+        self.useFixture(fixtures.MockPatchObject(
+            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
+            fake_get_quota
+        ))
+
+        configfile = self.setup_config('ignore_provider_quota_true.yaml')
+        self.useBuilder(configfile)
+        self.waitForImage('fake-provider', 'fake-image')
+
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+
+        # Create a request with ignore-provider-quota set to true that should
+        # pass regardless of the lack of cloud/provider quota.
+        self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
+
+        self.log.debug(
+            "Submitting an initial request with ignore-provider-quota True")
+        req = zk.NodeRequest()
+        req.state = zk.REQUESTED
+        req.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req)
+
+        self.waitForAnyNodeInState(zk.ABORTED)
+
+        self.assertReportedStat(
+            'nodepool.launch.provider.fake-provider.error.quota')
+        self.assertReportedStat(
+            'nodepool.launch.error.quota')
+
+        # Bump up the quota to allow the provider to allocate a node
+        max_instances = 1
+        req = self.waitForNodeRequest(req)
+        self.assertEqual(req.state, zk.FULFILLED)
+
    def test_request_order(self):
        """Test that requests are handled in sorted order"""
        configfile = self.setup_config('node_no_min_ready.yaml')
--- a/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
+++ b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes an exception that was raised by the OpenStack driver when attempting
+    to reset the quota timestamp and `ignore-provider-quota` is `true`. This
+    exception prevented nodepool from seeing quota errors from OpenStack,
+    causing it to fail the node request instead of retrying as it does for
+    other providers.