Create multiple snapshots from same volume

Under certain conditions, the Lightbits api-service will return various status codes (including HTTP status codes 400, 500, and 503) that could indicate a transient error, e.g., because a "create snapshot" operation is already in progress on the source volume. Fix the Cinder Lightbits driver to retry when this happens, since there's a good chance a subsequent "create snapshots" will succeed. Change-Id: I4f3463ef71997fe707800a9df6a239273a0d5c4a
2024-01-02 13:15:23 -08:00 · 2024-01-02 13:15:23 -08:00 · 665e01c483
commit 665e01c483
parent 1185caa9a8
3 changed files with 84 additions and 19 deletions
--- a/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py
+++ b/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py
@ -19,6 +19,7 @@ import functools
 import hashlib
 import http.client as httpstatus
 import json
+import time
 from typing import Dict
 from typing import List
 from typing import Tuple
@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
            "test_lightos_storage.InitiatorConnectorFactoryMocker")
        configuration.volume_backend_name = VOLUME_BACKEND_NAME
        configuration.reserved_percentage = RESERVED_PERCENTAGE
+        configuration.lightos_api_service_snapshots_max_calls = 5

        def mocked_safe_get(config, variable_name):
            if hasattr(config, variable_name):
@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
        self.driver.delete_volume(volume)
        db.volume_destroy(self.ctxt, volume.id)

+    @mock.patch.object(time, "sleep", return_value=None)
+    def test_create_snapshot_fail_bad_request(self, mock_sleep):
+        def send_cmd_mock(cmd, **kwargs):
+            if cmd == "create_snapshot":
+                return (httpstatus.BAD_REQUEST, {})
+            else:
+                return cluster_send_cmd(cmd, **kwargs)
+        self.driver.do_setup(None)
+        cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd)
+        self.driver.cluster.send_cmd = send_cmd_mock
+
+        vol_type = test_utils.create_volume_type(self.ctxt, self,
+                                                 name='my_vol_type')
+        volume = test_utils.create_volume(self.ctxt, size=4,
+                                          volume_type_id=vol_type.id)
+        snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id)
+
+        self.driver.create_volume(volume)
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.driver.create_snapshot, snapshot)
+        self.driver.delete_volume(volume)
+        db.volume_destroy(self.ctxt, volume.id)
+
    def test_delete_snapshot(self):
        self.driver.do_setup(None)

--- a/cinder/volume/drivers/lightos.py
+++ b/cinder/volume/drivers/lightos.py
@ -88,6 +88,11 @@ lightos_opts = [
                ' the host`s IP addresses to a volume IPACL. If set to'
                ' False, any IP address may access the volume. The default'
                ' is True.'),
+    cfg.IntOpt(
+        'lightos_api_service_snapshots_max_calls',
+        default=5,
+        help='The maximum number of calls to the LightOS'
+        ' when creating snapshots. The default is 5 calls.')
 ]

 CONF = cfg.CONF
@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver):

        self.logical_op_timeout = \
            self.configuration.lightos_api_service_timeout * 3 + 10
+        self.snapshots_retries = \
+            self.configuration.lightos_api_service_snapshots_max_calls

    @classmethod
    def get_driver_options(cls):
@ -1388,28 +1395,42 @@ class LightOSVolumeDriver(driver.VolumeDriver):

    @coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
    def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
-        (status_code_get, response) = self._get_lightos_snapshot(
-            project_name, self.logical_op_timeout,
-            snapshot_name=snapshot_name)
-        if status_code_get != httpstatus.OK:
-            end = time.time() + self.logical_op_timeout
-            while (time.time() < end):
-                (status_code_create, response) = self.cluster.send_cmd(
-                    cmd='create_snapshot',
-                    project_name=project_name,
-                    timeout=self.logical_op_timeout,
-                    name=snapshot_name,
-                    src_volume_name=src_volume_name,
-                )
+        found_or_created_snapshot = False
+        last_status_code = 999
+        last_response = "No response"

-                if status_code_create == httpstatus.INTERNAL_SERVER_ERROR:
-                    pass
-                else:
-                    break
+        for i in range(self.snapshots_retries):
+            if i != 0:
+                sleeptime = 2 ** i  # 2, 4, 8, 16 (default is 30 seconds)
+                time.sleep(sleeptime)
+            (status_code_get, response) = self._get_lightos_snapshot(
+                project_name, self.logical_op_timeout,
+                snapshot_name=snapshot_name)
+            if status_code_get == httpstatus.OK:
+                found_or_created_snapshot = True
+                break

-                time.sleep(1)
+            (status_code_create, response) = self.cluster.send_cmd(
+                cmd='create_snapshot',
+                project_name=project_name,
+                timeout=self.logical_op_timeout,
+                name=snapshot_name,
+                src_volume_name=src_volume_name,
+            )
+            if status_code_create == httpstatus.OK:
+                found_or_created_snapshot = True
+                break

-            if status_code_create != httpstatus.OK:
+            if status_code_create in (httpstatus.BAD_REQUEST,
+                                      httpstatus.INTERNAL_SERVER_ERROR,
+                                      httpstatus.SERVICE_UNAVAILABLE):
+
+                LOG.debug('Creating new snapshot %s under project %s'
+                          ' failed, received error with http-status %s',
+                          snapshot_name, project_name, status_code_create)
+                last_status_code = status_code_create
+                last_response = response
+            else:
                msg = ('Did not succeed creating LightOS snapshot %s'
                       ' project %s'
                       ' status code %s response %s' %
@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver):
                        response))
                raise exception.VolumeBackendAPIException(message=_(msg))

+        if not found_or_created_snapshot:
+            msg = ('Did not succeed creating LightOS snapshot %s'
+                   ' project %s'
+                   ' status code %s response %s' %
+                   (snapshot_name, project_name, last_status_code,
+                    last_response))
+            raise exception.VolumeBackendAPIException(message=_(msg))
+
        state = self._wait_for_snapshot_available(project_name,
                                                  timeout=
                                                  self.logical_op_timeout,
--- a/releasenotes/notes/lightbits-snapshot-timeout-6b25dbd15a650d52.yaml
+++ b/releasenotes/notes/lightbits-snapshot-timeout-6b25dbd15a650d52.yaml
@ -0,0 +1,11 @@
+---
+features:
+  - |
+    Lightbits driver: Added support to create multiple snapshots
+    from the same volume simultaneously when using the Lightbits
+    cinder driver. Under certain conditions, older releases of the
+    Lightbits api-service will return various status codes (including
+    HTTP status codes 500 and 503) that could indicate transient
+    failures. Added retry logic on such errors becuase there's a good
+    chance that the error is transient and subsequent calls will
+    succeed.