diff --git a/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py b/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py index 0d0ab8644c4..728ca73e0ec 100644 --- a/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py +++ b/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py @@ -19,6 +19,7 @@ import functools import hashlib import http.client as httpstatus import json +import time from typing import Dict from typing import List from typing import Tuple @@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase): "test_lightos_storage.InitiatorConnectorFactoryMocker") configuration.volume_backend_name = VOLUME_BACKEND_NAME configuration.reserved_percentage = RESERVED_PERCENTAGE + configuration.lightos_api_service_snapshots_max_calls = 5 def mocked_safe_get(config, variable_name): if hasattr(config, variable_name): @@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase): self.driver.delete_volume(volume) db.volume_destroy(self.ctxt, volume.id) + @mock.patch.object(time, "sleep", return_value=None) + def test_create_snapshot_fail_bad_request(self, mock_sleep): + def send_cmd_mock(cmd, **kwargs): + if cmd == "create_snapshot": + return (httpstatus.BAD_REQUEST, {}) + else: + return cluster_send_cmd(cmd, **kwargs) + self.driver.do_setup(None) + cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd) + self.driver.cluster.send_cmd = send_cmd_mock + + vol_type = test_utils.create_volume_type(self.ctxt, self, + name='my_vol_type') + volume = test_utils.create_volume(self.ctxt, size=4, + volume_type_id=vol_type.id) + snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id) + + self.driver.create_volume(volume) + self.assertRaises(exception.VolumeBackendAPIException, + self.driver.create_snapshot, snapshot) + self.driver.delete_volume(volume) + db.volume_destroy(self.ctxt, volume.id) + def test_delete_snapshot(self): self.driver.do_setup(None) diff --git a/cinder/volume/drivers/lightos.py b/cinder/volume/drivers/lightos.py index 03e7a69b901..23c5026bf9a 100644 --- a/cinder/volume/drivers/lightos.py +++ b/cinder/volume/drivers/lightos.py @@ -88,6 +88,11 @@ lightos_opts = [ ' the host`s IP addresses to a volume IPACL. If set to' ' False, any IP address may access the volume. The default' ' is True.'), + cfg.IntOpt( + 'lightos_api_service_snapshots_max_calls', + default=5, + help='The maximum number of calls to the LightOS' + ' when creating snapshots. The default is 5 calls.') ] CONF = cfg.CONF @@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver): self.logical_op_timeout = \ self.configuration.lightos_api_service_timeout * 3 + 10 + self.snapshots_retries = \ + self.configuration.lightos_api_service_snapshots_max_calls @classmethod def get_driver_options(cls): @@ -1388,28 +1395,42 @@ class LightOSVolumeDriver(driver.VolumeDriver): @coordination.synchronized('lightos-create_snapshot-{src_volume_name}') def _create_snapshot(self, project_name, snapshot_name, src_volume_name): - (status_code_get, response) = self._get_lightos_snapshot( - project_name, self.logical_op_timeout, - snapshot_name=snapshot_name) - if status_code_get != httpstatus.OK: - end = time.time() + self.logical_op_timeout - while (time.time() < end): - (status_code_create, response) = self.cluster.send_cmd( - cmd='create_snapshot', - project_name=project_name, - timeout=self.logical_op_timeout, - name=snapshot_name, - src_volume_name=src_volume_name, - ) + found_or_created_snapshot = False + last_status_code = 999 + last_response = "No response" - if status_code_create == httpstatus.INTERNAL_SERVER_ERROR: - pass - else: - break + for i in range(self.snapshots_retries): + if i != 0: + sleeptime = 2 ** i # 2, 4, 8, 16 (default is 30 seconds) + time.sleep(sleeptime) + (status_code_get, response) = self._get_lightos_snapshot( + project_name, self.logical_op_timeout, + snapshot_name=snapshot_name) + if status_code_get == httpstatus.OK: + found_or_created_snapshot = True + break - time.sleep(1) + (status_code_create, response) = self.cluster.send_cmd( + cmd='create_snapshot', + project_name=project_name, + timeout=self.logical_op_timeout, + name=snapshot_name, + src_volume_name=src_volume_name, + ) + if status_code_create == httpstatus.OK: + found_or_created_snapshot = True + break - if status_code_create != httpstatus.OK: + if status_code_create in (httpstatus.BAD_REQUEST, + httpstatus.INTERNAL_SERVER_ERROR, + httpstatus.SERVICE_UNAVAILABLE): + + LOG.debug('Creating new snapshot %s under project %s' + ' failed, received error with http-status %s', + snapshot_name, project_name, status_code_create) + last_status_code = status_code_create + last_response = response + else: msg = ('Did not succeed creating LightOS snapshot %s' ' project %s' ' status code %s response %s' % @@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver): response)) raise exception.VolumeBackendAPIException(message=_(msg)) + if not found_or_created_snapshot: + msg = ('Did not succeed creating LightOS snapshot %s' + ' project %s' + ' status code %s response %s' % + (snapshot_name, project_name, last_status_code, + last_response)) + raise exception.VolumeBackendAPIException(message=_(msg)) + state = self._wait_for_snapshot_available(project_name, timeout= self.logical_op_timeout, diff --git a/releasenotes/notes/lightbits-snapshot-timeout-6b25dbd15a650d52.yaml b/releasenotes/notes/lightbits-snapshot-timeout-6b25dbd15a650d52.yaml new file mode 100644 index 00000000000..5541851e6a1 --- /dev/null +++ b/releasenotes/notes/lightbits-snapshot-timeout-6b25dbd15a650d52.yaml @@ -0,0 +1,11 @@ +--- +features: + - | + Lightbits driver: Added support to create multiple snapshots + from the same volume simultaneously when using the Lightbits + cinder driver. Under certain conditions, older releases of the + Lightbits api-service will return various status codes (including + HTTP status codes 500 and 503) that could indicate transient + failures. Added retry logic on such errors becuase there's a good + chance that the error is transient and subsequent calls will + succeed.