Create multiple snapshots from same volume

Under certain conditions, the Lightbits api-service will return various
status codes (including HTTP status codes 400, 500, and 503) that could
indicate a transient error, e.g., because a "create snapshot" operation
is already in progress on the source volume.
Fix the Cinder Lightbits driver to retry when this happens, since there's
a good chance a subsequent "create snapshots" will succeed.

Change-Id: I4f3463ef71997fe707800a9df6a239273a0d5c4a
This commit is contained in:
Rahman Muhammad 2024-01-02 13:15:23 -08:00 committed by yuval
parent 1185caa9a8
commit 665e01c483
3 changed files with 84 additions and 19 deletions

View File

@ -19,6 +19,7 @@ import functools
import hashlib
import http.client as httpstatus
import json
import time
from typing import Dict
from typing import List
from typing import Tuple
@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
"test_lightos_storage.InitiatorConnectorFactoryMocker")
configuration.volume_backend_name = VOLUME_BACKEND_NAME
configuration.reserved_percentage = RESERVED_PERCENTAGE
configuration.lightos_api_service_snapshots_max_calls = 5
def mocked_safe_get(config, variable_name):
if hasattr(config, variable_name):
@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
self.driver.delete_volume(volume)
db.volume_destroy(self.ctxt, volume.id)
@mock.patch.object(time, "sleep", return_value=None)
def test_create_snapshot_fail_bad_request(self, mock_sleep):
def send_cmd_mock(cmd, **kwargs):
if cmd == "create_snapshot":
return (httpstatus.BAD_REQUEST, {})
else:
return cluster_send_cmd(cmd, **kwargs)
self.driver.do_setup(None)
cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd)
self.driver.cluster.send_cmd = send_cmd_mock
vol_type = test_utils.create_volume_type(self.ctxt, self,
name='my_vol_type')
volume = test_utils.create_volume(self.ctxt, size=4,
volume_type_id=vol_type.id)
snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id)
self.driver.create_volume(volume)
self.assertRaises(exception.VolumeBackendAPIException,
self.driver.create_snapshot, snapshot)
self.driver.delete_volume(volume)
db.volume_destroy(self.ctxt, volume.id)
def test_delete_snapshot(self):
self.driver.do_setup(None)

View File

@ -88,6 +88,11 @@ lightos_opts = [
' the host`s IP addresses to a volume IPACL. If set to'
' False, any IP address may access the volume. The default'
' is True.'),
cfg.IntOpt(
'lightos_api_service_snapshots_max_calls',
default=5,
help='The maximum number of calls to the LightOS'
' when creating snapshots. The default is 5 calls.')
]
CONF = cfg.CONF
@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver):
self.logical_op_timeout = \
self.configuration.lightos_api_service_timeout * 3 + 10
self.snapshots_retries = \
self.configuration.lightos_api_service_snapshots_max_calls
@classmethod
def get_driver_options(cls):
@ -1388,28 +1395,42 @@ class LightOSVolumeDriver(driver.VolumeDriver):
@coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
(status_code_get, response) = self._get_lightos_snapshot(
project_name, self.logical_op_timeout,
snapshot_name=snapshot_name)
if status_code_get != httpstatus.OK:
end = time.time() + self.logical_op_timeout
while (time.time() < end):
(status_code_create, response) = self.cluster.send_cmd(
cmd='create_snapshot',
project_name=project_name,
timeout=self.logical_op_timeout,
name=snapshot_name,
src_volume_name=src_volume_name,
)
found_or_created_snapshot = False
last_status_code = 999
last_response = "No response"
if status_code_create == httpstatus.INTERNAL_SERVER_ERROR:
pass
else:
break
for i in range(self.snapshots_retries):
if i != 0:
sleeptime = 2 ** i # 2, 4, 8, 16 (default is 30 seconds)
time.sleep(sleeptime)
(status_code_get, response) = self._get_lightos_snapshot(
project_name, self.logical_op_timeout,
snapshot_name=snapshot_name)
if status_code_get == httpstatus.OK:
found_or_created_snapshot = True
break
time.sleep(1)
(status_code_create, response) = self.cluster.send_cmd(
cmd='create_snapshot',
project_name=project_name,
timeout=self.logical_op_timeout,
name=snapshot_name,
src_volume_name=src_volume_name,
)
if status_code_create == httpstatus.OK:
found_or_created_snapshot = True
break
if status_code_create != httpstatus.OK:
if status_code_create in (httpstatus.BAD_REQUEST,
httpstatus.INTERNAL_SERVER_ERROR,
httpstatus.SERVICE_UNAVAILABLE):
LOG.debug('Creating new snapshot %s under project %s'
' failed, received error with http-status %s',
snapshot_name, project_name, status_code_create)
last_status_code = status_code_create
last_response = response
else:
msg = ('Did not succeed creating LightOS snapshot %s'
' project %s'
' status code %s response %s' %
@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver):
response))
raise exception.VolumeBackendAPIException(message=_(msg))
if not found_or_created_snapshot:
msg = ('Did not succeed creating LightOS snapshot %s'
' project %s'
' status code %s response %s' %
(snapshot_name, project_name, last_status_code,
last_response))
raise exception.VolumeBackendAPIException(message=_(msg))
state = self._wait_for_snapshot_available(project_name,
timeout=
self.logical_op_timeout,

View File

@ -0,0 +1,11 @@
---
features:
- |
Lightbits driver: Added support to create multiple snapshots
from the same volume simultaneously when using the Lightbits
cinder driver. Under certain conditions, older releases of the
Lightbits api-service will return various status codes (including
HTTP status codes 500 and 503) that could indicate transient
failures. Added retry logic on such errors becuase there's a good
chance that the error is transient and subsequent calls will
succeed.