Create multiple snapshots from same volume

Under certain conditions, the Lightbits api-service will return various
status codes (including HTTP status codes 400, 500, and 503) that could
indicate a transient error, e.g., because a "create snapshot" operation
is already in progress on the source volume.
Fix the Cinder Lightbits driver to retry when this happens, since there's
a good chance a subsequent "create snapshots" will succeed.

Change-Id: I4f3463ef71997fe707800a9df6a239273a0d5c4a
This commit is contained in:
Rahman Muhammad 2024-01-02 13:15:23 -08:00 committed by yuval
parent 1185caa9a8
commit 665e01c483
3 changed files with 84 additions and 19 deletions

View File

@ -19,6 +19,7 @@ import functools
import hashlib import hashlib
import http.client as httpstatus import http.client as httpstatus
import json import json
import time
from typing import Dict from typing import Dict
from typing import List from typing import List
from typing import Tuple from typing import Tuple
@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
"test_lightos_storage.InitiatorConnectorFactoryMocker") "test_lightos_storage.InitiatorConnectorFactoryMocker")
configuration.volume_backend_name = VOLUME_BACKEND_NAME configuration.volume_backend_name = VOLUME_BACKEND_NAME
configuration.reserved_percentage = RESERVED_PERCENTAGE configuration.reserved_percentage = RESERVED_PERCENTAGE
configuration.lightos_api_service_snapshots_max_calls = 5
def mocked_safe_get(config, variable_name): def mocked_safe_get(config, variable_name):
if hasattr(config, variable_name): if hasattr(config, variable_name):
@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
self.driver.delete_volume(volume) self.driver.delete_volume(volume)
db.volume_destroy(self.ctxt, volume.id) db.volume_destroy(self.ctxt, volume.id)
@mock.patch.object(time, "sleep", return_value=None)
def test_create_snapshot_fail_bad_request(self, mock_sleep):
def send_cmd_mock(cmd, **kwargs):
if cmd == "create_snapshot":
return (httpstatus.BAD_REQUEST, {})
else:
return cluster_send_cmd(cmd, **kwargs)
self.driver.do_setup(None)
cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd)
self.driver.cluster.send_cmd = send_cmd_mock
vol_type = test_utils.create_volume_type(self.ctxt, self,
name='my_vol_type')
volume = test_utils.create_volume(self.ctxt, size=4,
volume_type_id=vol_type.id)
snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id)
self.driver.create_volume(volume)
self.assertRaises(exception.VolumeBackendAPIException,
self.driver.create_snapshot, snapshot)
self.driver.delete_volume(volume)
db.volume_destroy(self.ctxt, volume.id)
def test_delete_snapshot(self): def test_delete_snapshot(self):
self.driver.do_setup(None) self.driver.do_setup(None)

View File

@ -88,6 +88,11 @@ lightos_opts = [
' the host`s IP addresses to a volume IPACL. If set to' ' the host`s IP addresses to a volume IPACL. If set to'
' False, any IP address may access the volume. The default' ' False, any IP address may access the volume. The default'
' is True.'), ' is True.'),
cfg.IntOpt(
'lightos_api_service_snapshots_max_calls',
default=5,
help='The maximum number of calls to the LightOS'
' when creating snapshots. The default is 5 calls.')
] ]
CONF = cfg.CONF CONF = cfg.CONF
@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver):
self.logical_op_timeout = \ self.logical_op_timeout = \
self.configuration.lightos_api_service_timeout * 3 + 10 self.configuration.lightos_api_service_timeout * 3 + 10
self.snapshots_retries = \
self.configuration.lightos_api_service_snapshots_max_calls
@classmethod @classmethod
def get_driver_options(cls): def get_driver_options(cls):
@ -1388,28 +1395,42 @@ class LightOSVolumeDriver(driver.VolumeDriver):
@coordination.synchronized('lightos-create_snapshot-{src_volume_name}') @coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
def _create_snapshot(self, project_name, snapshot_name, src_volume_name): def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
(status_code_get, response) = self._get_lightos_snapshot( found_or_created_snapshot = False
project_name, self.logical_op_timeout, last_status_code = 999
snapshot_name=snapshot_name) last_response = "No response"
if status_code_get != httpstatus.OK:
end = time.time() + self.logical_op_timeout
while (time.time() < end):
(status_code_create, response) = self.cluster.send_cmd(
cmd='create_snapshot',
project_name=project_name,
timeout=self.logical_op_timeout,
name=snapshot_name,
src_volume_name=src_volume_name,
)
if status_code_create == httpstatus.INTERNAL_SERVER_ERROR: for i in range(self.snapshots_retries):
pass if i != 0:
else: sleeptime = 2 ** i # 2, 4, 8, 16 (default is 30 seconds)
break time.sleep(sleeptime)
(status_code_get, response) = self._get_lightos_snapshot(
project_name, self.logical_op_timeout,
snapshot_name=snapshot_name)
if status_code_get == httpstatus.OK:
found_or_created_snapshot = True
break
time.sleep(1) (status_code_create, response) = self.cluster.send_cmd(
cmd='create_snapshot',
project_name=project_name,
timeout=self.logical_op_timeout,
name=snapshot_name,
src_volume_name=src_volume_name,
)
if status_code_create == httpstatus.OK:
found_or_created_snapshot = True
break
if status_code_create != httpstatus.OK: if status_code_create in (httpstatus.BAD_REQUEST,
httpstatus.INTERNAL_SERVER_ERROR,
httpstatus.SERVICE_UNAVAILABLE):
LOG.debug('Creating new snapshot %s under project %s'
' failed, received error with http-status %s',
snapshot_name, project_name, status_code_create)
last_status_code = status_code_create
last_response = response
else:
msg = ('Did not succeed creating LightOS snapshot %s' msg = ('Did not succeed creating LightOS snapshot %s'
' project %s' ' project %s'
' status code %s response %s' % ' status code %s response %s' %
@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver):
response)) response))
raise exception.VolumeBackendAPIException(message=_(msg)) raise exception.VolumeBackendAPIException(message=_(msg))
if not found_or_created_snapshot:
msg = ('Did not succeed creating LightOS snapshot %s'
' project %s'
' status code %s response %s' %
(snapshot_name, project_name, last_status_code,
last_response))
raise exception.VolumeBackendAPIException(message=_(msg))
state = self._wait_for_snapshot_available(project_name, state = self._wait_for_snapshot_available(project_name,
timeout= timeout=
self.logical_op_timeout, self.logical_op_timeout,

View File

@ -0,0 +1,11 @@
---
features:
- |
Lightbits driver: Added support to create multiple snapshots
from the same volume simultaneously when using the Lightbits
cinder driver. Under certain conditions, older releases of the
Lightbits api-service will return various status codes (including
HTTP status codes 500 and 503) that could indicate transient
failures. Added retry logic on such errors becuase there's a good
chance that the error is transient and subsequent calls will
succeed.