Create multiple snapshots from same volume
Under certain conditions, the Lightbits api-service will return various status codes (including HTTP status codes 400, 500, and 503) that could indicate a transient error, e.g., because a "create snapshot" operation is already in progress on the source volume. Fix the Cinder Lightbits driver to retry when this happens, since there's a good chance a subsequent "create snapshots" will succeed. Change-Id: I4f3463ef71997fe707800a9df6a239273a0d5c4a
This commit is contained in:
parent
1185caa9a8
commit
665e01c483
@ -19,6 +19,7 @@ import functools
|
||||
import hashlib
|
||||
import http.client as httpstatus
|
||||
import json
|
||||
import time
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
|
||||
"test_lightos_storage.InitiatorConnectorFactoryMocker")
|
||||
configuration.volume_backend_name = VOLUME_BACKEND_NAME
|
||||
configuration.reserved_percentage = RESERVED_PERCENTAGE
|
||||
configuration.lightos_api_service_snapshots_max_calls = 5
|
||||
|
||||
def mocked_safe_get(config, variable_name):
|
||||
if hasattr(config, variable_name):
|
||||
@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
|
||||
self.driver.delete_volume(volume)
|
||||
db.volume_destroy(self.ctxt, volume.id)
|
||||
|
||||
@mock.patch.object(time, "sleep", return_value=None)
|
||||
def test_create_snapshot_fail_bad_request(self, mock_sleep):
|
||||
def send_cmd_mock(cmd, **kwargs):
|
||||
if cmd == "create_snapshot":
|
||||
return (httpstatus.BAD_REQUEST, {})
|
||||
else:
|
||||
return cluster_send_cmd(cmd, **kwargs)
|
||||
self.driver.do_setup(None)
|
||||
cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd)
|
||||
self.driver.cluster.send_cmd = send_cmd_mock
|
||||
|
||||
vol_type = test_utils.create_volume_type(self.ctxt, self,
|
||||
name='my_vol_type')
|
||||
volume = test_utils.create_volume(self.ctxt, size=4,
|
||||
volume_type_id=vol_type.id)
|
||||
snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id)
|
||||
|
||||
self.driver.create_volume(volume)
|
||||
self.assertRaises(exception.VolumeBackendAPIException,
|
||||
self.driver.create_snapshot, snapshot)
|
||||
self.driver.delete_volume(volume)
|
||||
db.volume_destroy(self.ctxt, volume.id)
|
||||
|
||||
def test_delete_snapshot(self):
|
||||
self.driver.do_setup(None)
|
||||
|
||||
|
@ -88,6 +88,11 @@ lightos_opts = [
|
||||
' the host`s IP addresses to a volume IPACL. If set to'
|
||||
' False, any IP address may access the volume. The default'
|
||||
' is True.'),
|
||||
cfg.IntOpt(
|
||||
'lightos_api_service_snapshots_max_calls',
|
||||
default=5,
|
||||
help='The maximum number of calls to the LightOS'
|
||||
' when creating snapshots. The default is 5 calls.')
|
||||
]
|
||||
|
||||
CONF = cfg.CONF
|
||||
@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
||||
|
||||
self.logical_op_timeout = \
|
||||
self.configuration.lightos_api_service_timeout * 3 + 10
|
||||
self.snapshots_retries = \
|
||||
self.configuration.lightos_api_service_snapshots_max_calls
|
||||
|
||||
@classmethod
|
||||
def get_driver_options(cls):
|
||||
@ -1388,12 +1395,21 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
||||
|
||||
@coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
|
||||
def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
|
||||
found_or_created_snapshot = False
|
||||
last_status_code = 999
|
||||
last_response = "No response"
|
||||
|
||||
for i in range(self.snapshots_retries):
|
||||
if i != 0:
|
||||
sleeptime = 2 ** i # 2, 4, 8, 16 (default is 30 seconds)
|
||||
time.sleep(sleeptime)
|
||||
(status_code_get, response) = self._get_lightos_snapshot(
|
||||
project_name, self.logical_op_timeout,
|
||||
snapshot_name=snapshot_name)
|
||||
if status_code_get != httpstatus.OK:
|
||||
end = time.time() + self.logical_op_timeout
|
||||
while (time.time() < end):
|
||||
if status_code_get == httpstatus.OK:
|
||||
found_or_created_snapshot = True
|
||||
break
|
||||
|
||||
(status_code_create, response) = self.cluster.send_cmd(
|
||||
cmd='create_snapshot',
|
||||
project_name=project_name,
|
||||
@ -1401,15 +1417,20 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
||||
name=snapshot_name,
|
||||
src_volume_name=src_volume_name,
|
||||
)
|
||||
|
||||
if status_code_create == httpstatus.INTERNAL_SERVER_ERROR:
|
||||
pass
|
||||
else:
|
||||
if status_code_create == httpstatus.OK:
|
||||
found_or_created_snapshot = True
|
||||
break
|
||||
|
||||
time.sleep(1)
|
||||
if status_code_create in (httpstatus.BAD_REQUEST,
|
||||
httpstatus.INTERNAL_SERVER_ERROR,
|
||||
httpstatus.SERVICE_UNAVAILABLE):
|
||||
|
||||
if status_code_create != httpstatus.OK:
|
||||
LOG.debug('Creating new snapshot %s under project %s'
|
||||
' failed, received error with http-status %s',
|
||||
snapshot_name, project_name, status_code_create)
|
||||
last_status_code = status_code_create
|
||||
last_response = response
|
||||
else:
|
||||
msg = ('Did not succeed creating LightOS snapshot %s'
|
||||
' project %s'
|
||||
' status code %s response %s' %
|
||||
@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
||||
response))
|
||||
raise exception.VolumeBackendAPIException(message=_(msg))
|
||||
|
||||
if not found_or_created_snapshot:
|
||||
msg = ('Did not succeed creating LightOS snapshot %s'
|
||||
' project %s'
|
||||
' status code %s response %s' %
|
||||
(snapshot_name, project_name, last_status_code,
|
||||
last_response))
|
||||
raise exception.VolumeBackendAPIException(message=_(msg))
|
||||
|
||||
state = self._wait_for_snapshot_available(project_name,
|
||||
timeout=
|
||||
self.logical_op_timeout,
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Lightbits driver: Added support to create multiple snapshots
|
||||
from the same volume simultaneously when using the Lightbits
|
||||
cinder driver. Under certain conditions, older releases of the
|
||||
Lightbits api-service will return various status codes (including
|
||||
HTTP status codes 500 and 503) that could indicate transient
|
||||
failures. Added retry logic on such errors becuase there's a good
|
||||
chance that the error is transient and subsequent calls will
|
||||
succeed.
|
Loading…
Reference in New Issue
Block a user