Create multiple snapshots from same volume
Under certain conditions, the Lightbits api-service will return various status codes (including HTTP status codes 400, 500, and 503) that could indicate a transient error, e.g., because a "create snapshot" operation is already in progress on the source volume. Fix the Cinder Lightbits driver to retry when this happens, since there's a good chance a subsequent "create snapshots" will succeed. Change-Id: I4f3463ef71997fe707800a9df6a239273a0d5c4a
This commit is contained in:
parent
1185caa9a8
commit
665e01c483
@ -19,6 +19,7 @@ import functools
|
|||||||
import hashlib
|
import hashlib
|
||||||
import http.client as httpstatus
|
import http.client as httpstatus
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import List
|
from typing import List
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
@ -296,6 +297,7 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
|
|||||||
"test_lightos_storage.InitiatorConnectorFactoryMocker")
|
"test_lightos_storage.InitiatorConnectorFactoryMocker")
|
||||||
configuration.volume_backend_name = VOLUME_BACKEND_NAME
|
configuration.volume_backend_name = VOLUME_BACKEND_NAME
|
||||||
configuration.reserved_percentage = RESERVED_PERCENTAGE
|
configuration.reserved_percentage = RESERVED_PERCENTAGE
|
||||||
|
configuration.lightos_api_service_snapshots_max_calls = 5
|
||||||
|
|
||||||
def mocked_safe_get(config, variable_name):
|
def mocked_safe_get(config, variable_name):
|
||||||
if hasattr(config, variable_name):
|
if hasattr(config, variable_name):
|
||||||
@ -645,6 +647,29 @@ class LightOSStorageVolumeDriverTest(test.TestCase):
|
|||||||
self.driver.delete_volume(volume)
|
self.driver.delete_volume(volume)
|
||||||
db.volume_destroy(self.ctxt, volume.id)
|
db.volume_destroy(self.ctxt, volume.id)
|
||||||
|
|
||||||
|
@mock.patch.object(time, "sleep", return_value=None)
|
||||||
|
def test_create_snapshot_fail_bad_request(self, mock_sleep):
|
||||||
|
def send_cmd_mock(cmd, **kwargs):
|
||||||
|
if cmd == "create_snapshot":
|
||||||
|
return (httpstatus.BAD_REQUEST, {})
|
||||||
|
else:
|
||||||
|
return cluster_send_cmd(cmd, **kwargs)
|
||||||
|
self.driver.do_setup(None)
|
||||||
|
cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd)
|
||||||
|
self.driver.cluster.send_cmd = send_cmd_mock
|
||||||
|
|
||||||
|
vol_type = test_utils.create_volume_type(self.ctxt, self,
|
||||||
|
name='my_vol_type')
|
||||||
|
volume = test_utils.create_volume(self.ctxt, size=4,
|
||||||
|
volume_type_id=vol_type.id)
|
||||||
|
snapshot = test_utils.create_snapshot(self.ctxt, volume_id=volume.id)
|
||||||
|
|
||||||
|
self.driver.create_volume(volume)
|
||||||
|
self.assertRaises(exception.VolumeBackendAPIException,
|
||||||
|
self.driver.create_snapshot, snapshot)
|
||||||
|
self.driver.delete_volume(volume)
|
||||||
|
db.volume_destroy(self.ctxt, volume.id)
|
||||||
|
|
||||||
def test_delete_snapshot(self):
|
def test_delete_snapshot(self):
|
||||||
self.driver.do_setup(None)
|
self.driver.do_setup(None)
|
||||||
|
|
||||||
|
@ -88,6 +88,11 @@ lightos_opts = [
|
|||||||
' the host`s IP addresses to a volume IPACL. If set to'
|
' the host`s IP addresses to a volume IPACL. If set to'
|
||||||
' False, any IP address may access the volume. The default'
|
' False, any IP address may access the volume. The default'
|
||||||
' is True.'),
|
' is True.'),
|
||||||
|
cfg.IntOpt(
|
||||||
|
'lightos_api_service_snapshots_max_calls',
|
||||||
|
default=5,
|
||||||
|
help='The maximum number of calls to the LightOS'
|
||||||
|
' when creating snapshots. The default is 5 calls.')
|
||||||
]
|
]
|
||||||
|
|
||||||
CONF = cfg.CONF
|
CONF = cfg.CONF
|
||||||
@ -409,6 +414,8 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
|||||||
|
|
||||||
self.logical_op_timeout = \
|
self.logical_op_timeout = \
|
||||||
self.configuration.lightos_api_service_timeout * 3 + 10
|
self.configuration.lightos_api_service_timeout * 3 + 10
|
||||||
|
self.snapshots_retries = \
|
||||||
|
self.configuration.lightos_api_service_snapshots_max_calls
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_driver_options(cls):
|
def get_driver_options(cls):
|
||||||
@ -1388,28 +1395,42 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
|||||||
|
|
||||||
@coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
|
@coordination.synchronized('lightos-create_snapshot-{src_volume_name}')
|
||||||
def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
|
def _create_snapshot(self, project_name, snapshot_name, src_volume_name):
|
||||||
(status_code_get, response) = self._get_lightos_snapshot(
|
found_or_created_snapshot = False
|
||||||
project_name, self.logical_op_timeout,
|
last_status_code = 999
|
||||||
snapshot_name=snapshot_name)
|
last_response = "No response"
|
||||||
if status_code_get != httpstatus.OK:
|
|
||||||
end = time.time() + self.logical_op_timeout
|
|
||||||
while (time.time() < end):
|
|
||||||
(status_code_create, response) = self.cluster.send_cmd(
|
|
||||||
cmd='create_snapshot',
|
|
||||||
project_name=project_name,
|
|
||||||
timeout=self.logical_op_timeout,
|
|
||||||
name=snapshot_name,
|
|
||||||
src_volume_name=src_volume_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
if status_code_create == httpstatus.INTERNAL_SERVER_ERROR:
|
for i in range(self.snapshots_retries):
|
||||||
pass
|
if i != 0:
|
||||||
else:
|
sleeptime = 2 ** i # 2, 4, 8, 16 (default is 30 seconds)
|
||||||
break
|
time.sleep(sleeptime)
|
||||||
|
(status_code_get, response) = self._get_lightos_snapshot(
|
||||||
|
project_name, self.logical_op_timeout,
|
||||||
|
snapshot_name=snapshot_name)
|
||||||
|
if status_code_get == httpstatus.OK:
|
||||||
|
found_or_created_snapshot = True
|
||||||
|
break
|
||||||
|
|
||||||
time.sleep(1)
|
(status_code_create, response) = self.cluster.send_cmd(
|
||||||
|
cmd='create_snapshot',
|
||||||
|
project_name=project_name,
|
||||||
|
timeout=self.logical_op_timeout,
|
||||||
|
name=snapshot_name,
|
||||||
|
src_volume_name=src_volume_name,
|
||||||
|
)
|
||||||
|
if status_code_create == httpstatus.OK:
|
||||||
|
found_or_created_snapshot = True
|
||||||
|
break
|
||||||
|
|
||||||
if status_code_create != httpstatus.OK:
|
if status_code_create in (httpstatus.BAD_REQUEST,
|
||||||
|
httpstatus.INTERNAL_SERVER_ERROR,
|
||||||
|
httpstatus.SERVICE_UNAVAILABLE):
|
||||||
|
|
||||||
|
LOG.debug('Creating new snapshot %s under project %s'
|
||||||
|
' failed, received error with http-status %s',
|
||||||
|
snapshot_name, project_name, status_code_create)
|
||||||
|
last_status_code = status_code_create
|
||||||
|
last_response = response
|
||||||
|
else:
|
||||||
msg = ('Did not succeed creating LightOS snapshot %s'
|
msg = ('Did not succeed creating LightOS snapshot %s'
|
||||||
' project %s'
|
' project %s'
|
||||||
' status code %s response %s' %
|
' status code %s response %s' %
|
||||||
@ -1417,6 +1438,14 @@ class LightOSVolumeDriver(driver.VolumeDriver):
|
|||||||
response))
|
response))
|
||||||
raise exception.VolumeBackendAPIException(message=_(msg))
|
raise exception.VolumeBackendAPIException(message=_(msg))
|
||||||
|
|
||||||
|
if not found_or_created_snapshot:
|
||||||
|
msg = ('Did not succeed creating LightOS snapshot %s'
|
||||||
|
' project %s'
|
||||||
|
' status code %s response %s' %
|
||||||
|
(snapshot_name, project_name, last_status_code,
|
||||||
|
last_response))
|
||||||
|
raise exception.VolumeBackendAPIException(message=_(msg))
|
||||||
|
|
||||||
state = self._wait_for_snapshot_available(project_name,
|
state = self._wait_for_snapshot_available(project_name,
|
||||||
timeout=
|
timeout=
|
||||||
self.logical_op_timeout,
|
self.logical_op_timeout,
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Lightbits driver: Added support to create multiple snapshots
|
||||||
|
from the same volume simultaneously when using the Lightbits
|
||||||
|
cinder driver. Under certain conditions, older releases of the
|
||||||
|
Lightbits api-service will return various status codes (including
|
||||||
|
HTTP status codes 500 and 503) that could indicate transient
|
||||||
|
failures. Added retry logic on such errors becuase there's a good
|
||||||
|
chance that the error is transient and subsequent calls will
|
||||||
|
succeed.
|
Loading…
Reference in New Issue
Block a user