Retry SQLite DB write failures due to locks

Adds a database retry decorator to capture and retry exceptions
rooted in SQLite locking. These locking errors are rooted in
the fact that essentially, we can only have one distinct writer
at a time. This writer becomes transaction oriented as well.

Unfortunately with our green threads and API surface, we run into
cases where we have background operations (mainly, periodic tasks...)
and API surface transacations which need to operate against the DB
as well. Because we can't say one task or another (realistically
speaking) can have exclusive control and access, then we run into
database locking errors.

So when we encounter a lock error, we retry.

Adds two additional configuration parameters to the database
configuration section, to allow this capability to be further
tuned, as file IO performance is *surely* a contributing factor
to our locking issues as we mostly see them with a loaded CI
system where other issues begin to crop up.

The new parameters are as follows:
* sqlite_retries, a boolean value allowing the retry logic
  to be disabled. This can largely be ignored, but is available
  as it was logical to include.
* sqlite_max_wait_for_retry, a integer value, default 30 seconds
  as to how long to wait for retrying SQLite database operations
  which are failing due to a "database is locked" error.

The retry logic uses the tenacity library, and performs an
expoential backoff. Setting the amount of time to a very large
number is not advisable, as such the default of 30 seconds was
deemed reasonable.

Change-Id: Ifeb92e9f23a94f2d96bb495fe63a71df9865fef3
This commit is contained in:
Julia Kreger 2023-07-13 13:45:20 -07:00
parent 2699a4ebbc
commit 091edb0631
9 changed files with 160 additions and 34 deletions

View File

@ -20,7 +20,15 @@ from ironic.common.i18n import _
opts = [
cfg.StrOpt('mysql_engine',
default='InnoDB',
help=_('MySQL engine to use.'))
help=_('MySQL engine to use.')),
cfg.BoolOpt('sqlite_retries',
default=True,
help=_('If SQLite database operation retry logic is enabled '
'or not. Enabled by default.')),
cfg.IntOpt('sqlite_max_wait_for_retry',
default=30,
help=_('Maximum number of seconds to retry SQLite database '
'locks.')),
]

View File

@ -16,7 +16,9 @@
import collections
import datetime
import functools
import json
import logging
import threading
from oslo_concurrency import lockutils
@ -37,6 +39,7 @@ from sqlalchemy.exc import NoResultFound, MultipleResultsFound
from sqlalchemy.orm import Load
from sqlalchemy.orm import selectinload
from sqlalchemy import sql
import tenacity
from ironic.common import exception
from ironic.common.i18n import _
@ -63,6 +66,33 @@ synchronized = lockutils.synchronized_with_prefix('ironic-')
MAX_TRAITS_PER_NODE = 50
def wrap_sqlite_retry(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
if (CONF.database.sqlite_retries
and not utils.is_ironic_using_sqlite()):
return f(*args, **kwargs)
else:
for attempt in tenacity.Retrying(
retry=(
tenacity.retry_if_exception_type(
sa.exc.OperationalError)
& tenacity.retry_if_exception(
lambda e: 'database is locked' in str(e))
),
wait=tenacity.wait_full_jitter(
multiplier=0.25,
max=CONF.database.sqlite_max_wait_for_retry),
before_sleep=(
tenacity.before_sleep_log(LOG, logging.debug)
),
reraise=True):
with attempt:
return f(*args, **kwargs)
return wrapper
def get_backend():
"""The backend is this module itself."""
return Connection()
@ -653,6 +683,7 @@ class Connection(api.Connection):
return mapping
@synchronized(RESERVATION_SEMAPHORE, fair=True)
@wrap_sqlite_retry
def _reserve_node_place_lock(self, tag, node_id, node):
try:
# NOTE(TheJulia): We explicitly do *not* synch the session
@ -705,6 +736,7 @@ class Connection(api.Connection):
# Return a node object as that is the contract for this method.
return self.get_node_by_id(node.id)
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def release_node(self, tag, node_id):
with _session_for_read() as session:
@ -734,6 +766,7 @@ class Connection(api.Connection):
except NoResultFound:
raise exception.NodeNotFound(node=node_id)
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def create_node(self, values):
# ensure defaults are present for new nodes
@ -839,6 +872,7 @@ class Connection(api.Connection):
raise exception.InstanceNotFound(instance_uuid=instance)
return res
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def destroy_node(self, node_id):
with _session_for_write() as session:
@ -914,6 +948,7 @@ class Connection(api.Connection):
query.delete()
@wrap_sqlite_retry
def update_node(self, node_id, values):
# NOTE(dtantsur): this can lead to very strange errors
if 'uuid' in values:
@ -1059,6 +1094,7 @@ class Connection(api.Connection):
return _paginate_query(models.Port, limit, marker,
sort_key, sort_dir, query)
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def create_port(self, values):
if not values.get('uuid'):
@ -1076,6 +1112,7 @@ class Connection(api.Connection):
raise exception.PortAlreadyExists(uuid=values['uuid'])
return port
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def update_port(self, port_id, values):
# NOTE(dtantsur): this can lead to very strange errors
@ -1098,6 +1135,7 @@ class Connection(api.Connection):
raise exception.MACAlreadyExists(mac=values['address'])
return ref
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def destroy_port(self, port_id):
with _session_for_write() as session:
@ -1498,6 +1536,7 @@ class Connection(api.Connection):
.filter_by(conductor_id=conductor_id))
query.delete()
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def touch_node_provisioning(self, node_id):
with _session_for_write() as session:
@ -2049,6 +2088,7 @@ class Connection(api.Connection):
models.NodeTrait).filter_by(node_id=node_id, trait=trait)
return session.query(q.exists()).scalar()
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def create_bios_setting_list(self, node_id, settings, version):
bios_settings = []
@ -2078,6 +2118,7 @@ class Connection(api.Connection):
node=node_id, name=setting['name'])
return bios_settings
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def update_bios_setting_list(self, node_id, settings, version):
bios_settings = []
@ -2109,6 +2150,7 @@ class Connection(api.Connection):
node=node_id, name=setting['name'])
return bios_settings
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def delete_bios_setting_list(self, node_id, names):
missing_bios_settings = []
@ -2635,6 +2677,7 @@ class Connection(api.Connection):
# ordered ascending originally.
return final_set
@wrap_sqlite_retry
def bulk_delete_node_history_records(self, entries):
with _session_for_write() as session:
# Uses input entry list, selects entries matching those ids
@ -2672,6 +2715,7 @@ class Connection(api.Connection):
)
return res
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def create_node_inventory(self, values):
inventory = models.NodeInventory()
@ -2686,6 +2730,7 @@ class Connection(api.Connection):
session.flush()
return inventory
@wrap_sqlite_retry
@oslo_db_api.retry_on_deadlock
def destroy_node_inventory_by_node_id(self, node_id):
with _session_for_write() as session:
@ -2748,6 +2793,7 @@ class Connection(api.Connection):
return shard_list
@wrap_sqlite_retry
def create_firmware_component(self, values):
"""Create a FirmwareComponent record for a given node.
@ -2779,6 +2825,7 @@ class Connection(api.Connection):
name=values['component'], node=values['node_id'])
return fw_component
@wrap_sqlite_retry
def update_firmware_component(self, node_id, component, values):
"""Update a FirmwareComponent record.

View File

@ -43,6 +43,8 @@ class Database(fixtures.Fixture):
self.post_migrations()
self._DB = "".join(line for line in conn.connection.iterdump())
self.engine.dispose()
# Disable retry logic for unit testing, to minimize overhead.
CONF.set_override('sqlite_retries', False, group='database')
def setup_sqlite(self, db_migrate):
if db_migrate.version():

View File

@ -18,17 +18,27 @@
import datetime
from unittest import mock
from oslo_config import cfg
from oslo_utils import timeutils
from oslo_utils import uuidutils
from sqlalchemy.orm import exc as sa_exc
from sqlalchemy import exc as sa_exc
# NOTE(TheJulia): At some point, we should drop the line below,
# as I understand the ORM exceptions are going to move completely
# after SQLAlchemy 2.0 to just the base exception class.
from sqlalchemy.orm import exc as sa_orm_exc
from ironic.common import exception
from ironic.common import states
from ironic.db.sqlalchemy import api as dbapi
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic.db.sqlalchemy.models import NodeInventory
from ironic.tests.unit.db import base
from ironic.tests.unit.db import utils
CONF = cfg.CONF
class DbNodeTestCase(base.DbTestCase):
def test_create_node(self):
@ -535,7 +545,7 @@ class DbNodeTestCase(base.DbTestCase):
'deploy_interface', 'boot_interface',
'driver', 'extra']:
try:
self.assertRaises(sa_exc.DetachedInstanceError,
self.assertRaises(sa_orm_exc.DetachedInstanceError,
_attempt_field_access, r, field)
except AttributeError:
pass
@ -592,7 +602,7 @@ class DbNodeTestCase(base.DbTestCase):
'driver', 'extra', 'power_state',
'traits']:
try:
self.assertRaises(sa_exc.DetachedInstanceError,
self.assertRaises(sa_orm_exc.DetachedInstanceError,
_attempt_field_access, r, field)
except AttributeError:
# We expect an AttributeError, in addition to
@ -847,6 +857,28 @@ class DbNodeTestCase(base.DbTestCase):
self.assertRaises(exception.NodeNotFound, self.dbapi.update_node,
node_uuid, {'extra': new_extra})
@mock.patch.object(dbapi, 'LOG', autospec=True)
def test_update_node_retries(self, log_mock):
"""Test retry logic to ensure it works."""
node = utils.create_test_node()
CONF.set_override('sqlite_retries', True, group='database')
# NOTE(TheJulia): Update is an ideal place to test retries
# as the underlying work is done by _do_update_node.
with mock.patch.object(db_conn, '_do_update_node',
autospec=True) as mock_update:
sa_err = sa_exc.OperationalError(
statement=None,
params=None,
orig=Exception('database is locked'))
mock_update.side_effect = [
sa_err,
sa_err,
node
]
self.dbapi.update_node(node.id, {'extra': {'foo': 'bar'}})
self.assertEqual(3, mock_update.call_count)
self.assertEqual(2, log_mock.log.call_count)
def test_update_node_uuid(self):
node = utils.create_test_node()
self.assertRaises(exception.InvalidParameterValue,

View File

@ -15,6 +15,7 @@ from unittest import mock
from ironic.common import context
from ironic.db import api as dbapi
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic import objects
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.db import utils as db_utils
@ -71,7 +72,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
self.assertEqual(bios_setting2['name'], bios_obj_list[1].name)
self.assertEqual(bios_setting2['value'], bios_obj_list[1].value)
@mock.patch.object(dbapi.IMPL, 'create_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'create_bios_setting_list', autospec=True)
def test_create(self, mock_create_list):
fake_call_args = {'node_id': self.bios_setting['node_id'],
'name': self.bios_setting['name'],
@ -97,7 +98,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
mock_create_list.return_value = [self.bios_setting]
mock_create_list.call_args
bios_obj.create()
mock_create_list.assert_called_once_with(self.bios_setting['node_id'],
mock_create_list.assert_called_once_with(mock.ANY,
self.bios_setting['node_id'],
setting,
self.bios_setting['version'])
self.assertEqual(self.bios_setting['node_id'], bios_obj.node_id)
@ -113,7 +115,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
bios_obj.reset_required)
self.assertEqual(self.bios_setting['unique'], bios_obj.unique)
@mock.patch.object(dbapi.IMPL, 'update_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'update_bios_setting_list', autospec=True)
def test_save(self, mock_update_list):
fake_call_args = {'node_id': self.bios_setting['node_id'],
'name': self.bios_setting['name'],
@ -131,14 +133,15 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
mock_update_list.return_value = [self.bios_setting]
mock_update_list.call_args
bios_obj.save()
mock_update_list.assert_called_once_with(self.bios_setting['node_id'],
mock_update_list.assert_called_once_with(mock.ANY,
self.bios_setting['node_id'],
setting,
self.bios_setting['version'])
self.assertEqual(self.bios_setting['node_id'], bios_obj.node_id)
self.assertEqual(self.bios_setting['name'], bios_obj.name)
self.assertEqual(self.bios_setting['value'], bios_obj.value)
@mock.patch.object(dbapi.IMPL, 'create_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'create_bios_setting_list', autospec=True)
def test_list_create(self, mock_create_list):
bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread',
value='enabled')
@ -147,7 +150,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
bios_obj_list = objects.BIOSSettingList.create(
self.context, self.node_id, settings)
mock_create_list.assert_called_once_with(self.node_id, settings, '1.1')
mock_create_list.assert_called_once_with(mock.ANY, self.node_id,
settings, '1.1')
self.assertEqual(self.context, bios_obj_list._context)
self.assertEqual(2, len(bios_obj_list))
self.assertEqual(self.bios_setting['node_id'],
@ -157,7 +161,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
self.assertEqual(bios_setting2['node_id'], bios_obj_list[1].node_id)
self.assertEqual(bios_setting2['name'], bios_obj_list[1].name)
@mock.patch.object(dbapi.IMPL, 'update_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'update_bios_setting_list', autospec=True)
def test_list_save(self, mock_update_list):
bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread',
value='enabled')
@ -166,7 +170,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
bios_obj_list = objects.BIOSSettingList.save(
self.context, self.node_id, settings)
mock_update_list.assert_called_once_with(self.node_id, settings, '1.1')
mock_update_list.assert_called_once_with(mock.ANY, self.node_id,
settings, '1.1')
self.assertEqual(self.context, bios_obj_list._context)
self.assertEqual(2, len(bios_obj_list))
self.assertEqual(self.bios_setting['node_id'],
@ -177,19 +182,20 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
self.assertEqual(bios_setting2['name'], bios_obj_list[1].name)
self.assertEqual(bios_setting2['value'], bios_obj_list[1].value)
@mock.patch.object(dbapi.IMPL, 'delete_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'delete_bios_setting_list', autospec=True)
def test_delete(self, mock_delete):
objects.BIOSSetting.delete(self.context, self.node_id,
self.bios_setting['name'])
mock_delete.assert_called_once_with(self.node_id,
mock_delete.assert_called_once_with(mock.ANY,
self.node_id,
[self.bios_setting['name']])
@mock.patch.object(dbapi.IMPL, 'delete_bios_setting_list', autospec=True)
@mock.patch.object(db_conn, 'delete_bios_setting_list', autospec=True)
def test_list_delete(self, mock_delete):
bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread')
name_list = [self.bios_setting['name'], bios_setting2['name']]
objects.BIOSSettingList.delete(self.context, self.node_id, name_list)
mock_delete.assert_called_once_with(self.node_id, name_list)
mock_delete.assert_called_once_with(mock.ANY, self.node_id, name_list)
@mock.patch('ironic.objects.bios.BIOSSettingList.get_by_node_id',
spec_set=types.FunctionType)

View File

@ -22,6 +22,7 @@ from testtools import matchers
from ironic.common import context
from ironic.common import exception
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic import objects
from ironic.objects import node as node_objects
from ironic.tests.unit.db import base as db_base
@ -479,15 +480,15 @@ class TestNodeObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
node_id)
def test_release(self):
with mock.patch.object(self.dbapi, 'release_node',
with mock.patch.object(db_conn, 'release_node',
autospec=True) as mock_release:
node_id = self.fake_node['id']
fake_tag = 'fake-tag'
objects.Node.release(self.context, fake_tag, node_id)
mock_release.assert_called_once_with(fake_tag, node_id)
mock_release.assert_called_once_with(mock.ANY, fake_tag, node_id)
def test_release_node_not_found(self):
with mock.patch.object(self.dbapi, 'release_node',
with mock.patch.object(db_conn, 'release_node',
autospec=True) as mock_release:
node_id = 'non-existent'
mock_release.side_effect = exception.NodeNotFound(node=node_id)
@ -496,25 +497,25 @@ class TestNodeObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
'fake-tag', node_id)
def test_touch_provisioning(self):
with mock.patch.object(self.dbapi, 'get_node_by_uuid',
with mock.patch.object(db_conn, 'get_node_by_uuid',
autospec=True) as mock_get_node:
mock_get_node.return_value = self.fake_node
with mock.patch.object(self.dbapi, 'touch_node_provisioning',
with mock.patch.object(db_conn, 'touch_node_provisioning',
autospec=True) as mock_touch:
node = objects.Node.get(self.context, self.fake_node['uuid'])
node.touch_provisioning()
mock_touch.assert_called_once_with(node.id)
mock_touch.assert_called_once_with(mock.ANY, node.id)
def test_create(self):
node = obj_utils.get_test_node(self.ctxt, **self.fake_node)
with mock.patch.object(self.dbapi, 'create_node',
with mock.patch.object(db_conn, 'create_node',
autospec=True) as mock_create_node:
mock_create_node.return_value = db_utils.get_test_node()
node.create()
args, _kwargs = mock_create_node.call_args
self.assertEqual(objects.Node.VERSION, args[0]['version'])
self.assertEqual(objects.Node.VERSION, args[1]['version'])
self.assertEqual(1, mock_create_node.call_count)
def test_create_with_invalid_properties(self):

View File

@ -12,6 +12,7 @@
from unittest import mock
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic import objects
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.db import utils as db_utils
@ -25,25 +26,26 @@ class TestNodeInventoryObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
self.fake_inventory = db_utils.get_test_inventory()
def test_create(self):
with mock.patch.object(self.dbapi, 'create_node_inventory',
with mock.patch.object(db_conn, 'create_node_inventory',
autospec=True) as mock_db_create:
mock_db_create.return_value = self.fake_inventory
new_inventory = objects.NodeInventory(
self.context, **self.fake_inventory)
new_inventory.create()
mock_db_create.assert_called_once_with(self.fake_inventory)
mock_db_create.assert_called_once_with(mock.ANY,
self.fake_inventory)
def test_destroy(self):
node_id = self.fake_inventory['node_id']
with mock.patch.object(self.dbapi, 'get_node_inventory_by_node_id',
autospec=True) as mock_get:
mock_get.return_value = self.fake_inventory
with mock.patch.object(self.dbapi,
with mock.patch.object(db_conn,
'destroy_node_inventory_by_node_id',
autospec=True) as mock_db_destroy:
inventory = objects.NodeInventory.get_by_node_id(self.context,
node_id)
inventory.destroy()
mock_db_destroy.assert_called_once_with(node_id)
mock_db_destroy.assert_called_once_with(mock.ANY, node_id)

View File

@ -21,6 +21,7 @@ from oslo_config import cfg
from testtools import matchers
from ironic.common import exception
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic import objects
from ironic.objects import base as obj_base
from ironic.tests.unit.db import base as db_base
@ -86,9 +87,9 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
def test_create(self):
port = objects.Port(self.context, **self.fake_port)
with mock.patch.object(self.dbapi, 'create_port',
with mock.patch.object(db_conn, 'create_port',
autospec=True) as mock_create_port:
with mock.patch.object(self.dbapi, 'get_port_by_id',
with mock.patch.object(db_conn, 'get_port_by_id',
autospec=True) as mock_get_port:
test_port = db_utils.get_test_port()
mock_create_port.return_value = test_port
@ -97,7 +98,7 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
port.create()
args, _kwargs = mock_create_port.call_args
self.assertEqual(objects.Port.VERSION, args[0]['version'])
self.assertEqual(objects.Port.VERSION, args[1]['version'])
def test_save(self):
uuid = self.fake_port['uuid']
@ -106,7 +107,7 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
with mock.patch.object(self.dbapi, 'get_port_by_uuid',
autospec=True) as mock_get_port:
mock_get_port.return_value = self.fake_port
with mock.patch.object(self.dbapi, 'update_port',
with mock.patch.object(db_conn, 'update_port',
autospec=True) as mock_update_port:
mock_update_port.return_value = (
db_utils.get_test_port(address=address,
@ -117,7 +118,9 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn):
mock_get_port.assert_called_once_with(uuid)
mock_update_port.assert_called_once_with(
uuid, {'version': objects.Port.VERSION,
mock.ANY,
uuid,
{'version': objects.Port.VERSION,
'address': "b2:54:00:cf:2d:40"})
self.assertEqual(self.context, p._context)
res_updated_at = (p.updated_at).replace(tzinfo=None)

View File

@ -0,0 +1,25 @@
---
fixes:
- |
Adds a database write retry decorate for SQLite failures reporting
"database is locked". By default, through the new configuration
parameter ``[database]sqlite_max_wait_for_retry``, retries will
be performed on failing write operations for up to *30* seconds.
This value can be tuned, but be warned it is an expotential
backoff retry model, and HTTP requests can give up if no
response is received in a reasonable time, thus *30* seconds
was deemed a reasonable default.
The retry logic can be disabled using the
``[database]sqlite_retries`` option, which defaults to
*True*. Users of other, mutli-threaded/concurrent-write database
platforms are not impacted by this change, as the retry logic
recognizes if another database is in use and bypasses the retry
logic in that case. A similar retry logic concept already exists
with other databases in the form of a "Database Deadlock" retry
where two writers conflict on the same row or table. The database
abstraction layer already handles such deadlock conditions.
The SQLite file based locking issue is unfortunately more common
with file based write locking as the entire file, in other words
the entire database, to perform the write operation.