Don't actually heartbeat with sqlite
Disables internal heartbeat mechanism when ironic has been configured to utilize a SQLite database backend. This is done to lessen the possibility of a "database is locked" error, which can occur when two distinct threads attempt to write to the database at the same time with open writers. The process keepalive heartbeat process was identified as a major source of these write operations as it was writing every ten seconds by default, which would also collide with periodic tasks. Change-Id: I7b6d7a78ba2910f22673ad8e72e255f321d3fdff
This commit is contained in:
parent
54da324900
commit
0099d1812d
ironic
cmd
common
conductor
db/sqlalchemy
tests/unit
releasenotes/notes
@ -30,6 +30,17 @@ CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def _error_about_sqlite_usage():
|
||||
if 'sqlite' in CONF.database.connection.lower():
|
||||
# TODO(TheJulia): Make this a hard error in C*
|
||||
LOG.error('We have detected the API is being launched with a SQLite '
|
||||
'database backend. This is unsupported, and will be a hard '
|
||||
'error in the future. This is becaues multi-process use of '
|
||||
'a single SQLite database is problematic in terms of '
|
||||
'locking. A single process ironic model exists for use with '
|
||||
'SQLite.')
|
||||
|
||||
|
||||
def main():
|
||||
# Parse config file and command line options, then start logging
|
||||
ironic_service.prepare_service('ironic_api', sys.argv)
|
||||
@ -43,4 +54,5 @@ def main():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_error_about_sqlite_usage()
|
||||
sys.exit(main())
|
||||
|
@ -27,6 +27,7 @@ from oslo_service import service
|
||||
|
||||
from ironic.common import rpc_service
|
||||
from ironic.common import service as ironic_service
|
||||
from ironic.common import utils
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
||||
@ -43,8 +44,24 @@ def warn_about_unsafe_shred_parameters(conf):
|
||||
'Secure Erase. This is a possible SECURITY ISSUE!')
|
||||
|
||||
|
||||
def warn_about_sqlite():
|
||||
# We are intentionally calling the helper here to ensure it caches
|
||||
# for all future calls.
|
||||
if utils.is_ironic_using_sqlite():
|
||||
LOG.warning('Ironic has been configured to utilize SQLite. '
|
||||
'This has some restrictions and impacts. You must run '
|
||||
'as as a single combined ironic process, and some '
|
||||
'internal mechanisms do not execute such as the hash '
|
||||
'ring will remain static and the conductor\'s '
|
||||
'``last_updated`` field will also not update. This is '
|
||||
'in order to minimize database locking issues present '
|
||||
'as a result of SQLAlchemy 2.0 and the removal of '
|
||||
'autocommit support.')
|
||||
|
||||
|
||||
def issue_startup_warnings(conf):
|
||||
warn_about_unsafe_shred_parameters(conf)
|
||||
warn_about_sqlite()
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -21,6 +21,7 @@ from tooz import hashring
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import utils
|
||||
from ironic.conf import CONF
|
||||
from ironic.db import api as dbapi
|
||||
|
||||
@ -48,7 +49,11 @@ class HashRingManager(object):
|
||||
# Hot path, no lock. Using a local variable to avoid races with code
|
||||
# changing the class variable.
|
||||
hash_rings, updated_at = self.__class__._hash_rings
|
||||
if hash_rings is not None and updated_at >= limit:
|
||||
if (hash_rings is not None
|
||||
and (updated_at >= limit
|
||||
or utils.is_ironic_using_sqlite())):
|
||||
# Returning the hash ring for us, if it is still valid,
|
||||
# or if we're using sqlite.
|
||||
return hash_rings
|
||||
|
||||
with self._lock:
|
||||
|
@ -57,6 +57,8 @@ TZ_RE = r'((?P<tz_sign>[+-])(?P<tz_hour>\d{2}):(?P<tz_min>\d{2}))' + \
|
||||
DATETIME_RE = re.compile(
|
||||
'%sT%s(%s)?' % (DATE_RE, TIME_RE, TZ_RE))
|
||||
|
||||
USING_SQLITE = None
|
||||
|
||||
|
||||
def _get_root_helper():
|
||||
# NOTE(jlvillal): This function has been moved to ironic-lib. And is
|
||||
@ -724,3 +726,15 @@ def parse_kernel_params(params):
|
||||
else:
|
||||
result[key] = value
|
||||
return result
|
||||
|
||||
|
||||
def is_ironic_using_sqlite():
|
||||
"""Return True if Ironic is configured to use SQLite"""
|
||||
global USING_SQLITE
|
||||
if USING_SQLITE is not None:
|
||||
return USING_SQLITE
|
||||
|
||||
# We're being called for the first time, lets cache and
|
||||
# return the value.
|
||||
USING_SQLITE = 'sqlite' in CONF.database.connection.lower()
|
||||
return USING_SQLITE
|
||||
|
@ -34,6 +34,7 @@ from ironic.common.i18n import _
|
||||
from ironic.common import release_mappings as versions
|
||||
from ironic.common import rpc
|
||||
from ironic.common import states
|
||||
from ironic.common import utils as common_utils
|
||||
from ironic.conductor import allocations
|
||||
from ironic.conductor import notification_utils as notify_utils
|
||||
from ironic.conductor import task_manager
|
||||
@ -440,6 +441,10 @@ class BaseConductorManager(object):
|
||||
raise exception.NoFreeConductorWorker()
|
||||
|
||||
def _conductor_service_record_keepalive(self):
|
||||
if common_utils.is_ironic_using_sqlite():
|
||||
# Exit this keepalive heartbeats are disabled and not
|
||||
# considered.
|
||||
return
|
||||
while not self._keepalive_evt.is_set():
|
||||
try:
|
||||
self.conductor.touch()
|
||||
|
@ -355,10 +355,14 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None,
|
||||
def _filter_active_conductors(query, interval=None):
|
||||
if interval is None:
|
||||
interval = CONF.conductor.heartbeat_timeout
|
||||
if not utils.is_ironic_using_sqlite() and interval > 0:
|
||||
# Check for greater than zero becaues if the value is zero,
|
||||
# then the logic makes no sense.
|
||||
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
|
||||
|
||||
query = (query.filter(models.Conductor.online.is_(True))
|
||||
.filter(models.Conductor.updated_at >= limit))
|
||||
else:
|
||||
query = query.filter(models.Conductor.online.is_(True))
|
||||
return query
|
||||
|
||||
|
||||
@ -1433,10 +1437,16 @@ class Connection(api.Connection):
|
||||
def get_offline_conductors(self, field='hostname'):
|
||||
with _session_for_read() as session:
|
||||
field = getattr(models.Conductor, field)
|
||||
if not utils.is_ironic_using_sqlite():
|
||||
interval = CONF.conductor.heartbeat_timeout
|
||||
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
|
||||
limit = (timeutils.utcnow()
|
||||
- datetime.timedelta(seconds=interval))
|
||||
result = (session.query(field)
|
||||
.filter(models.Conductor.updated_at < limit))
|
||||
else:
|
||||
result = session.query(
|
||||
field
|
||||
).filter(models.Conductor.online.is_(False))
|
||||
return [row[0] for row in result]
|
||||
|
||||
def get_online_conductors(self):
|
||||
|
@ -14,11 +14,13 @@
|
||||
# under the License.
|
||||
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import hash_ring
|
||||
from ironic.common import utils
|
||||
from ironic.tests.unit.db import base as db_base
|
||||
|
||||
CONF = cfg.CONF
|
||||
@ -92,7 +94,9 @@ class HashRingManagerTestCase(db_base.DbTestCase):
|
||||
self.register_conductors()
|
||||
self.ring_manager.get_ring('hardware-type', '')
|
||||
|
||||
def test_hash_ring_manager_reset_interval(self):
|
||||
@mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
def test_hash_ring_manager_reset_interval(self, is_sqlite_mock):
|
||||
is_sqlite_mock.return_value = False
|
||||
CONF.set_override('hash_ring_reset_interval', 30)
|
||||
# Just to simplify calculations
|
||||
CONF.set_override('hash_partition_exponent', 0)
|
||||
@ -133,6 +137,53 @@ class HashRingManagerTestCase(db_base.DbTestCase):
|
||||
)
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
self.assertEqual(2, len(ring))
|
||||
self.assertEqual(3, is_sqlite_mock.call_count)
|
||||
|
||||
@mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
def test_hash_ring_manager_reset_interval_not_happen_sqlite(
|
||||
self, is_sqlite_mock):
|
||||
is_sqlite_mock.return_value = True
|
||||
CONF.set_override('hash_ring_reset_interval', 30)
|
||||
# Just to simplify calculations
|
||||
CONF.set_override('hash_partition_exponent', 0)
|
||||
c1 = self.dbapi.register_conductor({
|
||||
'hostname': 'host1',
|
||||
'drivers': ['driver1', 'driver2'],
|
||||
})
|
||||
c2 = self.dbapi.register_conductor({
|
||||
'hostname': 'host2',
|
||||
'drivers': ['driver1'],
|
||||
})
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
c1.id,
|
||||
[{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
|
||||
'interface_name': 'ansible', 'default': True},
|
||||
{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
|
||||
'interface_name': 'direct', 'default': False}]
|
||||
)
|
||||
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
self.assertEqual(1, len(ring))
|
||||
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
c2.id,
|
||||
[{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
|
||||
'interface_name': 'ansible', 'default': True},
|
||||
{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
|
||||
'interface_name': 'direct', 'default': False}]
|
||||
)
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
# The new conductor is not known yet. Automatic retry does not kick in,
|
||||
# since there is an active conductor for the requested hardware type.
|
||||
self.assertEqual(1, len(ring))
|
||||
|
||||
self.ring_manager.__class__._hash_rings = (
|
||||
self.ring_manager.__class__._hash_rings[0],
|
||||
time.monotonic() - 31
|
||||
)
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
self.assertEqual(1, len(ring))
|
||||
self.assertEqual(2, is_sqlite_mock.call_count)
|
||||
|
||||
def test_hash_ring_manager_uncached(self):
|
||||
ring_mgr = hash_ring.HashRingManager(cache=False,
|
||||
|
@ -27,6 +27,7 @@ from oslo_utils import uuidutils
|
||||
from ironic.common import driver_factory
|
||||
from ironic.common import exception
|
||||
from ironic.common import states
|
||||
from ironic.common import utils as common_utils
|
||||
from ironic.conductor import base_manager
|
||||
from ironic.conductor import manager
|
||||
from ironic.conductor import notification_utils
|
||||
@ -322,12 +323,24 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
'is_set', autospec=True) as mock_is_set:
|
||||
mock_is_set.side_effect = [False, True]
|
||||
self.service._conductor_service_record_keepalive()
|
||||
mock_touch.assert_not_called()
|
||||
with mock.patch.object(self.service._keepalive_evt,
|
||||
'is_set', autospec=True) as mock_is_set:
|
||||
mock_is_set.side_effect = [False, True]
|
||||
with mock.patch.object(common_utils, 'is_ironic_using_sqlite',
|
||||
autospec=True) as mock_is_sqlite:
|
||||
mock_is_sqlite.return_value = False
|
||||
self.service._conductor_service_record_keepalive()
|
||||
self.assertEqual(1, mock_is_sqlite.call_count)
|
||||
mock_touch.assert_called_once_with(self.hostname)
|
||||
|
||||
def test__conductor_service_record_keepalive_failed_db_conn(self):
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
def test__conductor_service_record_keepalive_failed_db_conn(
|
||||
self, is_sqlite_mock):
|
||||
self._start_service()
|
||||
# avoid wasting time at the event.wait()
|
||||
CONF.set_override('heartbeat_interval', 0, 'conductor')
|
||||
is_sqlite_mock.return_value = False
|
||||
with mock.patch.object(self.dbapi, 'touch_conductor',
|
||||
autospec=True) as mock_touch:
|
||||
mock_touch.side_effect = [None, db_exception.DBConnectionError(),
|
||||
@ -337,11 +350,15 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_is_set.side_effect = [False, False, False, True]
|
||||
self.service._conductor_service_record_keepalive()
|
||||
self.assertEqual(3, mock_touch.call_count)
|
||||
self.assertEqual(1, is_sqlite_mock.call_count)
|
||||
|
||||
def test__conductor_service_record_keepalive_failed_error(self):
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
def test__conductor_service_record_keepalive_failed_error(self,
|
||||
is_sqlite_mock):
|
||||
self._start_service()
|
||||
# avoid wasting time at the event.wait()
|
||||
# minimal time at the event.wait()
|
||||
CONF.set_override('heartbeat_interval', 0, 'conductor')
|
||||
is_sqlite_mock.return_value = False
|
||||
with mock.patch.object(self.dbapi, 'touch_conductor',
|
||||
autospec=True) as mock_touch:
|
||||
mock_touch.side_effect = [None, Exception(),
|
||||
@ -351,6 +368,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_is_set.side_effect = [False, False, False, True]
|
||||
self.service._conductor_service_record_keepalive()
|
||||
self.assertEqual(3, mock_touch.call_count)
|
||||
self.assertEqual(1, is_sqlite_mock.call_count)
|
||||
|
||||
|
||||
class ManagerSpawnWorkerTestCase(tests_base.TestCase):
|
||||
|
@ -21,6 +21,7 @@ from unittest import mock
|
||||
from oslo_utils import timeutils
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import utils as common_utils
|
||||
from ironic.tests.unit.db import base
|
||||
from ironic.tests.unit.db import utils
|
||||
|
||||
@ -295,9 +296,11 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
result = self.dbapi.get_active_hardware_type_dict()
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_get_active_hardware_type_dict_with_old_conductor(self,
|
||||
mock_utcnow):
|
||||
def test_get_active_hardware_type_dict_with_old_conductor(
|
||||
self, mock_utcnow, mock_is_sqlite):
|
||||
mock_is_sqlite.return_value = False
|
||||
past = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
present = past + datetime.timedelta(minutes=2)
|
||||
|
||||
@ -326,9 +329,12 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
expected = {ht: {h1, h2}, ht1: {h1}, ht2: {h2}}
|
||||
result = self.dbapi.get_active_hardware_type_dict()
|
||||
self.assertEqual(expected, result)
|
||||
self.assertEqual(2, mock_is_sqlite.call_count)
|
||||
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_get_offline_conductors(self, mock_utcnow):
|
||||
def test_get_offline_conductors(self, mock_utcnow, mock_is_sqlite):
|
||||
mock_is_sqlite.return_value = False
|
||||
self.config(heartbeat_timeout=60, group='conductor')
|
||||
time_ = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
|
||||
@ -344,9 +350,34 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
|
||||
self.assertEqual([c.hostname], self.dbapi.get_offline_conductors())
|
||||
self.assertEqual([c.id], self.dbapi.get_offline_conductors(field='id'))
|
||||
self.assertEqual(3, mock_is_sqlite.call_count)
|
||||
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_get_online_conductors(self, mock_utcnow):
|
||||
def test_get_offline_conductors_with_sqlite(self, mock_utcnow,
|
||||
mock_is_sqlite):
|
||||
mock_is_sqlite.return_value = True
|
||||
self.config(heartbeat_timeout=60, group='conductor')
|
||||
time_ = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
|
||||
mock_utcnow.return_value = time_
|
||||
self._create_test_cdr()
|
||||
|
||||
# Only 30 seconds passed since last heartbeat, it's still
|
||||
# considered alive
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30)
|
||||
self.assertEqual([], self.dbapi.get_offline_conductors())
|
||||
|
||||
# 61 seconds passed since last heartbeat, it's dead
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
|
||||
self.assertEqual([], self.dbapi.get_offline_conductors())
|
||||
self.assertEqual([], self.dbapi.get_offline_conductors(field='id'))
|
||||
self.assertEqual(3, mock_is_sqlite.call_count)
|
||||
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_get_online_conductors(self, mock_utcnow, mock_is_sqlite):
|
||||
mock_is_sqlite.return_value = False
|
||||
self.config(heartbeat_timeout=60, group='conductor')
|
||||
time_ = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
|
||||
@ -361,6 +392,26 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
# 61 seconds passed since last heartbeat, it's dead
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
|
||||
self.assertEqual([], self.dbapi.get_online_conductors())
|
||||
self.assertEqual(2, mock_is_sqlite.call_count)
|
||||
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_get_online_conductors_with_sqlite(self, mock_utcnow):
|
||||
# NOTE(TheJulia): Explicitly skipping the mock so the underlying
|
||||
# code on the test is 'tested'.
|
||||
self.config(heartbeat_timeout=60, group='conductor')
|
||||
time_ = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
|
||||
mock_utcnow.return_value = time_
|
||||
c = self._create_test_cdr()
|
||||
|
||||
# Only 30 seconds passed since last heartbeat, it's still
|
||||
# considered alive
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30)
|
||||
self.assertEqual([c.hostname], self.dbapi.get_online_conductors())
|
||||
|
||||
# 61 seconds passed since last heartbeat, it's dead
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
|
||||
self.assertEqual([c.hostname], self.dbapi.get_online_conductors())
|
||||
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_list_hardware_type_interfaces(self, mock_utcnow):
|
||||
@ -415,5 +466,14 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
|
||||
# 61 seconds passed since last heartbeat, it's dead
|
||||
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
|
||||
with mock.patch.object(common_utils, 'is_ironic_using_sqlite',
|
||||
autospec=True) as mock_is_sqlite:
|
||||
mock_is_sqlite.return_value = False
|
||||
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
|
||||
self.assertEqual(1, mock_is_sqlite.call_count)
|
||||
self.assertEqual([], result)
|
||||
|
||||
# Validate we still have four entries with SQLite, which is the
|
||||
# default for all unit tests running.
|
||||
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
|
||||
self.assertEqual(4, len(result))
|
||||
|
@ -0,0 +1,13 @@
|
||||
---
|
||||
deprecations:
|
||||
- |
|
||||
The use of a SQLite database with mutli-process (i.e. ``ironic-api`` and
|
||||
``ironic-conductor`` services) is not supported, and the ability to launch
|
||||
a dedicated ``ironic-api`` process with a SQLite database backend will
|
||||
be an error in the future. In this case, the single process combined
|
||||
API and Conductor service should be utilized.
|
||||
fixes:
|
||||
- |
|
||||
Database locks with a ``sqlite`` database backend should now be lessened
|
||||
as the conductor will no longer perform a keepalive heartbeat operation
|
||||
when the use of SQLite has been detected.
|
Loading…
x
Reference in New Issue
Block a user