From 6135a00c3b817f9bb548420bbcc34cd3b2eb3e9d Mon Sep 17 00:00:00 2001 From: Afonne-CID Date: Mon, 4 Aug 2025 17:33:51 +0100 Subject: [PATCH] Add periodic cleanup of stale conductors A new periodic task to automatically remove conductor records that have been offline for longer than a configured timeout period. This addresses the issue where deleted or decommissioned conductors would remain in the database indefinitely. Closes-Bug: #2069771 Assisted-by: Claude Sonnet 4.0 Change-Id: I90eb159abad94d8369b8792fa17c20d80201569a Signed-off-by: Afonne-CID --- ironic/conductor/manager.py | 112 ++++++++++++++++++ ironic/conf/conductor.py | 26 ++++ ironic/db/api.py | 8 ++ ironic/db/sqlalchemy/api.py | 35 ++++++ ironic/objects/conductor.py | 11 ++ ironic/tests/unit/conductor/test_manager.py | 89 ++++++++++++++ ironic/tests/unit/db/test_conductor.py | 57 +++++++++ ...nup-stale-conductors-654c2bcc4ffb4c43.yaml | 32 +++++ 8 files changed, 370 insertions(+) create mode 100644 releasenotes/notes/cleanup-stale-conductors-654c2bcc4ffb4c43.yaml diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 53f4fa277d..e434c1a3a8 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -41,6 +41,7 @@ notifying Neutron of a change, etc. """ import collections +import datetime import queue import time @@ -62,6 +63,7 @@ from ironic.common import network from ironic.common import nova from ironic.common import rpc from ironic.common import states +from ironic.common import utils as common_utils from ironic.conductor import allocations from ironic.conductor import base_manager from ironic.conductor import cleaning @@ -3780,6 +3782,116 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.error('Encountered error while cleaning node ' 'history records: %s', e) + @METRICS.timer('ConductorManager.cleanup_stale_conductors') + @periodics.periodic( + spacing=CONF.conductor.conductor_cleanup_interval, + enabled=CONF.conductor.conductor_cleanup_interval > 0 + ) + def cleanup_stale_conductors(self, context): + """Periodically clean up stale conductors from the database. + + This task removes conductors that have been offline for longer than + the configured timeout period. This helps prevent accumulation of + stale conductor records in the database. + """ + try: + cleanup_timeout = CONF.conductor.conductor_cleanup_timeout + heartbeat_timeout = CONF.conductor.heartbeat_timeout + + if heartbeat_timeout <= 0: + LOG.warning( + 'Skipping stale conductor cleanup due to invalid ' + 'configuration: heartbeat_timeout is invalid (%s).', + heartbeat_timeout + ) + return + + # We require conductor_cleanup_timeout to be at least 3x + # heartbeat_timeout to provide a significant safety margin. + # This ensures that active conductors won't be mistakenly + # removed from the database. + min_required = heartbeat_timeout * 3 + + if cleanup_timeout < min_required: + error_msg = _( + 'Skipping stale conductor cleanup due to invalid ' + 'configuration: [conductor]conductor_cleanup_timeout ' + '(%(cleanup_timeout)s) must be at least 3x ' + '[conductor]heartbeat_timeout (%(heartbeat_timeout)s) ' + '(recommended minimum: %(min_required)s). This is ' + 'required to prevent active conductors from being ' + 'mistakenly removed.') % { + 'cleanup_timeout': cleanup_timeout, + 'heartbeat_timeout': heartbeat_timeout, + 'min_required': min_required} + LOG.warning(error_msg) + return + + self._cleanup_stale_conductors(context) + except Exception as e: + LOG.error( + 'Encountered error while cleaning up stale conductors: %s', e) + + def _cleanup_stale_conductors(self, context): + """Clean up stale conductors from the database. + + :param context: request context. + """ + timeout = CONF.conductor.conductor_cleanup_timeout + batch_size = CONF.conductor.conductor_cleanup_batch_size + + # Get conductors that have been offline for longer than the timeout + if not common_utils.is_ironic_using_sqlite(): + + # For non-SQLite databases, we need to check the updated_at + # timestamp because the database may be shared by multiple + # conductors and we want to avoid deleting records for conductors + # that may still be alive but have not updated their heartbeat + # recently enough. + limit = (timeutils.utcnow() - datetime.timedelta(seconds=timeout)) + stale_conductors = self.dbapi.get_offline_conductors() + + # Filter by timestamp + conductors_to_delete = [] + for hostname in stale_conductors: + try: + conductor = objects.Conductor.get_by_hostname( + context, hostname, online=None) + if conductor.updated_at < limit: + conductors_to_delete.append(hostname) + if len(conductors_to_delete) >= batch_size: + break + except exception.ConductorNotFound: + # Conductor was already deleted, skip + continue + else: + # For SQLite, just get offline conductors + stale_conductors = self.dbapi.get_offline_conductors() + conductors_to_delete = stale_conductors[:batch_size] + + if not conductors_to_delete: + return + + LOG.info('Cleaning up %(count)d stale conductors: %(conductors)s', + {'count': len(conductors_to_delete), + 'conductors': conductors_to_delete}) + + deleted_count = 0 + for hostname in conductors_to_delete: + try: + self.dbapi.delete_conductor(hostname) + deleted_count += 1 + except exception.ConductorNotFound: + # Conductor was already deleted by another process + continue + except Exception as e: + LOG.error('Failed to delete conductor %(hostname)s: %(error)s', + {'hostname': hostname, 'error': e}) + + if deleted_count > 0: + LOG.info('Successfully cleaned up %(count)d stale conductors', + {'count': deleted_count}) + def _manage_node_history(self, context): """Periodic task to keep the node history tidy.""" max_batch = CONF.conductor.node_history_cleanup_batch_count diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index c74a7fe66a..9ea5c83763 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -404,6 +404,32 @@ opts = [ 'node_history_max_entries setting as users of ' 'this setting are anticipated to need to retain ' 'history by policy.')), + cfg.IntOpt('conductor_cleanup_interval', + min=0, + default=86400, + mutable=False, + help=_('Interval in seconds at which stale conductor entries ' + 'can be cleaned up from the database. Setting to 0 ' + 'disables the periodic task. Defaults to 86400 (1 day).' + )), + cfg.IntOpt('conductor_cleanup_timeout', + min=60, + default=1209600, + mutable=True, + help=_('Timeout in seconds after which offline conductors ' + 'are considered stale and can be cleaned up from the ' + 'database. It defaults to two weeks (1209600 seconds) ' + 'and is always required to be at least 3x larger than ' + '[conductor]heartbeat_timeout since if otherwise, ' + 'active conductors might be mistakenly removed from ' + 'the database.')), + cfg.IntOpt('conductor_cleanup_batch_size', + min=1, + default=50, + mutable=True, + help=_('The maximum number of stale conductor records to clean ' + 'up from the database in a single cleanup operation. ' + 'Defaults to 50.')), cfg.MultiOpt('verify_step_priority_override', item_type=types.Dict(), default={}, diff --git a/ironic/db/api.py b/ironic/db/api.py index 10d2dcd1d4..24bd70940c 100644 --- a/ironic/db/api.py +++ b/ironic/db/api.py @@ -594,6 +594,14 @@ class Connection(object, metaclass=abc.ABCMeta): :raises: ConductorNotFound """ + @abc.abstractmethod + def delete_conductor(self, hostname): + """Delete a conductor from the database. + + :param hostname: The hostname of this conductor service. + :raises: ConductorNotFound if the conductor doesn't exist. + """ + @abc.abstractmethod def touch_conductor(self, hostname, online=True): """Mark a conductor as active by updating its 'updated_at' property. diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 51e3fcabb5..94067aff84 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -1457,6 +1457,41 @@ class Connection(api.Connection): if count == 0: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock + def delete_conductor(self, hostname): + with _session_for_write() as session: + + # Before deleting the conductor, we clear any node + # reservations to prevent nodes from being left in a reserved + # state by a non-existent conductor which could block other + # conductors from managing those nodes. + self.clear_node_reservations_for_conductor(hostname) + + # Reset state to prevent transitional or inconsistent states and + # orphaned power state requests after deletion. + self.clear_node_target_power_state(hostname) + + # Delete conductor hardware interfaces + query = sa.delete(models.ConductorHardwareInterfaces).where( + models.ConductorHardwareInterfaces.conductor_id == ( + session.query(models.Conductor.id).where( + models.Conductor.hostname == hostname + ).scalar_subquery() + ) + ) + session.execute(query) + + query = sa.delete(models.Conductor).where( + models.Conductor.hostname == hostname + ) + result = session.execute(query) + count = result.rowcount + if count == 0: + raise exception.ConductorNotFound(conductor=hostname) + + LOG.info('Deleted conductor with hostname %(hostname)s', + {'hostname': hostname}) + @oslo_db_api.retry_on_deadlock def touch_conductor(self, hostname, online=True): with _session_for_write() as session: diff --git a/ironic/objects/conductor.py b/ironic/objects/conductor.py index a11e19df8b..eefe2ea93b 100644 --- a/ironic/objects/conductor.py +++ b/ironic/objects/conductor.py @@ -158,6 +158,17 @@ class Conductor(base.IronicObject, object_base.VersionedObjectDictCompat): self.unregister_all_hardware_interfaces() self.dbapi.unregister_conductor(self.hostname) + @classmethod + def delete(cls, context, hostname): + """Delete a conductor from the database. + + :param cls: the :class:`Conductor` + :param context: Security context + :param hostname: the hostname of the conductor to delete + :raises: ConductorNotFound if the conductor doesn't exist + """ + cls.dbapi.delete_conductor(hostname) + def register_hardware_interfaces(self, interfaces): """Register hardware interfaces with the conductor. diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index e358853f71..6969845341 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -9229,3 +9229,92 @@ class VirtualMediaTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.node.refresh() self.assertIn("Could not attach device cdrom", self.node.last_error) self.assertIn("disabled or not implemented", self.node.last_error) + + +class ConductorCleanupTestCase(mgr_utils.ServiceSetUpMixin, + db_base.DbTestCase): + """Test stale conductors cleanup.""" + def test_cleanup_stale_conductors(self): + """Test the periodic cleanup of stale conductors.""" + self._start_service() + + self.conductor1 = self.dbapi.register_conductor({ + 'hostname': 'conductor-1', + 'drivers': ['fake'], + 'conductor_group': 'group1' + }) + self.conductor2 = self.dbapi.register_conductor({ + 'hostname': 'conductor-2', + 'drivers': ['fake'], + 'conductor_group': 'group2' + }) + + self.dbapi.touch_conductor('conductor-1', online=False) + + with mock.patch.object(self.dbapi, 'get_offline_conductors', + return_value=['conductor-1'], autospec=True): + self.service._cleanup_stale_conductors(self.context) + + self.assertRaises(exception.ConductorNotFound, + self.dbapi.get_conductor, 'conductor-1') + + conductor2 = self.dbapi.get_conductor('conductor-2') + self.assertEqual('conductor-2', conductor2.hostname) + + def test_cleanup_stale_conductors_disabled(self): + """Test that cleanup is disabled when configured to do so.""" + self._start_service() + + conductors = [] + for i in range(5): + self.dbapi.register_conductor({ + 'hostname': 'conductor-%d' % i, + 'drivers': ['fake'], + 'conductor_group': 'group%d' % i + }) + self.dbapi.touch_conductor('conductor-%d' % i, online=False) + conductors.append('conductor-%d' % i) + + self.config(conductor_cleanup_batch_size=2, group='conductor') + + with mock.patch.object(self.dbapi, 'get_offline_conductors', + return_value=conductors, autospec=True): + with mock.patch.object(self.dbapi, 'delete_conductor', + autospec=True) as mock_delete: + self.service._cleanup_stale_conductors(self.context) + + self.assertEqual(2, mock_delete.call_count) + + def _run_cleanup_stale_conductors_and_check_warning( + self, heartbeat_timeout, conductor_cleanup_timeout, + expected_warning_substring=None): + self._start_service() + self.config(heartbeat_timeout=heartbeat_timeout, group='conductor') + self.config(conductor_cleanup_timeout=conductor_cleanup_timeout, + group='conductor') + self.config(conductor_cleanup_batch_size=2, group='conductor') + with mock.patch.object(self.dbapi, 'get_offline_conductors', + return_value=[], autospec=True): + with mock.patch('ironic.conductor.manager.LOG', + autospec=True) as mock_log: + self.service.cleanup_stale_conductors(self.context) + if expected_warning_substring is None: + mock_log.warning.assert_not_called() + else: + mock_log.warning.assert_called_once() + self.assertIn(expected_warning_substring, + mock_log.warning.call_args[0][0]) + + def test_cleanup_stale_conductors_configs_validation(self): + test_cases = [ + (60, 180, None), # valid: exactly 3x + (60, 200, None), # valid: greater than 3x + (0, 180, 'heartbeat_timeout is invalid'), + (60, 120, 'conductor_cleanup_timeout (120) must be at least 3x'), + ] + for heartbeat_timeout, cleanup_timeout, expected_warning in test_cases: + self._run_cleanup_stale_conductors_and_check_warning( + heartbeat_timeout=heartbeat_timeout, + conductor_cleanup_timeout=cleanup_timeout, + expected_warning_substring=expected_warning + ) diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index 34bab3fe20..4e97291d95 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -498,3 +498,60 @@ class DbConductorTestCase(base.DbTestCase): # default for all unit tests running. result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) self.assertEqual(4, len(result)) + + def test_delete_conductor(self): + self.dbapi.register_conductor( + {'hostname': 'test-conductor', + 'drivers': ['fake'], + 'conductor_group': 'test-group'}) + + self.dbapi.get_conductor('test-conductor') + self.dbapi.delete_conductor('test-conductor') + + self.assertRaises(exception.ConductorNotFound, + self.dbapi.get_conductor, 'test-conductor') + + def test_delete_conductor_not_found(self): + self.assertRaises(exception.ConductorNotFound, + self.dbapi.delete_conductor, + 'fake-hostname') + + def test_delete_conductor_clears_reservations(self): + self.dbapi.register_conductor( + {'hostname': 'test-conductor', + 'drivers': ['fake'], + 'conductor_group': 'test-group'}) + + node = self.dbapi.create_node({ + 'uuid': 'test-node-uuid', + 'driver': 'fake', + 'reservation': 'test-conductor' + }) + + self.assertEqual('test-conductor', node.reservation) + self.dbapi.delete_conductor('test-conductor') + + node = self.dbapi.get_node_by_uuid('test-node-uuid') + self.assertIsNone(node.reservation) + + def test_delete_conductor_clears_hardware_interfaces(self): + conductor = self.dbapi.register_conductor( + {'hostname': 'test-conductor', + 'drivers': ['fake'], + 'conductor_group': 'test-group'}) + + self.dbapi.register_conductor_hardware_interfaces( + conductor['id'], + [{'hardware_type': 'fake', + 'interface_type': 'deploy', + 'interface_name': 'direct', + 'default': True}]) + + interfaces = self.dbapi.list_conductor_hardware_interfaces( + conductor['id']) + self.assertEqual(1, len(interfaces)) + + self.dbapi.delete_conductor('test-conductor') + interfaces = self.dbapi.list_conductor_hardware_interfaces( + conductor['id']) + self.assertEqual([], interfaces) diff --git a/releasenotes/notes/cleanup-stale-conductors-654c2bcc4ffb4c43.yaml b/releasenotes/notes/cleanup-stale-conductors-654c2bcc4ffb4c43.yaml new file mode 100644 index 0000000000..af64fccf83 --- /dev/null +++ b/releasenotes/notes/cleanup-stale-conductors-654c2bcc4ffb4c43.yaml @@ -0,0 +1,32 @@ +--- +features: + - | + Adds periodic cleanup of stale conductor entries from the database. + A new periodic task automatically removes conductor records that have + been offline for longer than the configured timeout period, helping + prevent accumulation of stale conductor entries. + + Three new configuration options have been added to the ``[conductor]`` + section: + + * ``conductor_cleanup_interval`` - The interval in seconds of how often + to run the cleanup task (default: 86400 seconds - 1 day). + For example: 86400 seconds = 1 day, 604800 seconds = 1 week, + 2592000 seconds = 30 days (approx. 1 month). + * ``conductor_cleanup_timeout`` - How long a conductor must be offline + before it's considered stale and eligible for cleanup + (default: 1209600 seconds - 2 weeks). This value is always required to be + at least 3x larger than ``[conductor]heartbeat_timeout`` since if + otherwise, active conductors might be mistakenly removed from the + database. The cleanup task will skip execution and log a warning if + this requirement is not met. + * ``conductor_cleanup_batch_size`` - Maximum number of stale conductors + to clean up in a single operation (default: 50). + +fixes: + - | + [`Bug 2069771 `_] + Fixes an issue where deleted or decommissioned conductors would remain + in ``openstack baremetal conductor list`` indefinitely with + ``Alive = False`` status. The new periodic cleanup task automatically + removes these stale entries after the configured timeout period.