From 0125c26aba2e1ee3292a8cb85cc3ed2a07b6494a Mon Sep 17 00:00:00 2001 From: Afonne-CID Date: Wed, 11 Jun 2025 20:20:34 +0100 Subject: [PATCH] Log executed steps during cleaning/servicing/deploy Log steps performed during step-based flows in Node History at the beginning and at completion (or abort). Closes-Bug: #2106758 Change-Id: Ieffacf174180036d6a2418a8faf72a94eea74fb8 Signed-off-by: Afonne-CID --- devstack/lib/ironic | 3 + ironic/conductor/cleaning.py | 29 +++++++++- ironic/conductor/deployments.py | 24 ++++++++ ironic/conductor/servicing.py | 18 ++++++ ironic/conductor/steps.py | 24 ++++---- ironic/conductor/utils.py | 58 +++++++++++++++++++ ironic/conf/conductor.py | 17 +++++- ironic/tests/unit/conductor/test_utils.py | 39 +++++++++++++ ...mic-step-based-flows-f0ef2f066f5d8127.yaml | 12 ++++ zuul.d/ironic-jobs.yaml | 1 + 10 files changed, 212 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml diff --git a/devstack/lib/ironic b/devstack/lib/ironic index 72891bf96b..f914af763f 100644 --- a/devstack/lib/ironic +++ b/devstack/lib/ironic @@ -182,6 +182,7 @@ IRONIC_EXTRA_PXE_PARAMS=${IRONIC_EXTRA_PXE_PARAMS:-} IRONIC_TTY_DEV=${IRONIC_TTY_DEV:-ttyS0,115200} IRONIC_USE_PORT_GROUP=$(trueorfalse False IRONIC_USE_PORT_GROUP) IRONIC_PORT_GROUP_MODE=${IRONIC_PORT_GROUP_MODE:-"balance-rr"} +IRONIC_LOG_STEPS_TO_SYSLOG=${IRONIC_LOG_STEPS_TO_SYSLOG:-False} IRONIC_TEMPEST_BUILD_TIMEOUT=${IRONIC_TEMPEST_BUILD_TIMEOUT:-${BUILD_TIMEOUT:-}} if [[ -n "$BUILD_TIMEOUT" ]]; then @@ -1904,6 +1905,8 @@ function configure_ironic_conductor { iniset $IRONIC_CONF_FILE conductor automated_clean $IRONIC_AUTOMATED_CLEAN_ENABLED + iniset $IRONIC_CONF_FILE conductor log_step_flows_to_syslog $IRONIC_LOG_STEPS_TO_SYSLOG + # configure enabled and default interfaces local iface local iface_var diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index cb17a6c62c..597eb41dd6 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -28,6 +28,11 @@ from ironic import objects LOG = log.getLogger(__name__) +def get_cleaning_flow(manual_clean=False): + return (utils.StepFlow.CLEANING_MANUAL if manual_clean + else utils.StepFlow.CLEANING_AUTO) + + @task_manager.require_exclusive_lock def do_node_clean(task, clean_steps=None, disable_ramdisk=False, automated_with_steps=False): @@ -44,7 +49,9 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False, """ node = task.node manual_clean = clean_steps is not None and automated_with_steps is False - clean_type = 'manual' if manual_clean else 'automated' + + clean_type = get_cleaning_flow(manual_clean) + LOG.debug('Starting %(type)s cleaning for node %(node)s', {'type': clean_type, 'node': node.uuid}) @@ -135,6 +142,10 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False, return utils.cleaning_error_handler(task, msg) steps = node.driver_internal_info.get('clean_steps', []) + + utils.log_step_flow_history(node=node, flow=clean_type, + steps=steps, status="start") + step_index = 0 if steps else None do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk) @@ -157,6 +168,9 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): # For manual cleaning, the target provision state is MANAGEABLE, # whereas for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE + + clean_type = get_cleaning_flow(manual_clean) + if step_index is None: steps = [] else: @@ -234,6 +248,12 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): task.process_event('wait', target_state=target_state) return + utils.log_step_flow_history( + node=node, flow=clean_type, + steps=node.driver_internal_info.get("clean_steps", []), + status="end", aborted_step=step.get("step"), + ) + msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % {'node': node.uuid, 'exc': e, @@ -285,6 +305,13 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): tear_down_cleaning=False) utils.node_update_cache(task) LOG.info('Node %s cleaning complete', node.uuid) + + utils.log_step_flow_history( + node=node, flow=clean_type, + steps=node.driver_internal_info.get("clean_steps", []), + status="end" + ) + event = 'manage' if manual_clean or node.retired else 'done' # NOTE(rloo): No need to specify target prov. state; we're done task.process_event(event) diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 1f856a9ef4..84a950daba 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -303,6 +303,12 @@ def do_node_deploy(task, conductor_id=None, configdrive=None, node.conductor_affinity = conductor_id node.save() + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.DEPLOYMENT, + steps=node.driver_internal_info.get("deploy_steps", []), + status="start", + ) + do_next_deploy_step(task, 0) @@ -384,6 +390,12 @@ def do_next_deploy_step(task, step_index): task.process_event('wait') return except exception.IronicException as e: + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.DEPLOYMENT, + steps=node.driver_internal_info.get("deploy_steps", []), + status="end", aborted_step=step.get("step"), + ) + if isinstance(e, exception.AgentConnectionFailed): if task.node.driver_internal_info.get( async_steps.DEPLOYMENT_REBOOT): @@ -408,6 +420,12 @@ def do_next_deploy_step(task, step_index): % {'step': conductor_steps.step_id(step), 'err': e}) return except Exception as e: + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.DEPLOYMENT, + steps=node.driver_internal_info.get("deploy_steps", []), + status="end", aborted_step=step.get("step"), + ) + log_msg = ('Node %(node)s failed deploy step %(step)s with ' 'unexpected error: %(err)s' % {'node': node.uuid, 'step': node.deploy_step, 'err': e}) @@ -453,6 +471,12 @@ def do_next_deploy_step(task, step_index): LOG.info('Node %(node)s finished deploy step %(step)s', {'node': node.uuid, 'step': step}) + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.DEPLOYMENT, + steps=node.driver_internal_info.get("deploy_steps", []), + status="end", + ) + # Finished executing the steps. Clear deploy_step. node.deploy_step = None utils.wipe_deploy_internal_info(task) diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py index 6aaa4a58df..ad2d2b326c 100644 --- a/ironic/conductor/servicing.py +++ b/ironic/conductor/servicing.py @@ -76,6 +76,11 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False): return utils.servicing_error_handler(task, msg) steps = node.driver_internal_info.get('service_steps', []) + + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.SERVICING, + steps=steps, status="start") + step_index = 0 if steps else None for step in steps: step_requires_ramdisk = step.get('requires_ramdisk') @@ -194,6 +199,12 @@ def do_next_service_step(task, step_index, disable_ramdisk=None): task.process_event('wait') return + utils.log_step_flow_history( + node=node, flow=utils.StepFlow.SERVICING, + steps=node.driver_internal_info.get("service_steps", []), + status="end", aborted_step=step.get("step"), + ) + msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % {'node': node.uuid, 'exc': e, @@ -236,6 +247,13 @@ def _tear_down_node_service(task, disable_ramdisk): task.node.service_step = None utils.wipe_service_internal_info(task) task.node.save() + + utils.log_step_flow_history( + node=task.node, flow=utils.StepFlow.SERVICING, + steps=task.node.driver_internal_info.get("service_steps", []), + status="end" + ) + if not disable_ramdisk: try: task.driver.deploy.tear_down_service(task) diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py index 9112e6dad2..373eda68c2 100644 --- a/ironic/conductor/steps.py +++ b/ironic/conductor/steps.py @@ -333,10 +333,11 @@ def set_node_cleaning_steps(task, disable_ramdisk=False, # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE - LOG.debug('List of the steps for %(type)s cleaning of node %(node)s: ' - '%(steps)s', {'type': 'manual' if manual_clean else 'automated', - 'node': node.uuid, - 'steps': steps}) + if not CONF.conductor.log_step_flows_to_syslog: + LOG.debug( + 'List of the steps for %(type)s cleaning of node %(node)s: ' + '%(steps)s', {'type': 'manual' if manual_clean else 'automated', + 'node': node.uuid, 'steps': steps}) node.clean_step = {} node.set_driver_internal_info('clean_steps', steps) @@ -475,10 +476,11 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False): node.set_driver_internal_info('deploy_steps', _get_all_deployment_steps( task, skip_missing=skip_missing)) - LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', { - 'node': node.uuid, - 'steps': node.driver_internal_info['deploy_steps'] - }) + if not CONF.conductor.log_step_flows_to_syslog: + LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', { + 'node': node.uuid, + 'steps': node.driver_internal_info['deploy_steps'] + }) if reset_current: node.deploy_step = {} node.set_driver_internal_info('deploy_step_index', None) @@ -505,9 +507,9 @@ def set_node_service_steps(task, disable_ramdisk=False): steps = _validate_user_service_steps( task, node.driver_internal_info.get('service_steps', []), disable_ramdisk=disable_ramdisk) - LOG.debug('List of the steps for service of node %(node)s: ' - '%(steps)s', {'node': node.uuid, - 'steps': steps}) + if not CONF.conductor.log_step_flows_to_syslog: + LOG.debug('List of the steps for service of node %(node)s: ' + '%(steps)s', {'node': node.uuid, 'steps': steps}) node.service_step = {} node.set_driver_internal_info('service_steps', steps) diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 33a724d1dd..f8177f3a60 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -14,6 +14,7 @@ import contextlib import datetime +from enum import Enum import functools import os import secrets @@ -54,6 +55,13 @@ PASSWORD_HASH_FORMAT = { } +class StepFlow(Enum): + CLEANING_MANUAL = 'manual_cleaning' + CLEANING_AUTO = 'automated_cleaning' + DEPLOYMENT = 'deployment' + SERVICING = 'servicing' + + @task_manager.require_exclusive_lock def node_set_boot_device(task, device, persistent=False): """Set the boot device for a node. @@ -2008,3 +2016,53 @@ def node_update_cache(task): node_cache_boot_mode(task) node_cache_bios_settings(task) node_cache_firmware_components(task) + + +def log_step_flow_history(node, flow, steps, status, aborted_step=None): + """Persist start/end milestones for cleaning / servicing / deploy flows. + + :param node: A Node object + :param flow: A StepFlow enum member, e.g. StepFlow.DEPLOYMENT + :param steps: full list of step dicts (may be empty/None) + :param status: ``'start'`` or ``'done'`` + :param aborted_step: name of failed step (optional) + """ + if not (CONF.conductor.record_step_flows_in_history + or CONF.conductor.log_step_flows_to_syslog): + return + + sanitized = [ + ( + dict(st, args=strutils.mask_dict_password(st["args"])) + if isinstance(st, dict) and isinstance(st.get("args"), dict) + else st + ) + for st in (steps or []) + ] + + error = False + if status == "start": + event_type = "%s_start" % flow.value + event = "Starting %s with steps=%s" % ( + flow.value.replace("_", " "), sanitized) + elif status == "end": + event_type = "%s_end" % flow.value + if aborted_step: + error = True + event = "%s aborted at step '%s' with steps=%s" % ( + flow.value.replace("_", " ").title(), + aborted_step, sanitized) + else: + event = "%s completed successfully with steps=%s" % ( + flow.value.replace("_", " ").title(), sanitized) + else: + raise ValueError("Unknown status '%s'" % status) + + if CONF.conductor.log_step_flows_to_syslog: + log_fn = LOG.warning if error else LOG.info + log_fn("Node %(node)s: %(event)s", + {'node': node.uuid, 'event': event}) + + if CONF.conductor.record_step_flows_in_history: + node_history_record(node, event=event, event_type=event_type, + error=error) diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 1e48326d80..c74a7fe66a 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -652,7 +652,22 @@ opts = [ 'and fail the provisioning action that required a ' 'ramdisk and kernel. When set to False, Ironic will ' 'fallback to the next valid, consistent configured ' - 'ramdisk and kernel for the node.')) + 'ramdisk and kernel for the node.')), + cfg.BoolOpt( + "record_step_flows_in_history", + default=True, + help=( + "When True, the conductor writes a Node History entry at the " + "start and end of every cleaning/servicing/deploy-steps flow. " + "Disable this in very high-churn environments to reduce DB load." + )), + cfg.BoolOpt( + 'log_step_flows_to_syslog', + default=False, + help=( + "Log steps at the start/end of cleaning/servicing/deployment " + "to the conductor service log (WARNING for aborted/failure, " + "INFO otherwise.")), ] diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 4a243cbc44..344a2acff7 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -3094,6 +3094,12 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase): self.node = obj_utils.create_test_node( self.context, uuid=uuidutils.generate_uuid()) + self.steps = [ + { + 'step': 'erase_devices', + 'interface': 'deploy', + 'args': {'token': 'secret'}, + 'priority': 99}] def test_record_node_history(self): conductor_utils.node_history_record(self.node, event='meow') @@ -3143,6 +3149,39 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase): mock_history.assert_not_called() mock_create.assert_not_called() + @mock.patch('ironic.conductor.utils.LOG', autospec=True) + @mock.patch('ironic.conductor.utils.node_history_record', autospec=True) + def test_logs_to_history_and_syslog(self, mock_history, mock_log): + self.config(record_step_flows_in_history=True, group='conductor') + self.config(log_step_flows_to_syslog=True, group='conductor') + + conductor_utils.log_step_flow_history( + self.node, flow=conductor_utils.StepFlow.DEPLOYMENT, + steps=self.steps, status='start') + + mock_history.assert_called_once() + mock_log.info.assert_called_once() + assert 'deployment_start' in mock_history.call_args.kwargs[ + 'event_type'] + assert self.node.uuid in mock_log.info.call_args.args[1]['node'] + + @mock.patch('ironic.conductor.utils.node_history_record', autospec=True) + def test_sanitizes_passwords_in_args(self, mock_history): + steps = [ + {'interface': 'clean', 'step': 'noop', + 'args': {'password': 'meow'}, 'priority': 10} + ] + + conductor_utils.log_step_flow_history( + self.node, flow=conductor_utils.StepFlow.CLEANING_AUTO, + steps=steps, status='start') + + mock_history.assert_called_once() + event_string = mock_history.call_args.kwargs['event'] + assert 'password' in event_string + assert '***' in event_string + assert 'meow' not in event_string + class GetTokenProjectFromRequestTestCase(db_base.DbTestCase): diff --git a/releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml b/releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml new file mode 100644 index 0000000000..845d9a2573 --- /dev/null +++ b/releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml @@ -0,0 +1,12 @@ +--- +features: + - | + Start and end of dynamic step-based flows (automated cleaning, + manual cleaning, servicing, and deploy steps) are now logged to + Node History if ``[conductor]record_step_flows_in_history`` is enabled + (default: ``True``). This makes it easier to trace what steps were + attempted when troubleshooting deployment failures. + + A second knob which is disabled by default, + ``[conductor]log_step_flows_to_syslog``, allows emitting the same flow + events, but to the service log. diff --git a/zuul.d/ironic-jobs.yaml b/zuul.d/ironic-jobs.yaml index 8f4804af98..31cd8f017b 100644 --- a/zuul.d/ironic-jobs.yaml +++ b/zuul.d/ironic-jobs.yaml @@ -97,6 +97,7 @@ IRONIC_DEPLOY_DRIVER: ipmi IRONIC_INSPECTOR_BUILD_RAMDISK: False IRONIC_INSPECTOR_TEMPEST_INTROSPECTION_TIMEOUT: 1200 + IRONIC_LOG_STEPS_TO_SYSLOG: True IRONIC_TEMPEST_BUILD_TIMEOUT: 2000 IRONIC_TEMPEST_WHOLE_DISK_IMAGE: False IRONIC_VM_COUNT: 2