Log executed steps during cleaning/servicing/deploy

Log steps performed during step-based flows in Node History at the beginning and at completion (or abort). Closes-Bug: #2106758 Change-Id: Ieffacf174180036d6a2418a8faf72a94eea74fb8 Signed-off-by: Afonne-CID <afonnepaulc@gmail.com>
2025-06-11 20:20:34 +01:00
parent 51dd388d12
commit 0125c26aba
10 changed files with 212 additions and 13 deletions
--- a/devstack/lib/ironic
+++ b/devstack/lib/ironic
@@ -182,6 +182,7 @@ IRONIC_EXTRA_PXE_PARAMS=${IRONIC_EXTRA_PXE_PARAMS:-}
 IRONIC_TTY_DEV=${IRONIC_TTY_DEV:-ttyS0,115200}
 IRONIC_USE_PORT_GROUP=$(trueorfalse False IRONIC_USE_PORT_GROUP)
 IRONIC_PORT_GROUP_MODE=${IRONIC_PORT_GROUP_MODE:-"balance-rr"}
+IRONIC_LOG_STEPS_TO_SYSLOG=${IRONIC_LOG_STEPS_TO_SYSLOG:-False}

 IRONIC_TEMPEST_BUILD_TIMEOUT=${IRONIC_TEMPEST_BUILD_TIMEOUT:-${BUILD_TIMEOUT:-}}
 if [[ -n "$BUILD_TIMEOUT" ]]; then
@@ -1904,6 +1905,8 @@ function configure_ironic_conductor {

    iniset $IRONIC_CONF_FILE conductor automated_clean $IRONIC_AUTOMATED_CLEAN_ENABLED

+    iniset $IRONIC_CONF_FILE conductor log_step_flows_to_syslog $IRONIC_LOG_STEPS_TO_SYSLOG
+
    # configure enabled and default interfaces
    local iface
    local iface_var
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -28,6 +28,11 @@ from ironic import objects
 LOG = log.getLogger(__name__)


+def get_cleaning_flow(manual_clean=False):
+    return (utils.StepFlow.CLEANING_MANUAL if manual_clean
+            else utils.StepFlow.CLEANING_AUTO)
+
+
@task_manager.require_exclusive_lock
 def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
                  automated_with_steps=False):
@@ -44,7 +49,9 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
    """
    node = task.node
    manual_clean = clean_steps is not None and automated_with_steps is False
-    clean_type = 'manual' if manual_clean else 'automated'
+
+    clean_type = get_cleaning_flow(manual_clean)
+
    LOG.debug('Starting %(type)s cleaning for node %(node)s',
              {'type': clean_type, 'node': node.uuid})

@@ -135,6 +142,10 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
        return utils.cleaning_error_handler(task, msg)

    steps = node.driver_internal_info.get('clean_steps', [])
+
+    utils.log_step_flow_history(node=node, flow=clean_type,
+                                steps=steps, status="start")
+
    step_index = 0 if steps else None
    do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk)

@@ -157,6 +168,9 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE
+
+    clean_type = get_cleaning_flow(manual_clean)
+
    if step_index is None:
        steps = []
    else:
@@ -234,6 +248,12 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
                task.process_event('wait', target_state=target_state)
                return

+            utils.log_step_flow_history(
+                node=node, flow=clean_type,
+                steps=node.driver_internal_info.get("clean_steps", []),
+                status="end", aborted_step=step.get("step"),
+            )
+
            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
@@ -285,6 +305,13 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
                                                tear_down_cleaning=False)
    utils.node_update_cache(task)
    LOG.info('Node %s cleaning complete', node.uuid)
+
+    utils.log_step_flow_history(
+        node=node, flow=clean_type,
+        steps=node.driver_internal_info.get("clean_steps", []),
+        status="end"
+    )
+
    event = 'manage' if manual_clean or node.retired else 'done'
    # NOTE(rloo): No need to specify target prov. state; we're done
    task.process_event(event)
--- a/ironic/conductor/deployments.py
+++ b/ironic/conductor/deployments.py
@@ -303,6 +303,12 @@ def do_node_deploy(task, conductor_id=None, configdrive=None,
        node.conductor_affinity = conductor_id
        node.save()

+    utils.log_step_flow_history(
+        node=node, flow=utils.StepFlow.DEPLOYMENT,
+        steps=node.driver_internal_info.get("deploy_steps", []),
+        status="start",
+    )
+
    do_next_deploy_step(task, 0)


@@ -384,6 +390,12 @@ def do_next_deploy_step(task, step_index):
            task.process_event('wait')
            return
        except exception.IronicException as e:
+            utils.log_step_flow_history(
+                node=node, flow=utils.StepFlow.DEPLOYMENT,
+                steps=node.driver_internal_info.get("deploy_steps", []),
+                status="end", aborted_step=step.get("step"),
+            )
+
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get(
                        async_steps.DEPLOYMENT_REBOOT):
@@ -408,6 +420,12 @@ def do_next_deploy_step(task, step_index):
                    % {'step': conductor_steps.step_id(step), 'err': e})
            return
        except Exception as e:
+            utils.log_step_flow_history(
+                node=node, flow=utils.StepFlow.DEPLOYMENT,
+                steps=node.driver_internal_info.get("deploy_steps", []),
+                status="end", aborted_step=step.get("step"),
+            )
+
            log_msg = ('Node %(node)s failed deploy step %(step)s with '
                       'unexpected error: %(err)s' %
                       {'node': node.uuid, 'step': node.deploy_step, 'err': e})
@@ -453,6 +471,12 @@ def do_next_deploy_step(task, step_index):
        LOG.info('Node %(node)s finished deploy step %(step)s',
                 {'node': node.uuid, 'step': step})

+    utils.log_step_flow_history(
+        node=node, flow=utils.StepFlow.DEPLOYMENT,
+        steps=node.driver_internal_info.get("deploy_steps", []),
+        status="end",
+    )
+
    # Finished executing the steps. Clear deploy_step.
    node.deploy_step = None
    utils.wipe_deploy_internal_info(task)
--- a/ironic/conductor/servicing.py
+++ b/ironic/conductor/servicing.py
@@ -76,6 +76,11 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
        return utils.servicing_error_handler(task, msg)

    steps = node.driver_internal_info.get('service_steps', [])
+
+    utils.log_step_flow_history(
+        node=node, flow=utils.StepFlow.SERVICING,
+        steps=steps, status="start")
+
    step_index = 0 if steps else None
    for step in steps:
        step_requires_ramdisk = step.get('requires_ramdisk')
@@ -194,6 +199,12 @@ def do_next_service_step(task, step_index, disable_ramdisk=None):
                task.process_event('wait')
                return

+            utils.log_step_flow_history(
+                node=node, flow=utils.StepFlow.SERVICING,
+                steps=node.driver_internal_info.get("service_steps", []),
+                status="end", aborted_step=step.get("step"),
+            )
+
            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
@@ -236,6 +247,13 @@ def _tear_down_node_service(task, disable_ramdisk):
    task.node.service_step = None
    utils.wipe_service_internal_info(task)
    task.node.save()
+
+    utils.log_step_flow_history(
+        node=task.node, flow=utils.StepFlow.SERVICING,
+        steps=task.node.driver_internal_info.get("service_steps", []),
+        status="end"
+    )
+
    if not disable_ramdisk:
        try:
            task.driver.deploy.tear_down_service(task)
--- a/ironic/conductor/steps.py
+++ b/ironic/conductor/steps.py
@@ -333,10 +333,11 @@ def set_node_cleaning_steps(task, disable_ramdisk=False,
    # For manual cleaning, the target provision state is MANAGEABLE, whereas
    # for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE
-    LOG.debug('List of the steps for %(type)s cleaning of node %(node)s: '
-              '%(steps)s', {'type': 'manual' if manual_clean else 'automated',
-                            'node': node.uuid,
-                            'steps': steps})
+    if not CONF.conductor.log_step_flows_to_syslog:
+        LOG.debug(
+            'List of the steps for %(type)s cleaning of node %(node)s: '
+            '%(steps)s', {'type': 'manual' if manual_clean else 'automated',
+                          'node': node.uuid, 'steps': steps})

    node.clean_step = {}
    node.set_driver_internal_info('clean_steps', steps)
@@ -475,10 +476,11 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False):
    node.set_driver_internal_info('deploy_steps', _get_all_deployment_steps(
        task, skip_missing=skip_missing))

-    LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
-        'node': node.uuid,
-        'steps': node.driver_internal_info['deploy_steps']
-    })
+    if not CONF.conductor.log_step_flows_to_syslog:
+        LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
+            'node': node.uuid,
+            'steps': node.driver_internal_info['deploy_steps']
+        })
    if reset_current:
        node.deploy_step = {}
        node.set_driver_internal_info('deploy_step_index', None)
@@ -505,9 +507,9 @@ def set_node_service_steps(task, disable_ramdisk=False):
    steps = _validate_user_service_steps(
        task, node.driver_internal_info.get('service_steps', []),
        disable_ramdisk=disable_ramdisk)
-    LOG.debug('List of the steps for service of node %(node)s: '
-              '%(steps)s', {'node': node.uuid,
-                            'steps': steps})
+    if not CONF.conductor.log_step_flows_to_syslog:
+        LOG.debug('List of the steps for service of node %(node)s: '
+                  '%(steps)s', {'node': node.uuid, 'steps': steps})

    node.service_step = {}
    node.set_driver_internal_info('service_steps', steps)
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -14,6 +14,7 @@

 import contextlib
 import datetime
+from enum import Enum
 import functools
 import os
 import secrets
@@ -54,6 +55,13 @@ PASSWORD_HASH_FORMAT = {
 }


+class StepFlow(Enum):
+    CLEANING_MANUAL = 'manual_cleaning'
+    CLEANING_AUTO = 'automated_cleaning'
+    DEPLOYMENT = 'deployment'
+    SERVICING = 'servicing'
+
+
@task_manager.require_exclusive_lock
 def node_set_boot_device(task, device, persistent=False):
    """Set the boot device for a node.
@@ -2008,3 +2016,53 @@ def node_update_cache(task):
    node_cache_boot_mode(task)
    node_cache_bios_settings(task)
    node_cache_firmware_components(task)
+
+
+def log_step_flow_history(node, flow, steps, status, aborted_step=None):
+    """Persist start/end milestones for cleaning / servicing / deploy flows.
+
+    :param node: A Node object
+    :param flow: A StepFlow enum member, e.g. StepFlow.DEPLOYMENT
+    :param steps: full list of step dicts (may be empty/None)
+    :param status: ``'start'`` or ``'done'``
+    :param aborted_step:  name of failed step (optional)
+    """
+    if not (CONF.conductor.record_step_flows_in_history
+            or CONF.conductor.log_step_flows_to_syslog):
+        return
+
+    sanitized = [
+        (
+            dict(st, args=strutils.mask_dict_password(st["args"]))
+            if isinstance(st, dict) and isinstance(st.get("args"), dict)
+            else st
+        )
+        for st in (steps or [])
+    ]
+
+    error = False
+    if status == "start":
+        event_type = "%s_start" % flow.value
+        event = "Starting %s with steps=%s" % (
+            flow.value.replace("_", " "), sanitized)
+    elif status == "end":
+        event_type = "%s_end" % flow.value
+        if aborted_step:
+            error = True
+            event = "%s aborted at step '%s' with steps=%s" % (
+                flow.value.replace("_", " ").title(),
+                aborted_step, sanitized)
+        else:
+            event = "%s completed successfully with steps=%s" % (
+                flow.value.replace("_", " ").title(), sanitized)
+    else:
+        raise ValueError("Unknown status '%s'" % status)
+
+    if CONF.conductor.log_step_flows_to_syslog:
+        log_fn = LOG.warning if error else LOG.info
+        log_fn("Node %(node)s: %(event)s",
+               {'node': node.uuid, 'event': event})
+
+    if CONF.conductor.record_step_flows_in_history:
+        node_history_record(node, event=event, event_type=event_type,
+                            error=error)
--- a/ironic/conf/conductor.py
+++ b/ironic/conf/conductor.py
@@ -652,7 +652,22 @@ opts = [
                       'and fail the provisioning action that required a '
                       'ramdisk and kernel. When set to False, Ironic will '
                       'fallback to the next valid, consistent configured '
-                       'ramdisk and kernel for the node.'))
+                       'ramdisk and kernel for the node.')),
+    cfg.BoolOpt(
+        "record_step_flows_in_history",
+        default=True,
+        help=(
+            "When True, the conductor writes a Node History entry at the "
+            "start and end of every cleaning/servicing/deploy-steps flow. "
+            "Disable this in very high-churn environments to reduce DB load."
+        )),
+    cfg.BoolOpt(
+        'log_step_flows_to_syslog',
+        default=False,
+        help=(
+            "Log steps at the start/end of cleaning/servicing/deployment "
+            "to the conductor service log (WARNING for aborted/failure, "
+            "INFO otherwise.")),
 ]


--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -3094,6 +3094,12 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
        self.node = obj_utils.create_test_node(
            self.context,
            uuid=uuidutils.generate_uuid())
+        self.steps = [
+            {
+                'step': 'erase_devices',
+                'interface': 'deploy',
+                'args': {'token': 'secret'},
+                'priority': 99}]

    def test_record_node_history(self):
        conductor_utils.node_history_record(self.node, event='meow')
@@ -3143,6 +3149,39 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
        mock_history.assert_not_called()
        mock_create.assert_not_called()

+    @mock.patch('ironic.conductor.utils.LOG', autospec=True)
+    @mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
+    def test_logs_to_history_and_syslog(self, mock_history, mock_log):
+        self.config(record_step_flows_in_history=True, group='conductor')
+        self.config(log_step_flows_to_syslog=True, group='conductor')
+
+        conductor_utils.log_step_flow_history(
+            self.node, flow=conductor_utils.StepFlow.DEPLOYMENT,
+            steps=self.steps, status='start')
+
+        mock_history.assert_called_once()
+        mock_log.info.assert_called_once()
+        assert 'deployment_start' in mock_history.call_args.kwargs[
+            'event_type']
+        assert self.node.uuid in mock_log.info.call_args.args[1]['node']
+
+    @mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
+    def test_sanitizes_passwords_in_args(self, mock_history):
+        steps = [
+            {'interface': 'clean', 'step': 'noop',
+             'args': {'password': 'meow'}, 'priority': 10}
+        ]
+
+        conductor_utils.log_step_flow_history(
+            self.node, flow=conductor_utils.StepFlow.CLEANING_AUTO,
+            steps=steps, status='start')
+
+        mock_history.assert_called_once()
+        event_string = mock_history.call_args.kwargs['event']
+        assert 'password' in event_string
+        assert '***' in event_string
+        assert 'meow' not in event_string
+

 class GetTokenProjectFromRequestTestCase(db_base.DbTestCase):

--- a/releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml
+++ b/releasenotes/notes/log-steps-during-dynamic-step-based-flows-f0ef2f066f5d8127.yaml
@@ -0,0 +1,12 @@
+---
+features:
+  - |
+    Start and end of dynamic step-based flows (automated cleaning,
+    manual cleaning, servicing, and deploy steps) are now logged to
+    Node History if ``[conductor]record_step_flows_in_history`` is enabled
+    (default: ``True``). This makes it easier to trace what steps were
+    attempted when troubleshooting deployment failures.
+
+    A second knob which is disabled by default,
+    ``[conductor]log_step_flows_to_syslog``, allows emitting the same flow
+    events, but to the service log.
--- a/zuul.d/ironic-jobs.yaml
+++ b/zuul.d/ironic-jobs.yaml
@@ -97,6 +97,7 @@
        IRONIC_DEPLOY_DRIVER: ipmi
        IRONIC_INSPECTOR_BUILD_RAMDISK: False
        IRONIC_INSPECTOR_TEMPEST_INTROSPECTION_TIMEOUT: 1200
+        IRONIC_LOG_STEPS_TO_SYSLOG: True
        IRONIC_TEMPEST_BUILD_TIMEOUT: 2000
        IRONIC_TEMPEST_WHOLE_DISK_IMAGE: False
        IRONIC_VM_COUNT: 2