Log executed steps during cleaning/servicing/deploy

Log steps performed during step-based flows in Node History
at the beginning and at completion (or abort).

Closes-Bug: #2106758
Change-Id: Ieffacf174180036d6a2418a8faf72a94eea74fb8
Signed-off-by: Afonne-CID <afonnepaulc@gmail.com>
This commit is contained in:
Afonne-CID
2025-06-11 20:20:34 +01:00
parent 51dd388d12
commit 0125c26aba
10 changed files with 212 additions and 13 deletions

View File

@@ -182,6 +182,7 @@ IRONIC_EXTRA_PXE_PARAMS=${IRONIC_EXTRA_PXE_PARAMS:-}
IRONIC_TTY_DEV=${IRONIC_TTY_DEV:-ttyS0,115200}
IRONIC_USE_PORT_GROUP=$(trueorfalse False IRONIC_USE_PORT_GROUP)
IRONIC_PORT_GROUP_MODE=${IRONIC_PORT_GROUP_MODE:-"balance-rr"}
IRONIC_LOG_STEPS_TO_SYSLOG=${IRONIC_LOG_STEPS_TO_SYSLOG:-False}
IRONIC_TEMPEST_BUILD_TIMEOUT=${IRONIC_TEMPEST_BUILD_TIMEOUT:-${BUILD_TIMEOUT:-}}
if [[ -n "$BUILD_TIMEOUT" ]]; then
@@ -1904,6 +1905,8 @@ function configure_ironic_conductor {
iniset $IRONIC_CONF_FILE conductor automated_clean $IRONIC_AUTOMATED_CLEAN_ENABLED
iniset $IRONIC_CONF_FILE conductor log_step_flows_to_syslog $IRONIC_LOG_STEPS_TO_SYSLOG
# configure enabled and default interfaces
local iface
local iface_var

View File

@@ -28,6 +28,11 @@ from ironic import objects
LOG = log.getLogger(__name__)
def get_cleaning_flow(manual_clean=False):
return (utils.StepFlow.CLEANING_MANUAL if manual_clean
else utils.StepFlow.CLEANING_AUTO)
@task_manager.require_exclusive_lock
def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
automated_with_steps=False):
@@ -44,7 +49,9 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
"""
node = task.node
manual_clean = clean_steps is not None and automated_with_steps is False
clean_type = 'manual' if manual_clean else 'automated'
clean_type = get_cleaning_flow(manual_clean)
LOG.debug('Starting %(type)s cleaning for node %(node)s',
{'type': clean_type, 'node': node.uuid})
@@ -135,6 +142,10 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
return utils.cleaning_error_handler(task, msg)
steps = node.driver_internal_info.get('clean_steps', [])
utils.log_step_flow_history(node=node, flow=clean_type,
steps=steps, status="start")
step_index = 0 if steps else None
do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk)
@@ -157,6 +168,9 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
# For manual cleaning, the target provision state is MANAGEABLE,
# whereas for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
clean_type = get_cleaning_flow(manual_clean)
if step_index is None:
steps = []
else:
@@ -234,6 +248,12 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
task.process_event('wait', target_state=target_state)
return
utils.log_step_flow_history(
node=node, flow=clean_type,
steps=node.driver_internal_info.get("clean_steps", []),
status="end", aborted_step=step.get("step"),
)
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
{'node': node.uuid, 'exc': e,
@@ -285,6 +305,13 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
tear_down_cleaning=False)
utils.node_update_cache(task)
LOG.info('Node %s cleaning complete', node.uuid)
utils.log_step_flow_history(
node=node, flow=clean_type,
steps=node.driver_internal_info.get("clean_steps", []),
status="end"
)
event = 'manage' if manual_clean or node.retired else 'done'
# NOTE(rloo): No need to specify target prov. state; we're done
task.process_event(event)

View File

@@ -303,6 +303,12 @@ def do_node_deploy(task, conductor_id=None, configdrive=None,
node.conductor_affinity = conductor_id
node.save()
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.DEPLOYMENT,
steps=node.driver_internal_info.get("deploy_steps", []),
status="start",
)
do_next_deploy_step(task, 0)
@@ -384,6 +390,12 @@ def do_next_deploy_step(task, step_index):
task.process_event('wait')
return
except exception.IronicException as e:
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.DEPLOYMENT,
steps=node.driver_internal_info.get("deploy_steps", []),
status="end", aborted_step=step.get("step"),
)
if isinstance(e, exception.AgentConnectionFailed):
if task.node.driver_internal_info.get(
async_steps.DEPLOYMENT_REBOOT):
@@ -408,6 +420,12 @@ def do_next_deploy_step(task, step_index):
% {'step': conductor_steps.step_id(step), 'err': e})
return
except Exception as e:
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.DEPLOYMENT,
steps=node.driver_internal_info.get("deploy_steps", []),
status="end", aborted_step=step.get("step"),
)
log_msg = ('Node %(node)s failed deploy step %(step)s with '
'unexpected error: %(err)s' %
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
@@ -453,6 +471,12 @@ def do_next_deploy_step(task, step_index):
LOG.info('Node %(node)s finished deploy step %(step)s',
{'node': node.uuid, 'step': step})
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.DEPLOYMENT,
steps=node.driver_internal_info.get("deploy_steps", []),
status="end",
)
# Finished executing the steps. Clear deploy_step.
node.deploy_step = None
utils.wipe_deploy_internal_info(task)

View File

@@ -76,6 +76,11 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
return utils.servicing_error_handler(task, msg)
steps = node.driver_internal_info.get('service_steps', [])
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.SERVICING,
steps=steps, status="start")
step_index = 0 if steps else None
for step in steps:
step_requires_ramdisk = step.get('requires_ramdisk')
@@ -194,6 +199,12 @@ def do_next_service_step(task, step_index, disable_ramdisk=None):
task.process_event('wait')
return
utils.log_step_flow_history(
node=node, flow=utils.StepFlow.SERVICING,
steps=node.driver_internal_info.get("service_steps", []),
status="end", aborted_step=step.get("step"),
)
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
{'node': node.uuid, 'exc': e,
@@ -236,6 +247,13 @@ def _tear_down_node_service(task, disable_ramdisk):
task.node.service_step = None
utils.wipe_service_internal_info(task)
task.node.save()
utils.log_step_flow_history(
node=task.node, flow=utils.StepFlow.SERVICING,
steps=task.node.driver_internal_info.get("service_steps", []),
status="end"
)
if not disable_ramdisk:
try:
task.driver.deploy.tear_down_service(task)

View File

@@ -333,10 +333,11 @@ def set_node_cleaning_steps(task, disable_ramdisk=False,
# For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
LOG.debug('List of the steps for %(type)s cleaning of node %(node)s: '
'%(steps)s', {'type': 'manual' if manual_clean else 'automated',
'node': node.uuid,
'steps': steps})
if not CONF.conductor.log_step_flows_to_syslog:
LOG.debug(
'List of the steps for %(type)s cleaning of node %(node)s: '
'%(steps)s', {'type': 'manual' if manual_clean else 'automated',
'node': node.uuid, 'steps': steps})
node.clean_step = {}
node.set_driver_internal_info('clean_steps', steps)
@@ -475,10 +476,11 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False):
node.set_driver_internal_info('deploy_steps', _get_all_deployment_steps(
task, skip_missing=skip_missing))
LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
'node': node.uuid,
'steps': node.driver_internal_info['deploy_steps']
})
if not CONF.conductor.log_step_flows_to_syslog:
LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
'node': node.uuid,
'steps': node.driver_internal_info['deploy_steps']
})
if reset_current:
node.deploy_step = {}
node.set_driver_internal_info('deploy_step_index', None)
@@ -505,9 +507,9 @@ def set_node_service_steps(task, disable_ramdisk=False):
steps = _validate_user_service_steps(
task, node.driver_internal_info.get('service_steps', []),
disable_ramdisk=disable_ramdisk)
LOG.debug('List of the steps for service of node %(node)s: '
'%(steps)s', {'node': node.uuid,
'steps': steps})
if not CONF.conductor.log_step_flows_to_syslog:
LOG.debug('List of the steps for service of node %(node)s: '
'%(steps)s', {'node': node.uuid, 'steps': steps})
node.service_step = {}
node.set_driver_internal_info('service_steps', steps)

View File

@@ -14,6 +14,7 @@
import contextlib
import datetime
from enum import Enum
import functools
import os
import secrets
@@ -54,6 +55,13 @@ PASSWORD_HASH_FORMAT = {
}
class StepFlow(Enum):
CLEANING_MANUAL = 'manual_cleaning'
CLEANING_AUTO = 'automated_cleaning'
DEPLOYMENT = 'deployment'
SERVICING = 'servicing'
@task_manager.require_exclusive_lock
def node_set_boot_device(task, device, persistent=False):
"""Set the boot device for a node.
@@ -2008,3 +2016,53 @@ def node_update_cache(task):
node_cache_boot_mode(task)
node_cache_bios_settings(task)
node_cache_firmware_components(task)
def log_step_flow_history(node, flow, steps, status, aborted_step=None):
"""Persist start/end milestones for cleaning / servicing / deploy flows.
:param node: A Node object
:param flow: A StepFlow enum member, e.g. StepFlow.DEPLOYMENT
:param steps: full list of step dicts (may be empty/None)
:param status: ``'start'`` or ``'done'``
:param aborted_step: name of failed step (optional)
"""
if not (CONF.conductor.record_step_flows_in_history
or CONF.conductor.log_step_flows_to_syslog):
return
sanitized = [
(
dict(st, args=strutils.mask_dict_password(st["args"]))
if isinstance(st, dict) and isinstance(st.get("args"), dict)
else st
)
for st in (steps or [])
]
error = False
if status == "start":
event_type = "%s_start" % flow.value
event = "Starting %s with steps=%s" % (
flow.value.replace("_", " "), sanitized)
elif status == "end":
event_type = "%s_end" % flow.value
if aborted_step:
error = True
event = "%s aborted at step '%s' with steps=%s" % (
flow.value.replace("_", " ").title(),
aborted_step, sanitized)
else:
event = "%s completed successfully with steps=%s" % (
flow.value.replace("_", " ").title(), sanitized)
else:
raise ValueError("Unknown status '%s'" % status)
if CONF.conductor.log_step_flows_to_syslog:
log_fn = LOG.warning if error else LOG.info
log_fn("Node %(node)s: %(event)s",
{'node': node.uuid, 'event': event})
if CONF.conductor.record_step_flows_in_history:
node_history_record(node, event=event, event_type=event_type,
error=error)

View File

@@ -652,7 +652,22 @@ opts = [
'and fail the provisioning action that required a '
'ramdisk and kernel. When set to False, Ironic will '
'fallback to the next valid, consistent configured '
'ramdisk and kernel for the node.'))
'ramdisk and kernel for the node.')),
cfg.BoolOpt(
"record_step_flows_in_history",
default=True,
help=(
"When True, the conductor writes a Node History entry at the "
"start and end of every cleaning/servicing/deploy-steps flow. "
"Disable this in very high-churn environments to reduce DB load."
)),
cfg.BoolOpt(
'log_step_flows_to_syslog',
default=False,
help=(
"Log steps at the start/end of cleaning/servicing/deployment "
"to the conductor service log (WARNING for aborted/failure, "
"INFO otherwise.")),
]

View File

@@ -3094,6 +3094,12 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
self.node = obj_utils.create_test_node(
self.context,
uuid=uuidutils.generate_uuid())
self.steps = [
{
'step': 'erase_devices',
'interface': 'deploy',
'args': {'token': 'secret'},
'priority': 99}]
def test_record_node_history(self):
conductor_utils.node_history_record(self.node, event='meow')
@@ -3143,6 +3149,39 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
mock_history.assert_not_called()
mock_create.assert_not_called()
@mock.patch('ironic.conductor.utils.LOG', autospec=True)
@mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
def test_logs_to_history_and_syslog(self, mock_history, mock_log):
self.config(record_step_flows_in_history=True, group='conductor')
self.config(log_step_flows_to_syslog=True, group='conductor')
conductor_utils.log_step_flow_history(
self.node, flow=conductor_utils.StepFlow.DEPLOYMENT,
steps=self.steps, status='start')
mock_history.assert_called_once()
mock_log.info.assert_called_once()
assert 'deployment_start' in mock_history.call_args.kwargs[
'event_type']
assert self.node.uuid in mock_log.info.call_args.args[1]['node']
@mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
def test_sanitizes_passwords_in_args(self, mock_history):
steps = [
{'interface': 'clean', 'step': 'noop',
'args': {'password': 'meow'}, 'priority': 10}
]
conductor_utils.log_step_flow_history(
self.node, flow=conductor_utils.StepFlow.CLEANING_AUTO,
steps=steps, status='start')
mock_history.assert_called_once()
event_string = mock_history.call_args.kwargs['event']
assert 'password' in event_string
assert '***' in event_string
assert 'meow' not in event_string
class GetTokenProjectFromRequestTestCase(db_base.DbTestCase):

View File

@@ -0,0 +1,12 @@
---
features:
- |
Start and end of dynamic step-based flows (automated cleaning,
manual cleaning, servicing, and deploy steps) are now logged to
Node History if ``[conductor]record_step_flows_in_history`` is enabled
(default: ``True``). This makes it easier to trace what steps were
attempted when troubleshooting deployment failures.
A second knob which is disabled by default,
``[conductor]log_step_flows_to_syslog``, allows emitting the same flow
events, but to the service log.

View File

@@ -97,6 +97,7 @@
IRONIC_DEPLOY_DRIVER: ipmi
IRONIC_INSPECTOR_BUILD_RAMDISK: False
IRONIC_INSPECTOR_TEMPEST_INTROSPECTION_TIMEOUT: 1200
IRONIC_LOG_STEPS_TO_SYSLOG: True
IRONIC_TEMPEST_BUILD_TIMEOUT: 2000
IRONIC_TEMPEST_WHOLE_DISK_IMAGE: False
IRONIC_VM_COUNT: 2