Log executed steps during cleaning/servicing/deploy
Log steps performed during step-based flows in Node History at the beginning and at completion (or abort). Closes-Bug: #2106758 Change-Id: Ieffacf174180036d6a2418a8faf72a94eea74fb8 Signed-off-by: Afonne-CID <afonnepaulc@gmail.com>
This commit is contained in:
@@ -182,6 +182,7 @@ IRONIC_EXTRA_PXE_PARAMS=${IRONIC_EXTRA_PXE_PARAMS:-}
|
||||
IRONIC_TTY_DEV=${IRONIC_TTY_DEV:-ttyS0,115200}
|
||||
IRONIC_USE_PORT_GROUP=$(trueorfalse False IRONIC_USE_PORT_GROUP)
|
||||
IRONIC_PORT_GROUP_MODE=${IRONIC_PORT_GROUP_MODE:-"balance-rr"}
|
||||
IRONIC_LOG_STEPS_TO_SYSLOG=${IRONIC_LOG_STEPS_TO_SYSLOG:-False}
|
||||
|
||||
IRONIC_TEMPEST_BUILD_TIMEOUT=${IRONIC_TEMPEST_BUILD_TIMEOUT:-${BUILD_TIMEOUT:-}}
|
||||
if [[ -n "$BUILD_TIMEOUT" ]]; then
|
||||
@@ -1904,6 +1905,8 @@ function configure_ironic_conductor {
|
||||
|
||||
iniset $IRONIC_CONF_FILE conductor automated_clean $IRONIC_AUTOMATED_CLEAN_ENABLED
|
||||
|
||||
iniset $IRONIC_CONF_FILE conductor log_step_flows_to_syslog $IRONIC_LOG_STEPS_TO_SYSLOG
|
||||
|
||||
# configure enabled and default interfaces
|
||||
local iface
|
||||
local iface_var
|
||||
|
@@ -28,6 +28,11 @@ from ironic import objects
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def get_cleaning_flow(manual_clean=False):
|
||||
return (utils.StepFlow.CLEANING_MANUAL if manual_clean
|
||||
else utils.StepFlow.CLEANING_AUTO)
|
||||
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
|
||||
automated_with_steps=False):
|
||||
@@ -44,7 +49,9 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
|
||||
"""
|
||||
node = task.node
|
||||
manual_clean = clean_steps is not None and automated_with_steps is False
|
||||
clean_type = 'manual' if manual_clean else 'automated'
|
||||
|
||||
clean_type = get_cleaning_flow(manual_clean)
|
||||
|
||||
LOG.debug('Starting %(type)s cleaning for node %(node)s',
|
||||
{'type': clean_type, 'node': node.uuid})
|
||||
|
||||
@@ -135,6 +142,10 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False,
|
||||
return utils.cleaning_error_handler(task, msg)
|
||||
|
||||
steps = node.driver_internal_info.get('clean_steps', [])
|
||||
|
||||
utils.log_step_flow_history(node=node, flow=clean_type,
|
||||
steps=steps, status="start")
|
||||
|
||||
step_index = 0 if steps else None
|
||||
do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk)
|
||||
|
||||
@@ -157,6 +168,9 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
|
||||
# For manual cleaning, the target provision state is MANAGEABLE,
|
||||
# whereas for automated cleaning, it is AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
|
||||
clean_type = get_cleaning_flow(manual_clean)
|
||||
|
||||
if step_index is None:
|
||||
steps = []
|
||||
else:
|
||||
@@ -234,6 +248,12 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
|
||||
task.process_event('wait', target_state=target_state)
|
||||
return
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=clean_type,
|
||||
steps=node.driver_internal_info.get("clean_steps", []),
|
||||
status="end", aborted_step=step.get("step"),
|
||||
)
|
||||
|
||||
msg = (_('Node %(node)s failed step %(step)s: '
|
||||
'%(exc)s') %
|
||||
{'node': node.uuid, 'exc': e,
|
||||
@@ -285,6 +305,13 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
|
||||
tear_down_cleaning=False)
|
||||
utils.node_update_cache(task)
|
||||
LOG.info('Node %s cleaning complete', node.uuid)
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=clean_type,
|
||||
steps=node.driver_internal_info.get("clean_steps", []),
|
||||
status="end"
|
||||
)
|
||||
|
||||
event = 'manage' if manual_clean or node.retired else 'done'
|
||||
# NOTE(rloo): No need to specify target prov. state; we're done
|
||||
task.process_event(event)
|
||||
|
@@ -303,6 +303,12 @@ def do_node_deploy(task, conductor_id=None, configdrive=None,
|
||||
node.conductor_affinity = conductor_id
|
||||
node.save()
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.DEPLOYMENT,
|
||||
steps=node.driver_internal_info.get("deploy_steps", []),
|
||||
status="start",
|
||||
)
|
||||
|
||||
do_next_deploy_step(task, 0)
|
||||
|
||||
|
||||
@@ -384,6 +390,12 @@ def do_next_deploy_step(task, step_index):
|
||||
task.process_event('wait')
|
||||
return
|
||||
except exception.IronicException as e:
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.DEPLOYMENT,
|
||||
steps=node.driver_internal_info.get("deploy_steps", []),
|
||||
status="end", aborted_step=step.get("step"),
|
||||
)
|
||||
|
||||
if isinstance(e, exception.AgentConnectionFailed):
|
||||
if task.node.driver_internal_info.get(
|
||||
async_steps.DEPLOYMENT_REBOOT):
|
||||
@@ -408,6 +420,12 @@ def do_next_deploy_step(task, step_index):
|
||||
% {'step': conductor_steps.step_id(step), 'err': e})
|
||||
return
|
||||
except Exception as e:
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.DEPLOYMENT,
|
||||
steps=node.driver_internal_info.get("deploy_steps", []),
|
||||
status="end", aborted_step=step.get("step"),
|
||||
)
|
||||
|
||||
log_msg = ('Node %(node)s failed deploy step %(step)s with '
|
||||
'unexpected error: %(err)s' %
|
||||
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
|
||||
@@ -453,6 +471,12 @@ def do_next_deploy_step(task, step_index):
|
||||
LOG.info('Node %(node)s finished deploy step %(step)s',
|
||||
{'node': node.uuid, 'step': step})
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.DEPLOYMENT,
|
||||
steps=node.driver_internal_info.get("deploy_steps", []),
|
||||
status="end",
|
||||
)
|
||||
|
||||
# Finished executing the steps. Clear deploy_step.
|
||||
node.deploy_step = None
|
||||
utils.wipe_deploy_internal_info(task)
|
||||
|
@@ -76,6 +76,11 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
|
||||
return utils.servicing_error_handler(task, msg)
|
||||
|
||||
steps = node.driver_internal_info.get('service_steps', [])
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.SERVICING,
|
||||
steps=steps, status="start")
|
||||
|
||||
step_index = 0 if steps else None
|
||||
for step in steps:
|
||||
step_requires_ramdisk = step.get('requires_ramdisk')
|
||||
@@ -194,6 +199,12 @@ def do_next_service_step(task, step_index, disable_ramdisk=None):
|
||||
task.process_event('wait')
|
||||
return
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=node, flow=utils.StepFlow.SERVICING,
|
||||
steps=node.driver_internal_info.get("service_steps", []),
|
||||
status="end", aborted_step=step.get("step"),
|
||||
)
|
||||
|
||||
msg = (_('Node %(node)s failed step %(step)s: '
|
||||
'%(exc)s') %
|
||||
{'node': node.uuid, 'exc': e,
|
||||
@@ -236,6 +247,13 @@ def _tear_down_node_service(task, disable_ramdisk):
|
||||
task.node.service_step = None
|
||||
utils.wipe_service_internal_info(task)
|
||||
task.node.save()
|
||||
|
||||
utils.log_step_flow_history(
|
||||
node=task.node, flow=utils.StepFlow.SERVICING,
|
||||
steps=task.node.driver_internal_info.get("service_steps", []),
|
||||
status="end"
|
||||
)
|
||||
|
||||
if not disable_ramdisk:
|
||||
try:
|
||||
task.driver.deploy.tear_down_service(task)
|
||||
|
@@ -333,10 +333,11 @@ def set_node_cleaning_steps(task, disable_ramdisk=False,
|
||||
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
LOG.debug('List of the steps for %(type)s cleaning of node %(node)s: '
|
||||
'%(steps)s', {'type': 'manual' if manual_clean else 'automated',
|
||||
'node': node.uuid,
|
||||
'steps': steps})
|
||||
if not CONF.conductor.log_step_flows_to_syslog:
|
||||
LOG.debug(
|
||||
'List of the steps for %(type)s cleaning of node %(node)s: '
|
||||
'%(steps)s', {'type': 'manual' if manual_clean else 'automated',
|
||||
'node': node.uuid, 'steps': steps})
|
||||
|
||||
node.clean_step = {}
|
||||
node.set_driver_internal_info('clean_steps', steps)
|
||||
@@ -475,10 +476,11 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False):
|
||||
node.set_driver_internal_info('deploy_steps', _get_all_deployment_steps(
|
||||
task, skip_missing=skip_missing))
|
||||
|
||||
LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
|
||||
'node': node.uuid,
|
||||
'steps': node.driver_internal_info['deploy_steps']
|
||||
})
|
||||
if not CONF.conductor.log_step_flows_to_syslog:
|
||||
LOG.debug('List of the deploy steps for node %(node)s: %(steps)s', {
|
||||
'node': node.uuid,
|
||||
'steps': node.driver_internal_info['deploy_steps']
|
||||
})
|
||||
if reset_current:
|
||||
node.deploy_step = {}
|
||||
node.set_driver_internal_info('deploy_step_index', None)
|
||||
@@ -505,9 +507,9 @@ def set_node_service_steps(task, disable_ramdisk=False):
|
||||
steps = _validate_user_service_steps(
|
||||
task, node.driver_internal_info.get('service_steps', []),
|
||||
disable_ramdisk=disable_ramdisk)
|
||||
LOG.debug('List of the steps for service of node %(node)s: '
|
||||
'%(steps)s', {'node': node.uuid,
|
||||
'steps': steps})
|
||||
if not CONF.conductor.log_step_flows_to_syslog:
|
||||
LOG.debug('List of the steps for service of node %(node)s: '
|
||||
'%(steps)s', {'node': node.uuid, 'steps': steps})
|
||||
|
||||
node.service_step = {}
|
||||
node.set_driver_internal_info('service_steps', steps)
|
||||
|
@@ -14,6 +14,7 @@
|
||||
|
||||
import contextlib
|
||||
import datetime
|
||||
from enum import Enum
|
||||
import functools
|
||||
import os
|
||||
import secrets
|
||||
@@ -54,6 +55,13 @@ PASSWORD_HASH_FORMAT = {
|
||||
}
|
||||
|
||||
|
||||
class StepFlow(Enum):
|
||||
CLEANING_MANUAL = 'manual_cleaning'
|
||||
CLEANING_AUTO = 'automated_cleaning'
|
||||
DEPLOYMENT = 'deployment'
|
||||
SERVICING = 'servicing'
|
||||
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def node_set_boot_device(task, device, persistent=False):
|
||||
"""Set the boot device for a node.
|
||||
@@ -2008,3 +2016,53 @@ def node_update_cache(task):
|
||||
node_cache_boot_mode(task)
|
||||
node_cache_bios_settings(task)
|
||||
node_cache_firmware_components(task)
|
||||
|
||||
|
||||
def log_step_flow_history(node, flow, steps, status, aborted_step=None):
|
||||
"""Persist start/end milestones for cleaning / servicing / deploy flows.
|
||||
|
||||
:param node: A Node object
|
||||
:param flow: A StepFlow enum member, e.g. StepFlow.DEPLOYMENT
|
||||
:param steps: full list of step dicts (may be empty/None)
|
||||
:param status: ``'start'`` or ``'done'``
|
||||
:param aborted_step: name of failed step (optional)
|
||||
"""
|
||||
if not (CONF.conductor.record_step_flows_in_history
|
||||
or CONF.conductor.log_step_flows_to_syslog):
|
||||
return
|
||||
|
||||
sanitized = [
|
||||
(
|
||||
dict(st, args=strutils.mask_dict_password(st["args"]))
|
||||
if isinstance(st, dict) and isinstance(st.get("args"), dict)
|
||||
else st
|
||||
)
|
||||
for st in (steps or [])
|
||||
]
|
||||
|
||||
error = False
|
||||
if status == "start":
|
||||
event_type = "%s_start" % flow.value
|
||||
event = "Starting %s with steps=%s" % (
|
||||
flow.value.replace("_", " "), sanitized)
|
||||
elif status == "end":
|
||||
event_type = "%s_end" % flow.value
|
||||
if aborted_step:
|
||||
error = True
|
||||
event = "%s aborted at step '%s' with steps=%s" % (
|
||||
flow.value.replace("_", " ").title(),
|
||||
aborted_step, sanitized)
|
||||
else:
|
||||
event = "%s completed successfully with steps=%s" % (
|
||||
flow.value.replace("_", " ").title(), sanitized)
|
||||
else:
|
||||
raise ValueError("Unknown status '%s'" % status)
|
||||
|
||||
if CONF.conductor.log_step_flows_to_syslog:
|
||||
log_fn = LOG.warning if error else LOG.info
|
||||
log_fn("Node %(node)s: %(event)s",
|
||||
{'node': node.uuid, 'event': event})
|
||||
|
||||
if CONF.conductor.record_step_flows_in_history:
|
||||
node_history_record(node, event=event, event_type=event_type,
|
||||
error=error)
|
||||
|
@@ -652,7 +652,22 @@ opts = [
|
||||
'and fail the provisioning action that required a '
|
||||
'ramdisk and kernel. When set to False, Ironic will '
|
||||
'fallback to the next valid, consistent configured '
|
||||
'ramdisk and kernel for the node.'))
|
||||
'ramdisk and kernel for the node.')),
|
||||
cfg.BoolOpt(
|
||||
"record_step_flows_in_history",
|
||||
default=True,
|
||||
help=(
|
||||
"When True, the conductor writes a Node History entry at the "
|
||||
"start and end of every cleaning/servicing/deploy-steps flow. "
|
||||
"Disable this in very high-churn environments to reduce DB load."
|
||||
)),
|
||||
cfg.BoolOpt(
|
||||
'log_step_flows_to_syslog',
|
||||
default=False,
|
||||
help=(
|
||||
"Log steps at the start/end of cleaning/servicing/deployment "
|
||||
"to the conductor service log (WARNING for aborted/failure, "
|
||||
"INFO otherwise.")),
|
||||
]
|
||||
|
||||
|
||||
|
@@ -3094,6 +3094,12 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
|
||||
self.node = obj_utils.create_test_node(
|
||||
self.context,
|
||||
uuid=uuidutils.generate_uuid())
|
||||
self.steps = [
|
||||
{
|
||||
'step': 'erase_devices',
|
||||
'interface': 'deploy',
|
||||
'args': {'token': 'secret'},
|
||||
'priority': 99}]
|
||||
|
||||
def test_record_node_history(self):
|
||||
conductor_utils.node_history_record(self.node, event='meow')
|
||||
@@ -3143,6 +3149,39 @@ class NodeHistoryRecordTestCase(db_base.DbTestCase):
|
||||
mock_history.assert_not_called()
|
||||
mock_create.assert_not_called()
|
||||
|
||||
@mock.patch('ironic.conductor.utils.LOG', autospec=True)
|
||||
@mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
|
||||
def test_logs_to_history_and_syslog(self, mock_history, mock_log):
|
||||
self.config(record_step_flows_in_history=True, group='conductor')
|
||||
self.config(log_step_flows_to_syslog=True, group='conductor')
|
||||
|
||||
conductor_utils.log_step_flow_history(
|
||||
self.node, flow=conductor_utils.StepFlow.DEPLOYMENT,
|
||||
steps=self.steps, status='start')
|
||||
|
||||
mock_history.assert_called_once()
|
||||
mock_log.info.assert_called_once()
|
||||
assert 'deployment_start' in mock_history.call_args.kwargs[
|
||||
'event_type']
|
||||
assert self.node.uuid in mock_log.info.call_args.args[1]['node']
|
||||
|
||||
@mock.patch('ironic.conductor.utils.node_history_record', autospec=True)
|
||||
def test_sanitizes_passwords_in_args(self, mock_history):
|
||||
steps = [
|
||||
{'interface': 'clean', 'step': 'noop',
|
||||
'args': {'password': 'meow'}, 'priority': 10}
|
||||
]
|
||||
|
||||
conductor_utils.log_step_flow_history(
|
||||
self.node, flow=conductor_utils.StepFlow.CLEANING_AUTO,
|
||||
steps=steps, status='start')
|
||||
|
||||
mock_history.assert_called_once()
|
||||
event_string = mock_history.call_args.kwargs['event']
|
||||
assert 'password' in event_string
|
||||
assert '***' in event_string
|
||||
assert 'meow' not in event_string
|
||||
|
||||
|
||||
class GetTokenProjectFromRequestTestCase(db_base.DbTestCase):
|
||||
|
||||
|
@@ -0,0 +1,12 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Start and end of dynamic step-based flows (automated cleaning,
|
||||
manual cleaning, servicing, and deploy steps) are now logged to
|
||||
Node History if ``[conductor]record_step_flows_in_history`` is enabled
|
||||
(default: ``True``). This makes it easier to trace what steps were
|
||||
attempted when troubleshooting deployment failures.
|
||||
|
||||
A second knob which is disabled by default,
|
||||
``[conductor]log_step_flows_to_syslog``, allows emitting the same flow
|
||||
events, but to the service log.
|
@@ -97,6 +97,7 @@
|
||||
IRONIC_DEPLOY_DRIVER: ipmi
|
||||
IRONIC_INSPECTOR_BUILD_RAMDISK: False
|
||||
IRONIC_INSPECTOR_TEMPEST_INTROSPECTION_TIMEOUT: 1200
|
||||
IRONIC_LOG_STEPS_TO_SYSLOG: True
|
||||
IRONIC_TEMPEST_BUILD_TIMEOUT: 2000
|
||||
IRONIC_TEMPEST_WHOLE_DISK_IMAGE: False
|
||||
IRONIC_VM_COUNT: 2
|
||||
|
Reference in New Issue
Block a user