Merge "Record node history and manage events in db"
This commit is contained in:
commit
3f8be1471a
@ -297,6 +297,25 @@ ALLOCATING = 'allocating'
|
||||
|
||||
# States ERROR and ACTIVE are reused.
|
||||
|
||||
###########################
|
||||
# History Event State Types
|
||||
###########################
|
||||
|
||||
PROVISIONING = "provisioning"
|
||||
CLEANING = "cleaning"
|
||||
DEPLOYING = "deploying"
|
||||
TAKEOVER = "takeover"
|
||||
INTROSPECTION = "introspection"
|
||||
RESCUE = "rescue"
|
||||
CONDUCTOR = "conductor"
|
||||
TRANSITION = "transition"
|
||||
STARTFAIL = "startup failure"
|
||||
UNPROVISION = "unprovision"
|
||||
ADOPTION = "adoption"
|
||||
CONSOLE = "console"
|
||||
MONITORING = "monitoring"
|
||||
VERIFY = "verify"
|
||||
|
||||
|
||||
#####################
|
||||
# State machine model
|
||||
|
@ -37,6 +37,7 @@ from ironic.common import states
|
||||
from ironic.conductor import allocations
|
||||
from ironic.conductor import notification_utils as notify_utils
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.conductor import utils
|
||||
from ironic.conf import CONF
|
||||
from ironic.db import api as dbapi
|
||||
from ironic.drivers import base as driver_base
|
||||
@ -536,7 +537,10 @@ class BaseConductorManager(object):
|
||||
err_handler=err_handler,
|
||||
target_state=target_state)
|
||||
else:
|
||||
task.node.last_error = last_error
|
||||
utils.node_history_record(
|
||||
task.node, event=last_error,
|
||||
error=True,
|
||||
event_type=states.TRANSITION)
|
||||
task.process_event('fail', target_state=target_state)
|
||||
except exception.NoFreeConductorWorker:
|
||||
break
|
||||
@ -580,7 +584,9 @@ class BaseConductorManager(object):
|
||||
LOG.error(msg)
|
||||
# If starting console failed, set node console_enabled
|
||||
# back to False and set node's last error.
|
||||
task.node.last_error = msg
|
||||
utils.node_history_record(task.node, event=msg,
|
||||
error=True,
|
||||
event_type=states.STARTFAIL)
|
||||
task.node.console_enabled = False
|
||||
task.node.save()
|
||||
notify_utils.emit_console_notification(
|
||||
|
@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
|
||||
def handle_failure(e, errmsg, log_func=LOG.error):
|
||||
utils.remove_node_rescue_password(node, save=False)
|
||||
node.last_error = errmsg % e
|
||||
error = errmsg % e
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.RESCUE,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
log_func('Error while performing rescue operation for node '
|
||||
'%(node)s with instance %(instance)s: %(err)s',
|
||||
@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
node = task.node
|
||||
|
||||
def handle_failure(e, errmsg, log_func=LOG.error):
|
||||
node.last_error = errmsg % e
|
||||
error = errmsg % e
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.RESCUE,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
log_func('Error while performing unrescue operation for node '
|
||||
'%(node)s with instance %(instance)s: %(err)s',
|
||||
@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
error_msg = _('Failed to clean up rescue after aborting '
|
||||
'the operation')
|
||||
node.refresh()
|
||||
node.last_error = error_msg
|
||||
utils.node_history_record(node, event=error_msg,
|
||||
event_type=states.RESCUE, error=True,
|
||||
user=task.context.user_id)
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = error_msg
|
||||
node.fault = faults.RESCUE_ABORT_FAILURE
|
||||
@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
return
|
||||
|
||||
info_message = _('Rescue operation aborted for node %s.') % node.uuid
|
||||
last_error = _('By request, the rescue operation was aborted.')
|
||||
# NOTE(TheJulia): This "error" is not an actual error, the operation
|
||||
# has been aborted and the node returned to normal operation.
|
||||
error = _('By request, the rescue operation was aborted.')
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.RESCUE,
|
||||
error=False,
|
||||
user=task.context.user_id)
|
||||
node.refresh()
|
||||
utils.remove_agent_url(node)
|
||||
node.last_error = last_error
|
||||
node.save()
|
||||
LOG.info(info_message)
|
||||
|
||||
@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
with excutils.save_and_reraise_exception():
|
||||
LOG.exception('Error in tear_down of node %(node)s: %(err)s',
|
||||
{'node': node.uuid, 'err': e})
|
||||
node.last_error = _("Failed to tear down. Error: %s") % e
|
||||
error = _("Failed to tear down. Error: %s") % e
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.UNPROVISION,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
else:
|
||||
# NOTE(tenbrae): When tear_down finishes, the deletion is done,
|
||||
@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
with excutils.save_and_reraise_exception():
|
||||
LOG.exception('Error in aborting the inspection of '
|
||||
'node %(node)s', {'node': node.uuid})
|
||||
node.last_error = _('Failed to abort inspection. '
|
||||
'Error: %s') % e
|
||||
error = _('Failed to abort inspection. '
|
||||
'Error: %s') % e
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.INTROSPECTION,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
node.save()
|
||||
node.last_error = _('Inspection was aborted by request.')
|
||||
error = _('Inspection was aborted by request.')
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.INTROSPECTION,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
utils.wipe_token_and_url(task)
|
||||
task.process_event('abort')
|
||||
LOG.info('Successfully aborted inspection of node %(node)s',
|
||||
@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
if not task.node.maintenance and task.node.target_power_state:
|
||||
old_state = task.node.target_power_state
|
||||
task.node.target_power_state = None
|
||||
task.node.last_error = _('Pending power operation was '
|
||||
'aborted due to conductor take '
|
||||
'over')
|
||||
error = _('Pending power operation was '
|
||||
'aborted due to conductor take '
|
||||
'over')
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.TAKEOVER,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
|
||||
task.node.save()
|
||||
LOG.warning('Aborted pending power operation %(op)s '
|
||||
'on node %(node)s due to conductor take over',
|
||||
@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
LOG.error(msg)
|
||||
# Wipe power state from being preserved as it is likely invalid.
|
||||
node.power_state = states.NOSTATE
|
||||
node.last_error = msg
|
||||
utils.node_history_record(task.node, event=msg,
|
||||
event_type=states.ADOPTION,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
|
||||
@METRICS.timer('ConductorManager._do_takeover')
|
||||
@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
LOG.error(msg)
|
||||
# If taking over console failed, set node's console_enabled
|
||||
# back to False and set node's last error.
|
||||
task.node.last_error = msg
|
||||
utils.node_history_record(task.node, event=msg,
|
||||
event_type=states.TAKEOVER,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.node.console_enabled = False
|
||||
console_error = True
|
||||
else:
|
||||
@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'Reason: %(error)s') % {'op': op,
|
||||
'node': node.uuid,
|
||||
'error': e})
|
||||
node.last_error = msg
|
||||
utils.node_history_record(task.node, event=msg,
|
||||
event_type=states.CONSOLE,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
LOG.error(msg)
|
||||
node.save()
|
||||
notify_utils.emit_console_notification(
|
||||
@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
task.node.save()
|
||||
return task.node
|
||||
|
||||
@METRICS.timer('ConductorManager.manage_node_history')
|
||||
@periodics.periodic(
|
||||
spacing=CONF.conductor.node_history_cleanup_interval,
|
||||
enabled=(
|
||||
CONF.conductor.node_history_cleanup_batch_count > 0
|
||||
and CONF.conductor.node_history_max_entries != 0
|
||||
)
|
||||
)
|
||||
def manage_node_history(self, context):
|
||||
try:
|
||||
self._manage_node_history(context)
|
||||
except Exception as e:
|
||||
LOG.error('Encountered error while cleaning node '
|
||||
'history records: %s', e)
|
||||
|
||||
def _manage_node_history(self, context):
|
||||
"""Periodic task to keep the node history tidy."""
|
||||
max_batch = CONF.conductor.node_history_cleanup_batch_count
|
||||
# NOTE(TheJulia): Asks the db for the list. Presently just gets
|
||||
# the node id and the count. If we incorporate by date constraint
|
||||
# or handling, then it will need to be something like the method
|
||||
# needs to identify the explicit ID values to delete, and then
|
||||
# the deletion process needs to erase in logical chunks.
|
||||
entries_to_clean = self.dbapi.query_node_history_records_for_purge(
|
||||
conductor_id=self.conductor.id)
|
||||
count = 0
|
||||
for node_id in entries_to_clean:
|
||||
if count < max_batch:
|
||||
# If we have not hit our total limit, proceed
|
||||
if entries_to_clean[node_id]:
|
||||
# if we have work to do on this node, proceed.
|
||||
self.dbapi.bulk_delete_node_history_records(
|
||||
entries_to_clean[node_id])
|
||||
else:
|
||||
LOG.warning('While cleaning up node history records, '
|
||||
'we reached the maximum number of records '
|
||||
'permitted in a single batch. If this error '
|
||||
'is repeated, consider tuning node history '
|
||||
'configuration options to be more aggressive '
|
||||
'by increasing frequency and lowering the '
|
||||
'number of entries to be deleted to not '
|
||||
'negatively impact performance.')
|
||||
break
|
||||
count = count + len(entries_to_clean[node_id])
|
||||
# Yield to other threads, since we also don't want to be
|
||||
# looping tightly deleting rows as that will negatively
|
||||
# impact DB access if done in excess.
|
||||
eventlet.sleep(0)
|
||||
|
||||
|
||||
@METRICS.timer('get_vendor_passthru_metadata')
|
||||
def get_vendor_passthru_metadata(route_dict):
|
||||
@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
|
||||
|
||||
old_power_state = node.power_state
|
||||
node.power_state = actual_power_state
|
||||
node.last_error = msg
|
||||
utils.node_history_record(task.node, event=msg,
|
||||
event_type=states.MONITORING,
|
||||
error=True)
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.fault = faults.POWER_FAILURE
|
||||
@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task):
|
||||
node = task.node
|
||||
|
||||
def handle_failure(e, log_func=LOG.error):
|
||||
node.last_error = e
|
||||
utils.node_history_record(task.node, event=e,
|
||||
event_type=states.INTROSPECTION,
|
||||
error=True, user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
log_func("Failed to inspect node %(node)s: %(err)s",
|
||||
{'node': node.uuid, 'err': e})
|
||||
|
@ -39,6 +39,7 @@ from ironic.common import states
|
||||
from ironic.conductor import notification_utils as notify_utils
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.objects import fields
|
||||
from ironic.objects import node_history
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
@ -245,9 +246,10 @@ def _can_skip_state_change(task, new_state):
|
||||
curr_state = task.driver.power.get_power_state(task)
|
||||
except Exception as e:
|
||||
with excutils.save_and_reraise_exception():
|
||||
node['last_error'] = _(
|
||||
error = _(
|
||||
"Failed to change power state to '%(target)s'. "
|
||||
"Error: %(error)s") % {'target': new_state, 'error': e}
|
||||
node_history_record(node, event=error, error=True)
|
||||
node['target_power_state'] = states.NOSTATE
|
||||
node.save()
|
||||
notify_utils.emit_power_set_notification(
|
||||
@ -329,12 +331,13 @@ def node_power_action(task, new_state, timeout=None):
|
||||
except Exception as e:
|
||||
with excutils.save_and_reraise_exception():
|
||||
node['target_power_state'] = states.NOSTATE
|
||||
node['last_error'] = _(
|
||||
error = _(
|
||||
"Failed to change power state to '%(target_state)s' "
|
||||
"by '%(new_state)s'. Error: %(error)s") % {
|
||||
'target_state': target_state,
|
||||
'new_state': new_state,
|
||||
'error': e}
|
||||
node_history_record(node, event=error, error=True)
|
||||
node.save()
|
||||
notify_utils.emit_power_set_notification(
|
||||
task, fields.NotificationLevel.ERROR,
|
||||
@ -399,7 +402,9 @@ def provisioning_error_handler(e, node, provision_state,
|
||||
# because it isn't updated on a failed deploy
|
||||
node.provision_state = provision_state
|
||||
node.target_provision_state = target_provision_state
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
error = (_("No free conductor workers available"))
|
||||
node_history_record(node, event=error, event_type=states.PROVISIONING,
|
||||
error=True)
|
||||
node.save()
|
||||
LOG.warning("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
@ -483,7 +488,8 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
node.last_error = errmsg
|
||||
node_history_record(node, event=errmsg, event_type=states.CLEANING,
|
||||
error=True)
|
||||
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
|
||||
if not node.maintenance_reason and set_maintenance:
|
||||
node.maintenance_reason = errmsg
|
||||
@ -570,7 +576,8 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
errmsg = errmsg or logmsg
|
||||
node = task.node
|
||||
LOG.error(logmsg, exc_info=traceback)
|
||||
node.last_error = errmsg
|
||||
node_history_record(node, event=errmsg, event_type=states.DEPLOYING,
|
||||
error=True)
|
||||
node.save()
|
||||
|
||||
cleanup_err = None
|
||||
@ -600,7 +607,9 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
|
||||
wipe_deploy_internal_info(task)
|
||||
|
||||
if cleanup_err:
|
||||
node.last_error = cleanup_err
|
||||
node_history_record(node, event=cleanup_err,
|
||||
event_type=states.DEPLOYING,
|
||||
error=True)
|
||||
node.save()
|
||||
|
||||
# NOTE(tenbrae): there is no need to clear conductor_affinity
|
||||
@ -636,7 +645,8 @@ def abort_on_conductor_take_over(task):
|
||||
else:
|
||||
# For aborted deployment (and potentially other operations), just set
|
||||
# the last_error accordingly.
|
||||
task.node.last_error = msg
|
||||
node_history_record(task.node, event=msg, event_type=states.TAKEOVER,
|
||||
error=True)
|
||||
task.node.save()
|
||||
|
||||
LOG.warning('Aborted the current operation on node %s due to '
|
||||
@ -657,17 +667,22 @@ def rescuing_error_handler(task, msg, set_fail_state=True):
|
||||
node_power_action(task, states.POWER_OFF)
|
||||
task.driver.rescue.clean_up(task)
|
||||
remove_agent_url(node)
|
||||
node.last_error = msg
|
||||
node_history_record(task.node, event=msg, event_type=states.RESCUE,
|
||||
error=True)
|
||||
except exception.IronicException as e:
|
||||
node.last_error = (_('Rescue operation was unsuccessful, clean up '
|
||||
'failed for node: %(error)s') % {'error': e})
|
||||
error = (_('Rescue operation was unsuccessful, clean up '
|
||||
'failed for node: %(error)s') % {'error': e})
|
||||
node_history_record(task.node, event=error, event_type=states.RESCUE,
|
||||
error=True)
|
||||
LOG.error(('Rescue operation was unsuccessful, clean up failed for '
|
||||
'node %(node)s: %(error)s'),
|
||||
{'node': node.uuid, 'error': e})
|
||||
except Exception as e:
|
||||
node.last_error = (_('Rescue failed, but an unhandled exception was '
|
||||
'encountered while aborting: %(error)s') %
|
||||
{'error': e})
|
||||
error = (_('Rescue failed, but an unhandled exception was '
|
||||
'encountered while aborting: %(error)s') %
|
||||
{'error': e})
|
||||
node_history_record(task.node, event=error, event_type=states.RESCUE,
|
||||
error=True)
|
||||
LOG.exception('Rescue failed for node %(node)s, an exception was '
|
||||
'encountered while aborting.', {'node': node.uuid})
|
||||
finally:
|
||||
@ -708,7 +723,9 @@ def _spawn_error_handler(e, node, operation):
|
||||
:param operation: the operation being performed on the node.
|
||||
"""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
error = (_("No free conductor workers available"))
|
||||
node_history_record(node, event=error, event_type=states.CONDUCTOR,
|
||||
error=True)
|
||||
node.save()
|
||||
LOG.warning("No free conductor workers available to perform "
|
||||
"%(operation)s on node %(node)s",
|
||||
@ -749,7 +766,9 @@ def power_state_error_handler(e, node, power_state):
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
node.power_state = power_state
|
||||
node.target_power_state = states.NOSTATE
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
error = (_("No free conductor workers available"))
|
||||
node_history_record(node, event=error, event_type=states.CONDUCTOR,
|
||||
error=True)
|
||||
node.save()
|
||||
LOG.warning("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
@ -1573,3 +1592,58 @@ def node_change_secure_boot(task, secure_boot_target):
|
||||
{'state': secure_boot_target, 'node': task.node.uuid})
|
||||
task.node.secure_boot = secure_boot_target
|
||||
task.node.save()
|
||||
|
||||
|
||||
def node_history_record(node, conductor=None, event=None,
|
||||
event_type=None, user=None,
|
||||
error=False):
|
||||
"""Records a node history record
|
||||
|
||||
Adds an entry to the node history table with the appropriate fields
|
||||
populated to ensure consistent experience by also updating the
|
||||
node ``last_error`` field. Please note the event is only recorded
|
||||
if the ``[conductor]node_history_max_size`` parameter is set to a
|
||||
value greater than ``0``.
|
||||
|
||||
:param node: A node object from a task object. Required.
|
||||
:param conductor: The hostname of the conductor. If not specified
|
||||
this value is populated with the conductor FQDN.
|
||||
:param event: The text to record to the node history table.
|
||||
If no value is supplied, the method silently returns
|
||||
to the caller.
|
||||
:param event_type: The type activity where the event was encountered,
|
||||
either "provisioning", "monitoring", "cleaning",
|
||||
or whatever text the a driver author wishes to supply
|
||||
based upon the activity. The purpose is to help guide
|
||||
an API consumer/operator to have a better contextual
|
||||
understanding of what was going on *when* the "event"
|
||||
occured.
|
||||
:param user: The user_id value which triggered the request,
|
||||
if available.
|
||||
:param error: Boolean value, default false, to signify if the event
|
||||
is an error which should be recorded in the node
|
||||
``last_error`` field.
|
||||
:returns: None. No value is returned by this method.
|
||||
"""
|
||||
if not event:
|
||||
# No error has occured, apparently.
|
||||
return
|
||||
if error:
|
||||
# When the task exits out or is saved, the event
|
||||
# or error is saved, but that is outside of ceating an
|
||||
# entry in the history table.
|
||||
node.last_error = event
|
||||
if not conductor:
|
||||
conductor = CONF.host
|
||||
if CONF.conductor.node_history:
|
||||
# If the maximum number of entries is not set to zero,
|
||||
# then we should record the entry.
|
||||
# NOTE(TheJulia): DB API automatically adds in a uuid.
|
||||
# TODO(TheJulia): At some point, we should allow custom severity.
|
||||
node_history.NodeHistory(
|
||||
node_id=node.id,
|
||||
conductor=CONF.host,
|
||||
user=user,
|
||||
severity=error and "ERROR" or "INFO",
|
||||
event=event,
|
||||
event_type=event_type or "UNKNOWN").create()
|
||||
|
@ -16,6 +16,7 @@ from oslo_log import log
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import states
|
||||
from ironic.conductor import notification_utils as notify_utils
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.conductor import utils
|
||||
@ -67,5 +68,8 @@ def do_node_verify(task):
|
||||
else:
|
||||
task.process_event('done')
|
||||
else:
|
||||
node.last_error = error
|
||||
utils.node_history_record(task.node, event=error,
|
||||
event_type=states.VERIFY,
|
||||
error=True,
|
||||
user=task.context.user_id)
|
||||
task.process_event('fail')
|
||||
|
@ -283,6 +283,57 @@ opts = [
|
||||
'will not run during cleaning. If unset for an '
|
||||
'inband clean step, will use the priority set in the '
|
||||
'ramdisk.')),
|
||||
cfg.BoolOpt('node_history',
|
||||
default=True,
|
||||
mutable=True,
|
||||
help=_('Boolean value, default True, if node event history '
|
||||
'is to be recorded. Errors and other noteworthy '
|
||||
'events in relation to a node are journaled to a '
|
||||
'database table which incurs some additional load. '
|
||||
'A periodic task does periodically remove entries '
|
||||
'from the database. Please note, if this is disabled, '
|
||||
'the conductor will continue to purge entries as '
|
||||
'long as [conductor]node_history_cleanup_batch_count '
|
||||
'is not 0.')),
|
||||
cfg.IntOpt('node_history_max_entries',
|
||||
default=300,
|
||||
min=0,
|
||||
mutable=True,
|
||||
help=_('Maximum number of history entries which will be stored '
|
||||
'in the database per node. Default is 300. This setting '
|
||||
'excludes the minimum number of days retained using the '
|
||||
'[conductor]node_history_minimum_days setting.')),
|
||||
cfg.IntOpt('node_history_cleanup_interval',
|
||||
min=0,
|
||||
default=86400,
|
||||
mutable=False,
|
||||
help=_('Interval in seconds at which node history entries '
|
||||
'can be cleaned up in the database. Setting to 0 '
|
||||
'disables the periodic task. Defaults to once a day, '
|
||||
'or 86400 seconds.')),
|
||||
cfg.IntOpt('node_history_cleanup_batch_count',
|
||||
min=0,
|
||||
default=1000,
|
||||
mutable=False,
|
||||
help=_('The target number of node history records to purge '
|
||||
'from the database when performing clean-up. '
|
||||
'Deletes are performed by node, and a node with excess '
|
||||
'records for a node will still be deleted. '
|
||||
'Defaults to 1000. Operators who find node history '
|
||||
'building up may wish to '
|
||||
'lower this threshold and decrease the time between '
|
||||
'cleanup operations using the '
|
||||
'``node_history_cleanup_interval`` setting.')),
|
||||
cfg.IntOpt('node_history_minimum_days',
|
||||
min=0,
|
||||
default=0,
|
||||
mutable=True,
|
||||
help=_('The minimum number of days to explicitly keep on '
|
||||
'hand in the database history entries for nodes. '
|
||||
'This is exclusive from the [conductor]'
|
||||
'node_history_max_entries setting as users of '
|
||||
'this setting are anticipated to need to retain '
|
||||
'history by policy.')),
|
||||
]
|
||||
|
||||
|
||||
|
@ -1380,3 +1380,21 @@ class Connection(object, metaclass=abc.ABCMeta):
|
||||
(asc, desc)
|
||||
:returns: A list of histories.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def query_node_history_records_for_purge(self, conductor_id):
|
||||
"""Utility method to identify nodes to clean history records for.
|
||||
|
||||
:param conductor_id: Id value for the conductor to perform this
|
||||
query on behalf of.
|
||||
:returns: A dictionary with key values of node database ID values
|
||||
and a list of values associated with the node.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def bulk_delete_node_history_records(self, node_id, limit):
|
||||
"""Utility method to bulk delete node history entries.
|
||||
|
||||
:param entires: A list of node history entriy id's to be
|
||||
queried for deletion.
|
||||
"""
|
||||
|
@ -2329,3 +2329,74 @@ class Connection(api.Connection):
|
||||
query = query.filter_by(node_id=node_id)
|
||||
return _paginate_query(models.NodeHistory, limit, marker,
|
||||
sort_key, sort_dir, query)
|
||||
|
||||
def query_node_history_records_for_purge(self, conductor_id):
|
||||
min_days = CONF.conductor.node_history_minimum_days
|
||||
max_num = CONF.conductor.node_history_max_entries
|
||||
|
||||
with _session_for_read() as session:
|
||||
# First, figure out our nodes.
|
||||
nodes = session.query(
|
||||
models.Node.id,
|
||||
).filter(
|
||||
models.Node.conductor_affinity == conductor_id
|
||||
)
|
||||
|
||||
# Build our query to get the node_id and record id.
|
||||
query = session.query(
|
||||
models.NodeHistory.node_id,
|
||||
models.NodeHistory.id,
|
||||
)
|
||||
|
||||
# Filter by the nodes
|
||||
query = query.filter(
|
||||
models.NodeHistory.node_id.in_(nodes)
|
||||
).order_by(
|
||||
# Order in an ascending order as older is always first.
|
||||
models.NodeHistory.created_at.asc()
|
||||
)
|
||||
|
||||
# Filter by minimum days
|
||||
if min_days > 0:
|
||||
before = datetime.datetime.now() - datetime.timedelta(
|
||||
days=min_days)
|
||||
query = query.filter(
|
||||
models.NodeHistory.created_at < before
|
||||
)
|
||||
|
||||
# Build our result set
|
||||
result_set = {}
|
||||
for entry in query.all():
|
||||
if entry[0] not in result_set:
|
||||
result_set[entry[0]] = []
|
||||
result_set[entry[0]].append(entry[1])
|
||||
|
||||
final_set = {}
|
||||
# Generate our final set of entries which should be removed
|
||||
# by accounting for the number of permitted entries.
|
||||
for entry in result_set:
|
||||
final_set[entry] = []
|
||||
set_len = len(result_set[entry])
|
||||
# Any count <= the maximum number is okay
|
||||
if set_len > max_num:
|
||||
# figure out how many entries need to be removed
|
||||
num_to_remove = set_len - max_num
|
||||
for i in range(0, num_to_remove):
|
||||
final_set[entry].append(result_set[entry][i])
|
||||
# remove the entries at the end of the list
|
||||
# which will be the more recent items as we
|
||||
# ordered ascending originally.
|
||||
print('returning final set')
|
||||
print(final_set)
|
||||
return final_set
|
||||
|
||||
def bulk_delete_node_history_records(self, entries):
|
||||
with _session_for_write() as session:
|
||||
# Uses input entry list, selects entries matching those ids
|
||||
# then deletes them and does not synchronize the session so
|
||||
# sqlalchemy doesn't do extra un-necessary work.
|
||||
session.query(
|
||||
models.NodeHistory
|
||||
).filter(
|
||||
models.NodeHistory.id.in_(entries)
|
||||
).delete(synchronize_session=False)
|
||||
|
@ -542,6 +542,12 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
fields.NotificationStatus.START),
|
||||
mock.call(mock.ANY, 'console_restore',
|
||||
fields.NotificationStatus.ERROR)])
|
||||
history = objects.NodeHistory.list_by_node_id(self.context,
|
||||
test_node.id)
|
||||
entry = history[0]
|
||||
self.assertEqual('startup failure', entry['event_type'])
|
||||
self.assertEqual('ERROR', entry['severity'])
|
||||
self.assertIsNotNone(entry['event'])
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG', autospec=True)
|
||||
def test__start_consoles_node_locked(self, log_mock, mock_notify,
|
||||
@ -599,3 +605,9 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
|
||||
node.refresh()
|
||||
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||
history = objects.NodeHistory.list_by_node_id(self.context,
|
||||
node.id)
|
||||
entry = history[0]
|
||||
self.assertEqual('transition', entry['event_type'])
|
||||
self.assertEqual('ERROR', entry['severity'])
|
||||
self.assertEqual('unknown err', entry['event'])
|
||||
|
@ -3361,7 +3361,7 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
self.service._do_node_rescue_abort(task)
|
||||
clean_up_mock.assert_called_once_with(task.driver.rescue, task)
|
||||
self.assertIsNotNone(task.node.last_error)
|
||||
self.assertIsNone(task.node.last_error)
|
||||
self.assertFalse(task.node.maintenance)
|
||||
self.assertNotIn('agent_url', task.node.driver_internal_info)
|
||||
|
||||
@ -8153,3 +8153,166 @@ class DoNodeInspectAbortTestCase(mgr_utils.CommonMixIn,
|
||||
self.assertIn('Inspection was aborted', node.last_error)
|
||||
self.assertNotIn('agent_url', node.driver_internal_info)
|
||||
self.assertNotIn('agent_secret_token', node.driver_internal_info)
|
||||
|
||||
|
||||
class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
db_base.DbTestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(NodeHistoryRecordCleanupTestCase, self).setUp()
|
||||
self.node1 = obj_utils.get_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
id=10,
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
conductor_affinity=1)
|
||||
self.node2 = obj_utils.get_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
id=11,
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
conductor_affinity=1)
|
||||
self.node3 = obj_utils.get_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
id=12,
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
conductor_affinity=1)
|
||||
self.nodes = [self.node1, self.node2, self.node3]
|
||||
# Create the nodes, as the tasks need to operate across tables.
|
||||
self.node1.create()
|
||||
self.node2.create()
|
||||
self.node3.create()
|
||||
CONF.set_override('node_history_max_entries', 2, group='conductor')
|
||||
CONF.set_override('node_history_cleanup_batch_count', 2,
|
||||
group='conductor')
|
||||
self._start_service()
|
||||
|
||||
def test_history_is_pruned_to_config(self):
|
||||
for node in self.nodes:
|
||||
for event in ['one', 'two', 'three']:
|
||||
conductor_utils.node_history_record(node, event=event)
|
||||
conductor_utils.node_history_record(self.node1, event="final")
|
||||
self.service._manage_node_history(self.context)
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
self.assertEqual(8, len(events))
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 10)
|
||||
self.assertEqual('three', events[0].event)
|
||||
self.assertEqual('final', events[1].event)
|
||||
self.assertEqual(2, len(events))
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 11)
|
||||
self.assertEqual(3, len(events))
|
||||
self.assertEqual('one', events[0].event)
|
||||
self.assertEqual('two', events[1].event)
|
||||
self.assertEqual('three', events[2].event)
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 12)
|
||||
self.assertEqual(3, len(events))
|
||||
self.assertEqual('one', events[0].event)
|
||||
self.assertEqual('two', events[1].event)
|
||||
self.assertEqual('three', events[2].event)
|
||||
|
||||
def test_history_is_pruned_to_config_two_pass(self):
|
||||
for node in self.nodes:
|
||||
for event in ['one', 'two', 'three']:
|
||||
conductor_utils.node_history_record(node, event=event)
|
||||
conductor_utils.node_history_record(self.node1, event="final")
|
||||
with mock.patch.object(manager.LOG, 'warning',
|
||||
autospec=True) as mock_log:
|
||||
self.service._manage_node_history(self.context)
|
||||
# assert we did call a warning entry
|
||||
self.assertEqual(1, mock_log.call_count)
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
self.assertEqual(8, len(events))
|
||||
self.service._manage_node_history(self.context)
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
print(events)
|
||||
self.assertEqual(6, len(events))
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 10)
|
||||
self.assertEqual(2, len(events))
|
||||
self.assertEqual('three', events[0].event)
|
||||
self.assertEqual('final', events[1].event)
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 11)
|
||||
self.assertEqual(2, len(events))
|
||||
self.assertEqual('two', events[0].event)
|
||||
self.assertEqual('three', events[1].event)
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 12)
|
||||
self.assertEqual(2, len(events))
|
||||
self.assertEqual('two', events[0].event)
|
||||
self.assertEqual('three', events[1].event)
|
||||
|
||||
def test_history_is_pruned_from_all_nodes_one_pass(self):
|
||||
CONF.set_override('node_history_cleanup_batch_count', 15,
|
||||
group='conductor')
|
||||
for node in self.nodes:
|
||||
for event in ['one', 'two', 'three']:
|
||||
conductor_utils.node_history_record(node, event=event)
|
||||
self.service._manage_node_history(self.context)
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
self.assertEqual(6, len(events))
|
||||
|
||||
def test_history_pruning_no_work(self):
|
||||
conductor_utils.node_history_record(self.node1, event='meow')
|
||||
with mock.patch.object(self.dbapi,
|
||||
'bulk_delete_node_history_records',
|
||||
autospec=True) as mock_delete:
|
||||
self.service._manage_node_history(self.context)
|
||||
mock_delete.assert_not_called()
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
self.assertEqual(1, len(events))
|
||||
|
||||
def test_history_pruning_not_other_conductor(self):
|
||||
node = obj_utils.get_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
id=33,
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
conductor_affinity=2)
|
||||
# create node so it can be queried
|
||||
node.create()
|
||||
for i in range(0, 3):
|
||||
conductor_utils.node_history_record(node, event='meow%s' % i)
|
||||
with mock.patch.object(self.dbapi,
|
||||
'bulk_delete_node_history_records',
|
||||
autospec=True) as mock_delete:
|
||||
self.service._manage_node_history(self.context)
|
||||
mock_delete.assert_not_called()
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
self.assertEqual(3, len(events))
|
||||
self.assertEqual('meow0', events[0].event)
|
||||
|
||||
def test_history_is_pruned_to_config_with_days(self):
|
||||
CONF.set_override('node_history_cleanup_batch_count', 15,
|
||||
group='conductor')
|
||||
CONF.set_override('node_history_minimum_days', 1,
|
||||
group='conductor')
|
||||
CONF.set_override('node_history_max_entries', 1, group='conductor')
|
||||
|
||||
old_date = datetime.datetime.now() - datetime.timedelta(days=7)
|
||||
# Creates 18 entries
|
||||
for node in self.nodes:
|
||||
for event in ['oldone', 'oldtwo', 'oldthree']:
|
||||
objects.NodeHistory(created_at=old_date,
|
||||
event=event,
|
||||
node_id=node.id).create()
|
||||
for event in ['one', 'two', 'three']:
|
||||
conductor_utils.node_history_record(node, event=event)
|
||||
|
||||
# 9 retained due to days, 3 to config
|
||||
self.service._manage_node_history(self.context)
|
||||
events = objects.NodeHistory.list(self.context)
|
||||
print(events)
|
||||
self.assertEqual(12, len(events))
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 10)
|
||||
self.assertEqual(4, len(events))
|
||||
self.assertEqual('oldthree', events[0].event)
|
||||
self.assertEqual('one', events[1].event)
|
||||
self.assertEqual('two', events[2].event)
|
||||
self.assertEqual('three', events[3].event)
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 11)
|
||||
self.assertEqual(4, len(events))
|
||||
self.assertEqual('oldthree', events[0].event)
|
||||
self.assertEqual('one', events[1].event)
|
||||
self.assertEqual('two', events[2].event)
|
||||
self.assertEqual('three', events[3].event)
|
||||
events = objects.NodeHistory.list_by_node_id(self.context, 12)
|
||||
self.assertEqual(4, len(events))
|
||||
self.assertEqual('oldthree', events[0].event)
|
||||
self.assertEqual('one', events[1].event)
|
||||
self.assertEqual('two', events[2].event)
|
||||
self.assertEqual('three', events[3].event)
|
||||
|
@ -959,7 +959,7 @@ class NodeSoftPowerActionTestCase(db_base.DbTestCase):
|
||||
self.assertIsNone(node['last_error'])
|
||||
|
||||
|
||||
class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
class DeployingErrorHandlerTestCase(db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
super(DeployingErrorHandlerTestCase, self).setUp()
|
||||
self.task = mock.Mock(spec=task_manager.TaskManager)
|
||||
@ -972,6 +972,8 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.node.last_error = None
|
||||
self.node.deploy_step = None
|
||||
self.node.driver_internal_info = {}
|
||||
self.node.id = obj_utils.create_test_node(self.context,
|
||||
driver='fake-hardware').id
|
||||
self.logmsg = "log message"
|
||||
self.errmsg = "err message"
|
||||
|
||||
@ -1023,6 +1025,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.assertEqual({}, self.node.deploy_step)
|
||||
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
|
||||
self.task.process_event.assert_called_once_with('fail')
|
||||
self.assertIsNotNone(self.node.last_error)
|
||||
|
||||
def test_deploying_error_handler_cleanup_ironic_exception(self):
|
||||
self._test_deploying_error_handler_cleanup(
|
||||
@ -1058,7 +1061,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.task.process_event.assert_called_once_with('fail')
|
||||
|
||||
|
||||
class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
class ErrorHandlersTestCase(db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
super(ErrorHandlersTestCase, self).setUp()
|
||||
self.task = mock.Mock(spec=task_manager.TaskManager)
|
||||
@ -1070,9 +1073,13 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
# strict typing of the node power state fields and would fail if passed
|
||||
# a Mock object in constructors. A task context is also required for
|
||||
# notifications.
|
||||
fake_node = obj_utils.create_test_node(self.context,
|
||||
driver='fake-hardware')
|
||||
|
||||
self.node.configure_mock(power_state=states.POWER_OFF,
|
||||
target_power_state=states.POWER_ON,
|
||||
maintenance=False, maintenance_reason=None)
|
||||
maintenance=False, maintenance_reason=None,
|
||||
id=fake_node.id)
|
||||
self.task.context = self.context
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
|
||||
@ -1401,6 +1408,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
'%(node)s, an exception was '
|
||||
'encountered while aborting.',
|
||||
{'node': self.node.uuid})
|
||||
self.assertIsNotNone(self.node.last_error)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(conductor_utils.LOG, 'error', autospec=True)
|
||||
@ -2562,3 +2570,60 @@ class GetConfigDriveImageTestCase(db_base.DbTestCase):
|
||||
network_data=None,
|
||||
user_data=b'{"user": "data"}',
|
||||
vendor_data=None)
|
||||
|
||||
|
||||
class NodeHistoryRecordTestCase(db_base.DbTestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(NodeHistoryRecordTestCase, self).setUp()
|
||||
self.node = obj_utils.create_test_node(
|
||||
self.context,
|
||||
uuid=uuidutils.generate_uuid())
|
||||
|
||||
def test_record_node_history(self):
|
||||
conductor_utils.node_history_record(self.node, event='meow')
|
||||
entries = objects.NodeHistory.list_by_node_id(self.context,
|
||||
self.node.id)
|
||||
entry = entries[0]
|
||||
self.assertEqual('meow', entry['event'])
|
||||
self.assertEqual(CONF.host, entry['conductor'])
|
||||
self.assertEqual('INFO', entry['severity'])
|
||||
self.assertIsNone(entry['user'])
|
||||
|
||||
def test_record_node_history_with_user(self):
|
||||
conductor_utils.node_history_record(self.node, event='meow',
|
||||
user='peachesthecat')
|
||||
entries = objects.NodeHistory.list_by_node_id(self.context,
|
||||
self.node.id)
|
||||
entry = entries[0]
|
||||
self.assertEqual('meow', entry['event'])
|
||||
self.assertEqual(CONF.host, entry['conductor'])
|
||||
self.assertEqual('peachesthecat', entry['user'])
|
||||
|
||||
def test_record_node_history_with_error_severity(self):
|
||||
conductor_utils.node_history_record(self.node, event='meowmeow',
|
||||
error=True,
|
||||
event_type='catwantfood')
|
||||
entries = objects.NodeHistory.list_by_node_id(self.context,
|
||||
self.node.id)
|
||||
entry = entries[0]
|
||||
self.assertEqual('meowmeow', entry['event'])
|
||||
self.assertEqual(CONF.host, entry['conductor'])
|
||||
self.assertEqual('ERROR', entry['severity'])
|
||||
self.assertEqual('catwantfood', entry['event_type'])
|
||||
|
||||
@mock.patch.object(objects, 'NodeHistory', autospec=True)
|
||||
def test_record_node_history_noop(self, mock_history):
|
||||
CONF.set_override('node_history', False, group='conductor')
|
||||
self.assertIsNone(conductor_utils.node_history_record(self.node))
|
||||
mock_history.assert_not_called()
|
||||
|
||||
@mock.patch.object(objects, 'NodeHistory', autospec=True)
|
||||
def test_record_node_history_disaled(self, mock_history):
|
||||
mock_create = mock.Mock()
|
||||
conductor_utils.node_history_record(self.node, event='meow',
|
||||
error=True)
|
||||
self.assertEqual('meow', self.node.last_error)
|
||||
mock_history.create = mock_create
|
||||
mock_history.assert_not_called()
|
||||
mock_create.assert_not_called()
|
||||
|
@ -27,6 +27,7 @@ from ironic.common import states
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.conductor import utils as conductor_utils
|
||||
from ironic.conductor import verify
|
||||
from ironic import objects
|
||||
from ironic.tests.unit.conductor import mgr_utils
|
||||
from ironic.tests.unit.db import base as db_base
|
||||
from ironic.tests.unit.objects import utils as obj_utils
|
||||
@ -138,6 +139,11 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertEqual(states.ENROLL, node.provision_state)
|
||||
self.assertIsNone(node.target_provision_state)
|
||||
self.assertTrue(node.last_error)
|
||||
history = objects.NodeHistory.list_by_node_id(self.context,
|
||||
node.id)
|
||||
entry = history[0]
|
||||
self.assertEqual('verify', entry['event_type'])
|
||||
self.assertEqual('ERROR', entry['severity'])
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',
|
||||
|
Loading…
x
Reference in New Issue
Block a user