Support extraction of ironic internal metrics

Adds support for parsing messages from the ironic code base which
include metrics related to method call counts/timers, and ultimately
items like "how many nodes were in this state".

This is in order to provide operators with greater insight into

Depends-On: https://review.opendev.org/c/openstack/ironic/+/865447
Change-Id: Ie4f3b2d3c7c7cbab1a0e03d8c7f0961d38a7d2c0
This commit is contained in:
Julia Kreger 2023-01-06 12:40:56 -08:00
parent 563c0f1d39
commit 55cd446b3c
8 changed files with 706 additions and 15 deletions

View File

@ -23,9 +23,10 @@ function install_ironic_prometheus_exporter {
function configure_ironic_prometheus_exporter { function configure_ironic_prometheus_exporter {
# Update ironic configuration file to use the exporter # Update ironic configuration file to use the exporter
iniset $IRONIC_CONF_FILE conductor send_sensor_data true iniset $IRONIC_CONF_FILE sensor_data send_sensor_data true
iniset $IRONIC_CONF_FILE conductor send_sensor_data_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES iniset $IRONIC_CONF_FILE sensor_data enable_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES
iniset $IRONIC_CONF_FILE conductor send_sensor_data_interval 90 iniset $IRONIC_CONF_FILE sensor_data interval 90
iniset $IRONIC_CONF_FILE metrics backend collector
iniset $IRONIC_CONF_FILE oslo_messaging_notifications driver prometheus_exporter iniset $IRONIC_CONF_FILE oslo_messaging_notifications driver prometheus_exporter
iniset $IRONIC_CONF_FILE oslo_messaging_notifications transport_url fake:// iniset $IRONIC_CONF_FILE oslo_messaging_notifications transport_url fake://
iniset $IRONIC_CONF_FILE oslo_messaging_notifications location $IRONIC_PROMETHEUS_EXPORTER_LOCATION iniset $IRONIC_CONF_FILE oslo_messaging_notifications location $IRONIC_PROMETHEUS_EXPORTER_LOCATION
@ -66,7 +67,7 @@ function cleanup_ironic_prometheus_exporter {
} }
function wait_for_data { function wait_for_data {
# Sleep for more than the [conductor]send_sensor_data_interval value # Sleep for more than the [sensor_data]send_sensor_data_interval value
# to verify if we can get data from the baremetal # to verify if we can get data from the baremetal
# FIXME(iurygregory): Add some logic to verify if the data already exists # FIXME(iurygregory): Add some logic to verify if the data already exists
sleep 240 sleep 240
@ -84,6 +85,13 @@ function check_data {
else else
die $LINENO "Couldn't find $node_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION" die $LINENO "Couldn't find $node_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION"
fi fi
local stats_file="$(hostname)-ironic.metrics"
if [ -f "$IRONIC_PROMETHEUS_EXPORTER_LOCATION/$stats_file" ]; then
echo "#### Metrics data ####"
curl "http://$HOST_IP:$IRONIC_PROMETHEUS_EXPORTER_PORT/metrics"
else
die $LINENO "Could not find $stats_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION"
fi
} }
echo_summary "ironic-prometheus-exporter devstack plugin.sh called: $1/$2" echo_summary "ironic-prometheus-exporter devstack plugin.sh called: $1/$2"

View File

@ -20,6 +20,7 @@ from prometheus_client import write_to_textfile
from ironic_prometheus_exporter.parsers import header from ironic_prometheus_exporter.parsers import header
from ironic_prometheus_exporter.parsers import ipmi from ironic_prometheus_exporter.parsers import ipmi
from ironic_prometheus_exporter.parsers import ironic as ironic_parser
from ironic_prometheus_exporter.parsers import redfish from ironic_prometheus_exporter.parsers import redfish
@ -48,25 +49,39 @@ class PrometheusFileDriver(notifier.Driver):
def notify(self, ctxt, message, priority, retry): def notify(self, ctxt, message, priority, retry):
try: try:
registry = CollectorRegistry() registry = CollectorRegistry()
event_type = message['event_type'] event_type = message['event_type']
node_message = message['payload'] payload = message['payload']
header.timestamp_registry(node_message, registry) if event_type == 'ironic.metrics':
# We know this message payload is from a conductor itself
# and not for node drivers.
header.timestamp_conductor_registry(payload, registry)
ironic_parser.category_registry(payload, registry)
if event_type == 'hardware.ipmi.metrics': else:
ipmi.category_registry(node_message, registry) header.timestamp_registry(payload, registry)
if event_type == 'hardware.ipmi.metrics':
ipmi.category_registry(payload, registry)
elif event_type == 'hardware.redfish.metrics': elif event_type == 'hardware.redfish.metrics':
redfish.category_registry(node_message, registry) redfish.category_registry(payload, registry)
field = (node_message.get('node_name') or # Order of preference is for a node Name, UUID, or
node_message.get('node_uuid')) # payload hostname field to be used (i.e. for conductor
nodeFile = os.path.join( # message payloads).
field = (
payload.get('node_name') or
payload.get('node_uuid') or
payload.get('hostname')
)
statFile = os.path.join(
self.location, field + '-' + event_type) self.location, field + '-' + event_type)
write_to_textfile(nodeFile, registry)
# Writes to file for server pickup
write_to_textfile(statFile, registry)
except Exception as e: except Exception as e:
LOG.error(e) LOG.error(e)
raise
class SimpleFileDriver(notifier.Driver): class SimpleFileDriver(notifier.Driver):

View File

@ -19,6 +19,7 @@ from ironic_prometheus_exporter import utils as ipe_utils
def timestamp_registry(node_information, metric_registry): def timestamp_registry(node_information, metric_registry):
"""Injects a last updated timestamp for a node."""
metric = 'baremetal_last_payload_timestamp_seconds' metric = 'baremetal_last_payload_timestamp_seconds'
labels = {'node_uuid': node_information['node_uuid'], labels = {'node_uuid': node_information['node_uuid'],
'instance_uuid': node_information['instance_uuid']} 'instance_uuid': node_information['instance_uuid']}
@ -37,3 +38,21 @@ def timestamp_registry(node_information, metric_registry):
valid_labels = ipe_utils.update_instance_uuid(labels) valid_labels = ipe_utils.update_instance_uuid(labels)
g.labels(**valid_labels).set(value) g.labels(**valid_labels).set(value)
def timestamp_conductor_registry(payload, metric_registry):
"""Injets a last updated at timestamp for a conductor."""
metric = 'conductor_service_last_payload_timestamp_seconds'
labels = {'hostname': payload['hostname']}
dt_1970 = datetime(1970, 1, 1, 0, 0, 0)
dt_timestamp = datetime.strptime(payload['timestamp'],
'%Y-%m-%dT%H:%M:%S.%f')
value = int((dt_timestamp - dt_1970).total_seconds())
desc = descriptions.get_metric_description('header', metric)
g = Gauge(
metric, desc, labelnames=labels,
registry=metric_registry)
g.labels(labels).set(value)

View File

@ -0,0 +1,177 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
from prometheus_client import Gauge
LOG = logging.getLogger(__name__)
def category_registry(message, metrics_registry):
"""Parse ironic metrics and submit them to Prometheus
:param node_message: Oslo notification message
:param metrics_registry: Prometheus registry
"""
hostname = message.get('hostname')
payload = message.get('payload')
service = 'ironic'
for key in payload.keys():
value = payload[key]
metric_type = value['type']
driver = None
labels = {'hostname': hostname,
'service': service}
if key.startswith('ironic.api'):
# This is only *really* to be expected in a combined single
# process mode, or if someone is using the exporter coupled
# with the API service itself.
formatted_key = key.replace(
'ironic.api.controllers.',
'ironic_rest_api_')
labels['component'] = 'api'
if key.startswith('ironic.drivers.modules'):
# Deconstruct driver entries/counters to be more sane and attach
# labeling to them.
# TODO(TheJulia): Once the minimum python version is 3.9, change
# to str.removeprefix.
formatted_key = key.replace(
'ironic.drivers.modules.',
'ironic.')
for driver_label in ['ipmi', 'redfish', 'agent', 'pxe',
'ilo', 'drac', 'irmc', 'inspector', 'ansible',
'ibmc', 'xclarity']:
if driver_label in key:
# since Dell's driver name doesn't match the code
# classpath drac, driver name is idrac.
driver = driver_label
# NOTE(TheJulia): WRT, drac, Technically this should be idrac
# To have the names of the metrics make sense, we need to handle
# structural folder names in the file/driver structure, which
# varies from driver to driver.
for driver_dir in ['redfish', 'ipmi', 'network', 'storage', 'drac',
'ilo', 'irmc', 'intel_ipmi', 'ansible', 'ibmc',
'xclarity']:
if driver_dir in formatted_key:
formatted_key = formatted_key.replace(
f'.{driver_dir}.', '.')
# Everything here should be one and done...
# Famous. Last. Words.
break
# Now remove the filenames. This is extraineous ironic internal
# structural information where the classes are housed, not the
# actual methods or class names.
for filename in ['boot', 'raid', 'power', 'bios', 'inspect',
'management', 'agent_base', 'agent_client',
'agent', 'deploy_utils', 'deploy', 'ipmitool',
'pxe_base', 'pxe', 'ramdisk', 'vendor_passthru',
'vendor']:
if filename in formatted_key:
formatted_key = formatted_key.replace(f'.{filename}.', '.')
break
labels['component'] = 'driver'
labels['driver'] = driver
if key.startswith('ironic.conductor'):
# Catches entries from:
# - ironic.conductor.manager
# - ironic.conductor.deployments
# TODO(TheJulia): Once the minimum python version is 3.9, change
# to str.removeprefix.
labels['component'] = 'conductor'
formatted_key = key.replace('ironic.conductor.manager.', 'ironic_')
for filename in ['manager', 'deployments', 'allocations']:
if filename in key:
formatted_key = key.replace(f'conductor.{filename}', '')
break
# Prometheus does not use dot delimited data structures
# so we need to rename it to be underscore delimited.
# Downside of this is we end up with things like double
# underscores from method names, but it should be still clear
# where something is coming from.
# i.e.
# In: ironic.conductor.manager.ConductorManager.do_sync_power_state
# Out: ironic_conductormanager_do_sync_power_state
formatted_key = formatted_key.replace('.', '_')
if '__' in formatted_key:
# Remove entries introduced via private methods with metrics
# decorators defined on them.
formatted_key = formatted_key.replace('__', '_')
formatted_key = formatted_key.lower()
# Remove ConductorManager, because it gets confusing as that is the
# Internal class name
LOG.debug(f'Creating metric {key} using {formatted_key}.')
# Always process timer first. The bulk of our Metrics in Ironic
# are timer counters.
if metric_type == 'timer':
# NOTE(TheJulia): So this doesn't use the promethus_client
# histogram format as it requires the existence of sample buckets
# inside of it's data structure or the entry of individual
# instances into the running history.
# Instead, we will return two counters, a sum and count gauge,
# Hopefully this will be useful. The reason it is not just
# two counter values, is each counter value in prometheus_client
# gets a _created child sample, which creates a lot of confusion.
LOG.debug(f'Details of the metric {formatted_key} with labels '
'{labels}, sum: %s, count: %s', value['sum'],
value['count'])
metric = Gauge(formatted_key + '_time', 'Total time (ms) spent.',
labelnames=list(labels.keys()),
registry=metrics_registry)
metric.labels(**labels).set(value['sum'])
metric = Gauge(formatted_key + '_call_count',
'Sum of calls recorded.',
labelnames=list(labels.keys()),
registry=metrics_registry)
metric.labels(**labels).set(value['count'])
LOG.debug(f'Details of the metric {formatted_key} with labels '
'{labels}, sum: %s, count: %s', value['sum'],
value['count'])
next
elif metric_type == 'gauge':
metric = Gauge(formatted_key, 'Point in time count of data point.',
labelnames=list(labels.keys()),
registry=metrics_registry)
metric.labels(**labels).set(value['value'])
LOG.debug(f'Details of the metric {formatted_key} with labels '
'{labels}, value: %s', value['value'])
elif metric_type == 'counter':
# NOTE(TheJulia): We use a gauge instead of of a counter because
# the prometheus client library automatcially renames our value
# by adding _total to it, and adds a _created child sample value
# which is just the time. Unfortunately the later is just noise.
metric = Gauge(formatted_key,
'Counter representing the method or data point.',
labelnames=list(labels.keys()),
registry=metrics_registry)
# Prometheus_client doesn't directly expose a counter method
# to set a counterv alue directly.
metric.labels(**labels).set(value['count'])
LOG.debug(f'Details of the metric {formatted_key} with labels '
'{labels}, value: %s', value['count'])

View File

@ -0,0 +1,261 @@
[
{
"name": "ironic_post_clean_step_hook_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
"value": 0.0050067901611328125,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_post_clean_step_hook_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
"value": 3.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_post_deploy_step_hook_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
"value": 0.010013580322265625,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_post_deploy_step_hook_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
"value": 2.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_inspect_wait_timeouts_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 2007.3809623718262,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_inspect_wait_timeouts_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_deploy_timeouts_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 2200.153350830078,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_deploy_timeouts_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_cleanwait_timeouts_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 2424.9556064605713,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_cleanwait_timeouts_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_rescuewait_timeouts_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 1639.7652626037598,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_rescuewait_timeouts_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_orphan_nodes_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 1608.001708984375,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_orphan_nodes_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_orphan_allocations_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 1797.1274852752686,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_check_orphan_allocations_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_redfishmanagement_query_firmware_update_status_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
"value": 1598.8798141479492,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_redfishmanagement_query_firmware_update_status_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_redfishmanagement_query_firmware_update_failed_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
"value": 1955.6934833526611,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_redfishmanagement_query_firmware_update_failed_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_do_sync_power_state_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 16819.172620773315,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_do_sync_power_state_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_do_sync_power_state",
"labels": {"hostname": "a-test-conductor", "service": "ironic"},
"value": 0.0,
"docs": "Point in time count of data point.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sync_power_states_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 23187.243700027466,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sync_power_states_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 214.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"},
"value": 1078.2177448272705,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"},
"value": 142.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sensors_nodes_task_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 10648.590803146362,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sensors_nodes_task_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 142.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_send_sensor_data_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 15062.613010406494,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_send_sensor_data_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 142.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sync_local_state_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 276.4444351196289,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_sync_local_state_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 71.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_power_failure_recovery_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 166.60714149475098,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_power_failure_recovery_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 42.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_clean_up_caches_time",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 42.63949394226074,
"docs": "Total time (ms) spent.",
"type": "gauge"
},
{
"name": "ironic_conductormanager_clean_up_caches_call_count",
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
"value": 3.0,
"docs": "Sum of calls recorded.",
"type": "gauge"
}
]

View File

@ -0,0 +1,111 @@
{
"priority": "INFO",
"event_type": "ironic.metrics.update",
"timestamp": "2019-03-29 20:12:26.885347",
"publisher_id": "None.localhost.localdomain",
"payload": {
"instance_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
"node_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
"event_type": "hardware.redfish.metrics.update",
"timestamp": "2019-03-29T20:12:22.989020",
"message_id": "85d6b2c8-fe57-432d-868a-330e0e28cf34",
"hostname": "a-test-conductor",
"payload": {
"ironic.drivers.modules.agent_base.post_clean_step_hook": {
"count": 3,
"sum": 0.0050067901611328125,
"type": "timer"
},
"ironic.drivers.modules.agent_base.post_deploy_step_hook": {
"count": 2,
"sum": 0.010013580322265625,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_inspect_wait_timeouts": {
"count": 214,
"sum": 2007.3809623718262,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_deploy_timeouts": {
"count": 214,
"sum": 2200.153350830078,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_cleanwait_timeouts": {
"count": 214,
"sum": 2424.9556064605713,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_rescuewait_timeouts": {
"count": 214,
"sum": 1639.7652626037598,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_orphan_nodes": {
"count": 214,
"sum": 1608.001708984375,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._check_orphan_allocations": {
"count": 214,
"sum": 1797.1274852752686,
"type": "timer"
},
"ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_status": {
"count": 214,
"sum": 1598.8798141479492,
"type": "timer"
},
"ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_failed": {
"count": 214,
"sum": 1955.6934833526611,
"type": "timer"
},
"ironic.conductor.manager.do_sync_power_state": {
"count": 214,
"sum": 16819.172620773315,
"type": "timer"
},
"ConductorManager.FailingPowerSyncCount": {
"value": 0,
"type": "gauge"
},
"ironic.conductor.manager.ConductorManager._sync_power_states": {
"count": 214,
"sum": 23187.243700027466,
"type": "timer"
},
"ironic.drivers.modules.pxe_base.PXEBaseMixin._check_boot_timeouts": {
"count": 142,
"sum": 1078.2177448272705,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._sensors_nodes_task": {
"count": 142,
"sum": 10648.590803146362,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._send_sensor_data": {
"count": 142,
"sum": 15062.613010406494,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._sync_local_state": {
"count": 71,
"sum": 276.4444351196289,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._power_failure_recovery": {
"count": 42,
"sum": 166.60714149475098,
"type": "timer"
},
"ironic.conductor.manager.ConductorManager._clean_up_caches": {
"count": 3,
"sum": 42.63949394226074,
"type": "timer"
}
}
},
"message_id": "2c0da1e8-1958-484f-9bdd-9117d717f7fa"
}

View File

@ -0,0 +1,92 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import os
import unittest
from prometheus_client import CollectorRegistry
import ironic_prometheus_exporter
from ironic_prometheus_exporter.parsers import ironic
sample_file = os.path.join(
os.path.dirname(ironic_prometheus_exporter.__file__),
'tests', 'json_samples', 'notification-ironic.json')
expected_file = os.path.join(
os.path.dirname(ironic_prometheus_exporter.__file__),
'tests', 'json_samples',
'./expected_ironic_parser_entries.json')
DATA = json.load(open(sample_file))
# Helper to dump the output upon major changes, since
# it is a lot of JSON.
DUMP_JSON = False
if not DUMP_JSON:
EXPECTED = json.load(open(expected_file))
else:
EXPECTED = None
class TestIronicPayloadParser(unittest.TestCase):
def setUp(self):
self.message = DATA['payload']
def test_category_registry(self):
registry = CollectorRegistry()
ironic.category_registry(self.message, registry)
entry_count = 0
for entry in registry.collect():
# NOTE(TheJulia): We don't get the results back in any order
# which makes sense.
sample = entry.samples[0]
name = sample.name
labels = sample.labels
value = sample.value
documentation = entry.documentation
entry_type = entry.type
if not DUMP_JSON:
for expected_entry in EXPECTED:
# Find the entry, since access order is unreliable,
# and to compare so much data back and forth is otherwise
# not really feasible.
if name == expected_entry['name']:
break
else:
expected_entry = None
# NOTE(TheJulia): The lines below are just to help regenerate
# the known data set, but we don't get a reliable access order
# from the prometheus client registry collection object.
if DUMP_JSON:
print(' {')
print(f' \"name\": \"{sample.name}\",')
print(' \"labels\": %s,' % json.dumps(sample.labels))
print(f' \"value\": {sample.value},')
print(f' \"docs\": \"{entry.documentation}\",')
print(f' \"type\": \"{entry.type}\"')
print(' },')
else:
self.assertEqual(name, expected_entry['name'])
self.assertDictEqual(labels, expected_entry['labels'])
self.assertEqual(value, expected_entry['value'])
self.assertEqual(documentation, expected_entry['docs'])
self.assertEqual(entry_type, expected_entry['type'])
assert any(char.isupper() for char in sample.name) is not True
entry_count = entry_count + 1
if not DUMP_JSON:
self.assertEqual(len(EXPECTED), entry_count)

View File

@ -0,0 +1,8 @@
---
features:
- |
Adds the capability for the ``ironic-prometheus-exporter`` to parse
metrics data from the ``ironic`` and ``ironic-conductor`` processes
to enable greater operator insight into the inner working and performance
of these services. This feature requires the ironic.conf file
``[metrics]backend`` value to be set to ``collector``.