Support extraction of ironic internal metrics
Adds support for parsing messages from the ironic code base which include metrics related to method call counts/timers, and ultimately items like "how many nodes were in this state". This is in order to provide operators with greater insight into Depends-On: https://review.opendev.org/c/openstack/ironic/+/865447 Change-Id: Ie4f3b2d3c7c7cbab1a0e03d8c7f0961d38a7d2c0
This commit is contained in:
parent
563c0f1d39
commit
55cd446b3c
@ -23,9 +23,10 @@ function install_ironic_prometheus_exporter {
|
|||||||
|
|
||||||
function configure_ironic_prometheus_exporter {
|
function configure_ironic_prometheus_exporter {
|
||||||
# Update ironic configuration file to use the exporter
|
# Update ironic configuration file to use the exporter
|
||||||
iniset $IRONIC_CONF_FILE conductor send_sensor_data true
|
iniset $IRONIC_CONF_FILE sensor_data send_sensor_data true
|
||||||
iniset $IRONIC_CONF_FILE conductor send_sensor_data_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES
|
iniset $IRONIC_CONF_FILE sensor_data enable_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES
|
||||||
iniset $IRONIC_CONF_FILE conductor send_sensor_data_interval 90
|
iniset $IRONIC_CONF_FILE sensor_data interval 90
|
||||||
|
iniset $IRONIC_CONF_FILE metrics backend collector
|
||||||
iniset $IRONIC_CONF_FILE oslo_messaging_notifications driver prometheus_exporter
|
iniset $IRONIC_CONF_FILE oslo_messaging_notifications driver prometheus_exporter
|
||||||
iniset $IRONIC_CONF_FILE oslo_messaging_notifications transport_url fake://
|
iniset $IRONIC_CONF_FILE oslo_messaging_notifications transport_url fake://
|
||||||
iniset $IRONIC_CONF_FILE oslo_messaging_notifications location $IRONIC_PROMETHEUS_EXPORTER_LOCATION
|
iniset $IRONIC_CONF_FILE oslo_messaging_notifications location $IRONIC_PROMETHEUS_EXPORTER_LOCATION
|
||||||
@ -66,7 +67,7 @@ function cleanup_ironic_prometheus_exporter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function wait_for_data {
|
function wait_for_data {
|
||||||
# Sleep for more than the [conductor]send_sensor_data_interval value
|
# Sleep for more than the [sensor_data]send_sensor_data_interval value
|
||||||
# to verify if we can get data from the baremetal
|
# to verify if we can get data from the baremetal
|
||||||
# FIXME(iurygregory): Add some logic to verify if the data already exists
|
# FIXME(iurygregory): Add some logic to verify if the data already exists
|
||||||
sleep 240
|
sleep 240
|
||||||
@ -84,6 +85,13 @@ function check_data {
|
|||||||
else
|
else
|
||||||
die $LINENO "Couldn't find $node_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION"
|
die $LINENO "Couldn't find $node_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION"
|
||||||
fi
|
fi
|
||||||
|
local stats_file="$(hostname)-ironic.metrics"
|
||||||
|
if [ -f "$IRONIC_PROMETHEUS_EXPORTER_LOCATION/$stats_file" ]; then
|
||||||
|
echo "#### Metrics data ####"
|
||||||
|
curl "http://$HOST_IP:$IRONIC_PROMETHEUS_EXPORTER_PORT/metrics"
|
||||||
|
else
|
||||||
|
die $LINENO "Could not find $stats_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
echo_summary "ironic-prometheus-exporter devstack plugin.sh called: $1/$2"
|
echo_summary "ironic-prometheus-exporter devstack plugin.sh called: $1/$2"
|
||||||
|
@ -20,6 +20,7 @@ from prometheus_client import write_to_textfile
|
|||||||
|
|
||||||
from ironic_prometheus_exporter.parsers import header
|
from ironic_prometheus_exporter.parsers import header
|
||||||
from ironic_prometheus_exporter.parsers import ipmi
|
from ironic_prometheus_exporter.parsers import ipmi
|
||||||
|
from ironic_prometheus_exporter.parsers import ironic as ironic_parser
|
||||||
from ironic_prometheus_exporter.parsers import redfish
|
from ironic_prometheus_exporter.parsers import redfish
|
||||||
|
|
||||||
|
|
||||||
@ -48,25 +49,39 @@ class PrometheusFileDriver(notifier.Driver):
|
|||||||
def notify(self, ctxt, message, priority, retry):
|
def notify(self, ctxt, message, priority, retry):
|
||||||
try:
|
try:
|
||||||
registry = CollectorRegistry()
|
registry = CollectorRegistry()
|
||||||
|
|
||||||
event_type = message['event_type']
|
event_type = message['event_type']
|
||||||
node_message = message['payload']
|
payload = message['payload']
|
||||||
header.timestamp_registry(node_message, registry)
|
if event_type == 'ironic.metrics':
|
||||||
|
# We know this message payload is from a conductor itself
|
||||||
|
# and not for node drivers.
|
||||||
|
header.timestamp_conductor_registry(payload, registry)
|
||||||
|
ironic_parser.category_registry(payload, registry)
|
||||||
|
|
||||||
if event_type == 'hardware.ipmi.metrics':
|
else:
|
||||||
ipmi.category_registry(node_message, registry)
|
header.timestamp_registry(payload, registry)
|
||||||
|
if event_type == 'hardware.ipmi.metrics':
|
||||||
|
ipmi.category_registry(payload, registry)
|
||||||
|
|
||||||
elif event_type == 'hardware.redfish.metrics':
|
elif event_type == 'hardware.redfish.metrics':
|
||||||
redfish.category_registry(node_message, registry)
|
redfish.category_registry(payload, registry)
|
||||||
|
|
||||||
field = (node_message.get('node_name') or
|
# Order of preference is for a node Name, UUID, or
|
||||||
node_message.get('node_uuid'))
|
# payload hostname field to be used (i.e. for conductor
|
||||||
nodeFile = os.path.join(
|
# message payloads).
|
||||||
|
field = (
|
||||||
|
payload.get('node_name') or
|
||||||
|
payload.get('node_uuid') or
|
||||||
|
payload.get('hostname')
|
||||||
|
)
|
||||||
|
statFile = os.path.join(
|
||||||
self.location, field + '-' + event_type)
|
self.location, field + '-' + event_type)
|
||||||
write_to_textfile(nodeFile, registry)
|
|
||||||
|
# Writes to file for server pickup
|
||||||
|
write_to_textfile(statFile, registry)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error(e)
|
LOG.error(e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
class SimpleFileDriver(notifier.Driver):
|
class SimpleFileDriver(notifier.Driver):
|
||||||
|
@ -19,6 +19,7 @@ from ironic_prometheus_exporter import utils as ipe_utils
|
|||||||
|
|
||||||
|
|
||||||
def timestamp_registry(node_information, metric_registry):
|
def timestamp_registry(node_information, metric_registry):
|
||||||
|
"""Injects a last updated timestamp for a node."""
|
||||||
metric = 'baremetal_last_payload_timestamp_seconds'
|
metric = 'baremetal_last_payload_timestamp_seconds'
|
||||||
labels = {'node_uuid': node_information['node_uuid'],
|
labels = {'node_uuid': node_information['node_uuid'],
|
||||||
'instance_uuid': node_information['instance_uuid']}
|
'instance_uuid': node_information['instance_uuid']}
|
||||||
@ -37,3 +38,21 @@ def timestamp_registry(node_information, metric_registry):
|
|||||||
|
|
||||||
valid_labels = ipe_utils.update_instance_uuid(labels)
|
valid_labels = ipe_utils.update_instance_uuid(labels)
|
||||||
g.labels(**valid_labels).set(value)
|
g.labels(**valid_labels).set(value)
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp_conductor_registry(payload, metric_registry):
|
||||||
|
"""Injets a last updated at timestamp for a conductor."""
|
||||||
|
metric = 'conductor_service_last_payload_timestamp_seconds'
|
||||||
|
labels = {'hostname': payload['hostname']}
|
||||||
|
dt_1970 = datetime(1970, 1, 1, 0, 0, 0)
|
||||||
|
dt_timestamp = datetime.strptime(payload['timestamp'],
|
||||||
|
'%Y-%m-%dT%H:%M:%S.%f')
|
||||||
|
value = int((dt_timestamp - dt_1970).total_seconds())
|
||||||
|
|
||||||
|
desc = descriptions.get_metric_description('header', metric)
|
||||||
|
|
||||||
|
g = Gauge(
|
||||||
|
metric, desc, labelnames=labels,
|
||||||
|
registry=metric_registry)
|
||||||
|
|
||||||
|
g.labels(labels).set(value)
|
||||||
|
177
ironic_prometheus_exporter/parsers/ironic.py
Normal file
177
ironic_prometheus_exporter/parsers/ironic.py
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from prometheus_client import Gauge
|
||||||
|
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def category_registry(message, metrics_registry):
|
||||||
|
"""Parse ironic metrics and submit them to Prometheus
|
||||||
|
|
||||||
|
:param node_message: Oslo notification message
|
||||||
|
:param metrics_registry: Prometheus registry
|
||||||
|
"""
|
||||||
|
|
||||||
|
hostname = message.get('hostname')
|
||||||
|
payload = message.get('payload')
|
||||||
|
service = 'ironic'
|
||||||
|
for key in payload.keys():
|
||||||
|
value = payload[key]
|
||||||
|
metric_type = value['type']
|
||||||
|
driver = None
|
||||||
|
labels = {'hostname': hostname,
|
||||||
|
'service': service}
|
||||||
|
|
||||||
|
if key.startswith('ironic.api'):
|
||||||
|
# This is only *really* to be expected in a combined single
|
||||||
|
# process mode, or if someone is using the exporter coupled
|
||||||
|
# with the API service itself.
|
||||||
|
formatted_key = key.replace(
|
||||||
|
'ironic.api.controllers.',
|
||||||
|
'ironic_rest_api_')
|
||||||
|
labels['component'] = 'api'
|
||||||
|
|
||||||
|
if key.startswith('ironic.drivers.modules'):
|
||||||
|
# Deconstruct driver entries/counters to be more sane and attach
|
||||||
|
# labeling to them.
|
||||||
|
|
||||||
|
# TODO(TheJulia): Once the minimum python version is 3.9, change
|
||||||
|
# to str.removeprefix.
|
||||||
|
formatted_key = key.replace(
|
||||||
|
'ironic.drivers.modules.',
|
||||||
|
'ironic.')
|
||||||
|
|
||||||
|
for driver_label in ['ipmi', 'redfish', 'agent', 'pxe',
|
||||||
|
'ilo', 'drac', 'irmc', 'inspector', 'ansible',
|
||||||
|
'ibmc', 'xclarity']:
|
||||||
|
if driver_label in key:
|
||||||
|
# since Dell's driver name doesn't match the code
|
||||||
|
# classpath drac, driver name is idrac.
|
||||||
|
driver = driver_label
|
||||||
|
# NOTE(TheJulia): WRT, drac, Technically this should be idrac
|
||||||
|
|
||||||
|
# To have the names of the metrics make sense, we need to handle
|
||||||
|
# structural folder names in the file/driver structure, which
|
||||||
|
# varies from driver to driver.
|
||||||
|
for driver_dir in ['redfish', 'ipmi', 'network', 'storage', 'drac',
|
||||||
|
'ilo', 'irmc', 'intel_ipmi', 'ansible', 'ibmc',
|
||||||
|
'xclarity']:
|
||||||
|
if driver_dir in formatted_key:
|
||||||
|
formatted_key = formatted_key.replace(
|
||||||
|
f'.{driver_dir}.', '.')
|
||||||
|
# Everything here should be one and done...
|
||||||
|
# Famous. Last. Words.
|
||||||
|
break
|
||||||
|
|
||||||
|
# Now remove the filenames. This is extraineous ironic internal
|
||||||
|
# structural information where the classes are housed, not the
|
||||||
|
# actual methods or class names.
|
||||||
|
for filename in ['boot', 'raid', 'power', 'bios', 'inspect',
|
||||||
|
'management', 'agent_base', 'agent_client',
|
||||||
|
'agent', 'deploy_utils', 'deploy', 'ipmitool',
|
||||||
|
'pxe_base', 'pxe', 'ramdisk', 'vendor_passthru',
|
||||||
|
'vendor']:
|
||||||
|
if filename in formatted_key:
|
||||||
|
formatted_key = formatted_key.replace(f'.{filename}.', '.')
|
||||||
|
break
|
||||||
|
|
||||||
|
labels['component'] = 'driver'
|
||||||
|
labels['driver'] = driver
|
||||||
|
|
||||||
|
if key.startswith('ironic.conductor'):
|
||||||
|
# Catches entries from:
|
||||||
|
# - ironic.conductor.manager
|
||||||
|
# - ironic.conductor.deployments
|
||||||
|
# TODO(TheJulia): Once the minimum python version is 3.9, change
|
||||||
|
# to str.removeprefix.
|
||||||
|
labels['component'] = 'conductor'
|
||||||
|
|
||||||
|
formatted_key = key.replace('ironic.conductor.manager.', 'ironic_')
|
||||||
|
for filename in ['manager', 'deployments', 'allocations']:
|
||||||
|
if filename in key:
|
||||||
|
formatted_key = key.replace(f'conductor.{filename}', '')
|
||||||
|
break
|
||||||
|
|
||||||
|
# Prometheus does not use dot delimited data structures
|
||||||
|
# so we need to rename it to be underscore delimited.
|
||||||
|
# Downside of this is we end up with things like double
|
||||||
|
# underscores from method names, but it should be still clear
|
||||||
|
# where something is coming from.
|
||||||
|
# i.e.
|
||||||
|
# In: ironic.conductor.manager.ConductorManager.do_sync_power_state
|
||||||
|
# Out: ironic_conductormanager_do_sync_power_state
|
||||||
|
|
||||||
|
formatted_key = formatted_key.replace('.', '_')
|
||||||
|
if '__' in formatted_key:
|
||||||
|
# Remove entries introduced via private methods with metrics
|
||||||
|
# decorators defined on them.
|
||||||
|
formatted_key = formatted_key.replace('__', '_')
|
||||||
|
formatted_key = formatted_key.lower()
|
||||||
|
# Remove ConductorManager, because it gets confusing as that is the
|
||||||
|
# Internal class name
|
||||||
|
|
||||||
|
LOG.debug(f'Creating metric {key} using {formatted_key}.')
|
||||||
|
|
||||||
|
# Always process timer first. The bulk of our Metrics in Ironic
|
||||||
|
# are timer counters.
|
||||||
|
if metric_type == 'timer':
|
||||||
|
# NOTE(TheJulia): So this doesn't use the promethus_client
|
||||||
|
# histogram format as it requires the existence of sample buckets
|
||||||
|
# inside of it's data structure or the entry of individual
|
||||||
|
# instances into the running history.
|
||||||
|
# Instead, we will return two counters, a sum and count gauge,
|
||||||
|
# Hopefully this will be useful. The reason it is not just
|
||||||
|
# two counter values, is each counter value in prometheus_client
|
||||||
|
# gets a _created child sample, which creates a lot of confusion.
|
||||||
|
LOG.debug(f'Details of the metric {formatted_key} with labels '
|
||||||
|
'{labels}, sum: %s, count: %s', value['sum'],
|
||||||
|
value['count'])
|
||||||
|
metric = Gauge(formatted_key + '_time', 'Total time (ms) spent.',
|
||||||
|
labelnames=list(labels.keys()),
|
||||||
|
registry=metrics_registry)
|
||||||
|
metric.labels(**labels).set(value['sum'])
|
||||||
|
metric = Gauge(formatted_key + '_call_count',
|
||||||
|
'Sum of calls recorded.',
|
||||||
|
labelnames=list(labels.keys()),
|
||||||
|
registry=metrics_registry)
|
||||||
|
metric.labels(**labels).set(value['count'])
|
||||||
|
LOG.debug(f'Details of the metric {formatted_key} with labels '
|
||||||
|
'{labels}, sum: %s, count: %s', value['sum'],
|
||||||
|
value['count'])
|
||||||
|
next
|
||||||
|
|
||||||
|
elif metric_type == 'gauge':
|
||||||
|
metric = Gauge(formatted_key, 'Point in time count of data point.',
|
||||||
|
labelnames=list(labels.keys()),
|
||||||
|
registry=metrics_registry)
|
||||||
|
metric.labels(**labels).set(value['value'])
|
||||||
|
LOG.debug(f'Details of the metric {formatted_key} with labels '
|
||||||
|
'{labels}, value: %s', value['value'])
|
||||||
|
|
||||||
|
elif metric_type == 'counter':
|
||||||
|
# NOTE(TheJulia): We use a gauge instead of of a counter because
|
||||||
|
# the prometheus client library automatcially renames our value
|
||||||
|
# by adding _total to it, and adds a _created child sample value
|
||||||
|
# which is just the time. Unfortunately the later is just noise.
|
||||||
|
metric = Gauge(formatted_key,
|
||||||
|
'Counter representing the method or data point.',
|
||||||
|
labelnames=list(labels.keys()),
|
||||||
|
registry=metrics_registry)
|
||||||
|
# Prometheus_client doesn't directly expose a counter method
|
||||||
|
# to set a counterv alue directly.
|
||||||
|
metric.labels(**labels).set(value['count'])
|
||||||
|
LOG.debug(f'Details of the metric {formatted_key} with labels '
|
||||||
|
'{labels}, value: %s', value['count'])
|
@ -0,0 +1,261 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "ironic_post_clean_step_hook_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
|
||||||
|
"value": 0.0050067901611328125,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_post_clean_step_hook_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
|
||||||
|
"value": 3.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_post_deploy_step_hook_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
|
||||||
|
"value": 0.010013580322265625,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_post_deploy_step_hook_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"},
|
||||||
|
"value": 2.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_inspect_wait_timeouts_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 2007.3809623718262,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_inspect_wait_timeouts_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_deploy_timeouts_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 2200.153350830078,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_deploy_timeouts_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_cleanwait_timeouts_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 2424.9556064605713,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_cleanwait_timeouts_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_rescuewait_timeouts_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 1639.7652626037598,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_rescuewait_timeouts_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_orphan_nodes_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 1608.001708984375,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_orphan_nodes_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_orphan_allocations_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 1797.1274852752686,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_check_orphan_allocations_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_redfishmanagement_query_firmware_update_status_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
|
||||||
|
"value": 1598.8798141479492,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_redfishmanagement_query_firmware_update_status_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_redfishmanagement_query_firmware_update_failed_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
|
||||||
|
"value": 1955.6934833526611,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_redfishmanagement_query_firmware_update_failed_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_do_sync_power_state_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 16819.172620773315,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_do_sync_power_state_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_do_sync_power_state",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic"},
|
||||||
|
"value": 0.0,
|
||||||
|
"docs": "Point in time count of data point.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sync_power_states_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 23187.243700027466,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sync_power_states_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 214.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"},
|
||||||
|
"value": 1078.2177448272705,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"},
|
||||||
|
"value": 142.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sensors_nodes_task_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 10648.590803146362,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sensors_nodes_task_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 142.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_send_sensor_data_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 15062.613010406494,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_send_sensor_data_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 142.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sync_local_state_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 276.4444351196289,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_sync_local_state_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 71.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_power_failure_recovery_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 166.60714149475098,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_power_failure_recovery_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 42.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_clean_up_caches_time",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 42.63949394226074,
|
||||||
|
"docs": "Total time (ms) spent.",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ironic_conductormanager_clean_up_caches_call_count",
|
||||||
|
"labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"},
|
||||||
|
"value": 3.0,
|
||||||
|
"docs": "Sum of calls recorded.",
|
||||||
|
"type": "gauge"
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,111 @@
|
|||||||
|
{
|
||||||
|
"priority": "INFO",
|
||||||
|
"event_type": "ironic.metrics.update",
|
||||||
|
"timestamp": "2019-03-29 20:12:26.885347",
|
||||||
|
"publisher_id": "None.localhost.localdomain",
|
||||||
|
"payload": {
|
||||||
|
"instance_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
|
||||||
|
"node_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
|
||||||
|
"event_type": "hardware.redfish.metrics.update",
|
||||||
|
"timestamp": "2019-03-29T20:12:22.989020",
|
||||||
|
"message_id": "85d6b2c8-fe57-432d-868a-330e0e28cf34",
|
||||||
|
"hostname": "a-test-conductor",
|
||||||
|
"payload": {
|
||||||
|
"ironic.drivers.modules.agent_base.post_clean_step_hook": {
|
||||||
|
"count": 3,
|
||||||
|
"sum": 0.0050067901611328125,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.drivers.modules.agent_base.post_deploy_step_hook": {
|
||||||
|
"count": 2,
|
||||||
|
"sum": 0.010013580322265625,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_inspect_wait_timeouts": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 2007.3809623718262,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_deploy_timeouts": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 2200.153350830078,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_cleanwait_timeouts": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 2424.9556064605713,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_rescuewait_timeouts": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 1639.7652626037598,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_orphan_nodes": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 1608.001708984375,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._check_orphan_allocations": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 1797.1274852752686,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_status": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 1598.8798141479492,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_failed": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 1955.6934833526611,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.do_sync_power_state": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 16819.172620773315,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ConductorManager.FailingPowerSyncCount": {
|
||||||
|
"value": 0,
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._sync_power_states": {
|
||||||
|
"count": 214,
|
||||||
|
"sum": 23187.243700027466,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.drivers.modules.pxe_base.PXEBaseMixin._check_boot_timeouts": {
|
||||||
|
"count": 142,
|
||||||
|
"sum": 1078.2177448272705,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._sensors_nodes_task": {
|
||||||
|
"count": 142,
|
||||||
|
"sum": 10648.590803146362,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._send_sensor_data": {
|
||||||
|
"count": 142,
|
||||||
|
"sum": 15062.613010406494,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._sync_local_state": {
|
||||||
|
"count": 71,
|
||||||
|
"sum": 276.4444351196289,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._power_failure_recovery": {
|
||||||
|
"count": 42,
|
||||||
|
"sum": 166.60714149475098,
|
||||||
|
"type": "timer"
|
||||||
|
},
|
||||||
|
"ironic.conductor.manager.ConductorManager._clean_up_caches": {
|
||||||
|
"count": 3,
|
||||||
|
"sum": 42.63949394226074,
|
||||||
|
"type": "timer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"message_id": "2c0da1e8-1958-484f-9bdd-9117d717f7fa"
|
||||||
|
}
|
92
ironic_prometheus_exporter/tests/test_ironic_parser.py
Normal file
92
ironic_prometheus_exporter/tests/test_ironic_parser.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from prometheus_client import CollectorRegistry
|
||||||
|
|
||||||
|
import ironic_prometheus_exporter
|
||||||
|
from ironic_prometheus_exporter.parsers import ironic
|
||||||
|
|
||||||
|
|
||||||
|
sample_file = os.path.join(
|
||||||
|
os.path.dirname(ironic_prometheus_exporter.__file__),
|
||||||
|
'tests', 'json_samples', 'notification-ironic.json')
|
||||||
|
expected_file = os.path.join(
|
||||||
|
os.path.dirname(ironic_prometheus_exporter.__file__),
|
||||||
|
'tests', 'json_samples',
|
||||||
|
'./expected_ironic_parser_entries.json')
|
||||||
|
|
||||||
|
DATA = json.load(open(sample_file))
|
||||||
|
|
||||||
|
# Helper to dump the output upon major changes, since
|
||||||
|
# it is a lot of JSON.
|
||||||
|
DUMP_JSON = False
|
||||||
|
|
||||||
|
if not DUMP_JSON:
|
||||||
|
EXPECTED = json.load(open(expected_file))
|
||||||
|
else:
|
||||||
|
EXPECTED = None
|
||||||
|
|
||||||
|
|
||||||
|
class TestIronicPayloadParser(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.message = DATA['payload']
|
||||||
|
|
||||||
|
def test_category_registry(self):
|
||||||
|
registry = CollectorRegistry()
|
||||||
|
|
||||||
|
ironic.category_registry(self.message, registry)
|
||||||
|
entry_count = 0
|
||||||
|
for entry in registry.collect():
|
||||||
|
# NOTE(TheJulia): We don't get the results back in any order
|
||||||
|
# which makes sense.
|
||||||
|
sample = entry.samples[0]
|
||||||
|
name = sample.name
|
||||||
|
labels = sample.labels
|
||||||
|
value = sample.value
|
||||||
|
documentation = entry.documentation
|
||||||
|
entry_type = entry.type
|
||||||
|
if not DUMP_JSON:
|
||||||
|
for expected_entry in EXPECTED:
|
||||||
|
# Find the entry, since access order is unreliable,
|
||||||
|
# and to compare so much data back and forth is otherwise
|
||||||
|
# not really feasible.
|
||||||
|
if name == expected_entry['name']:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
expected_entry = None
|
||||||
|
|
||||||
|
# NOTE(TheJulia): The lines below are just to help regenerate
|
||||||
|
# the known data set, but we don't get a reliable access order
|
||||||
|
# from the prometheus client registry collection object.
|
||||||
|
if DUMP_JSON:
|
||||||
|
print(' {')
|
||||||
|
print(f' \"name\": \"{sample.name}\",')
|
||||||
|
print(' \"labels\": %s,' % json.dumps(sample.labels))
|
||||||
|
print(f' \"value\": {sample.value},')
|
||||||
|
print(f' \"docs\": \"{entry.documentation}\",')
|
||||||
|
print(f' \"type\": \"{entry.type}\"')
|
||||||
|
print(' },')
|
||||||
|
else:
|
||||||
|
self.assertEqual(name, expected_entry['name'])
|
||||||
|
self.assertDictEqual(labels, expected_entry['labels'])
|
||||||
|
self.assertEqual(value, expected_entry['value'])
|
||||||
|
self.assertEqual(documentation, expected_entry['docs'])
|
||||||
|
self.assertEqual(entry_type, expected_entry['type'])
|
||||||
|
assert any(char.isupper() for char in sample.name) is not True
|
||||||
|
entry_count = entry_count + 1
|
||||||
|
if not DUMP_JSON:
|
||||||
|
self.assertEqual(len(EXPECTED), entry_count)
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Adds the capability for the ``ironic-prometheus-exporter`` to parse
|
||||||
|
metrics data from the ``ironic`` and ``ironic-conductor`` processes
|
||||||
|
to enable greater operator insight into the inner working and performance
|
||||||
|
of these services. This feature requires the ironic.conf file
|
||||||
|
``[metrics]backend`` value to be set to ``collector``.
|
Loading…
Reference in New Issue
Block a user