diff --git a/devstack/plugin.sh b/devstack/plugin.sh index cba1606..bcd3deb 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -23,9 +23,10 @@ function install_ironic_prometheus_exporter { function configure_ironic_prometheus_exporter { # Update ironic configuration file to use the exporter - iniset $IRONIC_CONF_FILE conductor send_sensor_data true - iniset $IRONIC_CONF_FILE conductor send_sensor_data_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES - iniset $IRONIC_CONF_FILE conductor send_sensor_data_interval 90 + iniset $IRONIC_CONF_FILE sensor_data send_sensor_data true + iniset $IRONIC_CONF_FILE sensor_data enable_for_undeployed_nodes $COLLECT_DATA_UNDEPLOYED_NODES + iniset $IRONIC_CONF_FILE sensor_data interval 90 + iniset $IRONIC_CONF_FILE metrics backend collector iniset $IRONIC_CONF_FILE oslo_messaging_notifications driver prometheus_exporter iniset $IRONIC_CONF_FILE oslo_messaging_notifications transport_url fake:// iniset $IRONIC_CONF_FILE oslo_messaging_notifications location $IRONIC_PROMETHEUS_EXPORTER_LOCATION @@ -66,7 +67,7 @@ function cleanup_ironic_prometheus_exporter { } function wait_for_data { - # Sleep for more than the [conductor]send_sensor_data_interval value + # Sleep for more than the [sensor_data]send_sensor_data_interval value # to verify if we can get data from the baremetal # FIXME(iurygregory): Add some logic to verify if the data already exists sleep 240 @@ -84,6 +85,13 @@ function check_data { else die $LINENO "Couldn't find $node_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION" fi + local stats_file="$(hostname)-ironic.metrics" + if [ -f "$IRONIC_PROMETHEUS_EXPORTER_LOCATION/$stats_file" ]; then + echo "#### Metrics data ####" + curl "http://$HOST_IP:$IRONIC_PROMETHEUS_EXPORTER_PORT/metrics" + else + die $LINENO "Could not find $stats_file in $IRONIC_PROMETHEUS_EXPORTER_LOCATION" + fi } echo_summary "ironic-prometheus-exporter devstack plugin.sh called: $1/$2" diff --git a/ironic_prometheus_exporter/messaging.py b/ironic_prometheus_exporter/messaging.py index bd1f59f..07c30f1 100644 --- a/ironic_prometheus_exporter/messaging.py +++ b/ironic_prometheus_exporter/messaging.py @@ -20,6 +20,7 @@ from prometheus_client import write_to_textfile from ironic_prometheus_exporter.parsers import header from ironic_prometheus_exporter.parsers import ipmi +from ironic_prometheus_exporter.parsers import ironic as ironic_parser from ironic_prometheus_exporter.parsers import redfish @@ -48,25 +49,39 @@ class PrometheusFileDriver(notifier.Driver): def notify(self, ctxt, message, priority, retry): try: registry = CollectorRegistry() - event_type = message['event_type'] - node_message = message['payload'] - header.timestamp_registry(node_message, registry) + payload = message['payload'] + if event_type == 'ironic.metrics': + # We know this message payload is from a conductor itself + # and not for node drivers. + header.timestamp_conductor_registry(payload, registry) + ironic_parser.category_registry(payload, registry) - if event_type == 'hardware.ipmi.metrics': - ipmi.category_registry(node_message, registry) + else: + header.timestamp_registry(payload, registry) + if event_type == 'hardware.ipmi.metrics': + ipmi.category_registry(payload, registry) - elif event_type == 'hardware.redfish.metrics': - redfish.category_registry(node_message, registry) + elif event_type == 'hardware.redfish.metrics': + redfish.category_registry(payload, registry) - field = (node_message.get('node_name') or - node_message.get('node_uuid')) - nodeFile = os.path.join( + # Order of preference is for a node Name, UUID, or + # payload hostname field to be used (i.e. for conductor + # message payloads). + field = ( + payload.get('node_name') or + payload.get('node_uuid') or + payload.get('hostname') + ) + statFile = os.path.join( self.location, field + '-' + event_type) - write_to_textfile(nodeFile, registry) + + # Writes to file for server pickup + write_to_textfile(statFile, registry) except Exception as e: LOG.error(e) + raise class SimpleFileDriver(notifier.Driver): diff --git a/ironic_prometheus_exporter/parsers/header.py b/ironic_prometheus_exporter/parsers/header.py index d299b04..ee36755 100644 --- a/ironic_prometheus_exporter/parsers/header.py +++ b/ironic_prometheus_exporter/parsers/header.py @@ -19,6 +19,7 @@ from ironic_prometheus_exporter import utils as ipe_utils def timestamp_registry(node_information, metric_registry): + """Injects a last updated timestamp for a node.""" metric = 'baremetal_last_payload_timestamp_seconds' labels = {'node_uuid': node_information['node_uuid'], 'instance_uuid': node_information['instance_uuid']} @@ -37,3 +38,21 @@ def timestamp_registry(node_information, metric_registry): valid_labels = ipe_utils.update_instance_uuid(labels) g.labels(**valid_labels).set(value) + + +def timestamp_conductor_registry(payload, metric_registry): + """Injets a last updated at timestamp for a conductor.""" + metric = 'conductor_service_last_payload_timestamp_seconds' + labels = {'hostname': payload['hostname']} + dt_1970 = datetime(1970, 1, 1, 0, 0, 0) + dt_timestamp = datetime.strptime(payload['timestamp'], + '%Y-%m-%dT%H:%M:%S.%f') + value = int((dt_timestamp - dt_1970).total_seconds()) + + desc = descriptions.get_metric_description('header', metric) + + g = Gauge( + metric, desc, labelnames=labels, + registry=metric_registry) + + g.labels(labels).set(value) diff --git a/ironic_prometheus_exporter/parsers/ironic.py b/ironic_prometheus_exporter/parsers/ironic.py new file mode 100644 index 0000000..d81f119 --- /dev/null +++ b/ironic_prometheus_exporter/parsers/ironic.py @@ -0,0 +1,177 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging + +from prometheus_client import Gauge + + +LOG = logging.getLogger(__name__) + + +def category_registry(message, metrics_registry): + """Parse ironic metrics and submit them to Prometheus + + :param node_message: Oslo notification message + :param metrics_registry: Prometheus registry + """ + + hostname = message.get('hostname') + payload = message.get('payload') + service = 'ironic' + for key in payload.keys(): + value = payload[key] + metric_type = value['type'] + driver = None + labels = {'hostname': hostname, + 'service': service} + + if key.startswith('ironic.api'): + # This is only *really* to be expected in a combined single + # process mode, or if someone is using the exporter coupled + # with the API service itself. + formatted_key = key.replace( + 'ironic.api.controllers.', + 'ironic_rest_api_') + labels['component'] = 'api' + + if key.startswith('ironic.drivers.modules'): + # Deconstruct driver entries/counters to be more sane and attach + # labeling to them. + + # TODO(TheJulia): Once the minimum python version is 3.9, change + # to str.removeprefix. + formatted_key = key.replace( + 'ironic.drivers.modules.', + 'ironic.') + + for driver_label in ['ipmi', 'redfish', 'agent', 'pxe', + 'ilo', 'drac', 'irmc', 'inspector', 'ansible', + 'ibmc', 'xclarity']: + if driver_label in key: + # since Dell's driver name doesn't match the code + # classpath drac, driver name is idrac. + driver = driver_label + # NOTE(TheJulia): WRT, drac, Technically this should be idrac + + # To have the names of the metrics make sense, we need to handle + # structural folder names in the file/driver structure, which + # varies from driver to driver. + for driver_dir in ['redfish', 'ipmi', 'network', 'storage', 'drac', + 'ilo', 'irmc', 'intel_ipmi', 'ansible', 'ibmc', + 'xclarity']: + if driver_dir in formatted_key: + formatted_key = formatted_key.replace( + f'.{driver_dir}.', '.') + # Everything here should be one and done... + # Famous. Last. Words. + break + + # Now remove the filenames. This is extraineous ironic internal + # structural information where the classes are housed, not the + # actual methods or class names. + for filename in ['boot', 'raid', 'power', 'bios', 'inspect', + 'management', 'agent_base', 'agent_client', + 'agent', 'deploy_utils', 'deploy', 'ipmitool', + 'pxe_base', 'pxe', 'ramdisk', 'vendor_passthru', + 'vendor']: + if filename in formatted_key: + formatted_key = formatted_key.replace(f'.{filename}.', '.') + break + + labels['component'] = 'driver' + labels['driver'] = driver + + if key.startswith('ironic.conductor'): + # Catches entries from: + # - ironic.conductor.manager + # - ironic.conductor.deployments + # TODO(TheJulia): Once the minimum python version is 3.9, change + # to str.removeprefix. + labels['component'] = 'conductor' + + formatted_key = key.replace('ironic.conductor.manager.', 'ironic_') + for filename in ['manager', 'deployments', 'allocations']: + if filename in key: + formatted_key = key.replace(f'conductor.{filename}', '') + break + + # Prometheus does not use dot delimited data structures + # so we need to rename it to be underscore delimited. + # Downside of this is we end up with things like double + # underscores from method names, but it should be still clear + # where something is coming from. + # i.e. + # In: ironic.conductor.manager.ConductorManager.do_sync_power_state + # Out: ironic_conductormanager_do_sync_power_state + + formatted_key = formatted_key.replace('.', '_') + if '__' in formatted_key: + # Remove entries introduced via private methods with metrics + # decorators defined on them. + formatted_key = formatted_key.replace('__', '_') + formatted_key = formatted_key.lower() + # Remove ConductorManager, because it gets confusing as that is the + # Internal class name + + LOG.debug(f'Creating metric {key} using {formatted_key}.') + + # Always process timer first. The bulk of our Metrics in Ironic + # are timer counters. + if metric_type == 'timer': + # NOTE(TheJulia): So this doesn't use the promethus_client + # histogram format as it requires the existence of sample buckets + # inside of it's data structure or the entry of individual + # instances into the running history. + # Instead, we will return two counters, a sum and count gauge, + # Hopefully this will be useful. The reason it is not just + # two counter values, is each counter value in prometheus_client + # gets a _created child sample, which creates a lot of confusion. + LOG.debug(f'Details of the metric {formatted_key} with labels ' + '{labels}, sum: %s, count: %s', value['sum'], + value['count']) + metric = Gauge(formatted_key + '_time', 'Total time (ms) spent.', + labelnames=list(labels.keys()), + registry=metrics_registry) + metric.labels(**labels).set(value['sum']) + metric = Gauge(formatted_key + '_call_count', + 'Sum of calls recorded.', + labelnames=list(labels.keys()), + registry=metrics_registry) + metric.labels(**labels).set(value['count']) + LOG.debug(f'Details of the metric {formatted_key} with labels ' + '{labels}, sum: %s, count: %s', value['sum'], + value['count']) + next + + elif metric_type == 'gauge': + metric = Gauge(formatted_key, 'Point in time count of data point.', + labelnames=list(labels.keys()), + registry=metrics_registry) + metric.labels(**labels).set(value['value']) + LOG.debug(f'Details of the metric {formatted_key} with labels ' + '{labels}, value: %s', value['value']) + + elif metric_type == 'counter': + # NOTE(TheJulia): We use a gauge instead of of a counter because + # the prometheus client library automatcially renames our value + # by adding _total to it, and adds a _created child sample value + # which is just the time. Unfortunately the later is just noise. + metric = Gauge(formatted_key, + 'Counter representing the method or data point.', + labelnames=list(labels.keys()), + registry=metrics_registry) + # Prometheus_client doesn't directly expose a counter method + # to set a counterv alue directly. + metric.labels(**labels).set(value['count']) + LOG.debug(f'Details of the metric {formatted_key} with labels ' + '{labels}, value: %s', value['count']) diff --git a/ironic_prometheus_exporter/tests/json_samples/expected_ironic_parser_entries.json b/ironic_prometheus_exporter/tests/json_samples/expected_ironic_parser_entries.json new file mode 100644 index 0000000..9f8ed80 --- /dev/null +++ b/ironic_prometheus_exporter/tests/json_samples/expected_ironic_parser_entries.json @@ -0,0 +1,261 @@ +[ + { + "name": "ironic_post_clean_step_hook_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"}, + "value": 0.0050067901611328125, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_post_clean_step_hook_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"}, + "value": 3.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_post_deploy_step_hook_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"}, + "value": 0.010013580322265625, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_post_deploy_step_hook_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "agent"}, + "value": 2.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_inspect_wait_timeouts_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 2007.3809623718262, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_inspect_wait_timeouts_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_deploy_timeouts_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 2200.153350830078, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_deploy_timeouts_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_cleanwait_timeouts_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 2424.9556064605713, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_cleanwait_timeouts_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_rescuewait_timeouts_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 1639.7652626037598, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_rescuewait_timeouts_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_orphan_nodes_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 1608.001708984375, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_orphan_nodes_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_orphan_allocations_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 1797.1274852752686, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_check_orphan_allocations_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_redfishmanagement_query_firmware_update_status_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"}, + "value": 1598.8798141479492, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_redfishmanagement_query_firmware_update_status_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_redfishmanagement_query_firmware_update_failed_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"}, + "value": 1955.6934833526611, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_redfishmanagement_query_firmware_update_failed_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "redfish"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_do_sync_power_state_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 16819.172620773315, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_do_sync_power_state_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_do_sync_power_state", + "labels": {"hostname": "a-test-conductor", "service": "ironic"}, + "value": 0.0, + "docs": "Point in time count of data point.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sync_power_states_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 23187.243700027466, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sync_power_states_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 214.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"}, + "value": 1078.2177448272705, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_pxe_base_pxebasemixin_check_boot_timeouts_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "driver", "driver": "pxe"}, + "value": 142.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sensors_nodes_task_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 10648.590803146362, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sensors_nodes_task_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 142.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_send_sensor_data_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 15062.613010406494, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_send_sensor_data_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 142.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sync_local_state_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 276.4444351196289, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_sync_local_state_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 71.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_power_failure_recovery_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 166.60714149475098, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_power_failure_recovery_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 42.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_clean_up_caches_time", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 42.63949394226074, + "docs": "Total time (ms) spent.", + "type": "gauge" + }, + { + "name": "ironic_conductormanager_clean_up_caches_call_count", + "labels": {"hostname": "a-test-conductor", "service": "ironic", "component": "conductor"}, + "value": 3.0, + "docs": "Sum of calls recorded.", + "type": "gauge" + } +] diff --git a/ironic_prometheus_exporter/tests/json_samples/notification-ironic.json b/ironic_prometheus_exporter/tests/json_samples/notification-ironic.json new file mode 100644 index 0000000..5c2d9d6 --- /dev/null +++ b/ironic_prometheus_exporter/tests/json_samples/notification-ironic.json @@ -0,0 +1,111 @@ +{ + "priority": "INFO", + "event_type": "ironic.metrics.update", + "timestamp": "2019-03-29 20:12:26.885347", + "publisher_id": "None.localhost.localdomain", + "payload": { + "instance_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228", + "node_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228", + "event_type": "hardware.redfish.metrics.update", + "timestamp": "2019-03-29T20:12:22.989020", + "message_id": "85d6b2c8-fe57-432d-868a-330e0e28cf34", + "hostname": "a-test-conductor", + "payload": { + "ironic.drivers.modules.agent_base.post_clean_step_hook": { + "count": 3, + "sum": 0.0050067901611328125, + "type": "timer" + }, + "ironic.drivers.modules.agent_base.post_deploy_step_hook": { + "count": 2, + "sum": 0.010013580322265625, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_inspect_wait_timeouts": { + "count": 214, + "sum": 2007.3809623718262, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_deploy_timeouts": { + "count": 214, + "sum": 2200.153350830078, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_cleanwait_timeouts": { + "count": 214, + "sum": 2424.9556064605713, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_rescuewait_timeouts": { + "count": 214, + "sum": 1639.7652626037598, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_orphan_nodes": { + "count": 214, + "sum": 1608.001708984375, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._check_orphan_allocations": { + "count": 214, + "sum": 1797.1274852752686, + "type": "timer" + }, + "ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_status": { + "count": 214, + "sum": 1598.8798141479492, + "type": "timer" + }, + "ironic.drivers.modules.redfish.management.RedfishManagement._query_firmware_update_failed": { + "count": 214, + "sum": 1955.6934833526611, + "type": "timer" + }, + "ironic.conductor.manager.do_sync_power_state": { + "count": 214, + "sum": 16819.172620773315, + "type": "timer" + }, + "ConductorManager.FailingPowerSyncCount": { + "value": 0, + "type": "gauge" + }, + "ironic.conductor.manager.ConductorManager._sync_power_states": { + "count": 214, + "sum": 23187.243700027466, + "type": "timer" + }, + "ironic.drivers.modules.pxe_base.PXEBaseMixin._check_boot_timeouts": { + "count": 142, + "sum": 1078.2177448272705, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._sensors_nodes_task": { + "count": 142, + "sum": 10648.590803146362, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._send_sensor_data": { + "count": 142, + "sum": 15062.613010406494, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._sync_local_state": { + "count": 71, + "sum": 276.4444351196289, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._power_failure_recovery": { + "count": 42, + "sum": 166.60714149475098, + "type": "timer" + }, + "ironic.conductor.manager.ConductorManager._clean_up_caches": { + "count": 3, + "sum": 42.63949394226074, + "type": "timer" + } + } + }, + "message_id": "2c0da1e8-1958-484f-9bdd-9117d717f7fa" +} diff --git a/ironic_prometheus_exporter/tests/test_ironic_parser.py b/ironic_prometheus_exporter/tests/test_ironic_parser.py new file mode 100644 index 0000000..cdceb1f --- /dev/null +++ b/ironic_prometheus_exporter/tests/test_ironic_parser.py @@ -0,0 +1,92 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import json +import os +import unittest + +from prometheus_client import CollectorRegistry + +import ironic_prometheus_exporter +from ironic_prometheus_exporter.parsers import ironic + + +sample_file = os.path.join( + os.path.dirname(ironic_prometheus_exporter.__file__), + 'tests', 'json_samples', 'notification-ironic.json') +expected_file = os.path.join( + os.path.dirname(ironic_prometheus_exporter.__file__), + 'tests', 'json_samples', + './expected_ironic_parser_entries.json') + +DATA = json.load(open(sample_file)) + +# Helper to dump the output upon major changes, since +# it is a lot of JSON. +DUMP_JSON = False + +if not DUMP_JSON: + EXPECTED = json.load(open(expected_file)) +else: + EXPECTED = None + + +class TestIronicPayloadParser(unittest.TestCase): + + def setUp(self): + self.message = DATA['payload'] + + def test_category_registry(self): + registry = CollectorRegistry() + + ironic.category_registry(self.message, registry) + entry_count = 0 + for entry in registry.collect(): + # NOTE(TheJulia): We don't get the results back in any order + # which makes sense. + sample = entry.samples[0] + name = sample.name + labels = sample.labels + value = sample.value + documentation = entry.documentation + entry_type = entry.type + if not DUMP_JSON: + for expected_entry in EXPECTED: + # Find the entry, since access order is unreliable, + # and to compare so much data back and forth is otherwise + # not really feasible. + if name == expected_entry['name']: + break + else: + expected_entry = None + + # NOTE(TheJulia): The lines below are just to help regenerate + # the known data set, but we don't get a reliable access order + # from the prometheus client registry collection object. + if DUMP_JSON: + print(' {') + print(f' \"name\": \"{sample.name}\",') + print(' \"labels\": %s,' % json.dumps(sample.labels)) + print(f' \"value\": {sample.value},') + print(f' \"docs\": \"{entry.documentation}\",') + print(f' \"type\": \"{entry.type}\"') + print(' },') + else: + self.assertEqual(name, expected_entry['name']) + self.assertDictEqual(labels, expected_entry['labels']) + self.assertEqual(value, expected_entry['value']) + self.assertEqual(documentation, expected_entry['docs']) + self.assertEqual(entry_type, expected_entry['type']) + assert any(char.isupper() for char in sample.name) is not True + entry_count = entry_count + 1 + if not DUMP_JSON: + self.assertEqual(len(EXPECTED), entry_count) diff --git a/releasenotes/notes/parse-ironic-service-metrics-ac4afa0eb7f2582f.yaml b/releasenotes/notes/parse-ironic-service-metrics-ac4afa0eb7f2582f.yaml new file mode 100644 index 0000000..85b7f14 --- /dev/null +++ b/releasenotes/notes/parse-ironic-service-metrics-ac4afa0eb7f2582f.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds the capability for the ``ironic-prometheus-exporter`` to parse + metrics data from the ``ironic`` and ``ironic-conductor`` processes + to enable greater operator insight into the inner working and performance + of these services. This feature requires the ironic.conf file + ``[metrics]backend`` value to be set to ``collector``.