a8416f968e
This is the action item to implement the spec: doc/source/specs/2025.1/chart_versioning.rst Depends-On: I327103c18fc0e10e989a17f69b3bff9995c45eb4 Depends-On: I7bfdef3ea2128bbb4e26e3a00161fe30ce29b8e7 Change-Id: I4974785c904cf7c8730279854e3ad9b6b7c35498
317 lines
12 KiB
YAML
317 lines
12 KiB
YAML
---
|
|
conf:
|
|
check_scripts:
|
|
nvidia_vgpu: |
|
|
# Copyright (c) 2018 StackHPC Ltd.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
|
|
import monasca_agent.collector.checks as checks
|
|
from py3nvml import py3nvml as pynvml
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_METRIC_NAME_PREFIX = "nvidia"
|
|
|
|
|
|
class Nvidia(checks.AgentCheck):
|
|
def __init__(self, name, init_config, agent_config):
|
|
super(Nvidia, self).__init__(name, init_config, agent_config)
|
|
|
|
def handle_not_supported(f):
|
|
def wrapper(*args, **kw):
|
|
try:
|
|
return f(*args, **kw)
|
|
except pynvml.NVMLError as err:
|
|
if err == pynvml.NVMLError(pynvml.NVML_ERROR_NOT_SUPPORTED):
|
|
log.info('Not supported: {}'.format(f.__name__))
|
|
return {}
|
|
else:
|
|
raise
|
|
return wrapper
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_driver_version():
|
|
return {'driver_version': pynvml.nvmlSystemGetDriverVersion()}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_fan_speed_percent(gpu):
|
|
return {'fan_speed_percent': pynvml.nvmlDeviceGetFanSpeed(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_name(gpu):
|
|
return {'name': pynvml.nvmlDeviceGetName(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_serial(gpu):
|
|
return {'serial': pynvml.nvmlDeviceGetSerial(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_uuid(gpu):
|
|
return {'uuid': pynvml.nvmlDeviceGetUUID(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_vbios_version(gpu):
|
|
return {'vbios_version': pynvml.nvmlDeviceGetVbiosVersion(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_info_rom_image_version(gpu):
|
|
return {'info_rom_image_version':
|
|
pynvml.nvmlDeviceGetInforomImageVersion(gpu)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_power_state(gpu):
|
|
power_state = "P{}".format(pynvml.nvmlDeviceGetPowerState(gpu))
|
|
return {'power_state': power_state}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_framebuffer_memory_stats(gpu):
|
|
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu)
|
|
return {
|
|
'memory_fb_total_bytes': mem_info.total,
|
|
'memory_fb_used_bytes': mem_info.used,
|
|
'memory_fb_free_bytes': (mem_info.total - mem_info.used)
|
|
}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_bar1_memory_stats(gpu):
|
|
mem_info = pynvml.nvmlDeviceGetBAR1MemoryInfo(gpu)
|
|
return {
|
|
'memory_bar1_total_bytes': mem_info.bar1Total,
|
|
'memory_bar1_used_bytes': mem_info.bar1Used,
|
|
'memory_bar1_free_bytes': (mem_info.bar1Total - mem_info.bar1Used)
|
|
}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_utilisation_stats(gpu):
|
|
util = pynvml.nvmlDeviceGetUtilizationRates(gpu)
|
|
return {
|
|
'utilisation_gpu_percent': util.gpu,
|
|
'utilisation_memory_percent': util.memory
|
|
}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_temperature(gpu):
|
|
return {'temperature_deg_c':
|
|
pynvml.nvmlDeviceGetTemperature(
|
|
gpu, pynvml.NVML_TEMPERATURE_GPU)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_shutdown_temp(gpu):
|
|
return {'temperature_shutdown_deg_c':
|
|
pynvml.nvmlDeviceGetTemperatureThreshold(
|
|
gpu, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_device_slowdown_temp(gpu):
|
|
return {'temperature_slowdown_deg_c':
|
|
pynvml.nvmlDeviceGetTemperatureThreshold(
|
|
gpu, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_power_usage_watts(gpu):
|
|
return {'power_watts': (pynvml.nvmlDeviceGetPowerUsage(gpu) / 1000.0)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_power_limit_watts(gpu):
|
|
return {'power_limit_watts': (
|
|
pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0)}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_clock_info(gpu):
|
|
return {
|
|
'clock_freq_gpu_mhz':
|
|
pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_GRAPHICS),
|
|
'clock_freq_sm_mhz':
|
|
pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_SM),
|
|
'clock_freq_memory_mhz':
|
|
pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_MEM),
|
|
'clock_freq_video_mhz':
|
|
pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO)
|
|
}
|
|
|
|
@staticmethod
|
|
@handle_not_supported
|
|
def _get_clock_max_info(gpu):
|
|
return {
|
|
'clock_max_freq_gpu_mhz':
|
|
pynvml.nvmlDeviceGetMaxClockInfo(
|
|
gpu, pynvml.NVML_CLOCK_GRAPHICS),
|
|
'clock_max_freq_sm_mhz':
|
|
pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_SM),
|
|
'clock_max_freq_memory_mhz':
|
|
pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_MEM),
|
|
'clock_max_freq_video_mhz':
|
|
pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO)
|
|
}
|
|
|
|
@staticmethod
|
|
def _get_gpu_info():
|
|
pynvml.nvmlInit()
|
|
deviceCount = pynvml.nvmlDeviceGetCount()
|
|
all_info = []
|
|
for i in range(0, deviceCount):
|
|
gpu = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
|
|
dimensions = {}
|
|
dimensions.update(Nvidia._get_driver_version())
|
|
dimensions.update(Nvidia._get_device_uuid(gpu))
|
|
dimensions.update(Nvidia._get_info_rom_image_version(gpu))
|
|
dimensions.update(Nvidia._get_device_power_state(gpu))
|
|
dimensions.update(Nvidia._get_device_vbios_version(gpu))
|
|
|
|
measurements = {}
|
|
measurements.update(Nvidia._get_fan_speed_percent(gpu))
|
|
measurements.update(Nvidia._get_framebuffer_memory_stats(gpu))
|
|
measurements.update(Nvidia._get_bar1_memory_stats(gpu))
|
|
measurements.update(Nvidia._get_utilisation_stats(gpu))
|
|
measurements.update(Nvidia._get_device_temperature(gpu))
|
|
measurements.update(Nvidia._get_device_shutdown_temp(gpu))
|
|
measurements.update(Nvidia._get_device_slowdown_temp(gpu))
|
|
measurements.update(Nvidia._get_power_usage_watts(gpu))
|
|
measurements.update(Nvidia._get_power_limit_watts(gpu))
|
|
measurements.update(Nvidia._get_clock_info(gpu))
|
|
measurements.update(Nvidia._get_clock_max_info(gpu))
|
|
|
|
gpu_name = "{}_{}".format(
|
|
Nvidia._get_device_name(gpu).get('name'),
|
|
Nvidia._get_device_serial(gpu).get('serial'))
|
|
gpu_info = {
|
|
'name': gpu_name,
|
|
'dimensions': dimensions,
|
|
'measurements': measurements
|
|
}
|
|
all_info.append(gpu_info)
|
|
pynvml.nvmlShutdown()
|
|
return all_info
|
|
|
|
def check(self, instance):
|
|
for gpu_metrics in Nvidia._get_gpu_info():
|
|
for measurement, value in gpu_metrics['measurements'].items():
|
|
metric_name = '{0}.{1}'.format(
|
|
_METRIC_NAME_PREFIX, measurement)
|
|
self.gauge(metric_name,
|
|
value,
|
|
device_name=gpu_metrics.get('name'),
|
|
dimensions=gpu_metrics.get('dimensions'),
|
|
value_meta=None)
|
|
log.debug('Collected info for GPU {}'.format(
|
|
gpu_metrics.get('name')))
|
|
|
|
detection_scripts:
|
|
nvidia_vgpu: |
|
|
# Copyright (`c) 2018 StackHPC Ltd.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
import subprocess
|
|
|
|
import monasca_setup.agent_config
|
|
import monasca_setup.detection
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class NvidiaDetect(monasca_setup.detection.Plugin):
|
|
"""Detects and configures nVidia plugin."""
|
|
|
|
def _detect(self):
|
|
self.available = False
|
|
if b'nvidia' not in subprocess.check_output(
|
|
["lshw", "-C", "display"]).lower():
|
|
LOG.info('No nVidia hardware detected.')
|
|
return
|
|
self.available = True
|
|
|
|
def build_config(self):
|
|
config = monasca_setup.agent_config.Plugins()
|
|
config['nvidia'] = {
|
|
'init_config': None,
|
|
'instances': [{'name': 'nvidia_stats'}]}
|
|
return config
|
|
|
|
agent_plugins:
|
|
nvidia_vgpu:
|
|
auto_detect: true
|
|
config:
|
|
cache_dir: /dev/shm
|
|
nova_refresh: "14400"
|
|
pod:
|
|
security_context:
|
|
agent:
|
|
container:
|
|
monasca_collector:
|
|
runAsUser: 0
|
|
privileged: true
|
|
allowPrivilegeEscalation: true
|
|
|
|
mounts:
|
|
monasca_agent:
|
|
monasca_collector:
|
|
volumeMounts:
|
|
- name: varliblibvirt
|
|
mountPath: /var/lib/libvirt
|
|
readOnly: true
|
|
- mountPath: /lib/modules
|
|
name: libmodules
|
|
readOnly: true
|
|
- name: varlibnova
|
|
mountPath: /var/lib/nova
|
|
- name: hostproc
|
|
mountPath: /proc
|
|
volumes:
|
|
- name: libmodules
|
|
hostPath:
|
|
path: /lib/modules
|
|
- name: varliblibvirt
|
|
hostPath:
|
|
path: /var/lib/libvirt
|
|
- name: varlibnova
|
|
hostPath:
|
|
path: /var/lib/nova
|
|
- name: hostproc
|
|
hostPath:
|
|
path: /proc
|
|
...
|