Skip to content

Commit

Permalink
collect some counters as gauges (#19459)
Browse files Browse the repository at this point in the history
* collect some counters as gauges

* changelog

* lint and comment

* lint again
  • Loading branch information
steveny91 authored Jan 22, 2025
1 parent 22c0eeb commit fb7123d
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 14 deletions.
1 change: 1 addition & 0 deletions dcgm/changelog.d/19459.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add the gauge/total version of some monotonic counter metrics
35 changes: 28 additions & 7 deletions dcgm/datadog_checks/dcgm/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,47 @@
'DCGM_FI_DEV_FB_USED': 'frame_buffer.used',
'DCGM_FI_DEV_GPU_TEMP': 'temperature',
'DCGM_FI_DEV_GPU_UTIL': 'gpu_utilization',
'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL': 'nvlink_bandwidth',
'DCGM_FI_DEV_PCIE_REPLAY_COUNTER': 'pcie_replay', # becomes pcie_replay.count
'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL': {
'name': 'nvlink_bandwidth',
'type': 'counter_gauge',
}, # becomes nvlink_bandwidth.total and nvlink_bandwidth.count
'DCGM_FI_DEV_PCIE_REPLAY_COUNTER': {
'name': 'pcie_replay',
'type': 'counter_gauge',
}, # becomes pcie_replay.total and pcie_replay.count
'DCGM_FI_DEV_POWER_USAGE': 'power_usage',
'DCGM_FI_DEV_SM_CLOCK': 'sm_clock',
'DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION': 'total_energy_consumption',
'DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION': {
'name': 'total_energy_consumption',
'type': 'counter_gauge',
}, # becomes total_energy_consumption.total and total_energy_consumption.count
'DCGM_FI_DEV_VGPU_LICENSE_STATUS': 'vgpu_license_status',
'DCGM_FI_DEV_XID_ERRORS': 'xid_errors',
# Metrics related to memory get grouped together because there are more of them available.
'DCGM_FI_DEV_MEM_CLOCK': 'mem.clock',
'DCGM_FI_DEV_MEM_COPY_UTIL': 'mem.copy_utilization',
'DCGM_FI_DEV_MEMORY_TEMP': 'mem.temperature',
# NVML Specific Missing Metrics (5)
'DCGM_FI_DEV_COUNT': 'device', # becomes device.count
'DCGM_FI_DEV_COUNT': {
'name': 'device',
'type': 'counter_gauge',
}, # becomes device.total and device.count
'DCGM_FI_DEV_FAN_SPEED': 'fan_speed',
'DCGM_FI_PROF_PCIE_RX_BYTES': 'pcie_rx_throughput',
'DCGM_FI_PROF_PCIE_TX_BYTES': 'pcie_tx_throughput',
'DCGM_FI_PROF_PCIE_RX_BYTES': {
'name': 'pcie_rx_throughput',
'type': 'counter_gauge',
},
'DCGM_FI_PROF_PCIE_TX_BYTES': {
'name': 'pcie_tx_throughput',
'type': 'counter_gauge',
}, # becomes pcie_tx_throughput.total and pcie_tx_throughput.count
# Others from default-counters.csv
'DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS': 'correctable_remapped_rows',
'DCGM_FI_DEV_ROW_REMAP_FAILURE': 'row_remap_failure',
'DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS': 'uncorrectable_remapped_rows',
'DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS': {
'name': 'uncorrectable_remapped_rows',
'type': 'counter_gauge',
}, # becomes uncorrectable_remapped_rows.total and uncorrectable_remapped_rows.count
# More recommended metrics
'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS': 'clock_throttle_reasons',
'DCGM_FI_DEV_FB_RESERVED': 'frame_buffer.reserved',
Expand Down
21 changes: 14 additions & 7 deletions dcgm/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation
dcgm.clock_throttle_reasons,gauge,,,,Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*),0,dcgm,,
dcgm.correctable_remapped_rows.count,count,,row,,Number of remapped rows for correctable errors.,0,dcgm,,
dcgm.dec_utilization,gauge,,percent,,Decoder utilization (in %).,0,dcgm,,
dcgm.device.count,count,,device,,Number of Devices on the node.,0,dcgm,,
dcgm.device.count,count,,device,,Change in number Devices on the node.,0,dcgm,,
dcgm.device.total,gauge,,,,Number of Devices on the node.,0,dcgm,,
dcgm.dram.active,gauge,,fraction,,Ratio of cycles the device memory interface is active sending or receiving data (in %).,0,dcgm,,
dcgm.enc_utilization,gauge,,percent,,Encoder utilization (in %).,0,dcgm,,
dcgm.fan_speed,gauge,,percent,,Fan speed for the device in percent 0-100.,0,dcgm,,
Expand All @@ -16,10 +17,14 @@ dcgm.gr_engine_active,gauge,,fraction,,Ratio of time the graphics engine is acti
dcgm.mem.clock,gauge,,megahertz,,Memory clock frequency (in MHz).,0,dcgm,,
dcgm.mem.copy_utilization,gauge,,percent,,Memory utilization (in %).,0,dcgm,,
dcgm.mem.temperature,gauge,,degree celsius,,Memory temperature (in C).,0,dcgm,,
dcgm.nvlink_bandwidth.count,count,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
dcgm.pcie_replay.count,count,,,,Total number of PCIe retries.,0,dcgm,,
dcgm.pcie_rx_throughput.count,count,,,,PCIe Rx utilization information.,0,dcgm,,
dcgm.pcie_tx_throughput.count,count,,,,PCIe Tx utilization information.,0,dcgm,,
dcgm.nvlink_bandwidth.count,count,,,,Change in number of NVLink bandwidth counters for all lanes,0,dcgm,,
dcgm.nvlink_bandwidth.total,gauge,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
dcgm.pcie_replay.count,count,,,,Change in number of PCIe retries.,0,dcgm,,
dcgm.pcie_replay.total,gauge,,,,Total number of PCIe retries.,0,dcgm,,
dcgm.pcie_rx_throughput.count,count,,,,Change in PCIe Rx utilization information.,0,dcgm,,
dcgm.pcie_rx_throughput.total,gauge,,,,PCIe Rx utilization information.,0,dcgm,,
dcgm.pcie_tx_throughput.count,count,,,,Change PCIe Tx utilization information.,0,dcgm,,
dcgm.pcie_tx_throughput.total,gauge,,,,PCIe Tx utilization information,0,dcgm,,
dcgm.pipe.fp16_active,gauge,,fraction,,Ratio of cycles the fp16 pipes are active (in %).,0,dcgm,,
dcgm.pipe.fp32_active,gauge,,fraction,,Ratio of cycles the fp32 pipes are active (in %).,0,dcgm,,
dcgm.pipe.fp64_active,gauge,,fraction,,Ratio of cycles the fp64 pipes are active (in %).,0,dcgm,,
Expand All @@ -33,7 +38,9 @@ dcgm.sm_active,gauge,,fraction,,The ratio of cycles an SM has at least 1 warp as
dcgm.sm_clock,gauge,,megahertz,,SM clock frequency (in MHz).,0,dcgm,,
dcgm.sm_occupancy,gauge,,fraction,,The ratio of number of warps resident on an SM (in %).,0,dcgm,,
dcgm.temperature,gauge,,degree celsius,,GPU temperature (in C).,0,dcgm,,
dcgm.total_energy_consumption.count,count,,millijoule,,Total energy consumption since boot (in mJ).,0,dcgm,,
dcgm.uncorrectable_remapped_rows.count,count,,row,,Number of remapped rows for uncorrectable errors.,0,dcgm,,
dcgm.total_energy_consumption.count,count,,millijoule,,Change in energy consumption (in mJ).,0,dcgm,,
dcgm.total_energy_consumption.total,gauge,,,,Total energy consumption since boot (in mJ),0,dcgm,,
dcgm.uncorrectable_remapped_rows.count,count,,row,,Change in number of remapped rows for uncorrectable errors.,0,dcgm,,
dcgm.uncorrectable_remapped_rows.total,gauge,,,,Total number of remapped rows for uncorrectable errors.,0,dcgm,,
dcgm.vgpu_license_status,gauge,,,,vGPU License status,0,dcgm,,
dcgm.xid_errors,gauge,,,,Value of the last XID error encountered.,0,dcgm,,
8 changes: 8 additions & 0 deletions dcgm/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'correctable_remapped_rows.count',
'dec_utilization',
'device.count',
'device.total',
'dram.active',
'enc_utilization',
'fan_speed',
Expand All @@ -33,9 +34,13 @@
'mem.copy_utilization',
'mem.temperature',
'nvlink_bandwidth.count',
'nvlink_bandwidth.total',
'pcie_replay.count',
'pcie_replay.total',
'pcie_rx_throughput.count',
'pcie_rx_throughput.total',
'pcie_tx_throughput.count',
'pcie_tx_throughput.total',
'pipe.fp16_active',
'pipe.fp32_active',
'pipe.fp64_active',
Expand All @@ -50,9 +55,12 @@
'sm_occupancy',
'temperature',
'total_energy_consumption.count',
'total_energy_consumption.total',
'uncorrectable_remapped_rows.count',
'uncorrectable_remapped_rows.total',
'vgpu_license_status',
'xid_errors',
]

EXPECTED_METRICS = [f'dcgm.{m}' for m in EXPECTED_METRICS]
assert sorted(EXPECTED_METRICS) == EXPECTED_METRICS, 'Please keep this list in alphabetic order!'

0 comments on commit fb7123d

Please sign in to comment.