-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnvsmi.py
208 lines (169 loc) · 7.07 KB
/
nvsmi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division
from subprocess import Popen, PIPE
import collectd
import sys
import os
import re
_CONFIG = {
'bin': 'nvidia-smi',
'query_list': [],
'new_names_list': [],
'converters_dict': {},
'type_list': [],
}
_PLUGIN_NAME = 'nvsmi'
# From https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf:
# "It is recommended
# that users desiring consistency use either UUID or PCI bus ID, since
# device enumeration ordering is not guaranteed to be consistent between
# reboots and board serial number might be shared between multiple GPUs
# on the same board."
#
# Even though it's talking about the `--id` option. I don't think I should rely
# on the ordering from the output. Using `pci.bus` should be more realiable.
# All converters should return a string.
CONVERTERS = {
'hex_to_dec': lambda x: str(int(x, 16)),
'pstate': lambda x: re.match(r'P(\d+)', x).group(1),
# "Enabled" or "Disabled"
'enabled': lambda x: '1' if x.lower()=='enabled' else '0',
# "Active" or "Not Active"
'active': lambda x: '1' if x.lower()=='active' else '0',
# This is a little ugly. Not sure if there is any legit use for this.
'identity': lambda x: x,
}
QUERY_CONVERTERS = {
'pci.bus': CONVERTERS['hex_to_dec'],
'pci.device': CONVERTERS['hex_to_dec'],
'pci.device_id': CONVERTERS['hex_to_dec'],
'pci.domain': CONVERTERS['hex_to_dec'],
'pci.sub_device_id': CONVERTERS['hex_to_dec'],
'clocks_throttle_reasons.supported': CONVERTERS['hex_to_dec'],
'clocks_throttle_reasons.active': CONVERTERS['hex_to_dec'],
'clocks_throttle_reasons.gpu_idle': CONVERTERS['active'],
'clocks_throttle_reasons.applications_clocks_setting': CONVERTERS['active'],
'clocks_throttle_reasons.sw_power_cap': CONVERTERS['active'],
'clocks_throttle_reasons.hw_slowdown': CONVERTERS['active'],
'clocks_throttle_reasons.hw_thermal_slowdown': CONVERTERS['active'],
'clocks_throttle_reasons.hw_power_brake_slowdown': CONVERTERS['active'],
'clocks_throttle_reasons.sw_thermal_slowdown': CONVERTERS['active'],
'clocks_throttle_reasons.sync_boost': CONVERTERS['active'],
'accounting.mode': CONVERTERS['enabled'],
'display_active': CONVERTERS['enabled'],
'display_mode': CONVERTERS['enabled'],
'persistence_mode': CONVERTERS['enabled'],
'power.management': CONVERTERS['enabled'],
'pstate': CONVERTERS['pstate'],
}
# Assume type 'gauge' as default, but use other specific types when fit.
# I would like to only use types that are default on '/usr/share/collectd/types.db'
# to keep things simple.
QUERY_TYPES = {
'fan.speed': 'percent',
'utilization.gpu': 'percent',
'utilization.memory': 'percent',
'temperature.gpu': 'temperature',
'temperature.memory': 'temperature',
}
def info(s):
collectd.info('{}: {}'.format(_PLUGIN_NAME, s))
def error(s):
collectd.error('{}: {}'.format(_PLUGIN_NAME, s))
def error_exit(message, exit_code=1):
error(message)
exit(exit_code)
def make_replacements(replacements, s):
for orig, repl in replacements:
s = s.replace(orig, repl)
return s
def cb_config(config):
global _CONFIG
replacements = []
for node in config.children:
if node.key.lower() == 'bin':
_CONFIG['bin'] = node.values[0]
if not os.path.isfile(_CONFIG['bin']):
error_exit('The path ({}) provided for "{}" does not exist. Exiting.'.format(_CONFIG['bin'], node.key))
elif node.key.lower() == 'querygpu':
_CONFIG['query_list'] += node.values
elif node.key.lower() == 'interval':
_CONFIG['interval'] = node.values[0]
elif node.key.lower() == 'replacedotwith':
replacements.append((r'.', node.values[0]))
elif node.key.lower() == 'replaceunderlinewith':
replacements.append(('_', node.values[0]))
elif node.key.lower() == 'replace':
replacements.append((node.values[0], node.values[1]))
else:
info('Unknown config key "{}". Ignoring.'.format(node.key))
# Previously, a list of converters was used. With one converter for each
# query, necessarily. Since most values don't need a converter, I think a
# dictionary is better.
_CONFIG['type_list'] = [ QUERY_TYPES[q] if q in QUERY_TYPES else 'gauge' for q in _CONFIG['query_list'] ]
_CONFIG['converters_dict'] = { q: QUERY_CONVERTERS[q] for q in _CONFIG['query_list'] if q in QUERY_CONVERTERS }
_CONFIG['new_names_list'] = [ make_replacements(replacements, q) for q in _CONFIG['query_list']]
info('bin: {}'.format(_CONFIG['bin']))
info('query_list: {}'.format(','.join(_CONFIG['query_list'])))
info('type_list: {}'.format(','.join(_CONFIG['type_list'])))
if len(replacements) > 0:
info('new names for queries: {}'.format(','.join(_CONFIG['new_names_list'])))
info('queries that need conversion: {}'.format(','.join(list(_CONFIG['converters_dict']))))
# Call `register_read` here, so that we can set the interval.
collectd.register_read(cb_read, _CONFIG['interval'])
def nvidia_smi_query_gpu(bin_path, query_list, converters_dict, id_query='pci.bus', id_converter='hex_to_dec'):
"""Use `nvidia-smi --query-gpu` to query devices.
Arguments:
bin: Path to `nvidia-smi`.
query_list: List of queries.
converters_dict: Dictionary with list of converters, one converter for each query.
id_query: Query that will identify which GPU it is.
id_converter: Function used to convert the result from `id_query`. May be `None`.
"""
query_string = '--query-gpu={},'.format(id_query) + ','.join(query_list)
cmd_list = [bin_path, query_string, '--format=csv,noheader,nounits']
process = Popen(cmd_list, stdout=PIPE)
output, err = process.communicate()
if process.returncode != 0:
error_exit('{} exited with error code "{}".'.format(bin_path, process.returncode))
# I don't know why, the return code doesn't seem to be enough to check if it
# worked, so we have to check the output too. It should always start with a
# `0` because of `pci.bus`. But that may not be the case when we use another
# query as id.
# FIXME: A better and more generic way to check the output might be to check
# if `values` from the `split` has all the queries.
if not output.startswith('0'):
error_exit('The output from {} does not seem right.'.format(bin_path))
result = {}
for line in output.decode().strip().split('\n'):
values = re.split(r'\s*,\s*', line)
# Grab GPU ID.
gpu_id = values.pop(0)
if id_converter is not None:
gpu_id = CONVERTERS[id_converter](gpu_id)
# Convert whatever needs to be converted.
for query in converters_dict:
converter_func = converters_dict[query]
idx = query_list.index(query)
values[idx] = converter_func(values[idx])
result[gpu_id] = {
'values': values,
}
return result
def cb_read(data=None):
if not _CONFIG['query_list']:
print('Nothing to query with.', file=sys.stderr)
return
readings = nvidia_smi_query_gpu(_CONFIG['bin'], _CONFIG['query_list'], _CONFIG['converters_dict'])
vl = collectd.Values()
for gpu_id in readings:
for query_name, value, type_ in zip(_CONFIG['new_names_list'], readings[gpu_id]['values'], _CONFIG['type_list']):
vl.dispatch(
plugin=_PLUGIN_NAME,
plugin_instance=gpu_id,
type=type_,
type_instance=query_name,
values=[value],
)
collectd.register_config(cb_config)