Skip to content

Commit

Permalink
Removing collection of FP64 metrics on GPUs that does not supports it (
Browse files Browse the repository at this point in the history
…#19)

* Removing collection of FP64 metrics on GPUs that does not supports it

For issue #18

* Adding support for NVIDIA RTX A6000

Co-authored-by: Bruno Travouillon <[email protected]>

---------

Co-authored-by: Bruno Travouillon <[email protected]>
  • Loading branch information
guilbaults and btravouillon authored Nov 23, 2023
1 parent 104cfb7 commit 1814b0c
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions slurm-job-exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ def __init__(self, dcgm_update_interval=10):
dcgm_fields.DCGM_FI_PROF_NVLINK_RX_BYTES: 'nvlink_rx_bytes',
}

for gpu_id in pydcgm.DcgmSystemDiscovery(self.handle).GetAllSupportedGpuIds():
device = pydcgm.dcgm_agent.dcgmGetDeviceAttributes(self.handle.handle, gpu_id)
name = device.identifiers.deviceName
print('Detected gpu {} with ID {}'.format(name, gpu_id))
if name in ['NVIDIA RTX A6000']:
# This GPU does not supports fp64, we don't support a mix of fp64 and non-fp64 GPUs in the same node
print('Removing fp64 metrics since {} does not support fp64'.format(name))
del self.fieldIds_dict[dcgm_fields.DCGM_FI_PROF_PIPE_FP64_ACTIVE]
break

self.field_group = pydcgm.DcgmFieldGroup(self.handle, name="slurm-job-exporter-fg", fieldIds=list(self.fieldIds_dict.keys()))
self.group.samples.WatchFields(self.field_group, dcgm_update_interval * 1000 * 1000, dcgm_update_interval * 2.0, 0)
self.handle.GetSystem().UpdateAllFields(True)
Expand Down

0 comments on commit 1814b0c

Please sign in to comment.