forked from al42and/cuda-smi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nvml.h
3677 lines (3419 loc) · 178 KB
/
nvml.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/*
NVML API Reference
The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and
managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
API Documentation
Supported platforms:
- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit
- Linux: 32-bit and 64-bit
- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
Supported products:
- Full Support
- All Tesla products, starting with the Fermi architecture
- All Quadro products, starting with the Fermi architecture
- All GRID products, starting with the Kepler architecture
- Limited Support
- All Geforce products, starting with the Fermi architecture
The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
not be added to the system path by default. To dynamically link to NVML, add this path to the PATH
environmental variable. To dynamically load NVML, call LoadLibrary with this path.
On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
and 64 bit NVML libraries will be installed.
Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
*/
#ifndef __nvml_nvml_h__
#define __nvml_nvml_h__
#ifdef __cplusplus
extern "C" {
#endif
/*
* On Windows, set up methods for DLL export
* define NVML_STATIC_IMPORT when using nvml_loader library
*/
#if defined _WINDOWS
#if !defined NVML_STATIC_IMPORT
#if defined NVML_LIB_EXPORT
#define DECLDIR __declspec(dllexport)
#else
#define DECLDIR __declspec(dllimport)
#endif
#else
#define DECLDIR
#endif
#else
#define DECLDIR
#endif
/**
* NVML API versioning support
*/
#define NVML_API_VERSION 6
#define NVML_API_VERSION_STR "6"
#define nvmlInit nvmlInit_v2
#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v2
#define nvmlDeviceGetCount nvmlDeviceGetCount_v2
#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2
#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
/***************************************************************************************************/
/** @defgroup nvmlDeviceStructs Device Structs
* @{
*/
/***************************************************************************************************/
/**
* Special constant that some fields take when they are not available.
* Used when only part of the struct is not available.
*
* Each structure explicitly states when to check for this value.
*/
#define NVML_VALUE_NOT_AVAILABLE (-1)
typedef struct nvmlDevice_st* nvmlDevice_t;
/**
* Buffer size guaranteed to be large enough for pci bus id
*/
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
/**
* PCI information about a GPU device.
*/
typedef struct nvmlPciInfo_st
{
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator)
unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
unsigned int device; //!< The device's id on the bus, 0 to 31
unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
// Added in NVML 2.285 API
unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
// NVIDIA reserved for internal use only
unsigned int reserved0;
unsigned int reserved1;
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
/**
* Detailed ECC error counts for a device.
*
* @deprecated Different GPU families can have different memory error counters
* See \ref nvmlDeviceGetMemoryErrorCounter
*/
typedef struct nvmlEccErrorCounts_st
{
unsigned long long l1Cache; //!< L1 cache errors
unsigned long long l2Cache; //!< L2 cache errors
unsigned long long deviceMemory; //!< Device memory errors
unsigned long long registerFile; //!< Register file errors
} nvmlEccErrorCounts_t;
/**
* Utilization information for a device.
* Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
*/
typedef struct nvmlUtilization_st
{
unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written
} nvmlUtilization_t;
/**
* Memory allocation information for a device.
*/
typedef struct nvmlMemory_st
{
unsigned long long total; //!< Total installed FB memory (in bytes)
unsigned long long free; //!< Unallocated FB memory (in bytes)
unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
} nvmlMemory_t;
/**
* BAR1 Memory allocation Information for a device
*/
typedef struct nvmlBAR1Memory_st
{
unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes)
unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes)
unsigned long long bar1Used; //!< Allocated Used Memory (in bytes)
}nvmlBAR1Memory_t;
/**
* Information about running compute processes on the GPU
*/
typedef struct nvmlProcessInfo_st
{
unsigned int pid; //!< Process ID
unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
//! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
//! because Windows KMD manages all the memory and not the NVIDIA driver
} nvmlProcessInfo_t;
/**
* Enum to represent type of bridge chip
*/
typedef enum nvmlBridgeChipType_enum
{
NVML_BRIDGE_CHIP_PLX = 0,
NVML_BRIDGE_CHIP_BRO4 = 1
}nvmlBridgeChipType_t;
/**
* Maximum limit on Physical Bridges per Board
*/
#define NVML_MAX_PHYSICAL_BRIDGE (128)
/**
* Information about the Bridge Chip Firmware
*/
typedef struct nvmlBridgeChipInfo_st
{
nvmlBridgeChipType_t type; //!< Type of Bridge Chip
unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable
}nvmlBridgeChipInfo_t;
/**
* This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate
* bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
*/
typedef struct nvmlBridgeChipHierarchy_st
{
unsigned char bridgeCount; //!< Number of Bridge Chips on the Board
nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
}nvmlBridgeChipHierarchy_t;
/**
* Represents Type of Sampling Event
*/
typedef enum nvmlSamplingType_enum
{
NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU
NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy
NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy
NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples
NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples
// Keep this last
NVML_SAMPLINGTYPE_COUNT
}nvmlSamplingType_t;
/**
* Represents the type for sample value returned
*/
typedef enum nvmlValueType_enum
{
NVML_VALUE_TYPE_DOUBLE = 0,
NVML_VALUE_TYPE_UNSIGNED_INT = 1,
NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
// Keep this last
NVML_VALUE_TYPE_COUNT
}nvmlValueType_t;
/**
* Union to represent different types of Value
*/
typedef union nvmlValue_st
{
double dVal; //!< If the value is double
unsigned int uiVal; //!< If the value is unsigned int
unsigned long ulVal; //!< If the value is unsigned long
unsigned long long ullVal; //!< If the value is unsigned long long
}nvmlValue_t;
/**
* Information for Sample
*/
typedef struct nvmlSample_st
{
unsigned long long timeStamp; //!< CPU Timestamp in microseconds
nvmlValue_t sampleValue; //!< Sample Value
}nvmlSample_t;
/**
* Represents type of perf policy for which violation times can be queried
*/
typedef enum nvmlPerfPolicyType_enum
{
NVML_PERF_POLICY_POWER = 0,
NVML_PERF_POLICY_THERMAL = 1,
// Keep this last
NVML_PERF_POLICY_COUNT
}nvmlPerfPolicyType_t;
/**
* Struct to hold perf policy violation status data
*/
typedef struct nvmlViolationTime_st
{
unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds
unsigned long long violationTime; //!< violationTime in Nanoseconds
}nvmlViolationTime_t;
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlDeviceEnumvs Device Enums
* @{
*/
/***************************************************************************************************/
/**
* Generic enable/disable enum.
*/
typedef enum nvmlEnableState_enum
{
NVML_FEATURE_DISABLED = 0, //!< Feature disabled
NVML_FEATURE_ENABLED = 1 //!< Feature enabled
} nvmlEnableState_t;
//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
#define nvmlFlagDefault 0x00
//! Generic flag used to force some behavior. See description of particular functions for details.
#define nvmlFlagForce 0x01
/**
* * The Brand of the GPU
* */
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0, // This is either Geforce or something else. NVML does not detect Geforce
NVML_BRAND_QUADRO = 1,
NVML_BRAND_TESLA = 2,
NVML_BRAND_NVS = 3,
NVML_BRAND_GRID = 4,
// Keep this last
NVML_BRAND_COUNT
} nvmlBrandType_t;
/**
* Temperature thresholds.
*/
typedef enum nvmlTemperatureThresholds_enum
{
NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down
// for HW protection
NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin slowdown
// Keep this last
NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;
/**
* Temperature sensors.
*/
typedef enum nvmlTemperatureSensors_enum
{
NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die
// Keep this last
NVML_TEMPERATURE_COUNT
} nvmlTemperatureSensors_t;
/**
* Compute mode.
*
* NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
* Earlier CUDA versions supported a single exclusive mode,
* which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
*/
typedef enum nvmlComputeMode_enum
{
NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Compute-exclusive-thread mode -- only one context per device, usable from one thread at a time
NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
// Keep this last
NVML_COMPUTEMODE_COUNT
} nvmlComputeMode_t;
/**
* ECC bit types.
*
* @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
*/
#define nvmlEccBitType_t nvmlMemoryErrorType_t
/**
* Single bit ECC errors
*
* @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
*/
#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
/**
* Double bit ECC errors
*
* @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
*/
#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
/**
* Memory error types
*/
typedef enum nvmlMemoryErrorType_enum
{
/**
* A memory error that was corrected
*
* For ECC errors, these are single bit errors
* For Texture memory, these are errors fixed by resend
*/
NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
/**
* A memory error that was not corrected
*
* For ECC errors, these are double bit errors
* For Texture memory, these are errors where the resend fails
*/
NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
// Keep this last
NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
} nvmlMemoryErrorType_t;
/**
* ECC counter types.
*
* Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
* On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver
* client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
* is run.
*/
typedef enum nvmlEccCounterType_enum
{
NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads.
NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
// Keep this last
NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types
} nvmlEccCounterType_t;
/**
* Clock types.
*
* All speeds are in Mhz.
*/
typedef enum nvmlClockType_enum
{
NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain
NVML_CLOCK_SM = 1, //!< SM clock domain
NVML_CLOCK_MEM = 2, //!< Memory clock domain
// Keep this last
NVML_CLOCK_COUNT //<! Count of clock types
} nvmlClockType_t;
/**
* Driver models.
*
* Windows only.
*/
typedef enum nvmlDriverModel_enum
{
NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device
NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device
} nvmlDriverModel_t;
/**
* Allowed PStates.
*/
typedef enum nvmlPStates_enum
{
NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance
NVML_PSTATE_1 = 1, //!< Performance state 1
NVML_PSTATE_2 = 2, //!< Performance state 2
NVML_PSTATE_3 = 3, //!< Performance state 3
NVML_PSTATE_4 = 4, //!< Performance state 4
NVML_PSTATE_5 = 5, //!< Performance state 5
NVML_PSTATE_6 = 6, //!< Performance state 6
NVML_PSTATE_7 = 7, //!< Performance state 7
NVML_PSTATE_8 = 8, //!< Performance state 8
NVML_PSTATE_9 = 9, //!< Performance state 9
NVML_PSTATE_10 = 10, //!< Performance state 10
NVML_PSTATE_11 = 11, //!< Performance state 11
NVML_PSTATE_12 = 12, //!< Performance state 12
NVML_PSTATE_13 = 13, //!< Performance state 13
NVML_PSTATE_14 = 14, //!< Performance state 14
NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance
NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state
} nvmlPstates_t;
/**
* GPU Operation Mode
*
* GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
*
* Each GOM is designed to meet specific user needs.
*/
typedef enum nvmlGom_enum
{
NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed
NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations
//!< are not allowed
NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require
//!< high bandwidth double precision
} nvmlGpuOperationMode_t;
/**
* Available infoROM objects.
*/
typedef enum nvmlInforomObject_enum
{
NVML_INFOROM_OEM = 0, //!< An object defined by OEM
NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support
NVML_INFOROM_POWER = 2, //!< The power management object
// Keep this last
NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about
} nvmlInforomObject_t;
/**
* Return values for NVML API calls.
*/
typedef enum nvmlReturn_enum
{
NVML_SUCCESS = 0, //!< The operation was successful
NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t;
/**
* Memory locations
*
* See \ref nvmlDeviceGetMemoryErrorCounter
*/
typedef enum nvmlMemoryLocation_enum
{
NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache
NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache
NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory
NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File
NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory
// Keep this last
NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about
} nvmlMemoryLocation_t;
/**
* Causes for page retirement
*/
typedef enum nvmlPageRetirementCause_enum
{
NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error
NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error
// Keep this last
NVML_PAGE_RETIREMENT_CAUSE_COUNT
} nvmlPageRetirementCause_t;
/**
* API types that allow changes to default permission restrictions
*/
typedef enum nvmlRestrictedAPI_enum
{
NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks
//!< and see nvmlDeviceResetApplicationsClocks
NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable auto boosted clocks
//!< see nvmlDeviceSetAutoBoostedClocksEnabled
// Keep this last
NVML_RESTRICTED_API_COUNT
} nvmlRestrictedAPI_t;
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlUnitStructs Unit Structs
* @{
*/
/***************************************************************************************************/
typedef struct nvmlUnit_st* nvmlUnit_t;
/**
* Description of HWBC entry
*/
typedef struct nvmlHwbcEntry_st
{
unsigned int hwbcId;
char firmwareVersion[32];
} nvmlHwbcEntry_t;
/**
* Fan state enum.
*/
typedef enum nvmlFanState_enum
{
NVML_FAN_NORMAL = 0, //!< Fan is working properly
NVML_FAN_FAILED = 1 //!< Fan has failed
} nvmlFanState_t;
/**
* Led color enum.
*/
typedef enum nvmlLedColor_enum
{
NVML_LED_COLOR_GREEN = 0, //!< GREEN, indicates good health
NVML_LED_COLOR_AMBER = 1 //!< AMBER, indicates problem
} nvmlLedColor_t;
/**
* LED states for an S-class unit.
*/
typedef struct nvmlLedState_st
{
char cause[256]; //!< If amber, a text description of the cause
nvmlLedColor_t color; //!< GREEN or AMBER
} nvmlLedState_t;
/**
* Static S-class unit info.
*/
typedef struct nvmlUnitInfo_st
{
char name[96]; //!< Product name
char id[96]; //!< Product identifier
char serial[96]; //!< Product serial number
char firmwareVersion[96]; //!< Firmware version
} nvmlUnitInfo_t;
/**
* Power usage information for an S-class unit.
* The power supply state is a human readable string that equals "Normal" or contains
* a combination of "Abnormal" plus one or more of the following:
*
* - High voltage
* - Fan failure
* - Heatsink temperature
* - Current limit
* - Voltage below UV alarm threshold
* - Low-voltage
* - SI2C remote off command
* - MOD_DISABLE input
* - Short pin transition
*/
typedef struct nvmlPSUInfo_st
{
char state[256]; //!< The power supply state
unsigned int current; //!< PSU current (A)
unsigned int voltage; //!< PSU voltage (V)
unsigned int power; //!< PSU power draw (W)
} nvmlPSUInfo_t;
/**
* Fan speed reading for a single fan in an S-class unit.
*/
typedef struct nvmlUnitFanInfo_st
{
unsigned int speed; //!< Fan speed (RPM)
nvmlFanState_t state; //!< Flag that indicates whether fan is working properly
} nvmlUnitFanInfo_t;
/**
* Fan speed readings for an entire S-class unit.
*/
typedef struct nvmlUnitFanSpeeds_st
{
nvmlUnitFanInfo_t fans[24]; //!< Fan speed data for each fan
unsigned int count; //!< Number of fans in unit
} nvmlUnitFanSpeeds_t;
/** @} */
/***************************************************************************************************/
/** @addtogroup nvmlEvents
* @{
*/
/***************************************************************************************************/
/**
* Handle to an event set
*/
typedef struct nvmlEventSet_st* nvmlEventSet_t;
/** @defgroup nvmlEventType Event Types
* @{
* Event Types which user can be notified about.
* See description of particular functions for details.
*
* See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices
* support each event.
*
* Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents
*/
//! Event about single bit ECC errors
/**
* \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event
*/
#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL
//! Event about double bit ECC errors
/**
* \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event
*/
#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL
//! Event about PState changes
/**
* \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to
* no work being executed on the GPU, power capping or thermal capping. In a typical situation,
* Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
*/
#define nvmlEventTypePState 0x0000000000000004LL
//! Event that Xid critical error occurred
#define nvmlEventTypeXidCriticalError 0x0000000000000008LL
//! Event about clock changes
/**
* Kepler only
*/
#define nvmlEventTypeClock 0x0000000000000010LL
//! Mask with no events
#define nvmlEventTypeNone 0x0000000000000000LL
//! Mask of all events
#define nvmlEventTypeAll (nvmlEventTypeNone \
| nvmlEventTypeSingleBitEccError \
| nvmlEventTypeDoubleBitEccError \
| nvmlEventTypePState \
| nvmlEventTypeClock \
| nvmlEventTypeXidCriticalError \
)
/** @} */
/**
* Information about occurred event
*/
typedef struct nvmlEventData_st
{
nvmlDevice_t device; //!< Specific device where the event occurred
unsigned long long eventType; //!< Information about what specific event occurred
unsigned long long eventData; //!< Stores last XID error for the device in the event of nvmlEventTypeXidCriticalError,
// eventData is 0 for any other event. eventData is set as 999 for unknown xid error.
} nvmlEventData_t;
/** @} */
/***************************************************************************************************/
/** @addtogroup nvmlClocksThrottleReasons
* @{
*/
/***************************************************************************************************/
/** Nothing is running on the GPU and the clocks are dropping to Idle state
* \note This limiter may be removed in a later release
*/
#define nvmlClocksThrottleReasonGpuIdle 0x0000000000000001LL
/** GPU clocks are limited by current setting of applications clocks
*
* @see nvmlDeviceSetApplicationsClocks
* @see nvmlDeviceGetApplicationsClock
*/
#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL
/**
* @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting
* as the name describes the situation more accurately.
*/
#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksThrottleReasonApplicationsClocksSetting
/** SW Power Scaling algorithm is reducing the clocks below requested clocks
*
* @see nvmlDeviceGetPowerUsage
* @see nvmlDeviceSetPowerManagementLimit
* @see nvmlDeviceGetPowerManagementLimit
*/
#define nvmlClocksThrottleReasonSwPowerCap 0x0000000000000004LL
/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
*
* This is an indicator of:
* - temperature being too high
* - External Power Brake Assertion is triggered (e.g. by the system power supply)
* - Power draw is too high and Fast Trigger protection is reducing the clocks
* - May be also reported during PState or clock change
* - This behavior may be removed in a later release.
*
* @see nvmlDeviceGetTemperature
* @see nvmlDeviceGetTemperatureThreshold
* @see nvmlDeviceGetPowerUsage
*/
#define nvmlClocksThrottleReasonHwSlowdown 0x0000000000000008LL
/** Some other unspecified factor is reducing the clocks */
#define nvmlClocksThrottleReasonUnknown 0x8000000000000000LL
/** Bit mask representing no clocks throttling
*
* Clocks are as high as possible.
* */
#define nvmlClocksThrottleReasonNone 0x0000000000000000LL
/** Bit mask representing all supported clocks throttling reasons
* New reasons might be added to this list in the future
*/
#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \
| nvmlClocksThrottleReasonGpuIdle \
| nvmlClocksThrottleReasonApplicationsClocksSetting \
| nvmlClocksThrottleReasonSwPowerCap \
| nvmlClocksThrottleReasonHwSlowdown \
| nvmlClocksThrottleReasonUnknown \
)
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlAccountingStats Accounting Statistics
* @{
*
* Set of APIs designed to provide per process information about usage of GPU.
*
* @note All accounting statistics and accounting mode live in nvidia driver and reset
* to default (Disabled) when driver unloads.
* It is advised to run with persistence mode enabled.
*
* @note Enabling accounting mode has no negative impact on the GPU performance.
*/
/***************************************************************************************************/
/**
* Describes accounting statistics of a process.
*/
typedef struct nvmlAccountingStats_st {
unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
//! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
//! process (not just the last sample period).
//! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
//! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process.
//! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported
unsigned long long time; //!< Amount of time in ms during which the compute context was active
unsigned int reserved[8];
} nvmlAccountingStats_t;
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup
* This chapter describes the methods that handle NVML initialization and cleanup.
* It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and
* nvmlShutdown() once NVML is no longer being used.
* @{
*/
/***************************************************************************************************/
/**
* Initialize NVML, but don't initialize any GPUs yet.
*
* \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that
* did initialize all GPU devices in the system.
*
* This allows NVML to communicate with a GPU
* when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are
* discovered and initialized in nvmlDeviceGetHandleBy* functions instead.
*
* \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in
* a bad or unstable state.
*
* For all products.
*
* This method, should be called once before invoking any other methods in the library.
* A reference count of the number of initializations is maintained. Shutdown only occurs
* when the reference count reaches zero.
*
* @return
* - \ref NVML_SUCCESS if NVML has been properly initialized
* - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running
* - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver
* - \ref NVML_ERROR_UNKNOWN on any unexpected error
*/
nvmlReturn_t DECLDIR nvmlInit(void);
/**
* Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit().
*
* For all products.
*
* This method should be called after NVML work is done, once for each call to \ref nvmlInit()
* A reference count of the number of initializations is maintained. Shutdown only occurs
* when the reference count reaches zero. For backwards compatibility, no error is reported if
* nvmlShutdown() is called more times than nvmlInit().
*
* @return
* - \ref NVML_SUCCESS if NVML has been properly shut down
* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
* - \ref NVML_ERROR_UNKNOWN on any unexpected error
*/
nvmlReturn_t DECLDIR nvmlShutdown(void);
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlErrorReporting Error reporting
* This chapter describes helper functions for error reporting routines.
* @{
*/
/***************************************************************************************************/
/**
* Helper method for converting NVML error codes into readable strings.
*
* For all products
*
* @param result NVML error code to convert
*
* @return String representation of the error.
*
*/
const DECLDIR char* nvmlErrorString(nvmlReturn_t result);
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlConstants Constants
* @{
*/
/***************************************************************************************************/
/**
* Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion
*/
#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16
/**
* Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
*/
#define NVML_DEVICE_UUID_BUFFER_SIZE 80
/**
* Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion
*/
#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80
/**
* Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion
*/
#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80
/**
* Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
*/
#define NVML_DEVICE_NAME_BUFFER_SIZE 64
/**
* Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial
*/
#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30
/**
* Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion
*/
#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32
/** @} */
/***************************************************************************************************/
/** @defgroup nvmlSystemQueries System Queries