From 2f445c0e1844f4ed4655c4a061f0a2ed64aeb7d1 Mon Sep 17 00:00:00 2001 From: nobodysu Date: Sun, 30 Jun 2019 17:31:51 +0300 Subject: [PATCH 1/3] DISK hysteresis --- Template_mini-IPMI_v2.xml | 355 +++++++++++++++++++++++++++----------- mini_ipmi_smartctl.py | 89 +++++----- 2 files changed, 305 insertions(+), 139 deletions(-) diff --git a/Template_mini-IPMI_v2.xml b/Template_mini-IPMI_v2.xml index 6ed3d5b..8143813 100644 --- a/Template_mini-IPMI_v2.xml +++ b/Template_mini-IPMI_v2.xml @@ -1,7 +1,7 @@ 2.0 - 2019-03-11T16:54:27Z + 2019-06-30T12:01:16Z Templates @@ -27,9 +27,15 @@ mini-IPMI: Temperature + + mini-IPMI: Temperature thresholds + mini-IPMI: Voltage + + mini-IPMI: Voltage thresholds + @@ -630,7 +636,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. 0 - mini-IPMI: Info + mini-IPMI: Voltage thresholds @@ -673,7 +679,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. 0 - mini-IPMI: Info + mini-IPMI: Voltage thresholds @@ -1442,7 +1448,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. - 7 + 1 @@ -2441,16 +2447,16 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: CPU model or ID was changed 0 - 1 + 2 0 {Template mini-IPMI v2:mini.cpu.temp[cpu{#CPU},MAX].last(500)} > {Template mini-IPMI v2:mini.cpu.info[cpu{#CPU},TjMax].last()} - cpu{#CPU}: is throttling right now + cpu{#CPU}: is throttling right now: {ITEM.LASTVALUE} 0 - 4 + 5 (depending on values from last 500 seconds) 0 @@ -2460,25 +2466,25 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: no temperature info was found on CPU 0 - 1 + 2 Prevent multiple reporting. 0 {Template mini-IPMI v2:mini.cpu.temp[cpu{#CPU},MAX].last(500)}>60 - cpu{#CPU}: temperature is above 60C + cpu{#CPU}: temperature is too high: {ITEM.LASTVALUE} 0 - 3 + 4 (depending on values from last 500 seconds) 0 {Template mini-IPMI v2:mini.cpu.temp[cpu{#CPU},MAX].max(24h)}>60 - cpu{#CPU}: temperature was above 60C within past 24 hours + cpu{#CPU}: temperature was too high within past 24 hours 0 - 2 + 3 0 @@ -2487,7 +2493,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: was throttled within past 24 hours 0 - 2 + 3 0 @@ -2496,7 +2502,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUFAN}: fan speed is too low 0 - 2 + 3 Known false positives (zero speed). 0 @@ -2505,16 +2511,16 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUMEM}: free video memory is less than 32Mb 0 - 2 + 3 0 {Template mini-IPMI v2:mini.gpu.temp[gpu{#GPUTEMP}].last(500)}>70 - gpu{#GPUTEMP}: temperature is above 70C + gpu{#GPUTEMP}: temperature is too high: {ITEM.LASTVALUE} 0 - 3 + 4 (depending on values from last 500 seconds) 0 @@ -2523,7 +2529,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUTEMP}: temperature was above 70C within past 24 hours 0 - 2 + 3 0 @@ -2532,7 +2538,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: GPU model or ID was changed 0 - 1 + 2 0 @@ -2541,7 +2547,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: no fan info was found on GPU 1 - 1 + 2 0 @@ -2550,7 +2556,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: no temperature info was found on GPU 0 - 1 + 2 0 @@ -2559,16 +2565,16 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#BRDFANNAME}: motherboard fan speed is too low 0 - 2 + 3 Known false positives (zero speed). 0 {Template mini-IPMI v2:mini.brd.temp[{#BRDTEMPNUM}].last(500)}>60 - {#BRDTEMPNAME}: motherboard temperature is above 60C + {#BRDTEMPNAME}: motherboard temperature is too high: {ITEM.LASTVALUE} 0 - 3 + 4 (depending on values from last 500 seconds) 0 @@ -2577,7 +2583,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#BRDTEMPNAME}: motherboard temperature was above 60C within past 24 hours 0 - 2 + 3 0 @@ -2586,7 +2592,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage is too high 0 - 3 + 4 0 @@ -2595,7 +2601,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage is too low 0 - 3 + 4 0 @@ -2604,7 +2610,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage was too high within past 24 hours 0 - 2 + 3 0 @@ -2613,7 +2619,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage was too low within past 24 hours 0 - 2 + 3 0 @@ -2622,7 +2628,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage is too high 0 - 3 + 4 0 @@ -2631,7 +2637,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage is too low 0 - 3 + 4 0 @@ -2640,7 +2646,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage was too high within past 24 hours 0 - 2 + 3 0 @@ -2649,7 +2655,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage was too low within past 24 hours 0 - 2 + 3 0 @@ -2658,7 +2664,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage is too high 0 - 3 + 4 0 @@ -2667,7 +2673,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage is too low 0 - 3 + 4 0 @@ -2676,7 +2682,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage was too high within past 24 hours 0 - 2 + 3 0 @@ -2685,7 +2691,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage was too low within past 24 hours 0 - 2 + 3 0 @@ -2694,7 +2700,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VBAT}: CMOS battery voltage is too low 0 - 2 + 3 0 @@ -2703,7 +2709,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage is too high 0 - 3 + 4 0 @@ -2712,7 +2718,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage is too low 0 - 3 + 4 0 @@ -2721,7 +2727,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage was too high within past 24 hours 0 - 2 + 3 0 @@ -2730,7 +2736,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage was too low within past 24 hours 0 - 2 + 3 0 @@ -2739,7 +2745,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCORE}: CPU voltage is too high 0 - 3 + 4 Each processor have individual threshold. OHMR usage must be tweaked (mini_ipmi_ohmr.py). Known incorrect measurements. 0 @@ -2748,7 +2754,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCORE}: CPU voltage was too high within past 24 hours 0 - 2 + 3 Each processor have individual threshold. OHMR usage must be tweaked (mini_ipmi_ohmr.py). Known incorrect measurements. 0 @@ -2757,7 +2763,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage is too high 0 - 3 + 4 0 @@ -2766,7 +2772,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage is too low 0 - 3 + 4 0 @@ -2775,7 +2781,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage was too high within past 24 hours 0 - 2 + 3 0 @@ -2784,7 +2790,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage was too low within past 24 hours 0 - 2 + 3 0 @@ -2793,7 +2799,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VTT}: voltage is too high 0 - 3 + 4 Each processor have individual threshold. Tweak in mini_ipmi_ohmr.py before use. 0 @@ -2802,7 +2808,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VTT}: voltage was too high within past 24h 0 - 2 + 3 Each processor have individual threshold. Tweak in mini_ipmi_ohmr.py before use. 0 @@ -2840,9 +2846,52 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. - 7 + 1 + + {#DISK}: Critical disk temperature threshold + 2 + + 0 + + mini.disk.tempCrit[{#DISK}] + 0 + 14 + 30 + 0 + 3 + + C + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + 0 + + + + + + + 0 + + + mini-IPMI: Temperature thresholds + + + + + {#DISK}: Disk temperature 2 @@ -2886,6 +2935,92 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. + + {#DISK}: Maximum disk temperature threshold + 2 + + 0 + + mini.disk.tempMax[{#DISK}] + 0 + 14 + 30 + 0 + 3 + + C + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + 0 + + + + + + + 0 + + + mini-IPMI: Temperature thresholds + + + + + + + {#DISK}: Minimum disk temperature threshold + 2 + + 0 + + mini.disk.tempMin[{#DISK}] + 0 + 14 + 30 + 0 + 3 + + C + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + 0 + + + + + + + 0 + + + mini-IPMI: Temperature thresholds + + + + + {#DISK}: Status 2 @@ -2936,7 +3071,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Assumed to be a placeholder NVMe 1 - 1 + 2 0 @@ -2945,7 +3080,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Command line did not parse (mini-IPMI) 0 - 1 + 2 0 @@ -2954,46 +3089,58 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Device open failed (mini-IPMI) 0 - 1 + 2 Probably no administrative permissions for smartctl or incorrect device name. 0 - {Template mini-IPMI v2:mini.disk.temp[{#DISK}].last(500)}>=60 - {#DISK}: Disk temperature is critical + ({Template mini-IPMI v2:mini.disk.temp[{#DISK}].last()} > {Template mini-IPMI v2:mini.disk.tempCrit[{#DISK}].last()}) and +{Template mini-IPMI v2:mini.disk.info[{#DISK},DriveStatus].regexp(^DUPLICATE_IGNORE$|^STANDBY|^SLEEP$)}=0 + {#DISK}: Disk temperature is critical: {ITEM.LASTVALUE} 0 - 4 - (depending on values from last 500 seconds) + 5 + If last value is more than critical temperature setting and +drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 - {Template mini-IPMI v2:mini.disk.temp[{#DISK}].last(500)}>=46 - {#DISK}: Disk temperature is too high + ({Template mini-IPMI v2:mini.disk.temp[{#DISK}].last()} > {Template mini-IPMI v2:mini.disk.tempMax[{#DISK}].last()}) and +({Template mini-IPMI v2:mini.disk.temp[{#DISK}].prev()} > {Template mini-IPMI v2:mini.disk.tempMax[{#DISK}].last()}) and +{Template mini-IPMI v2:mini.disk.temp[{#DISK}].count(2h)}>=2 and +{Template mini-IPMI v2:mini.disk.info[{#DISK},DriveStatus].regexp(^DUPLICATE_IGNORE$|^STANDBY|^SLEEP$)}=0 + {#DISK}: Disk temperature is too high: {ITEM.LASTVALUE} 0 - 3 - (depending on values from last 500 seconds) + 4 + If last value is more than maximum temperature setting and +previous value is more than maximum temperature setting and +drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 - {Template mini-IPMI v2:mini.disk.temp[{#DISK}].last(#1)}<=25 and -{Template mini-IPMI v2:mini.disk.temp[{#DISK}].last(#2)}<=25 and -{Template mini-IPMI v2:mini.disk.temp[{#DISK}].last(#3)}<=25 - {#DISK}: Disk temperature is too low + ({Template mini-IPMI v2:mini.disk.temp[{#DISK}].last()} < {Template mini-IPMI v2:mini.disk.tempMin[{#DISK}].last()}) and +({Template mini-IPMI v2:mini.disk.temp[{#DISK}].prev()} < {Template mini-IPMI v2:mini.disk.tempMin[{#DISK}].last()}) and +{Template mini-IPMI v2:mini.disk.temp[{#DISK}].count(2h)}>=2 and +{Template mini-IPMI v2:mini.disk.info[{#DISK},DriveStatus].regexp(^DUPLICATE_IGNORE$|^STANDBY|^SLEEP$)}=0 + {#DISK}: Disk temperature is too low: {ITEM.LASTVALUE} 0 - 3 - Last and previous value were below 25. + 4 + If last value is less than minimum temperature setting and +previous value is less than minimum temperature setting and +drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 - {Template mini-IPMI v2:mini.disk.temp[{#DISK}].max(24h)}>=60 - {#DISK}: Disk temperature was critical in past 24 hours + ({Template mini-IPMI v2:mini.disk.temp[{#DISK}].max(24h)} > {Template mini-IPMI v2:mini.disk.tempCrit[{#DISK}].last()}) and +{Template mini-IPMI v2:mini.disk.info[{#DISK},DriveStatus].regexp(^DUPLICATE_IGNORE$|^STANDBY|^SLEEP$)}=0 + {#DISK}: Disk temperature was critical in the past 24 hours 0 - 2 - + 3 + If any value within last 24 hours exceeded critical temperature setting and +last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 @@ -3001,7 +3148,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Have no temperature sensor 1 - 1 + 2 0 @@ -3010,7 +3157,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Is in SLEEP mode 1 - 1 + 2 0 @@ -3019,7 +3166,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Is in STANDBY mode 1 - 1 + 2 0 @@ -3028,7 +3175,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: No temperature info was found on disk 0 - 1 + 2 0 @@ -3037,7 +3184,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Something went wrong (mini-IPMI) 0 - 1 + 2 0 @@ -3046,7 +3193,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Timeout exceeded while calling the disk (mini-IPMI) 0 - 2 + 3 Could indicate disk failure. Investigation is advised. 0 @@ -3055,7 +3202,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Unknown USB bridge 1 - 1 + 2 0 @@ -3075,7 +3222,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. Disk temperature is too high 1 - 3 + 4 (depending on values from last 500 seconds) This one is disabled - individual disk trigger takes precedence. 0 @@ -3086,7 +3233,7 @@ This one is disabled - individual disk trigger takes precedence. Disk temperature was too high in past 24 hours 1 - 2 + 3 (depending on values from last day) This one is disabled - individual disk trigger takes precedence. 0 @@ -3097,7 +3244,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: CPU binary was not found in PATH or manually 0 - 1 + 2 0 @@ -3107,7 +3254,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no CPUs were found for temperature test 0 - 1 + 2 0 @@ -3117,7 +3264,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no DISKs were found for temperature test 0 - 1 + 2 0 @@ -3127,37 +3274,37 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no GPUs were found for temperature test 1 - 1 + 2 0 {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOCPUTEMPS,4d)}=1 - mini-IPMI: no temperatures was found among CPUs + mini-IPMI: no temperatures were found among CPUs 0 - 1 + 2 0 {Template mini-IPMI v2:mini.disk.info[ConfigStatus].str(NODISKTEMPS)}=1 - mini-IPMI: no temperatures was found among DISKs + mini-IPMI: no temperatures were found among DISKs 0 - 1 + 2 0 {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOGPUTEMPS,#3)}=1 - mini-IPMI: no temperatures was found among GPUs + mini-IPMI: no temperatures were found among GPUs 0 - 1 + 2 0 @@ -3167,7 +3314,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: smartctl was not found in PATH or manually 0 - 1 + 2 0 @@ -3177,7 +3324,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with CPU configuration 0 - 1 + 2 0 @@ -3187,7 +3334,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with DISK configuration 0 - 1 + 2 0 @@ -3197,7 +3344,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with DISK sending 0 - 2 + 3 0 @@ -3207,7 +3354,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: template is assigned, but no CPU data is recieved 0 - 1 + 2 0 @@ -3217,7 +3364,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: template is assigned, but no DISK data is recieved 0 - 1 + 2 0 @@ -3227,7 +3374,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: too much data for CPU sending 0 - 2 + 3 0 @@ -3237,11 +3384,21 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: too much data for DISK sending 0 - 2 + 3 0 + + {Template mini-IPMI v2:mini.brd.info[BIOSversion].regexp(^4001$|^0508$|^P1.90$|^0601$|^1801$|^V2.7$)}=0 + Unexpected BIOS firmware version: {ITEM.LASTVALUE} + + 0 + 2 + List of newest BIOS firmware versions on the clients. + 0 + + diff --git a/mini_ipmi_smartctl.py b/mini_ipmi_smartctl.py index 7df21cb..920d1fd 100755 --- a/mini_ipmi_smartctl.py +++ b/mini_ipmi_smartctl.py @@ -26,7 +26,12 @@ # 'True' or 'False' isCheckNVMe = False # Additional overhead. Should be disabled if smartmontools is >= 7 or NVMe is absent. -isSkipDuplicates = True +isIgnoreDuplicates = True + +# type, min, max, critical +thresholds = ( + ('hdd', 25, 45, 60), +) isHeavyDebug = False @@ -113,6 +118,21 @@ def scanDisks(mode): return error, disks +def moveCsmiToBegining(disks): + csmis = [] + others = [] + + for i in disks: + if re.search(r'\/csmi\d+\,\d+', i, re.I): + csmis.append(i) + else: + others.append(i) + + result = csmis + others + + return result + + def listDisks(): errors = [] @@ -229,22 +249,22 @@ def findSerial(p): def chooseSystemSpecificPaths(): if sys.platform.startswith('linux'): - binPath_ = binPath_LINUX - agentConf_ = agentConf_LINUX - senderPath_ = senderPath_LINUX - senderPyPath_ = senderPyPath_LINUX + binPath_ = binPath_LINUX + agentConf_ = agentConf_LINUX + senderPath_ = senderPath_LINUX + senderPyPath_ = senderPyPath_LINUX elif sys.platform == 'win32': - binPath_ = binPath_WIN - agentConf_ = agentConf_WIN - senderPath_ = senderPath_WIN - senderPyPath_ = senderPyPath_WIN + binPath_ = binPath_WIN + agentConf_ = agentConf_WIN + senderPath_ = senderPath_WIN + senderPyPath_ = senderPyPath_WIN else: - binPath_ = binPath_OTHER - agentConf_ = agentConf_OTHER - senderPath_ = senderPath_OTHER - senderPyPath_ = senderPyPath_OTHER + binPath_ = binPath_OTHER + agentConf_ = agentConf_OTHER + senderPath_ = senderPath_OTHER + senderPyPath_ = senderPyPath_OTHER if sys.argv[1] == 'getverb': print(' Path guess: %s\n' % sys.platform) @@ -276,23 +296,8 @@ def isDummyNVMe(p): return True else: return False - - -def moveCsmiToBegining(disks): - csmis = [] - others = [] - - for i in disks: - if re.search(r'\/csmi\d+\,\d+', i, re.I): - csmis.append(i) - else: - others.append(i) - result = csmis + others - return result - - if __name__ == '__main__': fail_ifNot_Py3() @@ -330,16 +335,18 @@ def moveCsmiToBegining(disks): break # other disks json are discarded isDuplicate = False - if isSkipDuplicates: - serial = findSerial(diskPout) - if serial in sessionSerials: - isDuplicate = True - elif serial: - sessionSerials.append(serial) + serial = findSerial(diskPout) + if serial in sessionSerials: + isDuplicate = True + elif serial: + sessionSerials.append(serial) temp = findDiskTemp(diskPout) if isDuplicate: - driveStatus = 'DUPLICATE' + if isIgnoreDuplicates: + driveStatus = 'DUPLICATE_IGNORE' + else: + driveStatus = 'DUPLICATE_MENTION' elif diskError: driveStatus = diskError elif isModelWithoutSensor(diskPout): @@ -352,17 +359,19 @@ def moveCsmiToBegining(disks): driveStatus = 'PROCESSED' senderData.append('"%s" mini.disk.info[%s,DriveStatus] "%s"' % (host, sanitizedD, driveStatus)) - if (temp and - not isDuplicate): - + if temp: senderData.append('"%s" mini.disk.temp[%s] "%s"' % (host, sanitizedD, temp)) allTemps.append(temp) - + + senderData.append('"%s" mini.disk.tempMin[%s] "%s"' % (host, sanitizedD, thresholds[0][1])) + senderData.append('"%s" mini.disk.tempMax[%s] "%s"' % (host, sanitizedD, thresholds[0][2])) + senderData.append('"%s" mini.disk.tempCrit[%s] "%s"' % (host, sanitizedD, thresholds[0][3])) + if isHeavyDebug: heavyOut = repr(diskPout.strip()) heavyOut = heavyOut.strip().strip('"').strip("'").strip() heavyOut = heavyOut.replace("'", r"\'").replace('"', r'\"') - + debugData = '"%s" mini.disk.HeavyDebug "%s"' % (host, heavyOut) if diskError: if 'ERR_CODE_' in diskError: From a70d93044928f3c540002fd70525d6f24f9930c3 Mon Sep 17 00:00:00 2001 From: nobodysu Date: Mon, 1 Jul 2019 16:34:14 +0300 Subject: [PATCH 2/3] DISK hysteresis template fixes --- Template_mini-IPMI_v2.xml | 156 +++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/Template_mini-IPMI_v2.xml b/Template_mini-IPMI_v2.xml index 8143813..dd9ba00 100644 --- a/Template_mini-IPMI_v2.xml +++ b/Template_mini-IPMI_v2.xml @@ -1,7 +1,7 @@ 2.0 - 2019-06-30T12:01:16Z + 2019-07-01T13:03:30Z Templates @@ -2447,7 +2447,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: CPU model or ID was changed 0 - 2 + 1 0 @@ -2456,7 +2456,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: is throttling right now: {ITEM.LASTVALUE} 0 - 5 + 4 (depending on values from last 500 seconds) 0 @@ -2466,7 +2466,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: no temperature info was found on CPU 0 - 2 + 1 Prevent multiple reporting. 0 @@ -2475,7 +2475,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: temperature is too high: {ITEM.LASTVALUE} 0 - 4 + 3 (depending on values from last 500 seconds) 0 @@ -2484,7 +2484,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: temperature was too high within past 24 hours 0 - 3 + 2 0 @@ -2493,7 +2493,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. cpu{#CPU}: was throttled within past 24 hours 0 - 3 + 2 0 @@ -2502,7 +2502,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUFAN}: fan speed is too low 0 - 3 + 2 Known false positives (zero speed). 0 @@ -2511,7 +2511,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUMEM}: free video memory is less than 32Mb 0 - 3 + 2 0 @@ -2520,7 +2520,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUTEMP}: temperature is too high: {ITEM.LASTVALUE} 0 - 4 + 3 (depending on values from last 500 seconds) 0 @@ -2529,7 +2529,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPUTEMP}: temperature was above 70C within past 24 hours 0 - 3 + 2 0 @@ -2538,7 +2538,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: GPU model or ID was changed 0 - 2 + 1 0 @@ -2547,7 +2547,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: no fan info was found on GPU 1 - 2 + 1 0 @@ -2556,7 +2556,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. gpu{#GPU}: no temperature info was found on GPU 0 - 2 + 1 0 @@ -2565,7 +2565,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#BRDFANNAME}: motherboard fan speed is too low 0 - 3 + 2 Known false positives (zero speed). 0 @@ -2574,7 +2574,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#BRDTEMPNAME}: motherboard temperature is too high: {ITEM.LASTVALUE} 0 - 4 + 3 (depending on values from last 500 seconds) 0 @@ -2583,7 +2583,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#BRDTEMPNAME}: motherboard temperature was above 60C within past 24 hours 0 - 3 + 2 0 @@ -2592,7 +2592,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage is too high 0 - 4 + 3 0 @@ -2601,7 +2601,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage is too low 0 - 4 + 3 0 @@ -2610,7 +2610,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage was too high within past 24 hours 0 - 3 + 2 0 @@ -2619,7 +2619,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P5V}: voltage was too low within past 24 hours 0 - 3 + 2 0 @@ -2628,7 +2628,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage is too high 0 - 4 + 3 0 @@ -2637,7 +2637,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage is too low 0 - 4 + 3 0 @@ -2646,7 +2646,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage was too high within past 24 hours 0 - 3 + 2 0 @@ -2655,7 +2655,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P12V}: voltage was too low within past 24 hours 0 - 3 + 2 0 @@ -2664,7 +2664,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage is too high 0 - 4 + 3 0 @@ -2673,7 +2673,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage is too low 0 - 4 + 3 0 @@ -2682,7 +2682,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage was too high within past 24 hours 0 - 3 + 2 0 @@ -2691,7 +2691,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#P33V}: voltage was too low within past 24 hours 0 - 3 + 2 0 @@ -2700,7 +2700,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VBAT}: CMOS battery voltage is too low 0 - 3 + 2 0 @@ -2709,7 +2709,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage is too high 0 - 4 + 3 0 @@ -2718,7 +2718,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage is too low 0 - 4 + 3 0 @@ -2727,7 +2727,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage was too high within past 24 hours 0 - 3 + 2 0 @@ -2736,7 +2736,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCC3V}: voltage was too low within past 24 hours 0 - 3 + 2 0 @@ -2745,7 +2745,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCORE}: CPU voltage is too high 0 - 4 + 3 Each processor have individual threshold. OHMR usage must be tweaked (mini_ipmi_ohmr.py). Known incorrect measurements. 0 @@ -2754,7 +2754,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VCORE}: CPU voltage was too high within past 24 hours 0 - 3 + 2 Each processor have individual threshold. OHMR usage must be tweaked (mini_ipmi_ohmr.py). Known incorrect measurements. 0 @@ -2763,7 +2763,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage is too high 0 - 4 + 3 0 @@ -2772,7 +2772,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage is too low 0 - 4 + 3 0 @@ -2781,7 +2781,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage was too high within past 24 hours 0 - 3 + 2 0 @@ -2790,7 +2790,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VSB3V}: voltage was too low within past 24 hours 0 - 3 + 2 0 @@ -2799,7 +2799,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VTT}: voltage is too high 0 - 4 + 3 Each processor have individual threshold. Tweak in mini_ipmi_ohmr.py before use. 0 @@ -2808,7 +2808,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#VTT}: voltage was too high within past 24h 0 - 3 + 2 Each processor have individual threshold. Tweak in mini_ipmi_ohmr.py before use. 0 @@ -3071,7 +3071,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Assumed to be a placeholder NVMe 1 - 2 + 1 0 @@ -3080,7 +3080,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Command line did not parse (mini-IPMI) 0 - 2 + 1 0 @@ -3089,7 +3089,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Device open failed (mini-IPMI) 0 - 2 + 1 Probably no administrative permissions for smartctl or incorrect device name. 0 @@ -3099,7 +3099,7 @@ Could refer to mini_ipmi_lmsensors.py, mini_ipmi_ohmr.py or mini_ipmi_bsdcpu.py. {#DISK}: Disk temperature is critical: {ITEM.LASTVALUE} 0 - 5 + 4 If last value is more than critical temperature setting and drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 @@ -3112,7 +3112,7 @@ drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Disk temperature is too high: {ITEM.LASTVALUE} 0 - 4 + 3 If last value is more than maximum temperature setting and previous value is more than maximum temperature setting and drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. @@ -3126,7 +3126,7 @@ drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Disk temperature is too low: {ITEM.LASTVALUE} 0 - 4 + 3 If last value is less than minimum temperature setting and previous value is less than minimum temperature setting and drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. @@ -3138,7 +3138,7 @@ drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Disk temperature was critical in the past 24 hours 0 - 3 + 2 If any value within last 24 hours exceeded critical temperature setting and last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. 0 @@ -3148,7 +3148,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Have no temperature sensor 1 - 2 + 1 0 @@ -3157,7 +3157,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Is in SLEEP mode 1 - 2 + 1 0 @@ -3166,7 +3166,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Is in STANDBY mode 1 - 2 + 1 0 @@ -3175,7 +3175,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: No temperature info was found on disk 0 - 2 + 1 0 @@ -3184,7 +3184,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Something went wrong (mini-IPMI) 0 - 2 + 1 0 @@ -3193,7 +3193,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Timeout exceeded while calling the disk (mini-IPMI) 0 - 3 + 2 Could indicate disk failure. Investigation is advised. 0 @@ -3202,7 +3202,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. {#DISK}: Unknown USB bridge 1 - 2 + 1 0 @@ -3222,7 +3222,7 @@ last drive status is not DUPLICATE_IGNORE, STANDBY* or SLEEP. Disk temperature is too high 1 - 4 + 3 (depending on values from last 500 seconds) This one is disabled - individual disk trigger takes precedence. 0 @@ -3233,7 +3233,7 @@ This one is disabled - individual disk trigger takes precedence. Disk temperature was too high in past 24 hours 1 - 3 + 2 (depending on values from last day) This one is disabled - individual disk trigger takes precedence. 0 @@ -3244,7 +3244,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: CPU binary was not found in PATH or manually 0 - 2 + 1 0 @@ -3254,7 +3254,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no CPUs were found for temperature test 0 - 2 + 1 0 @@ -3264,7 +3264,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no DISKs were found for temperature test 0 - 2 + 1 0 @@ -3274,37 +3274,37 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: no GPUs were found for temperature test 1 - 2 + 1 0 {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOCPUTEMPS,4d)}=1 - mini-IPMI: no temperatures were found among CPUs + mini-IPMI: no temperatures was found among CPUs 0 - 2 + 1 0 {Template mini-IPMI v2:mini.disk.info[ConfigStatus].str(NODISKTEMPS)}=1 - mini-IPMI: no temperatures were found among DISKs + mini-IPMI: no temperatures was found among DISKs 0 - 2 + 1 0 {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOGPUTEMPS,#3)}=1 - mini-IPMI: no temperatures were found among GPUs + mini-IPMI: no temperatures was found among GPUs 0 - 2 + 1 0 @@ -3314,7 +3314,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: smartctl was not found in PATH or manually 0 - 2 + 1 0 @@ -3324,7 +3324,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with CPU configuration 0 - 2 + 1 0 @@ -3334,7 +3334,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with DISK configuration 0 - 2 + 1 0 @@ -3344,7 +3344,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: something went wrong with DISK sending 0 - 3 + 2 0 @@ -3354,7 +3354,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: template is assigned, but no CPU data is recieved 0 - 2 + 1 0 @@ -3364,7 +3364,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: template is assigned, but no DISK data is recieved 0 - 2 + 1 0 @@ -3374,7 +3374,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: too much data for CPU sending 0 - 3 + 2 0 @@ -3384,7 +3384,7 @@ This one is disabled - individual disk trigger takes precedence. mini-IPMI: too much data for DISK sending 0 - 3 + 2 0 @@ -3393,8 +3393,8 @@ This one is disabled - individual disk trigger takes precedence. {Template mini-IPMI v2:mini.brd.info[BIOSversion].regexp(^4001$|^0508$|^P1.90$|^0601$|^1801$|^V2.7$)}=0 Unexpected BIOS firmware version: {ITEM.LASTVALUE} - 0 - 2 + 1 + 1 List of newest BIOS firmware versions on the clients. 0 From c4989147b6757b3a3d1b83bec5f7f0e0174ef048 Mon Sep 17 00:00:00 2001 From: nobodysu Date: Mon, 1 Jul 2019 13:50:31 +0000 Subject: [PATCH 3/3] Update Template_mini-IPMI_v2.xml --- Template_mini-IPMI_v2.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Template_mini-IPMI_v2.xml b/Template_mini-IPMI_v2.xml index dd9ba00..a3a367c 100644 --- a/Template_mini-IPMI_v2.xml +++ b/Template_mini-IPMI_v2.xml @@ -3281,7 +3281,7 @@ This one is disabled - individual disk trigger takes precedence. {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOCPUTEMPS,4d)}=1 - mini-IPMI: no temperatures was found among CPUs + mini-IPMI: no temperatures were found among CPUs 0 1 @@ -3291,7 +3291,7 @@ This one is disabled - individual disk trigger takes precedence. {Template mini-IPMI v2:mini.disk.info[ConfigStatus].str(NODISKTEMPS)}=1 - mini-IPMI: no temperatures was found among DISKs + mini-IPMI: no temperatures were found among DISKs 0 1 @@ -3301,7 +3301,7 @@ This one is disabled - individual disk trigger takes precedence. {Template mini-IPMI v2:mini.cpu.info[ConfigStatus].str(NOGPUTEMPS,#3)}=1 - mini-IPMI: no temperatures was found among GPUs + mini-IPMI: no temperatures were found among GPUs 0 1