-
Notifications
You must be signed in to change notification settings - Fork 14
/
qmonitor
executable file
·109 lines (84 loc) · 2.81 KB
/
qmonitor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/g/data/hh5/public/apps/nci_scripts/python-analysis3
# Copyright 2020 Scott Wales
# author: Scott Wales <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Monitor a PBS queue job
"""
import argparse
import tqdm
import subprocess
import json
import time
import pandas
import importlib
from qtools import qstat, decode_bytes
def metrics(stat):
cpu = 0
mem = 0
wall = 0
if stat['job_state'] == 'R':
mem_request = decode_bytes(stat['Resource_List']['mem'])
mem_used = decode_bytes(stat['resources_used']['mem'])
mem = mem_used / mem_request * 100
cpu_time = pandas.to_timedelta(stat['resources_used']['cput'])
wall_used = pandas.to_timedelta(stat['resources_used']['walltime'])
ncpus = stat['Resource_List']['ncpus']
try:
cpu = cpu_time / wall_used / ncpus * 100
except ZeroDivisionError:
cpu = 0
wall = wall_used.seconds
return cpu, mem, wall
def monitor(jobid):
stat = None
while stat is None:
try:
stat = list(qstat([jobid], show_finished=True).values())[0]
except:
time.sleep(30)
c, m, w = metrics(stat)
wall_request = pandas.to_timedelta(stat['Resource_List']['walltime']).seconds
bar_format = '{l_bar}{bar}'
cpu = tqdm.tqdm(unit="%", desc="CPU", total=100.0, position=0, bar_format=bar_format, leave=True)
mem = tqdm.tqdm(unit="%", desc="MEM", total=100.0, position=1, bar_format=bar_format, leave=True)
wall = tqdm.tqdm(unit='s', desc="TIME", total=wall_request, position=2, bar_format=bar_format, leave=True)
while stat["job_state"] in ["Q", "R"]:
c, m, w = metrics(stat)
cpu.reset()
cpu.update(c)
cpu.refresh()
mem.reset()
mem.update(m)
mem.refresh()
wall.reset()
wall.update(w)
wall.refresh()
time.sleep(60)
try:
# Handle PBS server outages
newstat = list(qstat([jobid], show_finished=True).values())[0]
stat = newstat
except:
pass
cpu.close()
mem.close()
wall.close()
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("jobid")
args = parser.parse_args()
monitor(args.jobid)
if __name__ == "__main__":
main()