-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcrabLaw.py
266 lines (241 loc) · 9.46 KB
/
crabLaw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import copy
import datetime
import json
import law
import luigi
import os
import select
import shutil
import sys
import tempfile
import termios
import threading
from .law_customizations import HTCondorWorkflow
from .crabTask import Task as CrabTask
from .crabTaskStatus import Status
from .run_tools import ps_call, print_ts
cond = threading.Condition()
def update_kinit(verbose=0):
if shutil.which('kinit'):
ps_call(['kinit', '-R'], expected_return_codes=None, verbose=verbose)
def update_kinit_thread():
timeout = 60.0 * 60 # 1 hour
cond.acquire()
while not cond.wait(timeout):
update_kinit(verbose=1)
cond.release()
class LawTaskManager:
def __init__(self, cfg_path, law_task_dir=None):
self.cfg_path = cfg_path
self.law_task_dir = law_task_dir
if os.path.exists(cfg_path):
with open(cfg_path, 'r') as f:
self.cfg = json.load(f)
self.has_updates = False
else:
self.cfg = []
self.has_updates = True
def add(self, task_work_area, task_grid_job_id, done_flag, failed_flag=None, ready_to_run=True):
task_work_area = os.path.abspath(task_work_area)
done_flag = os.path.abspath(done_flag)
failed_flag = os.path.abspath(failed_flag) if failed_flag is not None else None
existing_entry = self.find(task_work_area, task_grid_job_id)
if existing_entry is not None:
if existing_entry.get('done_flag') != done_flag or existing_entry.get('failed_flag') != failed_flag \
or existing_entry.get('ready_to_run', True) != ready_to_run:
existing_entry['done_flag'] = done_flag
existing_entry['failed_flag'] = failed_flag
existing_entry['ready_to_run'] = ready_to_run
self.has_updates = True
return
branch_id = len(self.cfg)
self.cfg.append({ 'branch_id': branch_id, 'task_work_area': task_work_area, 'task_grid_job_id': task_grid_job_id,
'done_flag': done_flag, 'failed_flag': failed_flag, 'ready_to_run': ready_to_run })
self.has_updates = True
def find(self, task_work_area, task_grid_job_id):
task_work_area = os.path.abspath(task_work_area)
task_grid_job_id = int(task_grid_job_id)
for entry in self.cfg:
if entry['task_work_area'] == task_work_area and entry['task_grid_job_id'] == task_grid_job_id:
return entry
return None
def get_cfg(self):
cfg_ext = []
entry_jobs = {}
for entry in self.cfg:
entry_ext = copy.deepcopy(entry)
entry_ext['dependencies'] = []
task_work_area = entry_ext['task_work_area']
task_grid_job_id = entry_ext['task_grid_job_id']
if task_work_area not in entry_jobs:
entry_jobs[task_work_area] = {}
entry_jobs[task_work_area][task_grid_job_id] = entry_ext
cfg_ext.append(entry_ext)
for task_work_area, jobs in entry_jobs.items():
for job_id, job_entry in jobs.items():
if job_id < 0:
if job_id == -1:
for task_grid_job_id, entry in jobs.items():
if task_grid_job_id >= 0:
job_entry['dependencies'].append(entry['done_flag'])
elif job_id+1 in jobs:
job_entry['dependencies'].append(jobs[job_id+1]['done_flag'])
return cfg_ext
def select_branches(self, task_work_areas):
selected_branches = []
for task_work_area in task_work_areas:
task_work_area = os.path.abspath(task_work_area)
for entry in self.cfg:
if entry['task_work_area'] == task_work_area:
selected_branches.append(entry['branch_id'])
return selected_branches
def clean_branches(self, task_work_areas, dry_run=False):
task_work_areas = [ os.path.abspath(task_work_area) for task_work_area in task_work_areas ]
new_cfg = []
to_remove = []
has_updates = False
for entry in self.cfg:
if entry['task_work_area'] in task_work_areas:
new_cfg.append(entry)
else:
has_updates = True
to_remove.append(entry)
if not dry_run and has_updates:
self.cfg = new_cfg
self.has_updates = True
return to_remove
def _save_safe(self, file, json_content):
tmp_path = file + '.tmp'
with open(tmp_path, 'w') as f:
json.dump(json_content, f, indent=2)
shutil.move(tmp_path, file)
def save(self):
if self.has_updates:
self._save_safe(self.cfg_path, self.cfg)
self.has_updates = False
def update_grid_jobs(self, grid_jobs_file):
if not os.path.exists(grid_jobs_file):
return
with open(grid_jobs_file, 'r') as f:
grid_jobs = json.load(f)
has_updates = False
valid_jobs = set()
for entry in self.cfg:
branch_id = entry['branch_id']
job_id = str(branch_id + 1)
valid_jobs.add(job_id)
if job_id not in grid_jobs["jobs"] and job_id not in grid_jobs["unsubmitted_jobs"]:
grid_jobs["unsubmitted_jobs"][job_id] = [ branch_id ]
has_updates = True
for col in [ "jobs", "unsubmitted_jobs" ]:
jobs_to_remove = []
for job_id in grid_jobs[col]:
if job_id not in valid_jobs:
jobs_to_remove.append(job_id)
for job_id in jobs_to_remove:
grid_jobs[col].pop(job_id)
has_updates = True
if has_updates:
self._save_safe(grid_jobs_file, grid_jobs)
class ProdTask(HTCondorWorkflow, law.LocalWorkflow):
work_area = luigi.Parameter()
stop_date = luigi.parameter.DateSecondParameter(default=datetime.datetime.max)
def local_path(self, *path):
return os.path.join(self.htcondor_output_directory().path, *path)
def workflow_requires(self):
return {}
def requires(self):
return {}
def law_job_home(self):
if 'LAW_JOB_HOME' in os.environ:
return os.environ['LAW_JOB_HOME'], False
os.makedirs(self.local_path(), exist_ok=True)
return tempfile.mkdtemp(dir=self.local_path()), True
def create_branch_map(self):
task_list_path = os.path.join(self.work_area, 'law_tasks.json')
task_manager = LawTaskManager(task_list_path)
branches = {}
for entry in task_manager.get_cfg():
branches[entry['branch_id']] = (entry['task_work_area'], entry['task_grid_job_id'], entry['done_flag'], entry['dependencies'], entry.get('failed_flag'), entry.get('ready_to_run', True))
return branches
def output(self):
work_area, grid_job_id, done_flag, dependencies, failed_flag, ready_to_run = self.branch_data
if failed_flag is not None:
failed_flag_target = law.LocalFileTarget(failed_flag)
if failed_flag_target.exists():
return failed_flag_target
done_flag_target = law.LocalFileTarget(done_flag)
wait_flag_target = law.LocalFileTarget(done_flag + '.wait')
all_dependecies_exist = True
for dependency in dependencies:
if not os.path.exists(dependency):
all_dependecies_exist = False
break
if not ready_to_run or not all_dependecies_exist:
wait_flag_target.touch()
return wait_flag_target
return done_flag_target
def run(self):
thread = threading.Thread(target=update_kinit_thread)
thread.start()
job_home, remove_job_home = self.law_job_home()
try:
work_area, grid_job_id, done_flag, dependencies, failed_flag, ready_to_run = self.branch_data
task = CrabTask.Load(workArea=work_area)
if grid_job_id == -2:
if task.taskStatus.status == Status.PostProcessingFinished:
task.removeCrabOutputs()
self.output().touch()
elif grid_job_id == -1:
done = False
if task.taskStatus.status in [ Status.CrabFinished, Status.PostProcessingFinished ]:
try:
if task.taskStatus.status == Status.CrabFinished:
print(f'{task.name}: post-processing ...')
task.postProcessOutputs(job_home)
self.output().touch()
done = True
except Exception as e:
print(f'{task.name}: error while post-processing: {e}')
else:
print(f"task {task.name} is not ready for post-processing")
if not done:
failed_flag = failed_flag if failed_flag is not None else task.getPostProcessingFaliedFlagFile()
failed_flag_target = law.LocalFileTarget(failed_flag)
failed_flag_target.touch()
else:
if grid_job_id in task.getGridJobs():
print(f'Running {task.name} job_id = {grid_job_id}')
result = task.runJobLocally(grid_job_id, job_home)
print(f'Finished running {task.name} job_id = {grid_job_id}. result = {result}')
else:
print(f'job_id = {grid_job_id} is not found in {task.name}. considering it as finished')
result = True
state_str = 'finished' if result else 'failed'
with self.output().open('w') as output:
output.write(state_str)
finally:
if remove_job_home:
shutil.rmtree(job_home)
cond.acquire()
cond.notify_all()
cond.release()
thread.join()
def htcondor_poll_callback(self, poll_data):
return self.poll_callback(poll_data)
def poll_callback(self, poll_data):
update_kinit(verbose=0)
rlist, wlist, xlist = select.select([sys.stdin], [], [], 0.1)
if rlist:
termios.tcflush(sys.stdin, termios.TCIOFLUSH)
timeout = 10 # seconds
print_ts('Input from terminal is detected. Press return to stop polling, otherwise polling will'
f' continue in {timeout} seconds...')
rlist, wlist, xlist = select.select([sys.stdin], [], [], timeout)
if rlist:
termios.tcflush(sys.stdin, termios.TCIOFLUSH)
return False
print_ts(f'Polling resumed')
return datetime.datetime.now() < self.stop_date
def control_output_postfix(self):
return ""