-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
77 lines (60 loc) · 1.62 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import submitit
import logging
import datetime
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
LOG = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Basic parameters
SAVE_DIR = "./weights"
FILES_DIR = "./waymo110/preprocessed_data"
CHECKPOINT_PATH = None
TB_DIR = "./tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# Training parameters
LR = 1e-4
BATCH_SIZE = 8 # batch size per GPU => 8 * 2 * 2 = 32 batch size
EPOCHS = 10
LOCAL = False
# SLURM parameters
LOG_DIR = "./logs"
N_NODES = 4
GPUS_PER_NODE = 2
CPUS_PER_NODE = 40
MEM_PER_NODE = 150
from trainer import Trainer
def main():
trainer = Trainer(
lr=LR,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
save_dir=SAVE_DIR,
files_dir=FILES_DIR,
tb_dir=TB_DIR,
checkpoint_path=CHECKPOINT_PATH,
local=LOCAL,
)
if LOCAL:
LOG.info("Running locally")
trainer()
return 0
executor = submitit.AutoExecutor(folder=LOG_DIR)
executor.update_parameters(
name="train_strajnet_torch",
nodes=N_NODES,
mem_gb=MEM_PER_NODE,
gpus_per_node=GPUS_PER_NODE,
tasks_per_node=GPUS_PER_NODE,
cpus_per_task=CPUS_PER_NODE // GPUS_PER_NODE, # 40 total on one node
timeout_min=60 * 72, # 72 hours
slurm_partition="gpu",
slurm_qos="gpu",
slurm_gres=f"gpu:{GPUS_PER_NODE}",
slurm_additional_parameters={
"requeue": True,
},
)
job = executor.submit(trainer)
LOG.info(f"Submitted job_id: {job.job_id}")
return job
if __name__ == "__main__":
main()