-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_hypertune.sh
executable file
·139 lines (113 loc) · 3.36 KB
/
run_hypertune.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
read_path() {
read -ep "Path to $1: " the_path
echo $(readlink -f $the_path)
}
check_dir_exists() {
if [ ! -d $1 ]; then
echo No directory exists at $1, exiting...
exit 1
fi
}
check_file_exists() {
if [ ! -f $1 ]; then
echo No file exists at $1, exiting...
exit 1
fi
}
echo Welcome to HyperTune Runner!
echo
echo =============================
echo "[1/4] Specify parameters"
echo =============================
echo
echo -n "Username for SSH to remote machines: "
read username
echo -n "Password for SSH to remote machines: "
read -s password
echo
echo -n "DNN model (resnet or alexnet): "
read dnn_model
echo -n "DNN parallelization strategy (dp, mp, or gpipe): "
read dnn_strategy
echo -n "DNN dataset / task (imagenet or mnist): "
read dataset_task
venv_path=$(read_path "virtual environment")
check_dir_exists $venv_path
train_path=$(read_path "DNN training file (.py)")
check_file_exists $train_path
hyp_cfg_path=$(read_path "DNN hyperparameter space config (.json)")
check_file_exists $hyp_cfg_path
if [ "$dataset_task" == "imagenet" ]; then
data_path=$(read_path "ImageNet data (CLS-LOC folder)")
check_dir_exists $data_path
fi
cur_date=$(date '+%FT%H%M%S')
log_path=./logs/run_${dataset_task}_${dnn_model}_${dnn_strategy}_${cur_date}.log
echo
echo =============================
echo "[2/4] Controller Summary"
echo =============================
echo "Virtual environment path: $venv_path"
echo "DNN dataset / task: $dataset_task"
echo "DNN model / architecture: $dnn_model"
echo "DNN parallel training strategy: $dnn_strategy"
echo "DNN training file: $train_path"
echo "DNN hyperparameter space config file: $hyp_cfg_path"
echo "Number of DNN training epochs: 1"
if [ "$dataset_task" == "imagenet" ]; then
echo "ImageNet data folder: $data_path"
fi
echo "stdout will be logged to: $log_path"
echo
# Confirm settings with user
read -p "Please confirm these parameters by typing Y: " -n 1 -r
echo # (optional) move to a new line
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo The job has been cancelled, please run again. Exiting...
exit 1
fi
echo
echo "[3/4] Starting Controller..."
# Activate venv
source venv/bin/activate
# Run controller
if [ "$dataset_task" == "imagenet" ]; then
# ImageNet task must pass path to ImageNet dataset
nohup python -u controller/controller.py \
--venv $venv_path \
--dnn $train_path \
--dnn_hyperparameter_space $hyp_cfg_path \
--data $data_path \
--arch $dnn_model \
--parallelism $dnn_strategy \
--epochs 1 \
--dnn_metric_key accuracy \
--dnn_metric_objective max \
--username $username \
--password $password \
--debug \
--machines gpu1 gpu2 gpu3 \
> $log_path 2>&1 &
else
nohup python -u controller/controller.py \
--venv $venv_path \
--dnn $train_path \
--dnn_hyperparameter_space $hyp_cfg_path \
--arch $dnn_model \
--parallelism $dnn_strategy \
--epochs 1 \
--dnn_metric_key accuracy \
--dnn_metric_objective max \
--username $username \
--password $password \
--debug \
--machines gpu1 gpu2 gpu3 \
> $log_path 2>&1 &
fi
# Deactivate venv
deactivate
echo "[4/4] Your training job is now running in a separate process!"
echo Use this command to quickly print your log:
echo " cat $log_path"
echo Exiting...