-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_train.bash
113 lines (95 loc) · 3.13 KB
/
run_train.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/bash
RED='\033[1;31m'
YELLOW='\033[1;33m'
IWhite='\033[0;97m'
IGreen='\033[0;92m'
NC='\033[0m' # No Color
# stop on error
set -e
# Usage function
usage() {
echo -e "Usage: $0 -e <experiment_name> -c <config_name> [-f]\n
Options:
-e name of experiment file in '/experiments' folder
-c name of configuration file in '/configs' folder
-f don't continue from the most recent checkpoint, but start a new version from scratch"
exit 1
}
# Initialize from_scratch variable
from_scratch=0
# Parse command-line arguments
while getopts ":e:c:f" opt; do
case $opt in
e) experiment_name="$OPTARG" ;;
c) config_name="$OPTARG" ;;
f) from_scratch=1 ;;
\?) usage ;;
esac
done
# Check if both arguments are provided
if [ -z "$experiment_name" ] || [ -z "$config_name" ]; then
usage
fi
if [ $from_scratch == 1 ]; then
printf "${RED}Warning:${IWhite} Training from scratch, even if the script could continue!${NC}\n"
continue="--new_version"
else
printf "${YELLOW}Continuing training of most recent version if possible${NC}\n"
continue=""
fi
echo -e "Submitting with experiment ${IWhite}'$experiment_name'${NC} and config ${IWhite}'$config_name'${NC}"
# Construct paths
experiment_file="experiments/$experiment_name"
config_file="configs/$config_name"
# Check if the files exist
if [ ! -f "$experiment_file" ] || [ ! -f "$config_file" ]; then
echo "Experiment or Config file does not exist."
exit 1
fi
# Check if the files end with newlines
if [ ! "$(tail -c1 "$experiment_file" | wc -l)" -eq 1 ]; then
echo "The experiment file does not end with a newline."
exit 1
fi
if [ ! "$(tail -c1 "$config_file" | wc -l)" -eq 1 ]; then
echo "The config file does not end with a newline."
exit 1
fi
# Initialize the argument string for src/main.py
args=""
# Read each line in the experiment file
while IFS='=' read -r key value; do
# Skip lines that start with '#'
[[ $key == \#* ]] || [[ -z $key ]] && continue
# Add all other args
if [[ $value == "" ]]; then
args+=" --$key"
else
args+=" --$key=$value"
fi
done < "$experiment_file"
# Save job script
current_datetime=$(date "+%Y%m%d_%H%M%S")
mkdir -p jobs # create jobs folder if doesnt exist
job_script="jobs/${experiment_name}_${config_name}_${current_datetime}.job"
touch $job_script
cat "$config_file" > "$job_script"
configuration_args="\$COMPILE_ARGS $continue --num_workers="\$NUM_WORKERS" --num_devices="\$NUM_DEVICES" --experiment=$experiment_name"
# Write the command to run the experiment
cat << EOF >> "$job_script"
\$INSTALL_CMD
\$RUN_ENV src/train.py $args $configuration_args
EOF
# Check contents of the config file to determine the scheduler
if [[ $config_file == *"local"* ]]; then
bash "$job_script"
elif [[ $config_file == *"hendrix"* ]]; then
sbatch "$job_script"
echo -e "${IGreen}Success!${NC} Experiment submitted with ${IWhite}sbatch${NC}."
elif [[ $config_file == *"gbar"* ]]; then
bsub < "$job_script"
echo -e "${IGreen}Success!${NC} Experiment submitted with ${IWhite}bsub${NC}"
else
echo "Unknown scheduler configuration."
exit 1
fi