-
Notifications
You must be signed in to change notification settings - Fork 93
/
train_langdetect.sh
executable file
·110 lines (100 loc) · 2.6 KB
/
train_langdetect.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
if [[ ! -d $1 ]]
then
echo "Usage: train_langdetect.sh data_dir"
exit 1
fi
if [[ ! -f $1/train.txt ]] || [[ ! -f $1/valid.txt ]]
then
echo "data_dir must contain train.txt and valid.txt from lang_dataset.sh"
exit 2
fi
set +v
DATADIR=$1
OUTPUT=$DATADIR/models/${DATASET}
EXPORT_DIR=$DATADIR/models/${DATASET}
INPUT_TRAIN_FILE=$DATADIR/train.txt
INPUT_TEST_FILE=$DATADIR/valid.txt
TRAIN_FILE=$DATADIR/train.txt.tfrecords-1-of-1
TEST_FILE=$DATADIR/valid.txt.tfrecords-1-of-1
echo "Looking for $TRAIN_FILE"
if ls ${TRAIN_FILE} 1> /dev/null 2>&1
then
echo "Found"
else
echo "Not Found $TRAIN_FILE"
echo "Processing training dataset file"
python process_input.py --facebook_input=${INPUT_TRAIN_FILE} --ngrams=2,3,4
if ls ${TRAIN_FILE} 1> /dev/null 2>&1
then
echo "$TRAIN_FILE created"
else
echo "Failed to create $TRAIN_FILE"
exit 1
fi
fi
echo "Looking for $TEST_FILE"
if ls ${TEST_FILE} 1> /dev/null 2>&1
then
echo "Found"
else
echo "Not Found $TEST_FILE"
echo "Processing test dataset file"
python process_input.py --facebook_input=${INPUT_TEST_FILE} --ngrams=2,3,4
if ls ${TEST_FILE} 1> /dev/null 2>&1
then
echo "$TEST_FILE created"
else
echo "Failed to create $TEST_FILE"
exit 1
fi
fi
LABELS=$DATADIR/train.txt.labels
VOCAB=$DATADIR/train.txt.vocab
VOCAB_SIZE=`cat $VOCAB | wc -l | sed -e "s/[ \t]//g"`
echo $VOCAB
echo $VOCAB_SIZE
echo $LABELS
# python classifier.py \
# --train_records=$TRAIN_FILE \
# --eval_records=$TEST_FILE \
# --label_file=$LABELS \
# --vocab_file=$VOCAB \
# --vocab_size=$VOCAB_SIZE \
# --model_dir=$OUTPUT \
# --export_dir=$EXPORT_DIR \
# --embedding_dimension=16 \
# --num_ngram_buckets=100000 \
# --ngram_embedding_dimension=16 \
# --learning_rate=0.01 \
# --batch_size=128 \
# --train_steps=20000 \
# --eval_steps=1000 \
# --num_epochs=1 \
# --num_threads=1 \
# --use_ngrams \
# --nolog_device_placement \
# --fast \
# --debug
mpirun -np 2 python classifier.py \
--train_records=$TRAIN_FILE \
--eval_records=$TEST_FILE \
--label_file=$LABELS \
--vocab_file=$VOCAB \
--vocab_size=$VOCAB_SIZE \
--model_dir=$OUTPUT \
--export_dir=$EXPORT_DIR \
--embedding_dimension=16 \
--num_ngram_buckets=100000 \
--ngram_embedding_dimension=16 \
--learning_rate=0.01 \
--batch_size=128 \
--train_steps=20000 \
--eval_steps=1000 \
--num_epochs=1 \
--num_threads=1 \
--use_ngrams \
--nolog_device_placement \
--fast \
--horovod \
--debug