This repository was archived by the owner on Oct 26, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 613
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b08530a
Showing
71 changed files
with
10,229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
*~ | ||
build | ||
test/tst2012.en |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright (c) 2017-present, Facebook, Inc. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. An additional grant | ||
# of patent rights can be found in the PATENTS file in the same directory. | ||
|
||
CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) | ||
CMAKE_POLICY(VERSION 2.6) | ||
|
||
FIND_PACKAGE(Torch REQUIRED) | ||
FIND_PACKAGE(OpenMP) | ||
|
||
SET(CMAKE_CXX_FLAGS "-std=c++11 -Ofast") | ||
IF(OpenMP_FOUND) | ||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") | ||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | ||
ENDIF() | ||
|
||
# C++ library | ||
IF(APPLE) | ||
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so") | ||
ENDIF(APPLE) | ||
FILE(GLOB CPPSRC fairseq/clib/*.cpp) | ||
ADD_LIBRARY(fairseq_clib SHARED ${CPPSRC}) | ||
INSTALL(TARGETS fairseq_clib DESTINATION "${ROCKS_LIBDIR}") | ||
|
||
# Lua library | ||
INSTALL(DIRECTORY "fairseq" DESTINATION "${ROCKS_LUADIR}" FILES_MATCHING PATTERN "*.lua") | ||
|
||
# Scripts and main executable | ||
FOREACH(SCRIPT preprocess train tofloat generate generate-lines score optimize-fconv help) | ||
INSTALL(FILES "${SCRIPT}.lua" DESTINATION "${ROCKS_LUADIR}/fairseq/scripts") | ||
ENDFOREACH(SCRIPT) | ||
INSTALL(FILES "run.lua" DESTINATION "${ROCKS_BINDIR}" RENAME "fairseq") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
BSD License | ||
|
||
For fairseq software | ||
|
||
Copyright (c) 2017-present, Facebook, Inc. All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without modification, | ||
are permitted provided that the following conditions are met: | ||
|
||
* Redistributions of source code must retain the above copyright notice, this | ||
list of conditions and the following disclaimer. | ||
|
||
* Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
* Neither the name Facebook nor the names of its contributors may be used to | ||
endorse or promote products derived from this software without specific | ||
prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR | ||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON | ||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
Additional Grant of Patent Rights Version 2 | ||
|
||
"Software" means the fairseq software distributed by Facebook, Inc. | ||
|
||
Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software | ||
("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable | ||
(subject to the termination provision below) license under any Necessary | ||
Claims, to make, have made, use, sell, offer to sell, import, and otherwise | ||
transfer the Software. For avoidance of doubt, no license is granted under | ||
Facebook’s rights in any patent claims that are infringed by (i) modifications | ||
to the Software made by you or any third party or (ii) the Software in | ||
combination with any software or other technology. | ||
|
||
The license granted hereunder will terminate, automatically and without notice, | ||
if you (or any of your subsidiaries, corporate affiliates or agents) initiate | ||
directly or indirectly, or take a direct financial interest in, any Patent | ||
Assertion: (i) against Facebook or any of its subsidiaries or corporate | ||
affiliates, (ii) against any party if such Patent Assertion arises in whole or | ||
in part from any software, technology, product or service of Facebook or any of | ||
its subsidiaries or corporate affiliates, or (iii) against any party relating | ||
to the Software. Notwithstanding the foregoing, if Facebook or any of its | ||
subsidiaries or corporate affiliates files a lawsuit alleging patent | ||
infringement against you in the first instance, and you respond by filing a | ||
patent infringement counterclaim in that lawsuit against that party that is | ||
unrelated to the Software, the license granted hereunder will not terminate | ||
under section (i) of this paragraph due to such counterclaim. | ||
|
||
A "Necessary Claim" is a claim of a patent owned by Facebook that is | ||
necessarily infringed by the Software standing alone. | ||
|
||
A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, | ||
or contributory infringement or inducement to infringe any patent, including a | ||
cross-claim or counterclaim. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
# Introduction | ||
This is fairseq, a sequence-to-sequence learning toolkit for [Torch](http://torch.ch/) from Facebook AI Research tailored to Neural Machine Translation (NMT). | ||
It implements the convolutional NMT models models proposed in [Convolutional Sequence to Sequence Learning](https://fb.me/convolutional-s2s.pdf) and [A Convolutional Encoder Model for Neural Machine Translation](https://arxiv.org/abs/1611.02344) as well as a standard LSTM-based model. | ||
It features multi-GPU training on a single machine as well as fast beam search generation on both CPU and GPU. | ||
We provide pre-trained models for English to French, English to German and English to Romanian translation. | ||
|
||
# Requirements and Installation | ||
* A computer running macOS or Linux | ||
* For training new models, you'll also need a NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl) | ||
* A [Torch installation](http://torch.ch/docs/getting-started.html). For maximum speed, we recommend using LuaJIT and [Intel MKL](https://software.intel.com/en-us/intel-mkl). | ||
* A recent version [nn](https://github.com/torch/nn). The minimum required version is from May 5th, 2017. A simple `luarocks install nn` is sufficient to update your locally installed version. | ||
|
||
Install fairseq by cloning the GitHub repository and running | ||
``` | ||
luarocks make rocks/fairseq-scm-1.rockspec | ||
``` | ||
LuaRocks will fetch and build any additional dependencies that may be missing. | ||
In order to install the CPU-only version (which is only useful for translating new data with an existing model), do | ||
``` | ||
luarocks make rocks/fairseq-cpu-scm-1.rockspec | ||
``` | ||
|
||
The LuaRocks installation provides a command-line tool that includes the following functionality: | ||
* `fairseq preprocess`: Data pre-processing: build vocabularies and binarize training data | ||
* `fairseq train`: Train a new model on one or multiple GPUs | ||
* `fairseq generate`: Translate pre-processed data with a trained model | ||
* `fairseq generate-lines`: Translate raw text with a trained model | ||
* `fairseq score`: BLEU scoring of generated translations against reference translations | ||
* `fairseq tofloat`: Convert a trained model to a CPU model | ||
* `fairseq optimize-fconv`: Optimize a fully convolutional model for generation. This can also be achieved by passing the `-fconvfast` flag to the generation scripts. | ||
|
||
# Quick Start | ||
|
||
## Evaluating Pre-trained Models | ||
First, download a pre-trained model along with its vocabularies: | ||
``` | ||
$ curl https://s3.amazonaws.com/fairseq/models/wmt14.en-fr.fconv-cuda.tar.bz2 | tar xvjf - | ||
``` | ||
|
||
This will unpack vocabulary files and a serialized model for English to French translation to `wmt14.en-fr.fconv-cuda/`. | ||
|
||
Alternatively, use a CPU-based model: | ||
``` | ||
$ curl https://s3.amazonaws.com/fairseq/models/wmt14.en-fr.fconv-float.tar.bz2 | tar xvjf - | ||
``` | ||
|
||
Let's use `fairseq generate-lines` to translate some text. | ||
This model uses a [Byte Pair Encoding (BPE) vocabulary](https://arxiv.org/abs/1508.07909), so we'll have to apply the encoding to the source text. | ||
This can be done with [apply_bpe.py](https://github.com/rsennrich/subword-nmt/blob/master/apply_bpe.py) using the `bpecodes` file in within `wmt14.en-fr.fconv-cuda/`. | ||
`@@` is used as a continuation marker and the original text can be easily recovered with e.g. `sed s/@@ //g`. | ||
Prior to BPE, input text needs to be tokenized using `tokenizer.perl` from [mosesdecoder](https://github.com/moses-smt/mosesdecoder). | ||
Here, we use a beam size of 5: | ||
``` | ||
$ fairseq generate-lines -path wmt14.en-fr.fconv-cuda/model.th7 -sourcedict wmt14.en-fr.fconv-cuda/dict.en.th7 \ | ||
-targetdict wmt14.en-fr.fconv-cuda/dict.fr.th7 -beam 5 | ||
| [target] Dictionary: 44666 types | ||
| [source] Dictionary: 44409 types | ||
> Why is it rare to discover new marine mam@@ mal species ? | ||
S Why is it rare to discover new marine mam@@ mal species ? | ||
O Why is it rare to discover new marine mam@@ mal species ? | ||
H -0.068684287369251 Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins ? | ||
A 1 1 4 4 6 6 7 11 9 9 9 12 13 | ||
``` | ||
|
||
This generation script produces four types of output: a line prefixed with *S* shows the supplied source sentence after applying the vocabulary; *O* is a copy of the original source sentence; *H* is the hypothesis along with an average log-likelihood and *A* are attention maxima for each word in the hypothesis (including the end-of-sentence marker which is omitted from the text). | ||
|
||
Check [below](#pre-trained-models) for a full list of pre-trained models available. | ||
|
||
## Training a New Model | ||
|
||
### Data Pre-processing | ||
The fairseq source distribution contains an example pre-processing script for | ||
the IWSLT14 German-English corpus. | ||
Pre-process and binarize the data as follows: | ||
``` | ||
$ cd data/ | ||
$ bash prepare-iwslt14.sh | ||
$ cd .. | ||
$ TEXT=data/iwslt14.tokenized.de-en | ||
$ fairseq preprocess -sourcelang de -targetlang en \ | ||
-trainpref $TEXT/train -validpref $TEXT/valid -testpref $TEXT/test \ | ||
-thresholdsrc 3 -thresholdtgt 3 -destdir data-bin/iwslt14.tokenized.de-en | ||
``` | ||
This will write binarized data that can be used for model training to data-bin/iwslt14.tokenized.de-en. | ||
|
||
### Training | ||
Use `fairseq train` to tran a new model. | ||
Here a few example settings that work well for the IWSLT14 dataset: | ||
``` | ||
# Standard bi-directional LSTM model | ||
$ mkdir -p trainings/blstm | ||
$ fairseq train -sourcelang de -targetlang en -datadir data-bin/iwslt14.tokenized.de-en \ | ||
-model blstm -nhid 512 -dropout 0.2 -dropout_hid 0 -optim adam -lr 0.0003125 -savedir trainings/blstm | ||
# Fully convolutional sequence-to-sequence model | ||
$ mkdir -p trainings/fconv | ||
$ fairseq train -sourcelang de -targetlang en -datadir data-bin/iwslt14.tokenized.de-en \ | ||
-model fconv -nenclayer 4 -nlayer 3 -dropout 0.2 -optim nag -lr 0.25 -clip 0.1 \ | ||
-momentum 0.99 -timeavg -bptt 0 -savedir trainings/fconv | ||
# Convolutional encoder, LSTM decoder | ||
$ mkdir -p trainings/convenc | ||
$ fairseq train -sourcelang de -targetlang en -datadir data-bin/iwslt14.tokenized.de-en \ | ||
-model conv -nenclayer 6 -dropout 0.2 -dropout_hid 0 -savedir trainings/convenc | ||
``` | ||
|
||
By default, `fairseq train` will use all available GPUs on your machine. | ||
Use the [CUDA_VISIBLE_DEVICES](http://acceleware.com/blog/cudavisibledevices-masking-gpus) environment variable to select specific GPUs or `-ngpus` to change the number of GPU devices that will be used. | ||
|
||
### Generation | ||
Once your model is trained, you can translate with it using `fairseq generate` (for binarized data) or `fairseq generate-lines` (for text). | ||
Here, we'll do it for a fully convolutional model: | ||
``` | ||
# Optional: optimize for generation speed | ||
$ fairseq optimize-fconv -input_model trainings/fconv/model_best.th7 -output_model trainings/fconv/model_best_opt.th7 | ||
# Translate some text | ||
$ DATA=data-bin/iwslt14.tokenized.de-en | ||
$ fairseq generate-lines -sourcedict $DATA/dict.de.th7 -targetdict $DATA/dict.en.th7 \ | ||
-path trainings/fconv/model_best_opt.th7 -beam 10 -nbest 2 | ||
| [target] Dictionary: 24738 types | ||
| [source] Dictionary: 35474 types | ||
> eine sprache ist ausdruck des menschlichen geistes . | ||
S eine sprache ist ausdruck des menschlichen geistes . | ||
O eine sprache ist ausdruck des menschlichen geistes . | ||
H -0.23804219067097 a language is expression of human mind . | ||
A 2 2 3 4 5 6 7 8 9 | ||
H -0.23861141502857 a language is expression of the human mind . | ||
A 2 2 3 4 5 7 6 7 9 9 | ||
``` | ||
|
||
### CPU Generation | ||
Use `fairseq tofloat` to convert a trained model to use CPU-only operations (this has to be done on a GPU machine): | ||
``` | ||
# Optional: optimize for generation speed | ||
$ fairseq optimize-fconv -input_model trainings/fconv/model_best.th7 -output_model trainings/fconv/model_best_opt.th7 | ||
# Convert to float | ||
$ fairseq tofloat -input_model trainings/fconv/model_best_opt.th7 \ | ||
-output_model trainings/fconv/model_best_opt-float.th7 | ||
# Translate some text | ||
$ fairseq generate-lines -sourcedict $DATA/dict.de.th7 -targetdict $DATA/dict.en.th7 \ | ||
-path trainings/fconv/model_best_opt-float.th7 -beam 10 -nbest 2 | ||
> eine sprache ist ausdruck des menschlichen geistes . | ||
S eine sprache ist ausdruck des menschlichen geistes . | ||
O eine sprache ist ausdruck des menschlichen geistes . | ||
H -0.2380430996418 a language is expression of human mind . | ||
A 2 2 3 4 5 6 7 8 9 | ||
H -0.23861189186573 a language is expression of the human mind . | ||
A 2 2 3 4 5 7 6 7 9 9 | ||
``` | ||
|
||
# Pre-trained Models | ||
|
||
We provide the following pre-trained fully convolutional sequence-to-sequence models: | ||
|
||
* [wmt14.en-fr.fconv-cuda.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt14.en-fr.fconv-cuda.tar.bz2): Pre-trained model for [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) including vocabularies | ||
* [wmt14.en-fr.fconv-float.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt14.en-fr.fconv-float.tar.bz2): CPU version of the above | ||
* [wmt14.en-de.fconv-cuda.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt14.en-de.fconv-cuda.tar.bz2): Pre-trained model for [WMT14 English-German](https://nlp.stanford.edu/projects/nmt) including vocabularies | ||
* [wmt14.en-de.fconv-float.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt14.en-de.fconv-float.tar.bz2): CPU version of the above | ||
* [wmt16.en-ro.fconv-cuda.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt16.en-ro.fconv-cuda.tar.bz2): Pre-trained model for WMT16 English-Romanian including vocabularies. | ||
This model was trained on the [original WMT bitext](http://statmt.org/wmt16/translation-task.html#Download) as well as [back-translated data](http://data.statmt.org/rsennrich/wmt16_backtranslations/en-ro) provided by Rico Sennrich. | ||
* [wmt16.en-ro.fconv-float.tar.bz2](https://s3.amazonaws.com/fairseq/models/wmt16.en-ro.fconv-float.tar.bz2): CPU version of the above | ||
|
||
In addition, we provide pre-processed and binarized test sets for the models above: | ||
|
||
* [wmt14.en-fr.newstest2014.tar.bz2](https://s3.amazonaws.com/fairseq/data/wmt14.en-fr.newstest2014.tar.bz2): newstest2014 test set for WMT14 English-French | ||
* [wmt14.en-fr.ntst1213.tar.bz2](https://s3.amazonaws.com/fairseq/data/wmt14.en-fr.ntst1213.tar.bz2): newstest2012 and newstest2013 test sets for WMT14 English-French | ||
* [wmt14.en-de.newstest2014.tar.bz2](https://s3.amazonaws.com/fairseq/data/wmt14.en-de.newstest2014.tar.bz2): newstest2014 test set for WMT14 English-German | ||
* [wmt16.en-ro.newstest2014.tar.bz2](https://s3.amazonaws.com/fairseq/data/wmt16.en-ro.newstest2016.tar.bz2): newstest2016 test set for WMT16 English-Romanian | ||
|
||
Generation with the binarized test sets can be run in batch mode as follows, e.g. for English-French on a GTX-1080ti: | ||
``` | ||
$ curl https://s3.amazonaws.com/fairseq/data/wmt14.en-fr.newstest2014.tar.bz2 | tar xvjf - | ||
$ fairseq generate -sourcelang en -targetlang fr -datadir data-bin/wmt14.en-fr -dataset newstest2014 \ | ||
-path wmt14.en-fr.fconv-cuda/model.th7 -beam 5 -batchsize 128 | tee /tmp/gen.out | ||
... | ||
| Translated 3003 sentences (95451 tokens) in 136.3s (700.49 tokens/s) | ||
| Timings: setup 0.1s (0.1%), encoder 1.9s (1.4%), decoder 108.9s (79.9%), search_results 0.0s (0.0%), search_prune 12.5s (9.2%) | ||
| BLEU4 = 43.43, 68.2/49.2/37.4/28.8 (BP=0.996, ratio=1.004, sys_len=92087, ref_len=92448) | ||
# Word-level BLEU scoring: | ||
$ grep ^H /tmp/gen.out | cut -f3- | sed 's/@@ //g' > /tmp/gen.out.sys | ||
$ grep ^T /tmp/gen.out | cut -f2- | sed 's/@@ //g' > /tmp/gen.out.ref | ||
$ fairseq score -sys /tmp/gen.out.sys -ref /tmp/gen.out.ref | ||
BLEU4 = 40.55, 67.6/46.5/34.0/25.3 (BP=1.000, ratio=0.998, sys_len=81369, ref_len=81194) | ||
``` | ||
|
||
# License | ||
fairseq is BSD-licensed. | ||
The license applies to the pre-trained models as well. | ||
We also provide an additional patent grant. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh | ||
|
||
echo 'Cloning Moses github repository (for tokenization scripts)...' | ||
git clone https://github.com/moses-smt/mosesdecoder.git | ||
|
||
SCRIPTS=mosesdecoder/scripts | ||
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl | ||
LC=$SCRIPTS/tokenizer/lowercase.perl | ||
CLEAN=$SCRIPTS/training/clean-corpus-n.perl | ||
|
||
URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz" | ||
GZ=de-en.tgz | ||
|
||
if [ ! -d "$SCRIPTS" ]; then | ||
echo "Please set SCRIPTS variable correctly to point to Moses scripts." | ||
exit | ||
fi | ||
|
||
src=de | ||
tgt=en | ||
lang=de-en | ||
prep=iwslt14.tokenized.de-en | ||
tmp=prep/tmp | ||
orig=orig | ||
|
||
mkdir -p $orig $tmp $prep | ||
|
||
echo "Downloading data from ${URL}..." | ||
cd $orig | ||
wget "$URL" | ||
|
||
if [ -f $GZ ]; then | ||
echo "Data successfully downloaded." | ||
else | ||
echo "Data not successfully downloaded." | ||
exit | ||
fi | ||
|
||
tar zxvf $GZ | ||
cd .. | ||
|
||
echo "pre-processing train data..." | ||
for l in $src $tgt; do | ||
f=train.tags.$lang.$l | ||
tok=train.tags.$lang.tok.$l | ||
|
||
cat $orig/$lang/$f | \ | ||
grep -v '<url>' | \ | ||
grep -v '<talkid>' | \ | ||
grep -v '<keywords>' | \ | ||
sed -e 's/<title>//g' | \ | ||
sed -e 's/<\/title>//g' | \ | ||
sed -e 's/<description>//g' | \ | ||
sed -e 's/<\/description>//g' | \ | ||
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok | ||
echo "" | ||
done | ||
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175 | ||
for l in $src $tgt; do | ||
perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l | ||
done | ||
|
||
echo "pre-processing valid/test data..." | ||
for l in $src $tgt; do | ||
for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do | ||
fname=${o##*/} | ||
f=$tmp/${fname%.*} | ||
echo $o $f | ||
grep '<seg id' $o | \ | ||
sed -e 's/<seg id="[0-9]*">\s*//g' | \ | ||
sed -e 's/\s*<\/seg>\s*//g' | \ | ||
sed -e "s/\’/\'/g" | \ | ||
perl $TOKENIZER -threads 8 -l $l | \ | ||
perl $LC > $f | ||
echo "" | ||
done | ||
done | ||
|
||
|
||
echo "creating train, valid, test..." | ||
for l in $src $tgt; do | ||
awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.$l | ||
awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/train.$l | ||
|
||
cat $tmp/IWSLT14.TED.dev2010.de-en.$l \ | ||
$tmp/IWSLT14.TEDX.dev2012.de-en.$l \ | ||
$tmp/IWSLT14.TED.tst2010.de-en.$l \ | ||
$tmp/IWSLT14.TED.tst2011.de-en.$l \ | ||
$tmp/IWSLT14.TED.tst2012.de-en.$l \ | ||
> $prep/test.$l | ||
done |
Oops, something went wrong.