From ca32c4e2bd77544c4ad4aa0bba25f0b812ea1a61 Mon Sep 17 00:00:00 2001
From: GoVivace <nagendra.goel@govivace.com>
Date: Mon, 31 Dec 2018 18:22:34 -0500
Subject: [PATCH] [egs] Add more modern DNN recipe for fisher_callhome_spanish
 (#2951)

---
 .../s5/conf/mfcc_hires.conf                   |  10 +
 .../s5/conf/online_cmvn.conf                  |   1 +
 .../s5/local/chain/run_tdnn_1g.sh             | 288 ++++++++++++++++++
 .../s5/local/nnet3/run_ivector_common.sh      | 187 ++++++++++++
 .../s5/local/rnnlm/train_rnnlm.sh             | 101 ++++++
 egs/fisher_callhome_spanish/s5/path.sh        |   1 +
 egs/fisher_callhome_spanish/s5/rnnlm          |   1 +
 egs/fisher_callhome_spanish/s5/run.sh         | 154 +++++-----
 8 files changed, 662 insertions(+), 81 deletions(-)
 create mode 100644 egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf
 create mode 100644 egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf
 create mode 100755 egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
 create mode 120000 egs/fisher_callhome_spanish/s5/rnnlm

diff --git a/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..d870ab04c38
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
new file mode 100755
index 00000000000..c487f1bd222
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e.
+#   with bypass resnet connections, and re-tuned.
+# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- 
+# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ]
+# %SER 56.98 [ 3577 / 6278 ]
+# Scored 6278 sentences, 0 not present in hyp.
+
+# steps/info/chain_dir_info.pl  exp/chain/multipsplice_tdnn
+# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="test dev"
+gmm=tri5a        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 15 ]; then
+echo "local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix""
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+fi
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/tri5a_lats_nodup_sp
+dir=exp/chain/multipsplice_tdnn
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_train_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain/${gmm}_tree
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_${gmm}_chain
+
+#for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+#    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+#    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+#  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+#done
+
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 17 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 18 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 19 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 20 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand $srand \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.optimization.num-jobs-initial 1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context 0 \
+    --egs.chunk-right-context 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs $remove_egs \
+    --use-gpu true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir exp/tri5a_lats_nodup_sp \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 21 ]; then
+  # The reason we are using data/lang_test here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  #LM was trained only on Fisher Spanish train subset.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph_fsp_train || exit 1;
+
+fi
+
+rnnlmdir=exp/rnnlm_lstm_tdnn_1b
+if [ $stage -le 22 ]; then
+  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
+fi
+
+if [ $stage -le 23 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype  in fsp_train; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
+      done
+      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
+	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..cc9de4d26c5
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=7
+nj=30
+train_set=train   # you might set this to e.g. train.
+test_sets="test dev"
+gmm=tri5a                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 7."
+  exit 1
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+
+fi
+
+if [ $stage -le 11 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 13 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 15 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
new file mode 100755
index 00000000000..3713fe228d6
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+
+# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
+
+# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
+# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
+# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
+# Dev objf:   -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_tdnn_1b
+embedding_dim=200
+embedding_l2=0.005 # embedding layer l2 regularize
+comp_l2=0.005 # component-level l2 regularize
+output_l2=0.005 # output-layer l2 regularize
+epochs=90
+mic=
+stage=-10
+train_stage=0
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+train=data/train/text
+dev=data/dev2/text   # We at no stage in run.sh should decode dev2 partition for results!
+wordlist=data/lang/words.txt
+text_dir=data/local/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+ami  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
+relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "$cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 1a6fb5f891b..17ffb0369f8 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -3,3 +3,4 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
diff --git a/egs/fisher_callhome_spanish/s5/rnnlm b/egs/fisher_callhome_spanish/s5/rnnlm
new file mode 120000
index 00000000000..fb754622d5e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/rnnlm
@@ -0,0 +1 @@
+../../wsj/s5/rnnlm
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 57902a98fed..6e2752a7b68 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -1,20 +1,22 @@
 #!/bin/bash
 #
+# Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 # Recipe for Fisher/Callhome-Spanish
-# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT
 
 stage=0
+train_stage=-20
+train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is
 # (the values below are just an example).
-sfisher_speech=/veu4/jadrian/data/LDC/LDC2010S01
-sfisher_transcripts=/veu4/jadrian/data/LDC/LDC2010T04
-spanish_lexicon=/veu4/jadrian/data/LDC/LDC96L16
+sfisher_speech=/export/corpora/LDC/LDC2010S01
+sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+spanish_lexicon=/export/corpora/LDC/LDC96L16
 split=local/splits/split_fisher
 
-callhome_speech=/veu4/jadrian/data/LDC/LDC96S35
-callhome_transcripts=/veu4/jadrian/data/LDC/LDC96T17
+callhome_speech=/export/corpora/LDC/LDC96S35
+callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
 mfccdir=`pwd`/mfcc
@@ -25,7 +27,7 @@ if [ -f path.sh ]; then . ./path.sh; fi
 
 set -e
 
-if [ $stage -lt 1 ]; then
+if [ $stage -le 1 ]; then
   local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
   local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -95,7 +97,7 @@ if [ $stage -lt 1 ]; then
   local/callhome_create_splits.sh $split_callhome
 fi
 
-if [ $stage -lt 2 ]; then
+if [ $stage -le 2 ]; then
   # Now compute CMVN stats for the train, dev and test subsets
   steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
   steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -124,90 +126,95 @@ if [ $stage -lt 2 ]; then
   utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
 fi
 
+if [ $stage -le 3 ]; then
+  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
+    data/train_10k_nodup data/lang exp/mono0a
 
-steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang exp/mono0a
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-   data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" \
+  steps/train_deltas.sh --cmd "$train_cmd" \
     2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
 
 
-(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
- steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri1/graph data/dev exp/tri1/decode_dev)&
+  (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+    exp/tri1/graph data/dev exp/tri1/decode_dev)&
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-   data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
 
-steps/train_deltas.sh --cmd "$train_cmd" \
+  steps/train_deltas.sh --cmd "$train_cmd" \
     2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
-  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
-)&
-
+  (
+    utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
+   )&
+fi
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
 
 # Train tri3a, which is LDA+MLLT, on 100k data.
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" \
    3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
-(
-  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
-  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
-)&
-
+  (
+    utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
+  )&
+fi
 
+if [ $stage -le 5 ]; then
 # Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
 
-steps/train_sat.sh  --cmd "$train_cmd" \
-  4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri4a/graph data/dev exp/tri4a/decode_dev
+  (
+    utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri4a/graph data/dev exp/tri4a/decode_dev
 )&
 
 
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
 
 # Reduce the number of gaussians
-steps/train_sat.sh  --cmd "$train_cmd" \
-  5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/dev exp/tri5a/decode_dev
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/test exp/tri5a/decode_test
+  (
+    utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/dev exp/tri5a/decode_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/test exp/tri5a/decode_test
 
   # Decode CALLHOME
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
-) &
-
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
+    ) &
+
+
+   steps/align_fmllr.sh \
+     --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
+     data/train data/lang exp/tri5a exp/tri5a_ali
+fi
 
-steps/align_fmllr.sh \
-  --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
-  data/train data/lang exp/tri5a exp/tri5a_ali
+if $train_sgmm2; then
 
 steps/train_ubm.sh \
   --cmd "$train_cmd" 750 \
@@ -258,22 +265,7 @@ for iter in 1 2 3 4; do
 done
 ) &
 
-dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "--num-threads 16")
-dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "--gpu 1")
-
-steps/nnet2/train_pnorm_ensemble.sh \
-  --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
-  --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\
-  --cmd "$train_cmd" \
-  "${dnn_gpu_parallel_opts[@]}" \
-  --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \
-  data/train data/lang exp/tri5a_ali exp/tri6a_dnn
+fi
 
-(
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 \
-    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
-) &
-wait
+local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
 exit 0;