From c30c889f29b9e10b98c3aaafa0a18833f25d8694 Mon Sep 17 00:00:00 2001
From: Marek Wawrzos <mwawrzos@nvidia.com>
Date: Fri, 31 Jan 2020 23:08:43 +0100
Subject: [PATCH] Adding RNN-Transducer - RNN speech recognition benchmark
 (#329)

* RNN-Transducer from https://github.com/ryanleary/mlperf-rnnt-ref

* fixes after moving eval function

* Fix spelling of Speech Benchmark directory

* Fix inference script for greedy decode

* use 80 input features

* dropout on each layer and no batch normalization

* fix inference script after preprocessing rewrite

* further fixes to inference.py after preprocessing rewrite
---
 rnn_speech_recognition/pytorch/Dockerfile     |  46 ++
 rnn_speech_recognition/pytorch/LICENSE        | 204 ++++++++
 rnn_speech_recognition/pytorch/NOTICE         |   5 +
 rnn_speech_recognition/pytorch/README.md      |  44 ++
 .../pytorch/configs/rnnt.toml                 |  77 +++
 .../pytorch/configs/rnnt_bn.toml              |  78 +++
 .../pytorch/configs/rnnt_ln.toml              |  78 +++
 rnn_speech_recognition/pytorch/dataset.py     | 266 ++++++++++
 rnn_speech_recognition/pytorch/decoders.py    | 136 +++++
 rnn_speech_recognition/pytorch/helpers.py     | 212 ++++++++
 rnn_speech_recognition/pytorch/inference.py   | 247 +++++++++
 .../pytorch/inference_benchmark.py            | 246 +++++++++
 rnn_speech_recognition/pytorch/loss.py        | 104 ++++
 rnn_speech_recognition/pytorch/metrics.py     |  67 +++
 rnn_speech_recognition/pytorch/model.py       | 452 +++++++++++++++++
 rnn_speech_recognition/pytorch/model_rnnt.py  | 289 +++++++++++
 rnn_speech_recognition/pytorch/multiproc.py   | 190 +++++++
 rnn_speech_recognition/pytorch/optimizers.py  | 223 ++++++++
 .../pytorch/parts/features.py                 | 349 +++++++++++++
 .../pytorch/parts/manifest.py                 | 170 +++++++
 .../pytorch/parts/perturb.py                  | 111 ++++
 .../pytorch/parts/segment.py                  | 170 +++++++
 .../pytorch/parts/text/LICENSE                |  19 +
 .../pytorch/parts/text/__init__.py            |  12 +
 .../pytorch/parts/text/cleaners.py            | 107 ++++
 .../pytorch/parts/text/numbers.py             |  99 ++++
 .../pytorch/parts/text/symbols.py             |  19 +
 .../pytorch/preprocessing.py                  | 123 +++++
 .../pytorch/requirements.txt                  |  10 +
 rnn_speech_recognition/pytorch/rnn.py         | 402 +++++++++++++++
 .../pytorch/scripts/docker/build.sh           |   3 +
 .../pytorch/scripts/docker/launch.sh          |  32 ++
 .../pytorch/scripts/download_librispeech.sh   |  28 +
 .../pytorch/scripts/evaluation.sh             |  92 ++++
 .../pytorch/scripts/inference.sh              | 104 ++++
 .../pytorch/scripts/inference_benchmark.sh    |  84 +++
 .../pytorch/scripts/preprocess_librispeech.sh |  51 ++
 .../pytorch/scripts/train.sh                  | 113 +++++
 .../pytorch/scripts/train_benchmark.sh        | 130 +++++
 rnn_speech_recognition/pytorch/tb_logger.py   |  52 ++
 rnn_speech_recognition/pytorch/train.py       | 477 ++++++++++++++++++
 .../pytorch/utils/__init__.py                 |   0
 .../pytorch/utils/convert_librispeech.py      |  81 +++
 .../pytorch/utils/download_librispeech.py     |  72 +++
 .../pytorch/utils/download_utils.py           |  68 +++
 .../pytorch/utils/inference_librispeech.csv   |   5 +
 .../pytorch/utils/librispeech.csv             |   8 +
 .../pytorch/utils/preprocessing_utils.py      |  76 +++
 48 files changed, 6031 insertions(+)
 create mode 100755 rnn_speech_recognition/pytorch/Dockerfile
 create mode 100644 rnn_speech_recognition/pytorch/LICENSE
 create mode 100644 rnn_speech_recognition/pytorch/NOTICE
 create mode 100644 rnn_speech_recognition/pytorch/README.md
 create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt.toml
 create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt_bn.toml
 create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt_ln.toml
 create mode 100644 rnn_speech_recognition/pytorch/dataset.py
 create mode 100644 rnn_speech_recognition/pytorch/decoders.py
 create mode 100644 rnn_speech_recognition/pytorch/helpers.py
 create mode 100644 rnn_speech_recognition/pytorch/inference.py
 create mode 100644 rnn_speech_recognition/pytorch/inference_benchmark.py
 create mode 100644 rnn_speech_recognition/pytorch/loss.py
 create mode 100644 rnn_speech_recognition/pytorch/metrics.py
 create mode 100644 rnn_speech_recognition/pytorch/model.py
 create mode 100644 rnn_speech_recognition/pytorch/model_rnnt.py
 create mode 100644 rnn_speech_recognition/pytorch/multiproc.py
 create mode 100644 rnn_speech_recognition/pytorch/optimizers.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/features.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/manifest.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/perturb.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/segment.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/text/LICENSE
 create mode 100644 rnn_speech_recognition/pytorch/parts/text/__init__.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/text/cleaners.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/text/numbers.py
 create mode 100644 rnn_speech_recognition/pytorch/parts/text/symbols.py
 create mode 100644 rnn_speech_recognition/pytorch/preprocessing.py
 create mode 100755 rnn_speech_recognition/pytorch/requirements.txt
 create mode 100644 rnn_speech_recognition/pytorch/rnn.py
 create mode 100755 rnn_speech_recognition/pytorch/scripts/docker/build.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/docker/launch.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/download_librispeech.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/evaluation.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/inference.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/train.sh
 create mode 100755 rnn_speech_recognition/pytorch/scripts/train_benchmark.sh
 create mode 100644 rnn_speech_recognition/pytorch/tb_logger.py
 create mode 100644 rnn_speech_recognition/pytorch/train.py
 create mode 100644 rnn_speech_recognition/pytorch/utils/__init__.py
 create mode 100644 rnn_speech_recognition/pytorch/utils/convert_librispeech.py
 create mode 100644 rnn_speech_recognition/pytorch/utils/download_librispeech.py
 create mode 100644 rnn_speech_recognition/pytorch/utils/download_utils.py
 create mode 100644 rnn_speech_recognition/pytorch/utils/inference_librispeech.csv
 create mode 100644 rnn_speech_recognition/pytorch/utils/librispeech.csv
 create mode 100644 rnn_speech_recognition/pytorch/utils/preprocessing_utils.py

diff --git a/rnn_speech_recognition/pytorch/Dockerfile b/rnn_speech_recognition/pytorch/Dockerfile
new file mode 100755
index 000000000..1cb52bf62
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/Dockerfile
@@ -0,0 +1,46 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
+FROM ${FROM_IMAGE_NAME}
+
+
+RUN apt-get update && apt-get install -y libsndfile1 && apt-get install -y sox && rm -rf /var/lib/apt/lists/*
+
+RUN COMMIT_SHA=c6d12f9e1562833c2b4e7ad84cb22aa4ba31d18c && \
+    git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer && \
+    cd deps/warp-transducer && \
+    git checkout $COMMIT_SHA && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make VERBOSE=1 && \
+	export CUDA_HOME="/usr/local/cuda" && \
+    export WARP_RNNT_PATH=`pwd` && \
+    export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME && \
+    export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" && \
+    export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH && \
+    export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \
+    export CFLAGS="-I$CUDA_HOME/include $CFLAGS" && \
+    cd ../pytorch_binding && \
+    python3 setup.py install --user && \
+    rm -rf ../tests test ../tensorflow_binding && \
+    cd ../../..
+
+WORKDIR /workspace/jasper
+
+COPY requirements.txt .
+RUN pip install --disable-pip-version-check -U -r requirements.txt
+
+COPY . .
diff --git a/rnn_speech_recognition/pytorch/LICENSE b/rnn_speech_recognition/pytorch/LICENSE
new file mode 100644
index 000000000..75ee157cd
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/LICENSE
@@ -0,0 +1,204 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 NVIDIA Corporation
+   Copyright 2019 Myrtle Software Limited, www.myrtle.ai
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/rnn_speech_recognition/pytorch/NOTICE b/rnn_speech_recognition/pytorch/NOTICE
new file mode 100644
index 000000000..7916839bc
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/NOTICE
@@ -0,0 +1,5 @@
+Jasper in PyTorch
+
+This repository includes source code (in "parts/") from:
+* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.
+
diff --git a/rnn_speech_recognition/pytorch/README.md b/rnn_speech_recognition/pytorch/README.md
new file mode 100644
index 000000000..f62118c06
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/README.md
@@ -0,0 +1,44 @@
+# DISCLAIMER
+This codebase is a work in progress. There are known and unknown bugs in the implementation, and has not been optimized in any way.
+
+MLPerf has neither finalized on a decision to add a speech recognition benchmark, nor as this implementationn/architecture as a reference implementation.
+
+# 1. Problem 
+Speech recognition accepts raw audio samples and produces a corresponding text transcription.
+
+# 2. Directions
+See https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/README.md. This implementation shares significant code with that repository.
+
+# 3. Dataset/Environment
+### Publication/Attribution
+["OpenSLR LibriSpeech Corpus"](http://www.openslr.org/12/) provides over 1000 hours of speech data in the form of raw audio.
+### Data preprocessing
+What preprocessing is done to the the dataset? 
+### Training and test data separation
+How is the test set extracted?
+### Training data order
+In what order is the training data traversed?
+### Test data order
+In what order is the test data traversed?
+### Simulation environment (RL models only)
+Describe simulation environment briefly, if applicable. 
+# 4. Model
+### Publication/Attribution
+Cite paper describing model plus any additional attribution requested by code authors 
+### List of layers 
+Brief summary of structure of model
+### Weight and bias initialization
+How are weights and biases initialized
+### Loss function
+Transducer Loss
+### Optimizer
+TBD, currently Adam
+# 5. Quality
+### Quality metric
+Word Error Rate (WER) across all words in the output text of all samples in the validation set.
+### Quality target
+What is the numeric quality target
+### Evaluation frequency
+TBD
+### Evaluation thoroughness
+TBD
\ No newline at end of file
diff --git a/rnn_speech_recognition/pytorch/configs/rnnt.toml b/rnn_speech_recognition/pytorch/configs/rnnt.toml
new file mode 100644
index 000000000..11ed8b91a
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/configs/rnnt.toml
@@ -0,0 +1,77 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model = "RNNT"
+
+[input]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 80
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0   # TODO
+max_duration = 16.7
+speed_perturbation = true
+
+
+cutout_rect_regions = 0
+cutout_rect_time = 60
+cutout_rect_freq = 25
+
+
+cutout_x_regions = 2
+cutout_y_regions = 2
+cutout_x_width = 6
+cutout_y_width = 6
+
+
+[input_eval]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 80
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0
+
+
+[rnnt]
+rnn_type = "lstm"
+encoder_n_hidden = 1024
+encoder_pre_rnn_layers = 2
+encoder_stack_time_factor = 2
+encoder_post_rnn_layers = 3
+pred_n_hidden = 512
+pred_rnn_layers = 2
+forget_gate_bias = 1.0
+joint_n_hidden = 512
+dropout=0.32
+
+
+[labels]
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
diff --git a/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml b/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml
new file mode 100644
index 000000000..c1908128f
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml
@@ -0,0 +1,78 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model = "RNNT"
+
+[input]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 64
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0   # TODO
+max_duration = 16.7
+speed_perturbation = true
+
+
+cutout_rect_regions = 0
+cutout_rect_time = 60
+cutout_rect_freq = 25
+
+
+cutout_x_regions = 2
+cutout_y_regions = 2
+cutout_x_width = 6
+cutout_y_width = 6
+
+
+[input_eval]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 64
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0
+
+
+[rnnt]
+rnn_type = "lstm"
+norm = "batch_norm"
+encoder_n_hidden = 1024
+encoder_pre_rnn_layers = 2
+encoder_stack_time_factor = 2
+encoder_post_rnn_layers = 3
+pred_n_hidden = 1024
+pred_rnn_layers = 2
+forget_gate_bias = 1.0
+joint_n_hidden = 640
+dropout=0.0
+
+
+[labels]
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
diff --git a/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml b/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml
new file mode 100644
index 000000000..fd43b5595
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml
@@ -0,0 +1,78 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model = "RNNT"
+
+[input]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 64
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0   # TODO
+max_duration = 16.7
+speed_perturbation = true
+
+
+cutout_rect_regions = 0
+cutout_rect_time = 60
+cutout_rect_freq = 25
+
+
+cutout_x_regions = 2
+cutout_y_regions = 2
+cutout_x_width = 6
+cutout_y_width = 6
+
+
+[input_eval]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 64
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0
+
+
+[rnnt]
+rnn_type = "lstm"
+norm = "layer_norm"
+encoder_n_hidden = 1024
+encoder_pre_rnn_layers = 2
+encoder_stack_time_factor = 2
+encoder_post_rnn_layers = 3
+pred_n_hidden = 1024
+pred_rnn_layers = 2
+forget_gate_bias = 1.0
+joint_n_hidden = 640
+dropout=0.0
+
+
+[labels]
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
diff --git a/rnn_speech_recognition/pytorch/dataset.py b/rnn_speech_recognition/pytorch/dataset.py
new file mode 100644
index 000000000..ad88d2f01
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/dataset.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file contains classes and functions related to data loading
+"""
+import torch
+import numpy as np
+import math
+from torch.utils.data import Dataset, Sampler
+import torch.distributed as dist
+from parts.manifest import Manifest
+from parts.features import WaveformFeaturizer
+
+class DistributedBucketBatchSampler(Sampler):
+    def __init__(self, dataset, batch_size, num_replicas=None, rank=None):
+        """Distributed sampler that buckets samples with similar length to minimize padding,
+          similar concept as pytorch BucketBatchSampler  https://pytorchnlp.readthedocs.io/en/latest/source/torchnlp.samplers.html#torchnlp.samplers.BucketBatchSampler
+
+        Args:
+            dataset: Dataset used for sampling.
+            batch_size: data batch size
+            num_replicas (optional): Number of processes participating in
+                distributed training.
+            rank (optional): Rank of the current process within num_replicas.
+        """
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.dataset_size = len(dataset)
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.batch_size = batch_size
+        self.tile_size = batch_size * self.num_replicas
+        self.num_buckets = 6
+        self.bucket_size = self.round_up_to(math.ceil(self.dataset_size / self.num_buckets), self.tile_size)
+        self.index_count = self.round_up_to(self.dataset_size, self.tile_size)
+        self.num_samples = self.index_count // self.num_replicas
+
+    def round_up_to(self, x, mod):
+        return (x + mod - 1) // mod * mod
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = np.arange(self.index_count) % self.dataset_size
+        for bucket in range(self.num_buckets):
+            bucket_start = self.bucket_size * bucket
+            bucket_end = min(bucket_start + self.bucket_size, self.index_count)
+            indices[bucket_start:bucket_end] = indices[bucket_start:bucket_end][torch.randperm(bucket_end - bucket_start, generator=g)]
+
+        tile_indices = torch.randperm(self.index_count // self.tile_size, generator=g)
+        for tile_index in tile_indices:
+            start_index = self.tile_size * tile_index + self.batch_size * self.rank
+            end_index = start_index + self.batch_size
+            yield indices[start_index:end_index]
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+class data_prefetcher():
+    def __init__(self, loader):
+        self.loader = iter(loader)
+        self.stream = torch.cuda.Stream()
+        self.preload()
+
+    def preload(self):
+        try:
+            self.next_input = next(self.loader)
+        except StopIteration:
+            self.next_input = None
+            return
+        with torch.cuda.stream(self.stream):
+            self.next_input = [ x.cuda(non_blocking=True) for x in self.next_input]
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        input = self.next_input
+        self.preload()
+        return input
+    def next(self):
+        return self.__next__()
+    def __iter__(self):
+        return self
+
+def seq_collate_fn(batch):
+    """batches samples and returns as tensors
+    Args:
+    batch : list of samples
+    Returns
+    batches of tensors
+    """
+    batch_size = len(batch)
+    def _find_max_len(lst, ind):
+        max_len = -1
+        for item in lst:
+            if item[ind].size(0) > max_len:
+                max_len = item[ind].size(0)
+        return max_len
+    max_audio_len = _find_max_len(batch, 0)
+    max_transcript_len = _find_max_len(batch, 2)
+
+    batched_audio_signal = torch.zeros(batch_size, max_audio_len)
+    batched_transcript = torch.zeros(batch_size, max_transcript_len)
+    audio_lengths = []
+    transcript_lengths = []
+    for ind, sample in enumerate(batch):
+        batched_audio_signal[ind].narrow(0, 0, sample[0].size(0)).copy_(sample[0])
+        audio_lengths.append(sample[1])
+        batched_transcript[ind].narrow(0, 0, sample[2].size(0)).copy_(sample[2])
+        transcript_lengths.append(sample[3])
+    return batched_audio_signal, torch.stack(audio_lengths), batched_transcript, \
+         torch.stack(transcript_lengths)
+
+class AudioToTextDataLayer:
+    """Data layer with data loader
+    """
+    def __init__(self, **kwargs):
+        self._device = torch.device("cuda")
+
+        featurizer_config = kwargs['featurizer_config']
+        pad_to_max = kwargs.get('pad_to_max', False)
+        perturb_config = kwargs.get('perturb_config', None)
+        manifest_filepath = kwargs['manifest_filepath']
+        dataset_dir = kwargs['dataset_dir']
+        labels = kwargs['labels']
+        batch_size = kwargs['batch_size']
+        drop_last = kwargs.get('drop_last', False)
+        shuffle = kwargs.get('shuffle', True)
+        min_duration = featurizer_config.get('min_duration', 0.1)
+        max_duration = featurizer_config.get('max_duration', None)
+        normalize_transcripts = kwargs.get('normalize_transcripts', True)
+        trim_silence = kwargs.get('trim_silence', False)
+        multi_gpu = kwargs.get('multi_gpu', False)
+        sampler_type = kwargs.get('sampler', 'default')
+        speed_perturbation = featurizer_config.get('speed_perturbation', False)
+        sort_by_duration=sampler_type == 'bucket'
+        self._featurizer = WaveformFeaturizer.from_config(featurizer_config, perturbation_configs=perturb_config)
+        self._dataset = AudioDataset(
+            dataset_dir=dataset_dir,
+            manifest_filepath=manifest_filepath,
+            labels=labels, blank_index=len(labels),
+            sort_by_duration=sort_by_duration,
+            pad_to_max=pad_to_max,
+            featurizer=self._featurizer, max_duration=max_duration,
+            min_duration=min_duration, normalize=normalize_transcripts,
+            trim=trim_silence, speed_perturbation=speed_perturbation)
+
+        print('sort_by_duration', sort_by_duration)
+
+        if not multi_gpu:
+            self.sampler = None
+            self._dataloader = torch.utils.data.DataLoader(
+                dataset=self._dataset,
+                batch_size=batch_size,
+                collate_fn=lambda b: seq_collate_fn(b),
+                drop_last=drop_last,
+                shuffle=shuffle if self.sampler is None else False,
+                num_workers=4,
+                pin_memory=True,
+                sampler=self.sampler
+            )
+        elif sampler_type == 'bucket':
+            self.sampler = DistributedBucketBatchSampler(self._dataset, batch_size=batch_size)
+            print("DDBucketSampler")
+            self._dataloader = torch.utils.data.DataLoader(
+                dataset=self._dataset,
+                collate_fn=lambda b: seq_collate_fn(b),
+                num_workers=4,
+                pin_memory=True,
+                batch_sampler=self.sampler
+            )
+        elif sampler_type == 'default':
+            self.sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)
+            print("DDSampler")
+            self._dataloader = torch.utils.data.DataLoader(
+                dataset=self._dataset,
+                batch_size=batch_size,
+                collate_fn=lambda b: seq_collate_fn(b),
+                drop_last=drop_last,
+                shuffle=shuffle if self.sampler is None else False,
+                num_workers=4,
+                pin_memory=True,
+                sampler=self.sampler
+            )
+        else:
+            raise RuntimeError("Sampler {} not supported".format(sampler_type))
+
+    def __len__(self):
+        return len(self._dataset)
+
+    @property
+    def data_iterator(self):
+        return self._dataloader
+
+class AudioDataset(Dataset):
+    def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False,
+                 min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False,
+                 trim=False, speed_perturbation=False):
+        """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations
+        (in seconds). Each entry is a different audio sample.
+        Args:
+            dataset_dir: absolute path to dataset folder
+            manifest_filepath: relative path from dataset folder to manifest json as described above. Can be coma-separated paths.
+            labels: String containing all the possible characters to map to
+            featurizer: Initialized featurizer class that converts paths of audio to feature tensors
+            max_duration: If audio exceeds this length, do not include in dataset
+            min_duration: If audio is less than this length, do not include in dataset
+            pad_to_max: if specified input sequences into dnn model will be padded to max_duration
+            blank_index: blank index for ctc loss / decoder
+            max_utts: Limit number of utterances
+            normalize: whether to normalize transcript text
+            sort_by_duration: whether or not to sort sequences by increasing duration
+            trim: if specified trims leading and trailing silence from an audio signal.
+            speed_perturbation: specify if using data contains speed perburbation
+        """
+        m_paths = manifest_filepath.split(',')
+        self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max,
+                             max_duration=max_duration,
+                             sort_by_duration=sort_by_duration,
+                             min_duration=min_duration, max_utts=max_utts,
+                             normalize=normalize, speed_perturbation=speed_perturbation)
+        self.featurizer = featurizer
+        self.blank_index = blank_index
+        self.trim = trim
+        print(
+            "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format(
+            self.manifest.duration / 3600,
+            self.manifest.filtered_duration / 3600))
+
+    def __getitem__(self, index):
+        sample = self.manifest[index]
+        rn_indx = np.random.randint(len(sample['audio_filepath']))
+        duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0
+        offset = sample['offset'] if 'offset' in sample else 0
+        features = self.featurizer.process(sample['audio_filepath'][rn_indx],
+                                           offset=offset, duration=duration,
+                                           trim=self.trim)
+
+        return features, torch.tensor(features.shape[0]).int(), \
+               torch.tensor(sample["transcript"]), torch.tensor(
+               len(sample["transcript"])).int()
+
+    def __len__(self):
+        return len(self.manifest)
diff --git a/rnn_speech_recognition/pytorch/decoders.py b/rnn_speech_recognition/pytorch/decoders.py
new file mode 100644
index 000000000..882dee2e2
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/decoders.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import torch.nn.functional as F
+from model_rnnt import label_collate
+
+class TransducerDecoder:
+    """Decoder base class.
+
+    Args:
+        alphabet: An Alphabet object.
+        blank_symbol: The symbol in `alphabet` to use as the blank during CTC
+            decoding.
+        model: Model to use for prediction.
+    """
+
+    def __init__(self, blank_index, model):
+        self._model = model
+        self._SOS = -1   # start of sequence
+        self._blank_id = blank_index
+
+    def _pred_step(self, label, hidden, device):
+        if label == self._SOS:
+            return self._model.predict(None, hidden, add_sos=False)
+        if label > self._blank_id:
+            label -= 1
+        label = label_collate([[label]]).to(device)
+        return self._model.predict(label, hidden, add_sos=False)
+
+    def _joint_step(self, enc, pred, log_normalize=False):
+        logits = self._model.joint(enc, pred)[:, 0, 0, :]
+        if not log_normalize:
+            return logits
+
+        probs = F.log_softmax(logits, dim=len(logits.shape) - 1)
+
+        return probs
+
+    def _get_last_symb(self, labels):
+        return self._SOS if labels == [] else labels[-1]
+
+
+class RNNTGreedyDecoder(TransducerDecoder):
+    """A greedy transducer decoder.
+
+    Args:
+        blank_symbol: See `Decoder`.
+        model: Model to use for prediction.
+        max_symbols_per_step: The maximum number of symbols that can be added
+            to a sequence in a single time step; if set to None then there is
+            no limit.
+        cutoff_prob: Skip to next step in search if current highest character
+            probability is less than this.
+    """
+    def __init__(self, blank_index, model, max_symbols_per_step=30):
+        super().__init__(blank_index, model)
+        assert max_symbols_per_step is None or max_symbols_per_step > 0
+        self.max_symbols = max_symbols_per_step
+
+    def decode(self, x, out_lens):
+        """Returns a list of sentences given an input batch.
+
+        Args:
+            x: A tensor of size (batch, channels, features, seq_len)
+                TODO was (seq_len, batch, in_features).
+            out_lens: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            list containing batch number of sentences (strings).
+        """
+        with torch.no_grad():
+            # Apply optional preprocessing
+
+            logits, out_lens = self._model.encode((x, out_lens))
+
+            output = []
+            for batch_idx in range(logits.size(0)):
+                inseq = logits[batch_idx, :, :].unsqueeze(1)
+                logitlen = out_lens[batch_idx]
+                sentence = self._greedy_decode(inseq, logitlen)
+                output.append(sentence)
+
+        return output
+
+    def _greedy_decode(self, x, out_len):
+        training_state = self._model.training
+        self._model.eval()
+
+        device = x.device
+
+        hidden = None
+        label = []
+        for time_idx in range(out_len):
+            f = x[time_idx, :, :].unsqueeze(0)
+
+            not_blank = True
+            symbols_added = 0
+
+            while not_blank and (
+                    self.max_symbols is None or
+                    symbols_added < self.max_symbols):
+                g, hidden_prime = self._pred_step(
+                    self._get_last_symb(label),
+                    hidden,
+                    device
+                )
+                logp = self._joint_step(f, g, log_normalize=False)[0, :]
+
+                # get index k, of max prob
+                v, k = logp.max(0)
+                k = k.item()
+
+                if k == self._blank_id:
+                    not_blank = False
+                else:
+                    label.append(k)
+                    hidden = hidden_prime
+                symbols_added += 1
+
+        self._model.train(training_state)
+        return label
diff --git a/rnn_speech_recognition/pytorch/helpers.py b/rnn_speech_recognition/pytorch/helpers.py
new file mode 100644
index 000000000..e844b4c75
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/helpers.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+from apex.parallel import DistributedDataParallel as DDP
+from enum import Enum
+from metrics import word_error_rate
+
+
+class Optimization(Enum):
+    """Various levels of Optimization.
+    WARNING: This might have effect on model accuracy."""
+    nothing = 0
+    mxprO0 = 1
+    mxprO1 = 2
+    mxprO2 = 3
+    mxprO3 = 4
+
+
+AmpOptimizations = {Optimization.mxprO0: "O0",
+                    Optimization.mxprO1: "O1",
+                    Optimization.mxprO2: "O2",
+                    Optimization.mxprO3: "O3"}
+
+def print_once(msg):
+    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
+        print(msg)
+
+def add_blank_label(labels):
+    if not isinstance(labels, list):
+        raise ValueError("labels must be a list of symbols")
+    labels.append("<BLANK>")
+    return labels
+
+def __rnnt_decoder_predictions_tensor(tensor, labels):
+    """
+    Takes output of greedy rnnt decoder and converts to strings.
+    Args:
+        tensor: model output tensor
+        label: A list of labels
+    Returns:
+        prediction
+    """
+    hypotheses = []
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    # iterate over batch
+    for ind in range(len(tensor)):
+        hypothesis = ''.join([labels_map[c] for c in tensor[ind]])
+        hypotheses.append(hypothesis)
+    return hypotheses
+
+
+def monitor_asr_train_progress(tensors: list, labels: list):
+    """
+    Takes output of greedy ctc decoder and performs ctc decoding algorithm to
+    remove duplicates and special symbol. Prints wer and prediction examples to screen
+    Args:
+        tensors: A list of 3 tensors (predictions, targets, target_lengths)
+        labels: A list of labels
+
+    Returns:
+        word error rate
+    """
+    references = []
+
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    with torch.no_grad():
+        targets_cpu_tensor = tensors[1].long().cpu()
+        tgt_lenths_cpu_tensor = tensors[2].long().cpu()
+
+        # iterate over batch
+        for ind in range(targets_cpu_tensor.shape[0]):
+            tgt_len = tgt_lenths_cpu_tensor[ind].item()
+            target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
+            reference = ''.join([labels_map[c] for c in target])
+            references.append(reference)
+        hypotheses = __rnnt_decoder_predictions_tensor(tensors[0], labels=labels)
+    tag = "training_batch_WER"
+    wer, _, _ = word_error_rate(hypotheses, references)
+    print_once('{0}: {1}'.format(tag, wer))
+    print_once('Prediction: {0}'.format(hypotheses[0]))
+    print_once('Reference: {0}'.format(references[0]))
+    return wer
+
+
+def __gather_losses(losses_list: list) -> list:
+    return [torch.mean(torch.stack(losses_list))]
+
+
+def __gather_predictions(predictions_list: list, labels: list) -> list:
+    results = []
+    for prediction in predictions_list:
+        results += __rnnt_decoder_predictions_tensor(prediction, labels=labels)
+    return results
+
+
+def __gather_transcripts(transcript_list: list, transcript_len_list: list,
+                                                 labels: list) -> list:
+    results = []
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    # iterate over workers
+    for t, ln in zip(transcript_list, transcript_len_list):
+        # iterate over batch
+        t_lc = t.long().cpu()
+        ln_lc = ln.long().cpu()
+        for ind in range(t.shape[0]):
+            tgt_len = ln_lc[ind].item()
+            target = t_lc[ind][:tgt_len].numpy().tolist()
+            reference = ''.join([labels_map[c] for c in target])
+            results.append(reference)
+    return results
+
+
+def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
+    """
+    Processes results of an iteration and saves it in global_vars
+    Args:
+        tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output
+        global_vars: dictionary where processes results of iteration are saved
+        labels: A list of labels
+    """
+    for kv, v in tensors.items():
+        if kv.startswith('loss'):
+            global_vars['EvalLoss'] += __gather_losses(v)
+        elif kv.startswith('predictions'):
+            global_vars['predictions'] += __gather_predictions(v, labels=labels)
+        elif kv.startswith('transcript_length'):
+            transcript_len_list = v
+        elif kv.startswith('transcript'):
+
+            transcript_list = v
+        elif kv.startswith('output'):
+            global_vars['logits'] += v
+
+    global_vars['transcripts'] += __gather_transcripts(transcript_list,
+                                                       transcript_len_list,
+                                                       labels=labels)
+
+
+def process_evaluation_epoch(global_vars: dict, tag=None):
+    """
+    Processes results from each worker at the end of evaluation and combine to final result
+    Args:
+        global_vars: dictionary containing information of entire evaluation
+    Return:
+        wer: final word error rate
+        loss: final loss
+    """
+    if 'EvalLoss' in global_vars:
+        eloss = torch.mean(torch.stack(global_vars['EvalLoss'])).item()
+    else:
+        eloss = None
+    hypotheses = global_vars['predictions']
+    references = global_vars['transcripts']
+
+    wer, scores, num_words = word_error_rate(hypotheses=hypotheses, references=references)
+    multi_gpu = torch.distributed.is_initialized()
+    if multi_gpu:
+        if eloss is not None:
+            eloss /= torch.distributed.get_world_size()
+            eloss_tensor = torch.tensor(eloss).cuda()
+            dist.all_reduce(eloss_tensor)
+            eloss = eloss_tensor.item()
+            del eloss_tensor
+
+        scores_tensor = torch.tensor(scores).cuda()
+        dist.all_reduce(scores_tensor)
+        scores = scores_tensor.item()
+        del scores_tensor
+        num_words_tensor = torch.tensor(num_words).cuda()
+        dist.all_reduce(num_words_tensor)
+        num_words = num_words_tensor.item()
+        del num_words_tensor
+        wer = scores *1.0/num_words
+    return wer, eloss
+
+
+
+def norm(x):
+    if not isinstance(x, list):
+        if not isinstance(x, tuple):
+            return x
+    return x[0]
+
+
+def print_dict(d):
+    maxLen = max([len(ii) for ii in d.keys()])
+    fmtString = '\t%' + str(maxLen) + 's : %s'
+    print('Arguments:')
+    for keyPair in sorted(d.items()):
+            print(fmtString % keyPair)
+
+
+
+def model_multi_gpu(model, multi_gpu=False):
+    if multi_gpu:
+        model = DDP(model)
+        print('DDP(model)')
+    return model
diff --git a/rnn_speech_recognition/pytorch/inference.py b/rnn_speech_recognition/pytorch/inference.py
new file mode 100644
index 000000000..1d512a570
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/inference.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+from typing import List
+from tqdm import tqdm
+import math
+import toml
+from dataset import AudioToTextDataLayer
+from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_blank_label, AmpOptimizations, print_dict, model_multi_gpu
+from decoders import RNNTGreedyDecoder
+from model_rnnt import RNNT
+from preprocessing import AudioPreprocessing
+from parts.features import audio_from_file
+import torch
+import apex
+from apex import amp
+import random
+import numpy as np
+import pickle
+import time
+
+import torchvision
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Jasper')
+    parser.add_argument("--local_rank", default=None, type=int)
+    parser.add_argument("--batch_size", default=16, type=int, help='data batch size')
+    parser.add_argument("--steps", default=None, help='if not specified do evaluation on full dataset. otherwise only evaluates the specified number of iterations for each worker', type=int)
+    parser.add_argument("--model_toml", type=str, help='relative model configuration path given dataset folder')
+    parser.add_argument("--dataset_dir", type=str, help='absolute path to dataset folder')
+    parser.add_argument("--val_manifest", type=str, help='relative path to evaluation dataset manifest file')
+    parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
+    parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
+    parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
+    parser.add_argument("--fp16", action='store_true', help='use half precision')
+    parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
+    parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location")
+    parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path")
+    parser.add_argument("--seed", default=42, type=int, help='seed')
+    parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
+    return parser.parse_args()
+
+def eval(
+        data_layer,
+        audio_processor,
+        encoderdecoder,
+        greedy_decoder,
+        labels,
+        multi_gpu,
+        args):
+    """performs inference / evaluation
+    Args:
+        data_layer: data layer object that holds data loader
+        audio_processor: data processing module
+        encoderdecoder: acoustic model
+        greedy_decoder: greedy decoder
+        labels: list of labels as output vocabulary
+        multi_gpu: true if using multiple gpus
+        args: script input arguments
+    """
+    logits_save_to=args.logits_save_to
+    encoderdecoder.eval()
+    with torch.no_grad():
+        _global_var_dict = {
+            'predictions': [],
+            'transcripts': [],
+            'logits' : [],
+        }
+
+
+        
+        if args.wav:
+            features, p_length_e = audio_processor(audio_from_file(args.wav))
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            t_log_probs_e = encoderdecoder(features)
+            torch.cuda.synchronize()
+            t1 = time.perf_counter()
+            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
+            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
+            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
+            print("TRANSCRIPT\t\t:", hypotheses[0])
+            return
+        
+        for it, data in enumerate(tqdm(data_layer.data_iterator)):
+            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
+
+            t_log_probs_e, (x_len, y_len) = encoderdecoder(
+                    ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
+            )
+            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e)
+
+            values_dict = dict(
+                predictions=[t_predictions_e],
+                transcript=[t_transcript_e],
+                transcript_length=[t_transcript_len_e],
+                output=[t_log_probs_e]
+            )
+            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
+
+            if args.steps is not None and it + 1 >= args.steps:
+                break
+        wer, _ = process_evaluation_epoch(_global_var_dict)
+        if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
+            print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
+            if args.save_prediction is not None:
+                with open(args.save_prediction, 'w') as fp:
+                    fp.write('\n'.join(_global_var_dict['predictions']))
+            if logits_save_to is not None:
+                logits = []
+                for batch in _global_var_dict["logits"]:
+                    for i in range(batch.shape[0]):
+                        logits.append(batch[i].cpu().numpy())
+                with open(logits_save_to, 'wb') as f:
+                    pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+def main(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.benchmark = args.cudnn_benchmark
+    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
+    assert(torch.cuda.is_available())
+
+    if args.local_rank is not None:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+    multi_gpu = args.local_rank is not None
+    if multi_gpu:
+        print("DISTRIBUTED with ", torch.distributed.get_world_size())
+
+    if args.fp16:
+        optim_level = Optimization.mxprO3
+    else:
+        optim_level = Optimization.mxprO0
+
+    model_definition = toml.load(args.model_toml)
+    dataset_vocab = model_definition['labels']['labels']
+    ctc_vocab = add_blank_label(dataset_vocab)
+
+    val_manifest = args.val_manifest
+    featurizer_config = model_definition['input_eval']
+    featurizer_config["optimization_level"] = optim_level
+
+    if args.max_duration is not None:
+        featurizer_config['max_duration'] = args.max_duration
+    if args.pad_to is not None:
+        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max"
+
+    print('model_config')
+    print_dict(model_definition)
+    print('feature_config')
+    print_dict(featurizer_config)
+    data_layer = None
+    
+    if args.wav is None:
+        data_layer = AudioToTextDataLayer(
+            dataset_dir=args.dataset_dir, 
+            featurizer_config=featurizer_config,
+            manifest_filepath=val_manifest,
+            labels=dataset_vocab,
+            batch_size=args.batch_size,
+            pad_to_max=featurizer_config['pad_to'] == "max",
+            shuffle=False,
+            multi_gpu=multi_gpu)
+    audio_preprocessor = AudioPreprocessing(**featurizer_config)
+
+    #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))
+    model = RNNT(
+        feature_config=featurizer_config,
+        rnnt=model_definition['rnnt'],
+        num_classes=len(ctc_vocab)
+    )
+
+    if args.ckpt is not None:
+        print("loading model from ", args.ckpt)
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        model.load_state_dict(checkpoint['state_dict'], strict=False)
+
+    #greedy_decoder = GreedyCTCDecoder()
+
+    # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))
+    if args.wav is None:
+        N = len(data_layer)
+        step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
+
+        if args.steps is not None:
+            print('-----------------')
+            print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
+            print('Have {0} steps / (gpu * epoch).'.format(args.steps))
+            print('-----------------')
+        else:
+            print('-----------------')
+            print('Have {0} examples to eval on.'.format(N))
+            print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
+            print('-----------------')
+    else:
+            audio_preprocessor.featurizer.normalize = "per_feature"
+
+    print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
+    audio_preprocessor.cuda()
+    audio_preprocessor.eval()
+
+    eval_transforms = torchvision.transforms.Compose([
+        lambda xs: [x.cuda() for x in xs],
+        lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]],
+        lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]],
+    ])
+
+    model.cuda()
+    if args.fp16:
+        model = amp.initialize(
+            models=model,
+            opt_level=AmpOptimizations[optim_level])
+
+    model = model_multi_gpu(model, multi_gpu)
+
+    greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model)
+
+    eval(
+        data_layer=data_layer,
+        audio_processor=eval_transforms,
+        encoderdecoder=model,
+        greedy_decoder=greedy_decoder,
+        labels=ctc_vocab,
+        args=args,
+        multi_gpu=multi_gpu)
+
+if __name__=="__main__":
+    args = parse_args()
+
+    print_dict(vars(args))
+
+    main(args)
diff --git a/rnn_speech_recognition/pytorch/inference_benchmark.py b/rnn_speech_recognition/pytorch/inference_benchmark.py
new file mode 100644
index 000000000..fcc927ecb
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/inference_benchmark.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import os
+import sys
+import time
+import random
+import numpy as np
+from heapq import nlargest
+import math
+from tqdm import tqdm
+import toml
+import torch
+from apex import amp
+from dataset import AudioToTextDataLayer
+from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_ctc_labels, AmpOptimizations, print_dict
+from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Jasper')
+    parser.add_argument("--steps", default=None, help='if not specified do evaluation on full dataset. otherwise only evaluates the specified number of iterations for each worker', type=int)
+    parser.add_argument("--batch_size", default=16, type=int, help='data batch size')
+    parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
+    parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
+    parser.add_argument("--model_toml", type=str, help='relative model configuration path given dataset folder')
+    parser.add_argument("--dataset_dir", type=str, help='absolute path to dataset folder')
+    parser.add_argument("--val_manifest", type=str, help='relative path to evaluation dataset manifest file')
+    parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
+    parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
+    parser.add_argument("--fp16", action='store_true', help='use half precision')
+    parser.add_argument("--seed", default=42, type=int, help='seed')
+    return parser.parse_args()
+
+def eval(
+        data_layer,
+        audio_processor,
+        encoderdecoder,
+        greedy_decoder,
+        labels,
+        args):
+    """performs evaluation and prints performance statistics
+    Args:
+        data_layer: data layer object that holds data loader
+        audio_processor: data processing module
+        encoderdecoder: acoustic model
+        greedy_decoder: greedy decoder
+        labels: list of labels as output vocabulary
+        args: script input arguments
+    """
+    batch_size=args.batch_size
+    steps=args.steps
+    audio_processor.eval()
+    encoderdecoder.eval()
+    with torch.no_grad():
+        _global_var_dict = {
+            'predictions': [],
+            'transcripts': [],
+        }
+
+        it = 0
+        ep = 0
+
+        if steps is None:
+            steps = math.ceil(len(data_layer) / batch_size)
+        durations_dnn = []
+        durations_dnn_and_prep = []
+        seq_lens = []
+        while True:
+            ep += 1
+            for data in tqdm(data_layer.data_iterator):
+                it += 1
+                if it > steps:
+                    break
+                tensors = []
+                dl_device = torch.device("cuda")
+                for d in data:
+                    tensors.append(d.to(dl_device))
+
+
+                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
+
+                inp=(t_audio_signal_e, t_a_sig_length_e)
+                torch.cuda.synchronize()
+                t0 = time.perf_counter()
+                t_processed_signal, p_length_e = audio_processor(x=inp)
+                torch.cuda.synchronize()
+                t1 = time.perf_counter()
+                
+                if args.use_conv_mask:
+                    t_log_probs_e, t_encoded_len_e  = encoderdecoder((t_processed_signal, p_length_e))
+                else:
+                    t_log_probs_e  = encoderdecoder(t_processed_signal)
+                torch.cuda.synchronize()
+                stop_time = time.perf_counter()
+
+                time_prep_and_dnn = stop_time - t0
+                time_dnn = stop_time - t1
+                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
+
+                values_dict = dict(
+                    predictions=[t_predictions_e],
+                    transcript=[t_transcript_e],
+                    transcript_length=[t_transcript_len_e],
+                )
+                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
+                durations_dnn.append(time_dnn)
+                durations_dnn_and_prep.append(time_prep_and_dnn)
+                seq_lens.append(t_processed_signal.shape[-1])
+
+            if it >= steps:
+
+                wer, _ = process_evaluation_epoch(_global_var_dict)
+                print("==========>>>>>>Evaluation of all iterations WER: {0}\n".format(wer))
+                break
+
+        ratios = [0.9,  0.95,0.99, 1.]
+        latencies_dnn = take_durations_and_output_percentile(durations_dnn, ratios)
+        latencies_dnn_and_prep = take_durations_and_output_percentile(durations_dnn_and_prep, ratios)
+        print("\n using batch size {} and {} frames ".format(batch_size, seq_lens[-1]))
+        print("\n".join(["dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn.items()]))
+        print("\n".join(["prep + dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn_and_prep.items()]))
+
+def take_durations_and_output_percentile(durations, ratios):
+    durations = np.asarray(durations) * 1000 # in ms
+    latency = durations
+
+    latency = latency[5:]
+    mean_latency = np.mean(latency)
+
+    latency_worst = nlargest(math.ceil( (1 - min(ratios))* len(latency)), latency)
+    latency_ranges=get_percentile(ratios, latency_worst, len(latency))
+    latency_ranges["0.5"] = mean_latency
+    return latency_ranges
+
+def get_percentile(ratios, arr, nsamples):
+    res = {}
+    for a in ratios:
+        idx = max(int(nsamples * (1 - a)), 0)
+        res[a] = arr[idx]
+    return res
+
+def main(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.benchmark = args.cudnn_benchmark
+    assert(args.steps is None or args.steps > 5)
+    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
+    assert(torch.cuda.is_available())
+
+    if args.fp16:
+        optim_level = Optimization.mxprO3
+    else:
+        optim_level = Optimization.mxprO0
+    batch_size = args.batch_size
+
+    jasper_model_definition = toml.load(args.model_toml)
+    dataset_vocab = jasper_model_definition['labels']['labels']
+    ctc_vocab = add_ctc_labels(dataset_vocab)
+
+    val_manifest = args.val_manifest
+    featurizer_config = jasper_model_definition['input_eval']
+    featurizer_config["optimization_level"] = optim_level
+    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    if args.max_duration is not None:
+        featurizer_config['max_duration'] = args.max_duration
+    if args.pad_to is not None:
+        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max"
+
+    print('model_config')
+    print_dict(jasper_model_definition)
+    print('feature_config')
+    print_dict(featurizer_config)
+
+    data_layer = AudioToTextDataLayer(
+                            dataset_dir=args.dataset_dir,
+                            featurizer_config=featurizer_config,
+                            manifest_filepath=val_manifest,
+                            labels=dataset_vocab,
+                            batch_size=batch_size,
+                            pad_to_max=featurizer_config['pad_to'] == "max",
+                            shuffle=False,
+                            multi_gpu=False)
+
+    audio_preprocessor = AudioPreprocessing(**featurizer_config)
+
+    encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))
+
+    if args.ckpt is not None:
+        print("loading model from ", args.ckpt)
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        for k in audio_preprocessor.state_dict().keys():
+            checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
+        audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
+        encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
+
+    greedy_decoder = GreedyCTCDecoder()
+
+    # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))
+
+    N = len(data_layer)
+    step_per_epoch = math.ceil(N / args.batch_size)
+
+    print('-----------------')
+    if args.steps is None:
+        print('Have {0} examples to eval on.'.format(N))
+        print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
+    else:
+        print('Have {0} examples to eval on.'.format(args.steps * args.batch_size))
+        print('Have {0} steps / (gpu * epoch).'.format(args.steps))
+    print('-----------------')
+
+    audio_preprocessor.cuda()
+    encoderdecoder.cuda()
+    if args.fp16:
+        encoderdecoder = amp.initialize(
+            models=encoderdecoder,
+            opt_level=AmpOptimizations[optim_level])
+
+    eval(
+        data_layer=data_layer,
+        audio_processor=audio_preprocessor,
+        encoderdecoder=encoderdecoder,
+        greedy_decoder=greedy_decoder,
+        labels=ctc_vocab,
+        args=args)
+
+if __name__=="__main__":
+    args = parse_args()
+
+    print_dict(vars(args))
+
+    main(args)
diff --git a/rnn_speech_recognition/pytorch/loss.py b/rnn_speech_recognition/pytorch/loss.py
new file mode 100644
index 000000000..fa2bde88c
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/loss.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Tuple
+
+import torch
+from warprnnt_pytorch import RNNTLoss as WarpRNNTLoss
+
+
+class RNNTLoss(torch.nn.Module):
+    """Wrapped :py:class:`warprnnt_pytorch.RNNTLoss`.
+    Args:
+        blank: Index of the blank label.
+        reduction: (string) Specifies the reduction to apply to the output:
+            none:
+                No reduction will be applied.
+            mean:
+                The output losses will be divided by the target lengths and
+                then the mean over the batch is taken.
+            sum:
+                Sum all losses in a batch.
+    Attributes:
+        rnnt_loss: A :py:class:`warprnnt_pytorch.RNNTLoss` instance.
+    """
+
+    def __init__(self, blank, reduction="mean"):
+        super().__init__()
+        self.rnnt_loss = WarpRNNTLoss(blank=blank)
+        self.use_cuda = torch.cuda.is_available()
+
+    def forward(
+        self,
+        inputs: Tuple[torch.Tensor, torch.Tensor],
+        targets: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        """Computes RNNT loss.
+        All inputs are moved to the GPU with :py:meth:`torch.nn.Module.cuda` if
+        :py:func:`torch.cuda.is_available` was :py:data:`True` on
+        initialisation.
+        Args:
+            inputs: A tuple where the first element is the unnormalized network
+                :py:class:`torch.Tensor` outputs of size ``[batch, max_seq_len,
+                max_output_seq_len + 1, vocab_size + 1)``. The second element
+                is a Tuple of two :py:class:`torch.Tensor`s both of
+                size ``[batch]`` that contain the lengths of a) the audio features
+                logits and b) the target sequence logits.
+            targets: A tuple where the first element is a
+                :py:class:`torch.Tensor` such that each entry in the target
+                sequence is a class index. Target indices cannot be the blank
+                index. It must have size ``[batch, max_seq_len]``. In the former
+                form each target sequence is padded to the length of the longest
+                sequence and stacked.
+                The second element is a :py:class:`torch.Tensor` that gives
+                the lengths of the targets. Lengths are specified for each
+                sequence to achieve masking under the assumption that sequences
+                are padded to equal lengths.
+        """
+
+        logits, logit_lens = inputs
+        y, y_lens = targets
+
+        # cast to required types
+        if logits.dtype != torch.float:
+            logits_orig = logits
+            logits = logits.float()
+            del logits_orig  # save memory *before* computing the loss
+
+        if y.dtype != torch.int32:
+            y = y.int()
+
+        if logit_lens.dtype != torch.int32:
+            logit_lens = logit_lens.int()
+
+        if y_lens.dtype != torch.int32:
+            y_lens = y_lens.int()
+
+        # send to gpu
+        if self.use_cuda:
+            logits = logits.cuda()
+            logit_lens = logit_lens.cuda()
+            y = y.cuda()
+            y_lens = y_lens.cuda()
+
+        loss = self.rnnt_loss(
+            acts=logits, labels=y, act_lens=logit_lens, label_lens=y_lens
+        )
+
+        # del new variables that may have been created due to float/int/cuda()
+        del logits, y, logit_lens, y_lens, inputs, targets
+
+        return loss
diff --git a/rnn_speech_recognition/pytorch/metrics.py b/rnn_speech_recognition/pytorch/metrics.py
new file mode 100644
index 000000000..fdf287846
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/metrics.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def __levenshtein(a: List, b: List) -> int:
+    """Calculates the Levenshtein distance between a and b.
+    """
+    n, m = len(a), len(b)
+    if n > m:
+        # Make sure n <= m, to use O(min(n,m)) space
+        a, b = b, a
+        n, m = m, n
+
+    current = list(range(n + 1))
+    for i in range(1, m + 1):
+        previous, current = current, [i] + [0] * n
+        for j in range(1, n + 1):
+            add, delete = previous[j] + 1, current[j - 1] + 1
+            change = previous[j - 1]
+            if a[j - 1] != b[i - 1]:
+                change = change + 1
+            current[j] = min(add, delete, change)
+
+    return current[n]
+
+
+def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
+    """
+    Computes Average Word Error rate between two texts represented as
+    corresponding lists of string. Hypotheses and references must have same length.
+
+    Args:
+        hypotheses: list of hypotheses
+        references: list of references
+
+    Returns:
+        (float) average word error rate
+    """
+    scores = 0
+    words = 0
+    if len(hypotheses) != len(references):
+        raise ValueError("In word error rate calculation, hypotheses and reference"
+                                         " lists must have the same number of elements. But I got:"
+                                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    for h, r in zip(hypotheses, references):
+        h_list = h.split()
+        r_list = r.split()
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    if words!=0:
+        wer = 1.0*scores/words
+    else:
+        wer = float('inf')
+    return wer, scores, words
diff --git a/rnn_speech_recognition/pytorch/model.py b/rnn_speech_recognition/pytorch/model.py
new file mode 100644
index 000000000..d61d68f22
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/model.py
@@ -0,0 +1,452 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from apex import amp
+import torch
+import torch.nn as nn
+from parts.features import FeatureFactory
+from helpers import Optimization
+import random
+
+
+jasper_activations = {
+    "hardtanh": nn.Hardtanh,
+    "relu": nn.ReLU,
+    "selu": nn.SELU,
+}
+
+def init_weights(m, mode='xavier_uniform'):
+    if type(m) == nn.Conv1d or type(m) == MaskedConv1d:
+        if mode == 'xavier_uniform':
+            nn.init.xavier_uniform_(m.weight, gain=1.0)
+        elif mode == 'xavier_normal':
+            nn.init.xavier_normal_(m.weight, gain=1.0)
+        elif mode == 'kaiming_uniform':
+            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
+        elif mode == 'kaiming_normal':
+            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+        else:
+            raise ValueError("Unknown Initialization mode: {0}".format(mode))
+    elif type(m) == nn.BatchNorm1d:
+        if m.track_running_stats:
+            m.running_mean.zero_()
+            m.running_var.fill_(1)
+            m.num_batches_tracked.zero_()
+        if m.affine:
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+
+def get_same_padding(kernel_size, stride, dilation):
+    if stride > 1 and dilation > 1:
+        raise ValueError("Only stride OR dilation may be greater than 1")
+    return (kernel_size // 2) * dilation
+
+class AudioPreprocessing(nn.Module):
+    """GPU accelerated audio preprocessing
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)    # For PyTorch API
+        self.optim_level = kwargs.get('optimization_level', Optimization.nothing)
+        self.featurizer = FeatureFactory.from_config(kwargs)
+
+    def forward(self, x):
+        input_signal, length = x
+        length.requires_grad_(False)
+        if self.optim_level not in  [Optimization.nothing, Optimization.mxprO0, Optimization.mxprO3]:
+            with amp.disable_casts():
+                processed_signal = self.featurizer(x)
+                processed_length = self.featurizer.get_seq_len(length)
+        else:
+                processed_signal = self.featurizer(x)
+                processed_length = self.featurizer.get_seq_len(length)
+        return processed_signal, processed_length
+
+class SpectrogramAugmentation(nn.Module):
+    """Spectrogram augmentation
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)
+        self.spec_cutout_regions = SpecCutoutRegions(kwargs)
+        self.spec_augment = SpecAugment(kwargs)
+
+    @torch.no_grad()
+    def forward(self, input_spec):
+        augmented_spec = self.spec_cutout_regions(input_spec)
+        augmented_spec = self.spec_augment(augmented_spec)
+        return augmented_spec
+
+class SpecAugment(nn.Module):
+    """Spec augment. refer to https://arxiv.org/abs/1904.08779
+    """
+    def __init__(self, cfg):
+        super(SpecAugment, self).__init__()
+        self.cutout_x_regions = cfg.get('cutout_x_regions', 0)
+        self.cutout_y_regions = cfg.get('cutout_y_regions', 0)
+
+        self.cutout_x_width = cfg.get('cutout_x_width', 10)
+        self.cutout_y_width = cfg.get('cutout_y_width', 10)
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+
+        mask = torch.zeros(x.shape).byte()
+        for idx in range(sh[0]):
+            for _ in range(self.cutout_x_regions):
+                cutout_x_left = int(random.uniform(0, sh[1] - self.cutout_x_width))
+
+                mask[idx, cutout_x_left:cutout_x_left + self.cutout_x_width, :] = 1
+
+            for _ in range(self.cutout_y_regions):
+                cutout_y_left = int(random.uniform(0, sh[2] - self.cutout_y_width))
+
+                mask[idx, :, cutout_y_left:cutout_y_left + self.cutout_y_width] = 1
+
+        x = x.masked_fill(mask.to(device=x.device), 0)
+
+        return x
+
+class SpecCutoutRegions(nn.Module):
+    """Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf
+    """
+    def __init__(self, cfg):
+        super(SpecCutoutRegions, self).__init__()
+
+        self.cutout_rect_regions = cfg.get('cutout_rect_regions', 0)
+        self.cutout_rect_time = cfg.get('cutout_rect_time', 5)
+        self.cutout_rect_freq = cfg.get('cutout_rect_freq', 20)
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+
+        mask = torch.zeros(x.shape).byte()
+
+        for idx in range(sh[0]):
+            for i in range(self.cutout_rect_regions):
+                cutout_rect_x = int(random.uniform(
+                        0, sh[1] - self.cutout_rect_freq))
+                cutout_rect_y = int(random.uniform(
+                        0, sh[2] - self.cutout_rect_time))
+
+                mask[idx, cutout_rect_x:cutout_rect_x + self.cutout_rect_freq,
+                         cutout_rect_y:cutout_rect_y + self.cutout_rect_time] = 1
+
+        x = x.masked_fill(mask.to(device=x.device), 0)
+
+        return x
+
+class JasperEncoder(nn.Module):
+
+    """Jasper encoder
+    """
+    def __init__(self, **kwargs):
+        cfg = {}
+        for key, value in kwargs.items():
+            cfg[key] = value
+
+        nn.Module.__init__(self)
+        self._cfg = cfg
+
+        activation = jasper_activations[cfg['encoder']['activation']]()
+        self.use_conv_mask = cfg['encoder'].get('convmask', False)
+        feat_in = cfg['input']['features'] * cfg['input'].get('frame_splicing', 1)
+        init_mode = cfg.get('init_mode', 'xavier_uniform')
+
+        residual_panes = []
+        encoder_layers = []
+        self.dense_residual = False
+        for lcfg in cfg['jasper']:
+            dense_res = []
+            if lcfg.get('residual_dense', False):
+                residual_panes.append(feat_in)
+                dense_res = residual_panes
+                self.dense_residual = True
+            encoder_layers.append(
+                JasperBlock(feat_in, lcfg['filters'], repeat=lcfg['repeat'],
+                                        kernel_size=lcfg['kernel'], stride=lcfg['stride'],
+                                        dilation=lcfg['dilation'], dropout=lcfg['dropout'],
+                                        residual=lcfg['residual'], activation=activation,
+                                        residual_panes=dense_res, use_conv_mask=self.use_conv_mask))
+            feat_in = lcfg['filters']
+
+        self.encoder = nn.Sequential(*encoder_layers)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+
+    def num_weights(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def forward(self, x):
+        if self.use_conv_mask:
+            audio_signal, length = x
+            return self.encoder(([audio_signal], length))
+        else:
+            return self.encoder([x])
+
+class JasperDecoderForCTC(nn.Module):
+    """Jasper decoder
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)
+        self._feat_in = kwargs.get("feat_in")
+        self._num_classes = kwargs.get("num_classes")
+        init_mode = kwargs.get('init_mode', 'xavier_uniform')
+
+        self.decoder_layers = nn.Sequential(
+            nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True),)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+
+    def num_weights(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def forward(self, encoder_output):
+        out = self.decoder_layers(encoder_output[-1]).transpose(1, 2)
+        return nn.functional.log_softmax(out, dim=2)
+
+class Jasper(nn.Module):
+    """Contains data preprocessing, spectrogram augmentation, jasper encoder and decoder
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)
+        if kwargs.get("no_featurizer", False):
+            self.audio_preprocessor = None
+        else:
+            self.audio_preprocessor = AudioPreprocessing(**kwargs.get("feature_config"))
+
+        self.data_spectr_augmentation = SpectrogramAugmentation(**kwargs.get("feature_config"))
+        self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition"))
+        self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"),
+                                                  num_classes=kwargs.get("num_classes"))
+        self.acoustic_model = JasperAcousticModel(self.jasper_encoder, self.jasper_decoder)
+
+    def num_weights(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def forward(self, x):
+
+        # Apply optional preprocessing
+        if self.audio_preprocessor is not None:
+            t_processed_signal, p_length_t = self.audio_preprocessor(x)
+        # Apply optional spectral augmentation
+        if self.training:
+            t_processed_signal = self.data_spectr_augmentation(input_spec=t_processed_signal)
+            
+        if (self.jasper_encoder.use_conv_mask):
+            a_inp = (t_processed_signal, p_length_t)
+        else:
+            a_inp = t_processed_signal
+        # Forward Pass through Encoder-Decoder
+        return self.acoustic_model.forward(a_inp)
+
+
+class JasperAcousticModel(nn.Module):
+    def __init__(self, enc, dec, transpose_in=False):
+        nn.Module.__init__(self)
+        self.jasper_encoder = enc
+        self.jasper_decoder = dec
+        self.transpose_in = transpose_in
+    def forward(self, x):
+        if self.jasper_encoder.use_conv_mask:
+            t_encoded_t, t_encoded_len_t = self.jasper_encoder(x)
+        else:
+            if self.transpose_in:
+                x = x.transpose(1, 2)                
+            t_encoded_t = self.jasper_encoder(x)
+
+        out = self.jasper_decoder(encoder_output=t_encoded_t)
+        if self.jasper_encoder.use_conv_mask:
+            return out, t_encoded_len_t
+        else:
+            return out
+
+class JasperEncoderDecoder(nn.Module):
+    """Contains jasper encoder and decoder
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)
+        self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition"))
+        self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"),
+                                                  num_classes=kwargs.get("num_classes"))
+        self.acoustic_model = JasperAcousticModel(self.jasper_encoder,
+                                                  self.jasper_decoder,
+                                                  kwargs.get("transpose_in", False))
+        
+    def num_weights(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def forward(self, x):
+        return self.acoustic_model.forward(x)
+
+class MaskedConv1d(nn.Conv1d):
+    """1D convolution with sequence masking
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                             padding=0, dilation=1, groups=1, bias=False, use_conv_mask=True):
+        super(MaskedConv1d, self).__init__(in_channels, out_channels, kernel_size,
+                                                                             stride=stride,
+                                                                             padding=padding, dilation=dilation,
+                                                                             groups=groups, bias=bias)
+        self.use_conv_mask = use_conv_mask
+
+    def get_seq_len(self, lens):
+        return ((lens + 2 * self.padding[0] - self.dilation[0] * (
+            self.kernel_size[0] - 1) - 1) / self.stride[0] + 1)
+
+    def forward(self, inp):
+        if self.use_conv_mask:
+            x, lens = inp
+            max_len = x.size(2)
+            idxs = torch.arange(max_len).to(lens.dtype).to(lens.device).expand(len(lens), max_len)
+            mask = idxs >= lens.unsqueeze(1)
+            x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
+            del mask
+            del idxs
+            lens = self.get_seq_len(lens)
+        else:
+            x = inp
+        out = super(MaskedConv1d, self).forward(x)
+
+        if self.use_conv_mask:
+            return out, lens
+        else:
+            return out
+
+class JasperBlock(nn.Module):
+    """Jasper Block. See https://arxiv.org/pdf/1904.03288.pdf
+    """
+    def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
+                             dilation=1, padding='same', dropout=0.2, activation=None,
+                             residual=True, residual_panes=[], use_conv_mask=False):
+        super(JasperBlock, self).__init__()
+
+        if padding != "same":
+            raise ValueError("currently only 'same' padding is supported")
+
+
+        padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])
+        self.use_conv_mask = use_conv_mask
+        self.conv = nn.ModuleList()
+        inplanes_loop = inplanes
+        for _ in range(repeat - 1):
+            self.conv.extend(
+                self._get_conv_bn_layer(inplanes_loop, planes, kernel_size=kernel_size,
+                                                                stride=stride, dilation=dilation,
+                                                                padding=padding_val))
+            self.conv.extend(
+                self._get_act_dropout_layer(drop_prob=dropout, activation=activation))
+            inplanes_loop = planes
+        self.conv.extend(
+            self._get_conv_bn_layer(inplanes_loop, planes, kernel_size=kernel_size,
+                                                            stride=stride, dilation=dilation,
+                                                            padding=padding_val))
+
+        self.res = nn.ModuleList() if residual else None
+        res_panes = residual_panes.copy()
+        self.dense_residual = residual
+        if residual:
+            if len(residual_panes) == 0:
+                res_panes = [inplanes]
+                self.dense_residual = False
+            for ip in res_panes:
+                self.res.append(nn.ModuleList(
+                    modules=self._get_conv_bn_layer(ip, planes, kernel_size=1)))
+        self.out = nn.Sequential(
+            *self._get_act_dropout_layer(drop_prob=dropout, activation=activation))
+
+    def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11,
+                                                 stride=1, dilation=1, padding=0, bias=False):
+        layers = [
+            MaskedConv1d(in_channels, out_channels, kernel_size, stride=stride,
+                                     dilation=dilation, padding=padding, bias=bias,
+                                     use_conv_mask=self.use_conv_mask),
+            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)
+        ]
+        return layers
+
+    def _get_act_dropout_layer(self, drop_prob=0.2, activation=None):
+        if activation is None:
+            activation = nn.Hardtanh(min_val=0.0, max_val=20.0)
+        layers = [
+            activation,
+            nn.Dropout(p=drop_prob)
+        ]
+        return layers
+
+    def num_weights(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def forward(self, input_):
+        if self.use_conv_mask:
+            xs, lens_orig = input_
+        else:
+            xs = input_
+            lens_orig = 0
+        # compute forward convolutions
+        out = xs[-1]
+        lens = lens_orig
+        for i, l in enumerate(self.conv):
+            if self.use_conv_mask and isinstance(l, MaskedConv1d):
+                out, lens = l((out, lens))
+            else:
+                out = l(out)
+        # compute the residuals
+        if self.res is not None:
+            for i, layer in enumerate(self.res):
+                res_out = xs[i]
+                for j, res_layer in enumerate(layer):
+                    if j == 0 and self.use_conv_mask:
+                        res_out, _ = res_layer((res_out, lens_orig))
+                    else:
+                        res_out = res_layer(res_out)
+                out += res_out
+
+        # compute the output
+        out = self.out(out)
+        if self.res is not None and self.dense_residual:
+            out = xs + [out]
+        else:
+            out = [out]
+
+        if self.use_conv_mask:
+            return out, lens
+        else:
+            return out
+
+class GreedyCTCDecoder(nn.Module):
+    """ Greedy CTC Decoder
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)    # For PyTorch API
+
+    def forward(self, log_probs):
+        with torch.no_grad():
+            argmx = log_probs.argmax(dim=-1, keepdim=False).int()
+            return argmx
+
+class CTCLossNM:
+    """ CTC loss
+    """
+    def __init__(self, **kwargs):
+        self._blank = kwargs['num_classes'] - 1
+        self._criterion = nn.CTCLoss(blank=self._blank, reduction='none')
+
+    def __call__(self, log_probs, targets, input_length, target_length):
+        input_length = input_length.long()
+        target_length = target_length.long()
+        targets = targets.long()
+        loss = self._criterion(log_probs.transpose(1, 0), targets, input_length,
+                                                     target_length)
+        # note that this is different from reduction = 'mean'
+        # because we are not dividing by target lengths
+        return torch.mean(loss)
diff --git a/rnn_speech_recognition/pytorch/model_rnnt.py b/rnn_speech_recognition/pytorch/model_rnnt.py
new file mode 100644
index 000000000..242e96424
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/model_rnnt.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from rnn import rnn
+from rnn import StackTime
+
+class BnReLUDropout(torch.nn.Module):
+    def __init__(self, input_size, dropout):
+        super(BnReLUDropout, self).__init__()
+        self.bn = torch.nn.BatchNorm1d(input_size)
+        self.relu = torch.nn.ReLU()
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        return x
+
+class RNNT(torch.nn.Module):
+    """A Recurrent Neural Network Transducer (RNN-T).
+
+    Args:
+        in_features: Number of input features per step per batch.
+        vocab_size: Number of output symbols (inc blank).
+        forget_gate_bias: Total initialized value of the bias used in the
+            forget gate. Set to None to use PyTorch's default initialisation.
+            (See: http://proceedings.mlr.press/v37/jozefowicz15.pdf)
+        batch_norm: Use batch normalization in encoder and prediction network
+            if true.
+        encoder_n_hidden: Internal hidden unit size of the encoder.
+        encoder_rnn_layers: Encoder number of layers.
+        pred_n_hidden:  Internal hidden unit size of the prediction network.
+        pred_rnn_layers: Prediction network number of layers.
+        joint_n_hidden: Internal hidden unit size of the joint network.
+        rnn_type: string. Type of rnn in SUPPORTED_RNNS.
+    """
+    def __init__(self, rnnt=None, num_classes=1, **kwargs):
+        super().__init__()
+        if kwargs.get("no_featurizer", False):
+            in_features = kwargs.get("in_features")
+        else:
+            feat_config = kwargs.get("feature_config")
+            in_features = feat_config['features'] * feat_config.get("frame_splicing", 1)
+
+        self._pred_n_hidden = rnnt['pred_n_hidden']
+
+        self.encoder_n_hidden = rnnt["encoder_n_hidden"]
+        self.encoder_pre_rnn_layers = rnnt["encoder_pre_rnn_layers"]
+        self.encoder_post_rnn_layers = rnnt["encoder_post_rnn_layers"]
+
+        self.pred_n_hidden = rnnt["pred_n_hidden"]
+        self.pred_rnn_layers = rnnt["pred_rnn_layers"]
+
+        self.encoder = self._encoder(
+            in_features,
+            rnnt["encoder_n_hidden"],
+            rnnt["encoder_pre_rnn_layers"],
+            rnnt["encoder_post_rnn_layers"],
+            rnnt["forget_gate_bias"],
+            None if "norm" not in rnnt else rnnt["norm"],
+            rnnt["rnn_type"],
+            rnnt["encoder_stack_time_factor"],
+            rnnt["dropout"],
+        )
+
+        self.prediction = self._predict(
+            num_classes,
+            rnnt["pred_n_hidden"],
+            rnnt["pred_rnn_layers"],
+            rnnt["forget_gate_bias"],
+            None if "norm" not in "rnnt" else rnnt["norm"],
+            rnnt["rnn_type"],
+            rnnt["dropout"],
+        )
+
+        self.joint_net = self._joint_net(
+            num_classes,
+            rnnt["pred_n_hidden"],
+            rnnt["encoder_n_hidden"],
+            rnnt["joint_n_hidden"],
+            rnnt["dropout"],
+        )
+
+    def _encoder(self, in_features, encoder_n_hidden,
+                 encoder_pre_rnn_layers, encoder_post_rnn_layers,
+                 forget_gate_bias, norm, rnn_type, encoder_stack_time_factor,
+                 dropout):
+        layers = torch.nn.ModuleDict({
+            "pre_rnn": rnn(
+                rnn=rnn_type,
+                input_size=in_features,
+                hidden_size=encoder_n_hidden,
+                num_layers=encoder_pre_rnn_layers,
+                norm=norm,
+                forget_gate_bias=forget_gate_bias,
+                dropout=dropout,
+            ),
+            "stack_time": StackTime(factor=encoder_stack_time_factor),
+            "post_rnn": rnn(
+                rnn=rnn_type,
+                input_size=encoder_stack_time_factor*encoder_n_hidden,
+                hidden_size=encoder_n_hidden,
+                num_layers=encoder_post_rnn_layers,
+                norm=norm,
+                forget_gate_bias=forget_gate_bias,
+                norm_first_rnn=True,
+                dropout=dropout,
+            ),
+        })
+        return layers
+
+    def _predict(self, vocab_size, pred_n_hidden, pred_rnn_layers,
+                 forget_gate_bias, norm, rnn_type, dropout):
+        layers = torch.nn.ModuleDict({
+            "embed": torch.nn.Embedding(vocab_size - 1, pred_n_hidden),
+            "dec_rnn": rnn(
+                rnn=rnn_type,
+                input_size=pred_n_hidden,
+                hidden_size=pred_n_hidden,
+                num_layers=pred_rnn_layers,
+                norm=norm,
+                forget_gate_bias=forget_gate_bias,
+                dropout=dropout,
+            ),
+        })
+        return layers
+
+    def _joint_net(self, vocab_size, pred_n_hidden, enc_n_hidden,
+                   joint_n_hidden, dropout):
+        layers = [
+            torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden),
+            torch.nn.ReLU(),
+        ] + ([ torch.nn.Dropout(p=dropout), ] if dropout else [ ]) + [
+            torch.nn.Linear(joint_n_hidden, vocab_size)
+        ]
+        return torch.nn.Sequential(
+                *layers
+        )
+
+    def forward(self, batch, state=None):
+        # batch: ((x, y), (x_lens, y_lens))
+
+        # x: (B, channels, features, seq_len)
+        (x, y), (x_lens, y_lens) = batch
+        y = label_collate(y)
+
+        f, x_lens = self.encode((x, x_lens))
+
+        g, _ = self.predict(y, state)
+        out = self.joint(f, g)
+
+        return out, (x_lens, y_lens)
+
+    def encode(self, x):
+        """
+        Args:
+            x: tuple of ``(input, input_lens)``. ``input`` has shape (T, B, I),
+                ``input_lens`` has shape ``(B,)``.
+
+        Returns:
+            f: tuple of ``(output, output_lens)``. ``output`` has shape
+                (B, T, H), ``output_lens``
+        """
+        x, x_lens = x
+        x, _ = self.encoder["pre_rnn"](x, None)
+        x, x_lens = self.encoder["stack_time"]((x, x_lens))
+        x, _ = self.encoder["post_rnn"](x, None)
+
+        return x.transpose(0, 1), x_lens
+
+    def predict(self, y, state=None, add_sos=True):
+        """
+        B - batch size
+        U - label length
+        H - Hidden dimension size
+        L - Number of decoder layers = 2
+
+        Args:
+            y: (B, U)
+
+        Returns:
+            Tuple (g, hid) where:
+                g: (B, U + 1, H)
+                hid: (h, c) where h is the final sequence hidden state and c is
+                    the final cell state:
+                        h (tensor), shape (L, B, H)
+                        c (tensor), shape (L, B, H)
+        """
+        if y is not None:
+            # (B, U) -> (B, U, H)
+            y = self.prediction["embed"](y)
+        else:
+            B = 1 if state is None else state[0].size(1)
+            y = torch.zeros((B, 1, self.pred_n_hidden)).to(
+                device=self.joint_net[0].weight.device,
+                dtype=self.joint_net[0].weight.dtype
+            )
+
+        # preprend blank "start of sequence" symbol
+        if add_sos:
+            B, U, H = y.shape
+            start = torch.zeros((B, 1, H)).to(device=y.device, dtype=y.dtype)
+            y = torch.cat([start, y], dim=1).contiguous()   # (B, U + 1, H)
+        else:
+            start = None   # makes del call later easier
+
+        #if state is None:
+        #    batch = y.size(0)
+        #    state = [
+        #        (torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device),
+        #         torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device))
+        #        for _ in range(self.pred_rnn_layers)
+        #    ]
+
+        y = y.transpose(0, 1)#.contiguous()   # (U + 1, B, H)
+        g, hid = self.prediction["dec_rnn"](y, state)
+        g = g.transpose(0, 1)#.contiguous()   # (B, U + 1, H)
+        del y, start, state
+        return g, hid
+
+    def joint(self, f, g):
+        """
+        f should be shape (B, T, H)
+        g should be shape (B, U + 1, H)
+
+        returns:
+            logits of shape (B, T, U, K + 1)
+        """
+        # Combine the input states and the output states
+        B, T, H = f.shape
+        B, U_, H2 = g.shape
+
+        f = f.unsqueeze(dim=2)   # (B, T, 1, H)
+        f = f.expand((B, T, U_, H))
+
+        g = g.unsqueeze(dim=1)   # (B, 1, U + 1, H)
+        g = g.expand((B, T, U_, H2))
+
+        inp = torch.cat([f, g], dim=3)   # (B, T, U, 2H)
+        res = self.joint_net(inp)
+        del f, g, inp
+        return res
+
+
+def label_collate(labels):
+    """Collates the label inputs for the rnn-t prediction network.
+
+    If `labels` is already in torch.Tensor form this is a no-op.
+
+    Args:
+        labels: A torch.Tensor List of label indexes or a torch.Tensor.
+
+    Returns:
+        A padded torch.Tensor of shape (batch, max_seq_len).
+    """
+
+    if isinstance(labels, torch.Tensor):
+        return labels.type(torch.int64)
+    if not isinstance(labels, (list, tuple)):
+        raise ValueError(
+            f"`labels` should be a list or tensor not {type(labels)}"
+        )
+
+    batch_size = len(labels)
+    max_len = max(len(l) for l in labels)
+
+    cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32)
+    for e, l in enumerate(labels):
+        cat_labels[e, :len(l)] = l
+    labels = torch.LongTensor(cat_labels)
+
+    return labels
diff --git a/rnn_speech_recognition/pytorch/multiproc.py b/rnn_speech_recognition/pytorch/multiproc.py
new file mode 100644
index 000000000..eecba31a8
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/multiproc.py
@@ -0,0 +1,190 @@
+# From PyTorch:
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# From Caffe2:
+#
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+#
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+#
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+#
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+#
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+#
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+#
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+import sys
+import subprocess
+import os
+import socket
+import time
+from argparse import ArgumentParser, REMAINDER
+
+import torch
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # spawn the processes
+        cmd = [sys.executable,
+               "-u",
+               args.training_script] + args.training_script_args
+
+        print(cmd)
+
+        stdout = None if local_rank == 0 else open("GPU_"+str(local_rank)+".log", "w")
+
+        process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
+        processes.append(process)
+
+    try:
+        up = True
+        error = False
+        while up and not error:
+            up = False
+            for p in processes:
+                ret = p.poll()
+                if ret is None:
+                    up = True
+                elif ret != 0:
+                    error = True
+            time.sleep(1)
+
+        if error:
+            for p in processes:
+                if p.poll() is None:
+                    p.terminate()
+            exit(1)
+
+    except KeyboardInterrupt:
+        for p in processes:
+            p.terminate()
+        raise
+    except SystemExit:
+        for p in processes:
+            p.terminate()
+        raise
+    except:
+        for p in processes:
+            p.terminate()
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rnn_speech_recognition/pytorch/optimizers.py b/rnn_speech_recognition/pytorch/optimizers.py
new file mode 100644
index 000000000..da17030dd
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/optimizers.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim import Optimizer
+import math
+
+class AdamW(Optimizer):
+    """Implements AdamW algorithm.
+  
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+  
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+  
+        Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+        On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+  
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                  weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+  
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+  
+    def step(self, closure=None):
+        """Performs a single optimization step.
+  
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+  
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                amsgrad = group['amsgrad']
+  
+                state = self.state[p]
+  
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
+  
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+  
+                state['step'] += 1
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+  
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
+  
+        return loss
+  
+class Novograd(Optimizer):
+    """
+    Implements Novograd algorithm.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.95, 0))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging: gradient averaging
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
+                 weight_decay=0, grad_averaging=False, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                      weight_decay=weight_decay,
+                      grad_averaging=grad_averaging,
+                      amsgrad=amsgrad)
+
+        super(Novograd, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Novograd, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Sparse gradients are not supported.')
+                amsgrad = group['amsgrad']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                norm = torch.sum(torch.pow(grad, 2))
+
+                if exp_avg_sq == 0:
+                    exp_avg_sq.copy_(norm)
+                else:
+                    exp_avg_sq.mul_(beta2).add_(1 - beta2, norm)
+
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                grad.div_(denom)
+                if group['weight_decay'] != 0:
+                    grad.add_(group['weight_decay'], p.data)
+                if group['grad_averaging']:
+                    grad.mul_(1 - beta1)
+                exp_avg.mul_(beta1).add_(grad)
+
+                p.data.add_(-group['lr'], exp_avg)
+        
+        return loss
diff --git a/rnn_speech_recognition/pytorch/parts/features.py b/rnn_speech_recognition/pytorch/parts/features.py
new file mode 100644
index 000000000..2c80a9370
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/features.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import math
+import librosa
+from .perturb import AudioAugmentor
+from .segment import AudioSegment
+from apex import amp
+
+
+def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000):
+    audio = AudioSegment.from_file(file_path,
+                                   target_sr=target_sr,
+                                   int_values=False,
+                                   offset=offset, duration=duration, trim=trim)
+    samples=torch.tensor(audio.samples, dtype=torch.float).cuda()
+    num_samples = torch.tensor(samples.shape[0]).int().cuda()
+    return (samples.unsqueeze(0), num_samples.unsqueeze(0))
+
+class WaveformFeaturizer(object):
+    def __init__(self, input_cfg, augmentor=None):
+        self.augmentor = augmentor if augmentor is not None else AudioAugmentor()
+        self.cfg = input_cfg
+
+    def max_augmentation_length(self, length):
+        return self.augmentor.max_augmentation_length(length)
+
+    def process(self, file_path, offset=0, duration=0, trim=False):
+        audio = AudioSegment.from_file(file_path,
+                                       target_sr=self.cfg['sample_rate'],
+                                       int_values=self.cfg.get('int_values', False),
+                                       offset=offset, duration=duration, trim=trim)
+        return self.process_segment(audio)
+
+    def process_segment(self, audio_segment):
+        self.augmentor.perturb(audio_segment)
+        return torch.tensor(audio_segment.samples, dtype=torch.float)
+
+    @classmethod
+    def from_config(cls, input_config, perturbation_configs=None):
+        if perturbation_configs is not None:
+            aa = AudioAugmentor.from_config(perturbation_configs)
+        else:
+            aa = None
+
+        return cls(input_config, augmentor=aa)
+
+constant = 1e-5
+def normalize_batch(x, seq_len, normalize_type):
+    if normalize_type == "per_feature":
+        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
+                                                 device=x.device)
+        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
+                                                device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
+            x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+        # make sure x_std is not zero
+        x_std += constant
+        return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
+    elif normalize_type == "all_features":
+        x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i] = x[i, :, :seq_len[i].item()].mean()
+            x_std[i] = x[i, :, :seq_len[i].item()].std()
+        # make sure x_std is not zero
+        x_std += constant
+        return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
+    else:
+        return x
+
+def splice_frames(x, frame_splicing):
+    """ Stacks frames together across feature dim
+
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim*frame_splicing, num_frames
+
+    """
+    seq = [x]
+    for n in range(1, frame_splicing):
+        tmp = torch.zeros_like(x)
+        tmp[:, :, :-n] = x[:, :, n:]
+        seq.append(tmp)
+    return torch.cat(seq, dim=1)[:, :, ::frame_splicing]
+
+class SpectrogramFeatures(nn.Module):
+    def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
+                       n_fft=None,
+                       window="hamming", normalize="per_feature", log=True, center=True,
+                       dither=constant, pad_to=8, max_duration=16.7,
+                       frame_splicing=1):
+        super(SpectrogramFeatures, self).__init__()
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+        self.win_length = int(sample_rate * window_size)
+        self.hop_length = int(sample_rate * window_stride)
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length,
+                                  periodic=False) if window_fn else None
+        self.window = window_tensor
+
+        self.normalize = normalize
+        self.log = log
+        self.center = center
+        self.dither = dither
+        self.pad_to = pad_to
+        self.frame_splicing = frame_splicing
+
+        max_length = 1 + math.ceil(
+                (max_duration * sample_rate - self.win_length) / self.hop_length
+        )
+        max_pad = 16 - (max_length % 16)
+        self.max_length = max_length + max_pad
+
+    def get_seq_len(self, seq_len):
+        x = torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to(
+            dtype=torch.int)
+        if self.frame_splicing > 1:
+            x = torch.ceil(x.float() / self.frame_splicing).to(dtype=torch.int)
+        return x
+
+    @torch.no_grad()
+    def forward(self, inp):
+        x, seq_len = inp
+        dtype = x.dtype
+
+        seq_len = self.get_seq_len(seq_len)
+
+        # dither
+        if self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+
+        # do preemphasis
+        if hasattr(self,'preemph') and self.preemph is not None:
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]),
+                                        dim=1)
+
+        # get spectrogram
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
+                          win_length=self.win_length, center=self.center,
+                          window=self.window.to(torch.float))
+        x = torch.sqrt(x.pow(2).sum(-1))
+
+        # log features if required
+        if self.log:
+            x = torch.log(x + 1e-20)
+
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = splice_frames(x, self.frame_splicing)
+
+        # normalize if required
+        if self.normalize:
+            x = normalize_batch(x, seq_len, normalize_type=self.normalize)
+
+        # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
+        #max_len = x.size(-1)
+        #mask = torch.arange(max_len).to(seq_len.dtype).to(seq_len.device).expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
+        #x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
+        #del mask
+        x = x[:, :, :seq_len.max()]   # rnnt loss requires lengths to match
+        pad_to = self.pad_to
+        if pad_to != 0:
+            raise NotImplementedError()
+        #if pad_to == "max":
+        #    x = nn.functional.pad(x, (0, self.max_length - x.size(-1)))
+        #elif pad_to > 0:
+        #    pad_amt = x.size(-1) % pad_to
+        #    if pad_amt != 0:
+        #        x = nn.functional.pad(x, (0, pad_to - pad_amt))
+
+        return x.to(dtype)
+
+    @classmethod
+    def from_config(cls, cfg, log=False):
+        return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'],
+                   window_stride=cfg['window_stride'],
+                   n_fft=cfg['n_fft'], window=cfg['window'],
+                   normalize=cfg['normalize'],
+                   max_duration=cfg.get('max_duration', 16.7),
+                   dither=cfg.get('dither', 1e-5), pad_to=cfg.get("pad_to", 0),
+                   frame_splicing=cfg.get("frame_splicing", 1), log=log)
+
+class FilterbankFeatures(nn.Module):
+    def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
+                       window="hamming", normalize="per_feature", n_fft=None,
+                       preemph=0.97,
+                       nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant,
+                       pad_to=8,
+                       max_duration=16.7,
+                       frame_splicing=1):
+        super(FilterbankFeatures, self).__init__()
+#        print("PADDING: {}".format(pad_to))
+
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+
+        self.win_length = int(sample_rate * window_size) # frame size
+        self.hop_length = int(sample_rate * window_stride)
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+
+        self.normalize = normalize
+        self.log = log
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.nfilt = nfilt
+        self.preemph = preemph
+        self.pad_to = pad_to
+        highfreq = highfreq or sample_rate / 2
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length,
+                                  periodic=False) if window_fn else None
+        filterbanks = torch.tensor(
+            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq,
+                                                    fmax=highfreq), dtype=torch.float).unsqueeze(0)
+        # self.fb = filterbanks
+        # self.window = window_tensor
+        self.register_buffer("fb", filterbanks)
+        self.register_buffer("window", window_tensor)
+        # Calculate maximum sequence length (# frames)
+        max_length = 1 + math.ceil(
+                (max_duration * sample_rate - self.win_length) / self.hop_length
+        )
+        max_pad = 16 - (max_length % 16)
+        self.max_length = max_length + max_pad
+
+
+    def get_seq_len(self, seq_len):
+        x = torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to(
+            dtype=torch.int)
+            # dtype=torch.long)
+        if self.frame_splicing > 1:
+            x = torch.ceil(x.float() / self.frame_splicing).to(dtype=torch.int)
+        return x
+
+    @torch.no_grad()
+    def forward(self, inp):
+        x, seq_len = inp
+
+        dtype = x.dtype
+
+        seq_len = self.get_seq_len(seq_len)
+
+        # dither
+        if self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+
+        # do preemphasis
+        if self.preemph is not None:
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]),
+                                        dim=1)
+
+        # do stft
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
+                       win_length=self.win_length,
+                       center=True, window=self.window.to(dtype=torch.float))
+
+        # get power spectrum
+        x = x.pow(2).sum(-1)
+
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+
+        # log features if required
+        if self.log:
+            x = torch.log(x + 1e-20)
+
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = splice_frames(x, self.frame_splicing)
+
+        # normalize if required
+        if self.normalize:
+            x = normalize_batch(x, seq_len, normalize_type=self.normalize)
+
+        # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
+        #max_len = x.size(-1)
+        x = x[:, :, :seq_len.max()]   # rnnt loss requires lengths to match
+        #mask = torch.arange(max_len).to(seq_len.dtype).to(x.device).expand(x.size(0),
+        #                                                                   max_len) >= seq_len.unsqueeze(1)
+
+        #x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
+        pad_to = self.pad_to
+        if pad_to != 0:
+            raise NotImplementedError()
+        #if pad_to == "max":
+        #    x = nn.functional.pad(x, (0, self.max_length - x.size(-1)))
+        #elif pad_to > 0:
+        #    pad_amt = x.size(-1) % pad_to
+        #    if pad_amt != 0:
+        #        x = nn.functional.pad(x, (0, pad_to - pad_amt))
+
+        return x.to(dtype)
+
+    @classmethod
+    def from_config(cls, cfg, log=False):
+        return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'],
+                   window_stride=cfg['window_stride'], n_fft=cfg['n_fft'],
+                   nfilt=cfg['features'], window=cfg['window'],
+                   normalize=cfg['normalize'],
+                   max_duration=cfg.get('max_duration', 16.7),
+                   dither=cfg['dither'], pad_to=cfg.get("pad_to", 0),
+                   frame_splicing=cfg.get("frame_splicing", 1), log=log)
+
+class FeatureFactory(object):
+    featurizers = {
+        "logfbank": FilterbankFeatures,
+        "fbank": FilterbankFeatures,
+        "stft": SpectrogramFeatures,
+        "logspect": SpectrogramFeatures,
+        "logstft": SpectrogramFeatures
+    }
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_config(cls, cfg):
+        feat_type = cfg.get('feat_type', "logspect")
+        featurizer = cls.featurizers[feat_type]
+        #return featurizer.from_config(cfg, log="log" in cfg['feat_type'])
+        return featurizer.from_config(cfg, log="log" in feat_type)
diff --git a/rnn_speech_recognition/pytorch/parts/manifest.py b/rnn_speech_recognition/pytorch/parts/manifest.py
new file mode 100644
index 000000000..08cd7b564
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/manifest.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+import string
+import numpy as np
+import os
+
+from .text import _clean_text
+
+
+def normalize_string(s, labels, table, **unused_kwargs):
+    """
+    Normalizes string. For example:
+    'call me at 8:00 pm!' -> 'call me at eight zero pm'
+
+    Args:
+        s: string to normalize
+        labels: labels used during model training.
+
+    Returns:
+            Normalized string
+    """
+
+    def good_token(token, labels):
+        s = set(labels)
+        for t in token:
+            if not t in s:
+                return False
+        return True
+
+    try:
+        text = _clean_text(s, ["english_cleaners"], table).strip()
+        return ''.join([t for t in text if good_token(t, labels=labels)])
+    except:
+        print("WARNING: Normalizing {} failed".format(s))
+        return None
+
+class Manifest(object):
+    def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False,
+                 min_duration=None, sort_by_duration=False, max_utts=0,
+                 normalize=True, speed_perturbation=False, filter_speed=1.0):
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        self.blank_index = blank_index
+        self.max_duration= max_duration
+        ids = []
+        duration = 0.0
+        filtered_duration = 0.0
+
+        # If removing punctuation, make a list of punctuation to remove
+        table = None
+        if normalize:
+            # Punctuation to remove
+            punctuation = string.punctuation
+            punctuation = punctuation.replace("+", "")
+            punctuation = punctuation.replace("&", "")
+            ### We might also want to consider:
+            ### @ -> at
+            ### # -> number, pound, hashtag
+            ### ~ -> tilde
+            ### _ -> underscore
+            ### % -> percent
+            # If a punctuation symbol is inside our vocab, we do not remove from text
+            for l in labels:
+                punctuation = punctuation.replace(l, "")
+            # Turn all punctuation to whitespace
+            table = str.maketrans(punctuation, " " * len(punctuation))
+        for manifest_path in manifest_paths:
+            with open(manifest_path, "r", encoding="utf-8") as fh:
+                a=json.load(fh)
+                for data in a:
+                    files_and_speeds = data['files']
+
+                    if pad_to_max:
+                        if not speed_perturbation:
+                            min_speed = filter_speed
+                        else:
+                            min_speed = min(x['speed'] for x in files_and_speeds)
+                        max_duration = self.max_duration * min_speed
+
+                    data['duration'] = data['original_duration']
+                    if min_duration is not None and data['duration'] < min_duration:
+                        filtered_duration += data['duration']
+                        continue
+                    if max_duration is not None and data['duration'] > max_duration:
+                        filtered_duration += data['duration']
+                        continue
+
+                    # Prune and normalize according to transcript
+                    transcript_text = data[
+                        'transcript'] if "transcript" in data else self.load_transcript(
+                        data['text_filepath'])
+                    if normalize:
+                        transcript_text = normalize_string(transcript_text, labels=labels,
+                                                                                             table=table)
+                    if not isinstance(transcript_text, str):
+                        print(
+                            "WARNING: Got transcript: {}. It is not a string. Dropping data point".format(
+                                transcript_text))
+                        filtered_duration += data['duration']
+                        continue
+                    data["transcript"] = self.parse_transcript(transcript_text) # convert to vocab indices
+
+                    if speed_perturbation:
+                        audio_paths = [x['fname'] for x in files_and_speeds]
+                        data['audio_duration'] = [x['duration'] for x in files_and_speeds]
+                    else:
+                        audio_paths = [x['fname'] for x in files_and_speeds if x['speed'] == filter_speed]
+                        data['audio_duration'] = [x['duration'] for x in files_and_speeds if x['speed'] == filter_speed]
+                    data['audio_filepath'] = [os.path.join(data_dir, x) for x in audio_paths]
+                    data.pop('files')
+                    data.pop('original_duration')
+
+                    ids.append(data)
+                    duration += data['duration']
+
+                    if max_utts > 0 and len(ids) >= max_utts:
+                        print(
+                            'Stopping parsing %s as max_utts=%d' % (manifest_path, max_utts))
+                        break
+
+        if sort_by_duration:
+            ids = sorted(ids, key=lambda x: x['duration'])
+        self._data = ids
+        self._size = len(ids)
+        self._duration = duration
+        self._filtered_duration = filtered_duration
+
+    def load_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        return transcript
+
+    def parse_transcript(self, transcript):
+        chars = [self.labels_map.get(x, self.blank_index) for x in list(transcript)]
+        transcript = list(filter(lambda x: x != self.blank_index, chars))
+        return transcript
+
+    def __getitem__(self, item):
+        return self._data[item]
+
+    def __len__(self):
+        return self._size
+
+    def __iter__(self):
+        return iter(self._data)
+
+    @property
+    def duration(self):
+        return self._duration
+
+    @property
+    def filtered_duration(self):
+        return self._filtered_duration
+
+    @property
+    def data(self):
+        return list(self._data)
diff --git a/rnn_speech_recognition/pytorch/parts/perturb.py b/rnn_speech_recognition/pytorch/parts/perturb.py
new file mode 100644
index 000000000..b8ff0f50a
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/perturb.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import librosa
+from .manifest import Manifest
+from .segment import AudioSegment
+
+
+class Perturbation(object):
+    def max_augmentation_length(self, length):
+        return length
+
+    def perturb(self, data):
+        raise NotImplementedError
+
+
+class SpeedPerturbation(Perturbation):
+    def __init__(self, min_speed_rate=0.85, max_speed_rate=1.15, rng=None):
+        self._min_rate = min_speed_rate
+        self._max_rate = max_speed_rate
+        self._rng = random.Random() if rng is None else rng
+
+    def max_augmentation_length(self, length):
+        return length * self._max_rate
+
+    def perturb(self, data):
+        speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
+        if speed_rate <= 0:
+            raise ValueError("speed_rate should be greater than zero.")
+        data._samples = librosa.effects.time_stretch(data._samples, speed_rate)
+
+
+class GainPerturbation(Perturbation):
+    def __init__(self, min_gain_dbfs=-10, max_gain_dbfs=10, rng=None):
+        self._min_gain_dbfs = min_gain_dbfs
+        self._max_gain_dbfs = max_gain_dbfs
+        self._rng = random.Random() if rng is None else rng
+
+    def perturb(self, data):
+        gain = self._rng.uniform(self._min_gain_dbfs, self._max_gain_dbfs)
+        data._samples = data._samples * (10. ** (gain / 20.))
+
+
+
+class ShiftPerturbation(Perturbation):
+    def __init__(self, min_shift_ms=-5.0, max_shift_ms=5.0, rng=None):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = random.Random() if rng is None else rng
+
+    def perturb(self, data):
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        if abs(shift_ms) / 1000 > data.duration:
+            # TODO: do something smarter than just ignore this condition
+            return
+        shift_samples = int(shift_ms * data.sample_rate // 1000)
+        # print("DEBUG: shift:", shift_samples)
+        if shift_samples < 0:
+            data._samples[-shift_samples:] = data._samples[:shift_samples]
+            data._samples[:-shift_samples] = 0
+        elif shift_samples > 0:
+            data._samples[:-shift_samples] = data._samples[shift_samples:]
+            data._samples[-shift_samples:] = 0
+
+
+perturbation_types = {
+    "speed": SpeedPerturbation,
+    "gain": GainPerturbation,
+    "shift": ShiftPerturbation,
+}
+
+
+class AudioAugmentor(object):
+    def __init__(self, perturbations=None, rng=None):
+        self._rng = random.Random() if rng is None else rng
+        self._pipeline = perturbations if perturbations is not None else []
+
+    def perturb(self, segment):
+        for (prob, p) in self._pipeline:
+            if self._rng.random() < prob:
+                p.perturb(segment)
+        return
+
+    def max_augmentation_length(self, length):
+        newlen = length
+        for (prob, p) in self._pipeline:
+            newlen = p.max_augmentation_length(newlen)
+        return newlen
+
+    @classmethod
+    def from_config(cls, config):
+        ptbs = []
+        for p in config:
+            if p['aug_type'] not in perturbation_types:
+                print(p['aug_type'], "perturbation not known. Skipping.")
+                continue
+            perturbation = perturbation_types[p['aug_type']]
+            ptbs.append((p['prob'], perturbation(**p['cfg'])))
+        return cls(perturbations=ptbs)
diff --git a/rnn_speech_recognition/pytorch/parts/segment.py b/rnn_speech_recognition/pytorch/parts/segment.py
new file mode 100644
index 000000000..b06983941
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/segment.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import librosa
+import soundfile as sf
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, samples, sample_rate, target_sr=None, trim=False,
+                             trim_db=60):
+        """Create audio segment from samples.
+        Samples are convert float32 internally, with int scaled to [-1, 1].
+        """
+        samples = self._convert_samples_to_float32(samples)
+        if target_sr is not None and target_sr != sample_rate:
+            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            sample_rate = target_sr
+        if trim:
+            samples, _ = librosa.effects.trim(samples, trim_db)
+        self._samples = samples
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                        "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                                        self.duration, self.rms_db))
+
+    @staticmethod
+    def _convert_samples_to_float32(samples):
+        """Convert sample type to float32.
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2 ** (bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+
+    @classmethod
+    def from_file(cls, filename, target_sr=None, int_values=False, offset=0,
+                                duration=0, trim=False):
+        """
+        Load a file supported by librosa and return as an AudioSegment.
+        :param filename: path of file to load
+        :param target_sr: the desired sample rate
+        :param int_values: if true, load samples as 32-bit integers
+        :param offset: offset in seconds when loading audio
+        :param duration: duration in seconds when loading audio
+        :return: numpy array of samples
+        """
+        with sf.SoundFile(filename, 'r') as f:
+            dtype = 'int32' if int_values else 'float32'
+            sample_rate = f.samplerate
+            if offset > 0:
+                f.seek(int(offset * sample_rate))
+            if duration > 0:
+                samples = f.read(int(duration * sample_rate), dtype=dtype)
+            else:
+                samples = f.read(dtype=dtype)
+        samples = samples.transpose()
+        return cls(samples, sample_rate, target_sr=target_sr, trim=trim)
+
+    @property
+    def samples(self):
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    @property
+    def num_samples(self):
+        return self._samples.shape[0]
+
+    @property
+    def duration(self):
+        return self._samples.shape[0] / float(self._sample_rate)
+
+    @property
+    def rms_db(self):
+        mean_square = np.mean(self._samples ** 2)
+        return 10 * np.log10(mean_square)
+
+    def gain_db(self, gain):
+        self._samples *= 10. ** (gain / 20.)
+
+    def pad(self, pad_size, symmetric=False):
+        """Add zero padding to the sample. The pad size is given in number of samples.
+        If symmetric=True, `pad_size` will be added to both sides. If false, `pad_size`
+        zeros will be added only to the end.
+        """
+        self._samples = np.pad(self._samples,
+                               (pad_size if symmetric else 0, pad_size),
+                               mode='constant')
+
+    def subsegment(self, start_time=None, end_time=None):
+        """Cut the AudioSegment between given boundaries.
+        Note that this is an in-place transformation.
+        :param start_time: Beginning of subsegment in seconds.
+        :type start_time: float
+        :param end_time: End of subsegment in seconds.
+        :type end_time: float
+        :raise ValueError: If start_time or end_time is incorrectly set, e.g. out
+                                             of bounds in time.
+        """
+        start_time = 0.0 if start_time is None else start_time
+        end_time = self.duration if end_time is None else end_time
+        if start_time < 0.0:
+            start_time = self.duration + start_time
+        if end_time < 0.0:
+            end_time = self.duration + end_time
+        if start_time < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_time)
+        if end_time < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_time)
+        if start_time > end_time:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_time, end_time))
+        if end_time > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_time, self.duration))
+        start_sample = int(round(start_time * self._sample_rate))
+        end_sample = int(round(end_time * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
diff --git a/rnn_speech_recognition/pytorch/parts/text/LICENSE b/rnn_speech_recognition/pytorch/parts/text/LICENSE
new file mode 100644
index 000000000..4ad4ed1d5
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/text/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/rnn_speech_recognition/pytorch/parts/text/__init__.py b/rnn_speech_recognition/pytorch/parts/text/__init__.py
new file mode 100644
index 000000000..da9e021cd
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/text/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+import re
+from . import cleaners
+
+def _clean_text(text, cleaner_names, *args):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text, *args)
+    return text
diff --git a/rnn_speech_recognition/pytorch/parts/text/cleaners.py b/rnn_speech_recognition/pytorch/parts/text/cleaners.py
new file mode 100644
index 000000000..a99db1a62
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/text/cleaners.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" from https://github.com/keithito/tacotron 
+Modified to add puncturation removal
+"""
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+    1. "english_cleaners" for English text
+    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+         the symbols in symbols.py to match your data).
+
+'''
+
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+def lowercase(text):
+    return text.lower()
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+def remove_punctuation(text, table):
+    text = text.translate(table)
+    text = re.sub(r'&', " and ", text)
+    text = re.sub(r'\+', " plus ", text)
+    return text
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+def english_cleaners(text, table=None):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    if table is not None:
+        text = remove_punctuation(text, table)
+    text = collapse_whitespace(text)
+    return text
diff --git a/rnn_speech_recognition/pytorch/parts/text/numbers.py b/rnn_speech_recognition/pytorch/parts/text/numbers.py
new file mode 100644
index 000000000..46ce11067
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/text/numbers.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" from https://github.com/keithito/tacotron 
+Modifed to add support for time and slight tweaks to _expand_number
+"""
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})')
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    if int(m.group(0)[0]) == 0:
+        return _inflect.number_to_words(m.group(0), andword='', group=1)
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    # Add check for number phones and other large numbers
+    elif num > 1000000000 and num % 10000 != 0:
+        return _inflect.number_to_words(num, andword='', group=1)
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+def _expand_time(m):
+    mins = int(m.group(2))
+    if mins == 0:
+        return _inflect.number_to_words(m.group(1))
+    return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))])
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    text = re.sub(_time_re, _expand_time, text)
+    return text
diff --git a/rnn_speech_recognition/pytorch/parts/text/symbols.py b/rnn_speech_recognition/pytorch/parts/text/symbols.py
new file mode 100644
index 000000000..24efedf8d
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/parts/text/symbols.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+from . import cmudict
+
+_pad        = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in cmudict.valid_symbols]
+
+# Export all symbols:
+symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
diff --git a/rnn_speech_recognition/pytorch/preprocessing.py b/rnn_speech_recognition/pytorch/preprocessing.py
new file mode 100644
index 000000000..eb2e5b2f3
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/preprocessing.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.nn as nn
+from apex import amp
+
+from helpers import Optimization
+from parts.features import FeatureFactory
+
+
+class SpecCutoutRegions(nn.Module):
+    """Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf
+    """
+    def __init__(self, cfg):
+        super(SpecCutoutRegions, self).__init__()
+
+        self.cutout_rect_regions = cfg.get('cutout_rect_regions', 0)
+        self.cutout_rect_time = cfg.get('cutout_rect_time', 5)
+        self.cutout_rect_freq = cfg.get('cutout_rect_freq', 20)
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+
+        mask = torch.zeros(x.shape).bool()
+
+        for idx in range(sh[0]):
+            for i in range(self.cutout_rect_regions):
+                cutout_rect_x = int(random.uniform(
+                        0, sh[1] - self.cutout_rect_freq))
+                cutout_rect_y = int(random.uniform(
+                        0, sh[2] - self.cutout_rect_time))
+
+                mask[idx, cutout_rect_x:cutout_rect_x + self.cutout_rect_freq,
+                         cutout_rect_y:cutout_rect_y + self.cutout_rect_time] = 1
+
+        x = x.masked_fill(mask.to(device=x.device), 0)
+
+        return x
+
+
+class SpecAugment(nn.Module):
+    """Spec augment. refer to https://arxiv.org/abs/1904.08779
+    """
+    def __init__(self, cfg):
+        super(SpecAugment, self).__init__()
+        self.cutout_x_regions = cfg.get('cutout_x_regions', 0)
+        self.cutout_y_regions = cfg.get('cutout_y_regions', 0)
+
+        self.cutout_x_width = cfg.get('cutout_x_width', 10)
+        self.cutout_y_width = cfg.get('cutout_y_width', 10)
+
+    @torch.no_grad()
+    def forward(self, x):
+        sh = x.shape
+
+        mask = torch.zeros(x.shape).bool()
+        for idx in range(sh[0]):
+            for _ in range(self.cutout_x_regions):
+                cutout_x_left = int(random.uniform(0, sh[1] - self.cutout_x_width))
+
+                mask[idx, cutout_x_left:cutout_x_left + self.cutout_x_width, :] = 1
+
+            for _ in range(self.cutout_y_regions):
+                cutout_y_left = int(random.uniform(0, sh[2] - self.cutout_y_width))
+
+                mask[idx, :, cutout_y_left:cutout_y_left + self.cutout_y_width] = 1
+
+        x = x.masked_fill(mask.to(device=x.device), 0)
+
+        return x
+
+
+class SpectrogramAugmentation(nn.Module):
+    """Spectrogram augmentation
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)
+        self.spec_cutout_regions = SpecCutoutRegions(kwargs)
+        self.spec_augment = SpecAugment(kwargs)
+
+    @torch.no_grad()
+    def forward(self, input_spec):
+        augmented_spec = self.spec_cutout_regions(input_spec)
+        augmented_spec = self.spec_augment(augmented_spec)
+        return augmented_spec
+
+
+class AudioPreprocessing(nn.Module):
+    """GPU accelerated audio preprocessing
+    """
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)    # For PyTorch API
+        self.optim_level = kwargs.get('optimization_level', Optimization.nothing)
+        self.featurizer = FeatureFactory.from_config(kwargs)
+
+    def forward(self, x):
+        input_signal, length = x
+        length.requires_grad_(False)
+        if self.optim_level not in  [Optimization.nothing, Optimization.mxprO0, Optimization.mxprO3]:
+            with amp.disable_casts():
+                processed_signal = self.featurizer(x)
+                processed_length = self.featurizer.get_seq_len(length)
+        else:
+                processed_signal = self.featurizer(x)
+                processed_length = self.featurizer.get_seq_len(length)
+        return processed_signal, processed_length
+
+
diff --git a/rnn_speech_recognition/pytorch/requirements.txt b/rnn_speech_recognition/pytorch/requirements.txt
new file mode 100755
index 000000000..cc675c8d1
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/requirements.txt
@@ -0,0 +1,10 @@
+pandas==0.24.2
+tqdm==4.31.1
+ascii-graph==1.5.1
+wrapt==1.10.11
+librosa
+toml
+soundfile
+ipdb
+sox
+tensorboard==2.0.0
diff --git a/rnn_speech_recognition/pytorch/rnn.py b/rnn_speech_recognition/pytorch/rnn.py
new file mode 100644
index 000000000..c6234d61f
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/rnn.py
@@ -0,0 +1,402 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+from torch.nn import Parameter
+
+
+def rnn(rnn, input_size, hidden_size, num_layers, norm=None,
+        forget_gate_bias=1.0, dropout=0.0, **kwargs):
+    """TODO"""
+    if rnn != "lstm":
+        raise ValueError(f"Unknown rnn={rnn}")
+    if norm not in [None, "batch_norm", "layer_norm"]:
+        raise ValueError(f"unknown norm={norm}")
+
+    if rnn == "lstm":
+        if norm is None:
+            return LstmDrop(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                forget_gate_bias=forget_gate_bias,
+                **kwargs
+            )
+
+        if norm == "batch_norm":
+            return BNRNNSum(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                rnn_layers=num_layers,
+                batch_norm=True,
+                dropout=dropout,
+                forget_gate_bias=forget_gate_bias,
+                **kwargs
+            )
+
+        if norm == "layer_norm":
+            return torch.jit.script(lnlstm(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                forget_gate_bias=forget_gate_bias,
+                **kwargs
+            ))
+
+
+class OverLastDim(torch.nn.Module):
+    """Collapses a tensor to 2D, applies a module, and (re-)expands the tensor.
+
+    An n-dimensional tensor of shape (s_1, s_2, ..., s_n) is first collapsed to
+    a tensor with shape (s_1*s_2*...*s_n-1, s_n). The module is called with
+    this as input producing (s_1*s_2*...*s_n-1, s_n') --- note that the final
+    dimension can change. This is expanded to (s_1, s_2, ..., s_n-1, s_n') and
+    returned.
+
+    Args:
+        module (torch.nn.Module): Module to apply. Must accept a 2D tensor as
+            input and produce a 2D tensor as output, optionally changing the
+            size of the last dimension.
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, x):
+        *dims, input_size = x.size()
+
+        reduced_dims = 1
+        for dim in dims:
+            reduced_dims *= dim
+
+        x = x.view(reduced_dims, -1)
+        x = self.module(x)
+        x = x.view(*dims, -1)
+        return x
+
+
+class LstmDrop(torch.nn.Module):
+
+    def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bias,
+             **kwargs):
+        """Returns an LSTM with forget gate bias init to `forget_gate_bias`.
+
+        Args:
+            input_size: See `torch.nn.LSTM`.
+            hidden_size: See `torch.nn.LSTM`.
+            num_layers: See `torch.nn.LSTM`.
+            dropout: See `torch.nn.LSTM`.
+            forget_gate_bias: For each layer and each direction, the total value of
+                to initialise the forget gate bias to.
+
+        Returns:
+            A `torch.nn.LSTM`.
+        """
+        super(LstmDrop, self).__init__()
+
+        self.lstm = torch.nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+        )
+        if forget_gate_bias is not None:
+            for name, v in self.lstm.named_parameters():
+                if "bias_ih" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2*hidden_size].fill_(forget_gate_bias)
+                if "bias_hh" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2*hidden_size].fill_(0)
+
+        self.dropout = torch.nn.Dropout(dropout) if dropout else None
+
+    def forward(self, x, h=None):
+
+        x, h = self.lstm(x, h)
+
+        if self.dropout:
+            x = self.dropout(x)
+
+        return x, h
+
+
+
+class RNNLayer(torch.nn.Module):
+    """A single RNNLayer with optional batch norm."""
+    def __init__(self, input_size, hidden_size, rnn_type=torch.nn.LSTM,
+                 batch_norm=True, forget_gate_bias=1.0):
+        super().__init__()
+
+        if batch_norm:
+            self.bn = OverLastDim(torch.nn.BatchNorm1d(input_size))
+
+        if isinstance(rnn_type, torch.nn.LSTM) and not batch_norm:
+            # batch_norm will apply bias, no need to add a second to LSTM
+            self.rnn = lstm(input_size=input_size,
+                            hidden_size=hidden_size,
+                            forget_gate_bias=forget_gate_bias)
+        else:
+            self.rnn = rnn_type(input_size=input_size,
+                                hidden_size=hidden_size,
+                                bias=not batch_norm)
+
+    def forward(self, x, hx=None):
+        if hasattr(self, 'bn'):
+            x = x.contiguous()
+            x = self.bn(x)
+        x, h = self.rnn(x, hx=hx)
+        return x, h
+
+    def _flatten_parameters(self):
+        self.rnn.flatten_parameters()
+
+
+class BNRNNSum(torch.nn.Module):
+    """RNN wrapper with optional batch norm.
+
+    Instantiates an RNN. If it is an LSTM it initialises the forget gate
+    bias =`lstm_gate_bias`. Optionally applies a batch normalisation layer to
+    the input with the statistics computed over all time steps.  If dropout > 0
+    then it is applied to all layer outputs except the last.
+    """
+    def __init__(self, input_size, hidden_size, rnn_type=torch.nn.LSTM,
+                 rnn_layers=1, batch_norm=True, dropout=0.0,
+                 forget_gate_bias=1.0, norm_first_rnn=False, **kwargs):
+        super().__init__()
+        self.rnn_layers = rnn_layers
+
+        self.layers = torch.nn.ModuleList()
+        for i in range(rnn_layers):
+            final_layer = (rnn_layers - 1) == i
+
+            self.layers.append(
+                RNNLayer(
+                    input_size,
+                    hidden_size,
+                    rnn_type=rnn_type,
+                    batch_norm=batch_norm and (norm_first_rnn or i > 0),
+                    forget_gate_bias=forget_gate_bias,
+                )
+            )
+
+            if dropout > 0.0 and not final_layer:
+                self.layers.append(torch.nn.Dropout(dropout))
+
+            input_size = hidden_size
+
+    def forward(self, x, hx=None):
+        hx = self._parse_hidden_state(hx)
+
+        hs = []
+        cs = []
+        rnn_idx = 0
+        for layer in self.layers:
+            if isinstance(layer, torch.nn.Dropout):
+                x = layer(x)
+            else:
+                x, h_out = layer(x, hx=hx[rnn_idx])
+                hs.append(h_out[0])
+                cs.append(h_out[1])
+                rnn_idx += 1
+                del h_out
+
+        h_0 = torch.stack(hs, dim=0)
+        c_0 = torch.stack(cs, dim=0)
+        return x, (h_0, c_0)
+
+    def _parse_hidden_state(self, hx):
+        """
+        Dealing w. hidden state:
+        Typically in pytorch: (h_0, c_0)
+            h_0 = ``[num_layers * num_directions, batch, hidden_size]``
+            c_0 = ``[num_layers * num_directions, batch, hidden_size]``
+        """
+        if hx is None:
+            return [None] * self.rnn_layers
+        else:
+            h_0, c_0 = hx
+            assert h_0.shape[0] == self.rnn_layers
+            return [(h_0[i], c_0[i]) for i in range(h_0.shape[0])]
+
+    def _flatten_parameters(self):
+        for layer in self.layers:
+            if isinstance(layer, (torch.nn.LSTM, torch.nn.GRU, torch.nn.RNN)):
+                layer._flatten_parameters()
+
+
+class StackTime(torch.nn.Module):
+    def __init__(self, factor):
+        super().__init__()
+        self.factor = int(factor)
+
+    def forward(self, x):
+        # T, B, U
+        x, x_lens = x
+        seq = [x]
+        for i in range(1, self.factor):
+            tmp = torch.zeros_like(x)
+            tmp[:-i, :, :] = x[i:, :, :]
+            seq.append(tmp)
+        x_lens = torch.ceil(x_lens.float() / self.factor).int()
+        return torch.cat(seq, dim=2)[::self.factor, :, :], x_lens
+
+
+def lnlstm(input_size, hidden_size, num_layers, dropout, forget_gate_bias,
+           **kwargs):
+    """Returns a ScriptModule that mimics a PyTorch native LSTM."""
+    # The following are not implemented.
+    assert dropout == 0.0
+
+    return StackedLSTM(
+        num_layers,
+        LSTMLayer,
+        first_layer_args=[
+            LayerNormLSTMCell,
+            input_size,
+            hidden_size,
+            forget_gate_bias,
+        ],
+        other_layer_args=[
+            LayerNormLSTMCell,
+            hidden_size,
+            hidden_size,
+            forget_gate_bias,
+        ]
+    )
+
+
+class LSTMLayer(torch.nn.Module):
+    def __init__(self, cell, *cell_args):
+        super(LSTMLayer, self).__init__()
+        self.cell = cell(*cell_args)
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        state: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        inputs = input.unbind(0)
+        outputs = []
+        for i in range(len(inputs)):
+            out, state = self.cell(inputs[i], state)
+            outputs += [out]
+        return torch.stack(outputs), state
+
+
+class LayerNormLSTMCell(torch.nn.Module):
+    def __init__(self, input_size, hidden_size, forget_gate_bias):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
+        self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size))
+
+        # layernorms provide learnable biases
+        self.layernorm_i = torch.nn.LayerNorm(4 * hidden_size)
+        self.layernorm_h = torch.nn.LayerNorm(4 * hidden_size)
+        self.layernorm_c = torch.nn.LayerNorm(hidden_size)
+
+        self.reset_parameters()
+
+        self.layernorm_i.bias.data[hidden_size:2*hidden_size].fill_(0.0)
+        self.layernorm_h.bias.data[hidden_size:2*hidden_size].fill_(
+            forget_gate_bias
+        )
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            torch.nn.init.uniform_(weight, -stdv, stdv)
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        state: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        hx, cx = state
+        igates = self.layernorm_i(torch.mm(input, self.weight_ih.t()))
+        hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t()))
+        gates = igates + hgates
+        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+
+        ingate = torch.sigmoid(ingate)
+        forgetgate = torch.sigmoid(forgetgate)
+        cellgate = torch.tanh(cellgate)
+        outgate = torch.sigmoid(outgate)
+
+        cy = self.layernorm_c((forgetgate * cx) + (ingate * cellgate))
+        hy = outgate * torch.tanh(cy)
+
+        return hy, (hy, cy)
+
+
+def init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args):
+    layers = [layer(*first_layer_args)] + [layer(*other_layer_args)
+                                           for _ in range(num_layers - 1)]
+    return torch.nn.ModuleList(layers)
+
+
+class StackedLSTM(torch.nn.Module):
+    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
+        super(StackedLSTM, self).__init__()
+        self.layers: Final[torch.nn.ModuleList] = init_stacked_lstm(
+            num_layers, layer, first_layer_args, other_layer_args
+        )
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        if states is None:
+            states: List[Tuple[torch.Tensor, torch.Tensor]] = []
+            batch = input.size(1)
+            for layer in self.layers:
+                states.append(
+                    (torch.zeros(
+                        batch,
+                        layer.cell.hidden_size,
+                        dtype=input.dtype,
+                        device=input.device
+                     ),
+                     torch.zeros(
+                         batch,
+                         layer.cell.hidden_size,
+                         dtype=input.dtype,
+                         device=input.device
+                     )
+                    )
+                )
+
+        output_states: List[Tuple[Tensor, Tensor]] = []
+        output = input
+        # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
+        i = 0
+        for rnn_layer in self.layers:
+            state = states[i]
+            output, out_state = rnn_layer(output, state)
+            output_states += [out_state]
+            i += 1
+        return output, output_states
diff --git a/rnn_speech_recognition/pytorch/scripts/docker/build.sh b/rnn_speech_recognition/pytorch/scripts/docker/build.sh
new file mode 100755
index 000000000..cfdc97c01
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/docker/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build . --rm -t jasper
\ No newline at end of file
diff --git a/rnn_speech_recognition/pytorch/scripts/docker/launch.sh b/rnn_speech_recognition/pytorch/scripts/docker/launch.sh
new file mode 100755
index 000000000..5c9c6a3f3
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/docker/launch.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+
+DATA_DIR=$1
+CHECKPOINT_DIR=$2
+RESULT_DIR=$3
+
+docker run -it --rm \
+  --gpus='"device=1"' \
+  --shm-size=4g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -v "$DATA_DIR":/datasets \
+  -v "$CHECKPOINT_DIR":/checkpoints/ \
+  -v "$RESULT_DIR":/results/ \
+  -v $PWD:/code \
+  -v $PWD:/workspace/jasper \
+  mlperf-rnnt-ref bash
diff --git a/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh b/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh
new file mode 100755
index 000000000..ee322fe30
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh
@@ -0,0 +1,28 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env bash
+
+DATA_SET="LibriSpeech"
+DATA_ROOT_DIR="/datasets"
+DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}"
+if [ ! -d "$DATA_DIR" ]
+then
+    mkdir $DATA_DIR
+    chmod go+rx $DATA_DIR
+    python utils/download_librispeech.py utils/librispeech.csv $DATA_DIR -e ${DATA_ROOT_DIR}/
+else
+    echo "Directory $DATA_DIR already exists."
+fi
diff --git a/rnn_speech_recognition/pytorch/scripts/evaluation.sh b/rnn_speech_recognition/pytorch/scripts/evaluation.sh
new file mode 100755
index 000000000..fcd472fd9
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/evaluation.sh
@@ -0,0 +1,92 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"false"}
+NUM_GPUS=${8:-1}
+PRECISION=${9:-"fp32"}
+NUM_STEPS=${10:-"-1"}
+SEED=${11:-0}
+BATCH_SIZE=${12:-64}
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+    export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+    printf -v TAG "jasper_evaluation_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
+    DATESTAMP=`date +'%y%m%d%H%M%S'`
+    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+    printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+    PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+    PREC=""
+else
+    echo "Unknown <precision> argument"
+    exit -2
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+    STEPS=" --steps $NUM_STEPS"
+fi
+
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+
+CMD=" inference.py "
+CMD+=" --batch_size $BATCH_SIZE "
+CMD+=" --dataset_dir $DATA_DIR "
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --model_toml $MODEL_CONFIG  "
+CMD+=" --seed $SEED "
+CMD+=" --ckpt $CHECKPOINT "
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PREC "
+CMD+=" $STEPS "
+
+
+if [ "$NUM_GPUS" -gt 1  ] ; then
+    CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
+else
+    CMD="python3  $CMD"
+fi
+
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+fi
+set +x
diff --git a/rnn_speech_recognition/pytorch/scripts/inference.sh b/rnn_speech_recognition/pytorch/scripts/inference.sh
new file mode 100755
index 000000000..2d4474ce2
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/inference.sh
@@ -0,0 +1,104 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+
+DATA_DIR=${1-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"false"}
+PRECISION=${8:-"fp32"}
+NUM_STEPS=${9:-"-1"}
+SEED=${10:-0}
+BATCH_SIZE=${11:-64}
+MODELOUTPUT_FILE=${12:-"none"}
+PREDICTION_FILE=${13:-"$RESULT_DIR/${DATASET}.predictions"}
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+    export GBS=$(expr $BATCH_SIZE)
+    printf -v TAG "jasper_inference_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
+    DATESTAMP=`date +'%y%m%d%H%M%S'`
+    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+    printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+    PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+    PREC=""
+else
+    echo "Unknown <precision> argument"
+    exit -2
+fi
+
+PRED=""
+if [ "$PREDICTION_FILE" = "none" ] ; then
+    PRED=""
+else
+    PRED=" --save_prediction $PREDICTION_FILE"
+fi
+
+OUTPUT=""
+if [ "$MODELOUTPUT_FILE" = "none" ] ; then
+    OUTPUT=" "
+else
+    OUTPUT=" --logits_save_to $MODELOUTPUT_FILE"
+fi
+
+
+if [ "$CUDNN_BENCHMARK" = "true" ]; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+    STEPS=" --steps $NUM_STEPS"
+fi
+
+CMD=" python inference.py "
+CMD+=" --batch_size $BATCH_SIZE "
+CMD+=" --dataset_dir $DATA_DIR "
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --model_toml $MODEL_CONFIG  "
+CMD+=" --seed $SEED "
+CMD+=" --ckpt $CHECKPOINT "
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PRED "
+CMD+=" $OUTPUT "
+CMD+=" $PREC "
+CMD+=" $STEPS "
+
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+fi
+set +x
+echo "MODELOUTPUT_FILE: ${MODELOUTPUT_FILE}"
+echo "PREDICTION_FILE: ${PREDICTION_FILE}"
diff --git a/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh b/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh
new file mode 100755
index 000000000..7aeea84c1
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh
@@ -0,0 +1,84 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"true"}
+PRECISION=${8:-"fp32"}
+NUM_STEPS=${9:-"-1"}
+MAX_DURATION=${10:-"36"}
+SEED=${11:-0}
+BATCH_SIZE=${12:-64}
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+   STEPS=" --steps $NUM_STEPS"
+fi
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+CMD=" python inference_benchmark.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --seed=$SEED"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --ckpt=$CHECKPOINT"
+CMD+=" --max_duration=$MAX_DURATION"
+CMD+=" --pad_to=-1"
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PREC"
+CMD+=" $STEPS"
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE )
+  printf -v TAG "jasper_inference_benchmark_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+   grep 'latency' "$LOGFILE"
+fi
+set +x
diff --git a/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh b/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh
new file mode 100755
index 000000000..7cfe5cc6a
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-100 \
+    --dest_dir /datasets/LibriSpeech/train-clean-100-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-360 \
+    --dest_dir /datasets/LibriSpeech/train-clean-360-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-other-500 \
+    --dest_dir /datasets/LibriSpeech/train-other-500-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \
+    --speed 0.9 1.1
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-clean \
+    --dest_dir /datasets/LibriSpeech/dev-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-other \
+    --dest_dir /datasets/LibriSpeech/dev-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-clean \
+    --dest_dir /datasets/LibriSpeech/test-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-other \
+    --dest_dir /datasets/LibriSpeech/test-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json
diff --git a/rnn_speech_recognition/pytorch/scripts/train.sh b/rnn_speech_recognition/pytorch/scripts/train.sh
new file mode 100755
index 000000000..d59ce8ebe
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/train.sh
@@ -0,0 +1,113 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${2:-"configs/rnnt.toml"}
+RESULT_DIR=${3:-"/results"}
+CHECKPOINT=${4:-"none"}
+CREATE_LOGFILE=${5:-"true"}
+CUDNN_BENCHMARK=${6:-"true"}
+NUM_GPUS=${7:-8}
+PRECISION=${8:-"fp16"}
+EPOCHS=${9:-100}
+SEED=${10:-6}
+BATCH_SIZE=${11:-8}
+EVAL_BATCH_SIZE=${11:-2}
+LEARNING_RATE=${12:-"0.001"}
+LEARNING_RATE_WARMUP=${12:-"8000"}
+GRADIENT_ACCUMULATION_STEPS=${13:-1}
+LAUNCH_OPT=${LAUNCH_OPT:-"none"}
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+CUDNN=""
+if [ "$CUDNN_BENCHMARK" = "true" ] && [ "$PRECISION" = "fp16" ]; then
+   CUDNN=" --cudnn"
+else
+   CUDNN=""
+fi
+
+
+
+if [ "$CHECKPOINT" = "none" ] ; then
+   CHECKPOINT=""
+else
+   CHECKPOINT=" --ckpt=${CHECKPOINT}"
+fi
+
+
+CMD=" train.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --eval_batch_size=$EVAL_BATCH_SIZE"
+CMD+=" --num_epochs=$EPOCHS"
+CMD+=" --output_dir=$RESULT_DIR"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --lr=$LEARNING_RATE"
+CMD+=" --lr_warmup=$LEARNING_RATE_WARMUP"
+CMD+=" --seed=$SEED"
+CMD+=" --optimizer=adam"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --weight_decay=1e-3"
+CMD+=" --save_freq=100"
+CMD+=" --eval_freq=1"
+CMD+=" --train_freq=250"
+CMD+=" --lr_decay"
+CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS "
+CMD+=" $CHECKPOINT"
+CMD+=" $PREC"
+CMD+=" $CUDNN"
+
+
+if [ "${LAUNCH_OPT}" != "none" ]; then
+   CMD="python -m $LAUNCH_OPT $CMD"
+elif [ "$NUM_GPUS" -gt 1  ] ; then
+   CMD="python3 -m multiproc --nproc_per_node=$NUM_GPUS $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+  printf -v TAG "rnnt_train_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+set +x
diff --git a/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh b/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh
new file mode 100755
index 000000000..7b5a33705
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${3:-"/results"}
+CREATE_LOGFILE=${4:-"true"}
+CUDNN_BENCHMARK=${5:-"true"}
+NUM_GPUS=${6:-8}
+PRECISION=${7:-"fp16"}
+NUM_STEPS=${8:-"-1"}
+MAX_DURATION=${9:-16.7}
+SEED=${10:-0}
+BATCH_SIZE=${11:-64}
+LEARNING_RATE=${12:-"0.015"}
+GRADIENT_ACCUMULATION_STEPS=${13:-1}
+PRINT_FREQUENCY=${14:-1}
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC=" --fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -ne "-1" ] ; then
+   STEPS=" --num_steps=$NUM_STEPS"
+elif [ "$NUM_STEPS" = "-1" ] ; then
+   STEPS=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+CUDNN=""
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+   CUDNN=" --cudnn"
+else
+   CUDNN=""
+fi
+
+
+CMD=" train.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --num_epochs=400"
+CMD+=" --output_dir=$RESULT_DIR"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --lr=$LEARNING_RATE"
+CMD+=" --seed=$SEED"
+CMD+=" --optimizer=novograd"
+CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --weight_decay=1e-3"
+CMD+=" --save_freq=100000"
+CMD+=" --eval_freq=100000"
+CMD+=" --max_duration=$MAX_DURATION"
+CMD+=" --pad_to_max"
+CMD+=" --train_freq=$PRINT_FREQUENCY"
+CMD+=" --lr_decay"
+CMD+=" $CUDNN"
+CMD+=" $PREC"
+CMD+=" $STEPS"
+
+if [ "$NUM_GPUS" -gt 1  ] ; then
+   CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+  printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+  printf "Logs written to %s\n" "$LOGFILE"
+
+fi
+
+if [ -z "$LOGFILE" ] ; then
+
+   set -x
+   $CMD
+   set +x
+else
+
+   set -x
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+
+   set +x
+
+   mean_latency=`cat "$LOGFILE" | grep 'Step time' | awk '{print $3}'  | tail -n +2 | egrep -o '[0-9.]+'| awk 'BEGIN {total=0} {total+=$1} END {printf("%.2f\n",total/NR)}'`
+   mean_throughput=`python -c "print($BATCH_SIZE*$NUM_GPUS/${mean_latency})"`
+   training_wer_per_pgu=`cat "$LOGFILE" | grep 'training_batch_WER'| awk '{print $2}'  | tail -n 1 | egrep -o '[0-9.]+'`
+   training_loss_per_pgu=`cat "$LOGFILE" | grep 'Loss@Step'| awk '{print $4}'  | tail -n 1 | egrep -o '[0-9.]+'`
+   final_eval_wer=`cat "$LOGFILE" | grep 'Evaluation WER'| tail -n 1 | egrep -o '[0-9.]+'`
+   final_eval_loss=`cat "$LOGFILE" | grep 'Evaluation Loss'| tail -n 1 | egrep -o '[0-9.]+'`
+
+   echo "max duration: $MAX_DURATION s" | tee -a "$LOGFILE"
+   echo "mean_latency: $mean_latency s" | tee -a "$LOGFILE"
+   echo "mean_throughput: $mean_throughput sequences/s" | tee -a "$LOGFILE"
+   echo "training_wer_per_pgu: $training_wer_per_pgu" | tee -a "$LOGFILE"
+   echo "training_loss_per_pgu: $training_loss_per_pgu" | tee -a "$LOGFILE"
+   echo "final_eval_loss: $final_eval_loss" | tee -a "$LOGFILE"
+   echo "final_eval_wer: $final_eval_wer" | tee -a "$LOGFILE"
+fi
diff --git a/rnn_speech_recognition/pytorch/tb_logger.py b/rnn_speech_recognition/pytorch/tb_logger.py
new file mode 100644
index 000000000..cbc2f215e
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/tb_logger.py
@@ -0,0 +1,52 @@
+import torch.utils.tensorboard as tb
+
+class DummyLogger:
+    def log_scalar(*args, **kwargs):
+        pass
+
+    def log_params(*args, **kwargs):
+        pass
+
+    def log_grad(*args, **kwargs):
+        pass
+
+    def train_end(*args, **kwargs):
+        pass
+
+
+class TensorBoardLogger(DummyLogger):
+    def __init__(self, path, model, histogram=False):
+        self.writer = tb.SummaryWriter(log_dir=str(path))
+        self.model = model
+        self.histogram = histogram
+
+    def log_scalar(self, name, value, step, stage='train'):
+        self.writer.add_scalar(
+            f'{stage}/{name}',
+            value,
+            global_step=step
+        )
+
+    def log_grad(self, step):
+        if not self.histogram:
+            return
+        for name, param in self.model.named_parameters():
+            if param.grad is not None:
+                self.writer.add_histogram(
+                    name.replace('.', '/'),
+                    param.grad,
+                    global_step=step
+                )
+
+    def log_params(self, step):
+        if not self.histogram:
+            return
+        for name, param in self.model.named_parameters():
+            self.writer.add_histogram(
+                name.replace('.', '/'),
+                param,
+                global_step=step
+            )
+
+    def train_end(self):
+        self.writer.close()
diff --git a/rnn_speech_recognition/pytorch/train.py b/rnn_speech_recognition/pytorch/train.py
new file mode 100644
index 000000000..3261c4ec9
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/train.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import os
+import time
+import toml
+import torch
+import apex
+from apex import amp
+import random
+import numpy as np
+import math
+from dataset import AudioToTextDataLayer
+from helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch, Optimization, add_blank_label, AmpOptimizations, model_multi_gpu, print_dict, print_once
+from model_rnnt import RNNT
+from decoders import RNNTGreedyDecoder
+from loss import RNNTLoss
+from optimizers import Novograd, AdamW
+
+import torchvision
+
+from tb_logger import DummyLogger, TensorBoardLogger
+import preprocessing
+
+
+def lr_decay(N, step, learning_rate):
+    """
+    learning rate decay
+    Args:
+        learning_rate: base learning rate
+        step: current iteration number
+        N: total number of iterations over which learning rate is decayed
+    """
+    min_lr = 0.00001
+    res = learning_rate * ((N - step) / N) ** 2
+    return max(res, min_lr)
+
+def lr_warmup(warmup_steps, step, learning_rate):
+    return min(1, (step / warmup_steps)) * learning_rate
+
+def save(model, optimizer, epoch, output_dir):
+    """
+    Saves model checkpoint
+    Args:
+        model: model
+        optimizer: optimizer
+        epoch: epoch of model training
+        output_dir: path to save model checkpoint
+    """
+    class_name = model.__class__.__name__
+    unix_time = time.time()
+    file_name = "{0}_{1}-epoch-{2}.pt".format(class_name, unix_time, epoch)
+    print_once("Saving module {0} in {1}".format(class_name, os.path.join(output_dir, file_name)))
+    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        save_checkpoint={
+                        'epoch': epoch,
+                        'state_dict': model_to_save.state_dict(),
+                        'optimizer': optimizer.state_dict()
+                        }
+
+        torch.save(save_checkpoint, os.path.join(output_dir, file_name))
+    print_once('Saved.')
+
+
+def evaluator(model, data_transforms, loss_fn, greedy_decoder, labels, eval_datasets, logger):
+    """Evaluates model on evaluation dataset
+    """
+
+    def evalutaion(epoch=0):
+        model.eval()
+
+        for dataset, frequency, name in eval_datasets:
+            if epoch % frequency != 0:
+                continue
+
+            print_once(f"Doing {name} ....................... ......  ... .. . .")
+
+            with torch.no_grad():
+                _global_var_dict = {
+                    'EvalLoss': [],
+                    'predictions': [],
+                    'transcripts': [],
+                }
+                dataloader = dataset.data_iterator
+                for data in dataloader:
+
+                    t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data)
+
+                    t_log_probs_e, (x_len, y_len) = model(
+                        ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
+                    )
+                    t_loss_e = loss_fn(
+                        (t_log_probs_e, x_len), (t_transcript_e, y_len)
+                    )
+                    del t_log_probs_e
+
+                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e)
+
+                    values_dict = dict(
+                        loss=[t_loss_e],
+                        predictions=[t_predictions_e],
+                        transcript=[t_transcript_e],
+                        transcript_length=[t_transcript_len_e]
+                    )
+                    process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
+
+                # final aggregation across all workers and minibatches) and logging of results
+                wer, eloss = process_evaluation_epoch(_global_var_dict)
+                logger.log_scalar('loss', eloss, epoch, name)
+                logger.log_scalar('wer', wer, epoch, name)
+
+                print_once(f"==========>>>>>>{name} Loss: {eloss}\n")
+                print_once(f"==========>>>>>>{name} WER: {wer}\n")
+
+    return evalutaion
+
+
+def train(
+        data_layer,
+        model,
+        loss_fn,
+        greedy_decoder,
+        optimizer,
+        optim_level,
+        labels,
+        multi_gpu,
+        data_transforms,
+        args,
+        evalutaion,
+        logger,
+        fn_lr_policy):
+    """Trains model
+    Args:
+        data_layer: training data layer
+        model: model ( encapsulates data processing, encoder, decoder)
+        loss_fn: loss function
+        greedy_decoder: greedy ctc decoder
+        optimizer: optimizer
+        optim_level: AMP optimization level
+        labels: list of output labels
+        multi_gpu: true if multi gpu training
+        args: script input argument list
+        fn_lr_policy: function returning lr in given step
+    """
+    print_once("Starting .....")
+    start_time = time.time()
+
+    train_dataloader = data_layer.data_iterator
+    epoch = args.start_epoch
+    step = epoch * args.step_per_epoch
+
+    while True:
+        if multi_gpu:
+            data_layer.sampler.set_epoch(epoch)
+        print_once("Starting epoch {0}, step {1}".format(epoch, step))
+        last_epoch_start = time.time()
+        batch_counter = 0
+        average_loss = 0
+        for data in train_dataloader:
+
+            if batch_counter == 0:
+
+                adjusted_lr = fn_lr_policy(step)
+                for param_group in optimizer.param_groups:
+                        param_group['lr'] = adjusted_lr
+                optimizer.zero_grad()
+                last_iter_start = time.time()
+
+            t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data)
+            model.train()
+
+            t_log_probs_t, (x_len, y_len) = model(
+                ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
+            )
+
+            t_loss_t = loss_fn(
+                (t_log_probs_t, x_len), (t_transcript_t, y_len)
+            )
+            logger.log_scalar('loss', t_loss_t.item(), step)
+            del t_log_probs_t
+            if args.gradient_accumulation_steps > 1:
+                t_loss_t = t_loss_t / args.gradient_accumulation_steps
+
+            if optim_level in AmpOptimizations:
+                with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                t_loss_t.backward()
+            batch_counter += 1
+            average_loss += t_loss_t.item()
+
+            if batch_counter % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+
+                if (step + 1) % args.train_frequency == 0:
+                    t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t)
+
+                    e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
+                    train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
+                    print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
+                    print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
+                    logger.log_scalar('wer', train_wer, step)
+
+                step += 1
+                batch_counter = 0
+                average_loss = 0
+                if args.num_steps is not None and step >= args.num_steps:
+                    break
+
+        evalutaion(epoch)
+
+        if args.num_steps is not None and step >= args.num_steps:
+            break
+        print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
+        epoch += 1
+        if epoch % args.save_frequency == 0 and epoch > 0:
+            save(model, optimizer, epoch, output_dir=args.output_dir)
+        if args.num_steps is None and epoch >= args.num_epochs:
+            break
+    print_once("Done in {0}".format(time.time() - start_time))
+    print_once("Final Evaluation ....................... ......  ... .. . .")
+    evalutaion()
+    save(model, optimizer, epoch, output_dir=args.output_dir)
+
+def main(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    assert(torch.cuda.is_available())
+    torch.backends.cudnn.benchmark = args.cudnn
+
+    args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank)
+    # set up distributed training
+    if args.local_rank is not None:
+        args.local_rank = int(args.local_rank)
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    multi_gpu = torch.distributed.is_initialized()
+    if multi_gpu:
+        print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size()))
+
+    # define amp optimiation level
+    if args.fp16:
+        optim_level = Optimization.mxprO1
+    else:
+        optim_level = Optimization.mxprO0
+
+    model_definition = toml.load(args.model_toml)
+    dataset_vocab = model_definition['labels']['labels']
+    ctc_vocab = add_blank_label(dataset_vocab)
+
+    train_manifest = args.train_manifest
+    val_manifest = args.val_manifest
+    tst_manifest = args.tst_manifest
+    featurizer_config = model_definition['input']
+    featurizer_config_eval = model_definition['input_eval']
+    featurizer_config["optimization_level"] = optim_level
+    featurizer_config_eval["optimization_level"] = optim_level
+
+    sampler_type = featurizer_config.get("sampler", 'default')
+    perturb_config = model_definition.get('perturb', None)
+    if args.pad_to_max:
+        assert(args.max_duration > 0)
+        featurizer_config['max_duration'] = args.max_duration
+        featurizer_config_eval['max_duration'] = args.max_duration
+        featurizer_config['pad_to'] = "max"
+        featurizer_config_eval['pad_to'] = "max"
+    print_once('model_config')
+    print_dict(model_definition)
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps))
+    if args.batch_size % args.gradient_accumulation_steps != 0:
+        raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size))
+
+
+    preprocessor = preprocessing.AudioPreprocessing(**featurizer_config)
+    preprocessor.cuda()
+
+    augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config)
+    augmentations.cuda()
+
+    train_transforms = torchvision.transforms.Compose([
+        lambda xs: [x.cuda() for x in xs],
+        lambda xs: [*preprocessor(xs[0:2]), *xs[2:]],
+        lambda xs: [augmentations(xs[0]),   *xs[1:]],
+        lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]],
+    ])
+
+    eval_transforms = torchvision.transforms.Compose([
+        lambda xs: [x.cuda() for x in xs],
+        lambda xs: [*preprocessor(xs[0:2]), *xs[2:]],
+        lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]],
+    ])
+
+    data_layer = AudioToTextDataLayer(
+                                    dataset_dir=args.dataset_dir,
+                                    featurizer_config=featurizer_config,
+                                    perturb_config=perturb_config,
+                                    manifest_filepath=train_manifest,
+                                    labels=dataset_vocab,
+                                    batch_size=args.batch_size // args.gradient_accumulation_steps,
+                                    multi_gpu=multi_gpu,
+                                    pad_to_max=args.pad_to_max,
+                                    sampler=sampler_type)
+
+    eval_datasets = [(
+        AudioToTextDataLayer(
+            dataset_dir=args.dataset_dir,
+            featurizer_config=featurizer_config_eval,
+            manifest_filepath=val_manifest,
+            labels=dataset_vocab,
+            batch_size=args.eval_batch_size,
+            multi_gpu=multi_gpu,
+            pad_to_max=args.pad_to_max
+        ),
+        args.eval_frequency,
+        'Eval clean',
+    )]
+
+    if tst_manifest:
+        eval_datasets.append((
+            AudioToTextDataLayer(
+                dataset_dir=args.dataset_dir,
+                featurizer_config=featurizer_config_eval,
+                manifest_filepath=tst_manifest,
+                labels=dataset_vocab,
+                batch_size=args.eval_batch_size,
+                multi_gpu=multi_gpu,
+                pad_to_max=args.pad_to_max
+            ),
+            args.test_frequency,
+            'Test other',
+        ))
+
+    model = RNNT(
+        feature_config=featurizer_config,
+        rnnt=model_definition['rnnt'],
+        num_classes=len(ctc_vocab)
+    )
+
+    if args.ckpt is not None:
+        print_once("loading model from {}".format(args.ckpt))
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        model.load_state_dict(checkpoint['state_dict'], strict=True)
+        args.start_epoch = checkpoint['epoch']
+    else:
+        args.start_epoch = 0
+
+    loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1)
+
+    N = len(data_layer)
+    if sampler_type == 'default':
+        args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
+    elif sampler_type == 'bucket':
+        args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size )
+
+    print_once('-----------------')
+    print_once('Have {0} examples to train on.'.format(N))
+    print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch))
+    print_once('-----------------')
+
+    constant_lr_policy = lambda _: args.lr
+    fn_lr_policy = constant_lr_policy
+    if args.lr_decay:
+        pre_decay_policy = fn_lr_policy
+        fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s))
+    if args.lr_warmup:
+        pre_warmup_policy = fn_lr_policy
+        fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) )
+
+
+    model.cuda()
+
+
+    if args.optimizer_kind == "novograd":
+        optimizer = Novograd(model.parameters(),
+                        lr=args.lr,
+                        weight_decay=args.weight_decay)
+    elif args.optimizer_kind == "adam":
+        optimizer = AdamW(model.parameters(),
+                        lr=args.lr,
+                        weight_decay=args.weight_decay)
+    else:
+        raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind))
+
+    if optim_level in AmpOptimizations:
+        model, optimizer = amp.initialize(
+            min_loss_scale=0.125,
+            models=model,
+            optimizers=optimizer,
+            opt_level=AmpOptimizations[optim_level]
+        )
+
+    if args.ckpt is not None:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+
+    model = model_multi_gpu(model, multi_gpu)
+    print_once(model)
+    print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters())))
+    greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model)
+
+    if args.tb_path and args.local_rank == 0:
+        logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram)
+    else:
+        logger = DummyLogger()
+
+    train(
+        data_layer=data_layer,
+        model=model,
+        loss_fn=loss_fn,
+        greedy_decoder=greedy_decoder,
+        optimizer=optimizer,
+        data_transforms=train_transforms,
+        labels=ctc_vocab,
+        optim_level=optim_level,
+        multi_gpu=multi_gpu,
+        fn_lr_policy=fn_lr_policy,
+        evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger),
+        logger=logger,
+        args=args)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='RNNT Training Reference')
+    parser.add_argument("--local_rank", default=None, type=int)
+    parser.add_argument("--batch_size", default=16, type=int, help='data batch size')
+    parser.add_argument("--eval_batch_size", default=1, type=int, help='eval data batch size')
+    parser.add_argument("--num_epochs", default=10, type=int, help='number of training epochs. if number of steps if specified will overwrite this')
+    parser.add_argument("--num_steps", default=None, type=int, help='if specified overwrites num_epochs and will only train for this number of iterations')
+    parser.add_argument("--save_freq", dest="save_frequency", default=300, type=int, help='number of epochs until saving checkpoint. will save at the end of training too.')
+    parser.add_argument("--eval_freq", dest="eval_frequency", default=1, type=int, help='number of epochs until doing evaluation on full dataset')
+    parser.add_argument("--test_freq", dest="test_frequency", default=2, type=int, help='number of epochs until doing test on full dataset')
+    parser.add_argument("--train_freq", dest="train_frequency", default=25, type=int, help='number of iterations until printing training statistics on the past iteration')
+    parser.add_argument("--lr", default=1e-3, type=float, help='learning rate')
+    parser.add_argument("--weight_decay", default=1e-3, type=float, help='weight decay rate')
+    parser.add_argument("--train_manifest", type=str, required=True, help='relative path given dataset folder of training manifest file')
+    parser.add_argument("--model_toml", type=str, required=True, help='relative path given dataset folder of model configuration file')
+    parser.add_argument("--val_manifest", type=str, required=True, help='relative path given dataset folder of evaluation manifest file')
+    parser.add_argument("--tst_manifest", type=str, required=False, help='relative path given dataset folder of test manifest file')
+    parser.add_argument("--max_duration", type=float, help='maximum duration of audio samples for training and evaluation')
+    parser.add_argument("--pad_to_max", action="store_true", default=False, help="pad sequence to max_duration")
+    parser.add_argument("--gradient_accumulation_steps", default=1, type=int, help='number of accumulation steps')
+    parser.add_argument("--optimizer", dest="optimizer_kind", default="novograd", type=str, help='optimizer')
+    parser.add_argument("--dataset_dir", dest="dataset_dir", required=True, type=str, help='root dir of dataset')
+    parser.add_argument("--lr_decay", action="store_true", default=False, help='use learning rate decay')
+    parser.add_argument("--lr_warmup", type=int, default=None, help='if provided, the learning rate will linearly scale for given number of iterations from zero')
+    parser.add_argument("--cudnn", action="store_true", default=False, help="enable cudnn benchmark")
+    parser.add_argument("--fp16", action="store_true", default=False, help="use mixed precision training")
+    parser.add_argument("--output_dir", type=str, required=True, help='saves results in this directory')
+    parser.add_argument("--ckpt", default=None, type=str, help="if specified continues training from given checkpoint. Otherwise starts from beginning")
+    parser.add_argument("--seed", default=42, type=int, help='seed')
+    parser.add_argument("--tb_path", default=None, type=str, help='where to store tensorboard data')
+    parser.add_argument("--histogram", default=False, action='store_true', help='whether to log param and grad histograms')
+    args=parser.parse_args()
+    return args
+
+
+if __name__=="__main__":
+    args = parse_args()
+    print_dict(vars(args))
+    main(args)
diff --git a/rnn_speech_recognition/pytorch/utils/__init__.py b/rnn_speech_recognition/pytorch/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rnn_speech_recognition/pytorch/utils/convert_librispeech.py b/rnn_speech_recognition/pytorch/utils/convert_librispeech.py
new file mode 100644
index 000000000..914997516
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/convert_librispeech.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env python
+import argparse
+import os
+import glob
+import multiprocessing
+import json
+
+import pandas as pd
+
+from preprocessing_utils import parallel_preprocess
+
+parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
+parser.add_argument('--input_dir', type=str, required=True,
+                    help='LibriSpeech collection input dir')
+parser.add_argument('--dest_dir', type=str, required=True,
+                    help='Output dir')
+parser.add_argument('--output_json', type=str, default='./',
+                    help='name of the output json file.')
+parser.add_argument('-s','--speed', type=float, nargs='*',
+                    help='Speed perturbation ratio')
+parser.add_argument('--target_sr', type=int, default=None,
+                    help='Target sample rate. '
+                         'defaults to the input sample rate')
+parser.add_argument('--overwrite', action='store_true',
+                    help='Overwrite file if exists')
+parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
+                    help='Number of threads to use when processing audio files')
+args = parser.parse_args()
+
+args.input_dir = args.input_dir.rstrip('/')
+args.dest_dir = args.dest_dir.rstrip('/')
+
+def build_input_arr(input_dir):
+    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
+                          recursive=True)
+    input_data = []
+    for txt_file in txt_files:
+        rel_path = os.path.relpath(txt_file, input_dir)
+        with open(txt_file) as fp:
+            for line in fp:
+                fname, _, transcript = line.partition(' ')
+                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
+                                       input_fname=fname+'.flac',
+                                       transcript=transcript))
+    return input_data
+
+
+print("[%s] Scaning input dir..." % args.output_json)
+dataset = build_input_arr(input_dir=args.input_dir)
+
+print("[%s] Converting audio files..." % args.output_json)
+dataset = parallel_preprocess(dataset=dataset,
+                              input_dir=args.input_dir,
+                              dest_dir=args.dest_dir,
+                              target_sr=args.target_sr,
+                              speed=args.speed,
+                              overwrite=args.overwrite,
+                              parallel=args.parallel)
+
+print("[%s] Generating json..." % args.output_json)
+df = pd.DataFrame(dataset, dtype=object)
+
+# Save json with python. df.to_json() produces back slashed in file paths
+dataset = df.to_dict(orient='records')
+with open(args.output_json, 'w') as fp:
+    json.dump(dataset, fp, indent=2)
diff --git a/rnn_speech_recognition/pytorch/utils/download_librispeech.py b/rnn_speech_recognition/pytorch/utils/download_librispeech.py
new file mode 100644
index 000000000..ad36ad4e4
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/download_librispeech.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import os
+import argparse
+import pandas as pd
+
+from download_utils import download_file, md5_checksum, extract
+
+parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
+parser.add_argument('csv', type=str,
+                    help='CSV file with urls and checksums to download.')
+parser.add_argument('dest', type=str,
+                    help='Download destnation folder.')
+parser.add_argument('-e', type=str, default=None,
+                    help='Extraction destnation folder. Defaults to download folder if not provided')
+parser.add_argument('--skip_download', action='store_true',
+                    help='Skip downloading the files')
+parser.add_argument('--skip_checksum', action='store_true',
+                    help='Skip checksum')
+parser.add_argument('--skip_extract', action='store_true',
+                    help='Skip extracting files')
+args = parser.parse_args()
+args.e = args.e or args.dest
+
+
+df = pd.read_csv(args.csv, delimiter=',')
+
+
+if not args.skip_download:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        print("Downloading %s:" % fname)
+        download_file(url=url, dest_folder=args.dest, fname=fname)
+else:
+    print("Skipping file download")
+
+
+if not args.skip_checksum:
+    for index, row in df.iterrows():
+        url = row['url']
+        md5 = row['md5']
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Verifing %s: " % fname, end='')
+        ret = md5_checksum(fpath=fpath, target_hash=md5)
+        print("Passed" if ret else "Failed")
+else:
+    print("Skipping checksum")
+
+
+if not args.skip_extract:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Decompressing %s:" % fpath)
+        extract(fpath=fpath, dest_folder=args.e)
+else:
+    print("Skipping file extraction")
diff --git a/rnn_speech_recognition/pytorch/utils/download_utils.py b/rnn_speech_recognition/pytorch/utils/download_utils.py
new file mode 100644
index 000000000..e881388a6
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/download_utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import hashlib
+import requests
+import os
+import tarfile
+import tqdm
+
+def download_file(url, dest_folder, fname, overwrite=False):
+    fpath = os.path.join(dest_folder, fname)
+    if os.path.isfile(fpath):
+        if overwrite:
+            print("Overwriting existing file")
+        else:
+            print("File exists, skipping download.")
+            return
+
+    tmp_fpath = fpath + '.tmp'
+
+    r = requests.get(url, stream=True)
+    file_size = int(r.headers['Content-Length'])
+    chunk_size = 1024 * 1024  # 1MB
+    total_chunks = int(file_size / chunk_size)
+
+    with open(tmp_fpath, 'wb') as fp:
+        content_iterator = r.iter_content(chunk_size=chunk_size)
+        chunks = tqdm.tqdm(content_iterator, total=total_chunks,
+                           unit='MB', desc=fpath, leave=True)
+        for chunk in chunks:
+            fp.write(chunk)
+
+    os.rename(tmp_fpath, fpath)
+
+
+def md5_checksum(fpath, target_hash):
+    file_hash = hashlib.md5()
+    with open(fpath, "rb") as fp:
+        for chunk in iter(lambda: fp.read(1024*1024), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest() == target_hash
+
+
+def extract(fpath, dest_folder):
+    if fpath.endswith('.tar.gz'):
+        mode = 'r:gz'
+    elif fpath.endswith('.tar'):
+        mode = 'r:'
+    else:
+        raise IOError('fpath has unknown extention: %s' % fpath)
+
+    with tarfile.open(fpath, mode) as tar:
+        members = tar.getmembers()
+        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+            tar.extract(path=dest_folder, member=member)
diff --git a/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv b/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv
new file mode 100644
index 000000000..40dac4e0e
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv
@@ -0,0 +1,5 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
diff --git a/rnn_speech_recognition/pytorch/utils/librispeech.csv b/rnn_speech_recognition/pytorch/utils/librispeech.csv
new file mode 100644
index 000000000..d48a9f8db
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/librispeech.csv
@@ -0,0 +1,8 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
+http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522
+http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa
+http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708
diff --git a/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py b/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py
new file mode 100644
index 000000000..15605cea2
--- /dev/null
+++ b/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+import os
+import multiprocessing
+import librosa
+import functools
+
+import sox
+
+
+from tqdm import tqdm
+
+def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
+               overwrite=True):
+    speed = speed or []
+    speed.append(1)
+    speed = list(set(speed))  # Make uniqe
+
+    input_fname = os.path.join(input_dir,
+                               data['input_relpath'],
+                               data['input_fname'])
+    input_sr = sox.file_info.sample_rate(input_fname)
+    target_sr = target_sr or input_sr
+
+    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
+
+    output_dict = {}
+    output_dict['transcript'] = data['transcript'].lower().strip()
+    output_dict['files'] = []
+
+    fname = os.path.splitext(data['input_fname'])[0]
+    for s in speed:
+        output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s))
+        output_fpath = os.path.join(dest_dir,
+                                    data['input_relpath'],
+                                    output_fname)
+
+        if not os.path.exists(output_fpath) or overwrite:
+            cbn = sox.Transformer().speed(factor=s).convert(target_sr)
+            cbn.build(input_fname, output_fpath)
+
+        file_info = sox.file_info.info(output_fpath)
+        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
+                                          data['input_relpath'],
+                                          output_fname)
+        file_info['speed'] = s
+        output_dict['files'].append(file_info)
+
+        if s == 1:
+            file_info = sox.file_info.info(output_fpath)
+            output_dict['original_duration'] = file_info['duration']
+            output_dict['original_num_samples'] = file_info['num_samples']
+
+    return output_dict
+
+
+def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+    with multiprocessing.Pool(parallel) as p:
+        func = functools.partial(preprocess,
+            input_dir=input_dir, dest_dir=dest_dir,
+            target_sr=target_sr, speed=speed, overwrite=overwrite)
+        dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
+        return dataset