From c30c889f29b9e10b98c3aaafa0a18833f25d8694 Mon Sep 17 00:00:00 2001 From: Marek Wawrzos Date: Fri, 31 Jan 2020 23:08:43 +0100 Subject: [PATCH] Adding RNN-Transducer - RNN speech recognition benchmark (#329) * RNN-Transducer from https://github.com/ryanleary/mlperf-rnnt-ref * fixes after moving eval function * Fix spelling of Speech Benchmark directory * Fix inference script for greedy decode * use 80 input features * dropout on each layer and no batch normalization * fix inference script after preprocessing rewrite * further fixes to inference.py after preprocessing rewrite --- rnn_speech_recognition/pytorch/Dockerfile | 46 ++ rnn_speech_recognition/pytorch/LICENSE | 204 ++++++++ rnn_speech_recognition/pytorch/NOTICE | 5 + rnn_speech_recognition/pytorch/README.md | 44 ++ .../pytorch/configs/rnnt.toml | 77 +++ .../pytorch/configs/rnnt_bn.toml | 78 +++ .../pytorch/configs/rnnt_ln.toml | 78 +++ rnn_speech_recognition/pytorch/dataset.py | 266 ++++++++++ rnn_speech_recognition/pytorch/decoders.py | 136 +++++ rnn_speech_recognition/pytorch/helpers.py | 212 ++++++++ rnn_speech_recognition/pytorch/inference.py | 247 +++++++++ .../pytorch/inference_benchmark.py | 246 +++++++++ rnn_speech_recognition/pytorch/loss.py | 104 ++++ rnn_speech_recognition/pytorch/metrics.py | 67 +++ rnn_speech_recognition/pytorch/model.py | 452 +++++++++++++++++ rnn_speech_recognition/pytorch/model_rnnt.py | 289 +++++++++++ rnn_speech_recognition/pytorch/multiproc.py | 190 +++++++ rnn_speech_recognition/pytorch/optimizers.py | 223 ++++++++ .../pytorch/parts/features.py | 349 +++++++++++++ .../pytorch/parts/manifest.py | 170 +++++++ .../pytorch/parts/perturb.py | 111 ++++ .../pytorch/parts/segment.py | 170 +++++++ .../pytorch/parts/text/LICENSE | 19 + .../pytorch/parts/text/__init__.py | 12 + .../pytorch/parts/text/cleaners.py | 107 ++++ .../pytorch/parts/text/numbers.py | 99 ++++ .../pytorch/parts/text/symbols.py | 19 + .../pytorch/preprocessing.py | 123 +++++ .../pytorch/requirements.txt | 10 + rnn_speech_recognition/pytorch/rnn.py | 402 +++++++++++++++ .../pytorch/scripts/docker/build.sh | 3 + .../pytorch/scripts/docker/launch.sh | 32 ++ .../pytorch/scripts/download_librispeech.sh | 28 + .../pytorch/scripts/evaluation.sh | 92 ++++ .../pytorch/scripts/inference.sh | 104 ++++ .../pytorch/scripts/inference_benchmark.sh | 84 +++ .../pytorch/scripts/preprocess_librispeech.sh | 51 ++ .../pytorch/scripts/train.sh | 113 +++++ .../pytorch/scripts/train_benchmark.sh | 130 +++++ rnn_speech_recognition/pytorch/tb_logger.py | 52 ++ rnn_speech_recognition/pytorch/train.py | 477 ++++++++++++++++++ .../pytorch/utils/__init__.py | 0 .../pytorch/utils/convert_librispeech.py | 81 +++ .../pytorch/utils/download_librispeech.py | 72 +++ .../pytorch/utils/download_utils.py | 68 +++ .../pytorch/utils/inference_librispeech.csv | 5 + .../pytorch/utils/librispeech.csv | 8 + .../pytorch/utils/preprocessing_utils.py | 76 +++ 48 files changed, 6031 insertions(+) create mode 100755 rnn_speech_recognition/pytorch/Dockerfile create mode 100644 rnn_speech_recognition/pytorch/LICENSE create mode 100644 rnn_speech_recognition/pytorch/NOTICE create mode 100644 rnn_speech_recognition/pytorch/README.md create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt.toml create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt_bn.toml create mode 100644 rnn_speech_recognition/pytorch/configs/rnnt_ln.toml create mode 100644 rnn_speech_recognition/pytorch/dataset.py create mode 100644 rnn_speech_recognition/pytorch/decoders.py create mode 100644 rnn_speech_recognition/pytorch/helpers.py create mode 100644 rnn_speech_recognition/pytorch/inference.py create mode 100644 rnn_speech_recognition/pytorch/inference_benchmark.py create mode 100644 rnn_speech_recognition/pytorch/loss.py create mode 100644 rnn_speech_recognition/pytorch/metrics.py create mode 100644 rnn_speech_recognition/pytorch/model.py create mode 100644 rnn_speech_recognition/pytorch/model_rnnt.py create mode 100644 rnn_speech_recognition/pytorch/multiproc.py create mode 100644 rnn_speech_recognition/pytorch/optimizers.py create mode 100644 rnn_speech_recognition/pytorch/parts/features.py create mode 100644 rnn_speech_recognition/pytorch/parts/manifest.py create mode 100644 rnn_speech_recognition/pytorch/parts/perturb.py create mode 100644 rnn_speech_recognition/pytorch/parts/segment.py create mode 100644 rnn_speech_recognition/pytorch/parts/text/LICENSE create mode 100644 rnn_speech_recognition/pytorch/parts/text/__init__.py create mode 100644 rnn_speech_recognition/pytorch/parts/text/cleaners.py create mode 100644 rnn_speech_recognition/pytorch/parts/text/numbers.py create mode 100644 rnn_speech_recognition/pytorch/parts/text/symbols.py create mode 100644 rnn_speech_recognition/pytorch/preprocessing.py create mode 100755 rnn_speech_recognition/pytorch/requirements.txt create mode 100644 rnn_speech_recognition/pytorch/rnn.py create mode 100755 rnn_speech_recognition/pytorch/scripts/docker/build.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/docker/launch.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/download_librispeech.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/evaluation.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/inference.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/train.sh create mode 100755 rnn_speech_recognition/pytorch/scripts/train_benchmark.sh create mode 100644 rnn_speech_recognition/pytorch/tb_logger.py create mode 100644 rnn_speech_recognition/pytorch/train.py create mode 100644 rnn_speech_recognition/pytorch/utils/__init__.py create mode 100644 rnn_speech_recognition/pytorch/utils/convert_librispeech.py create mode 100644 rnn_speech_recognition/pytorch/utils/download_librispeech.py create mode 100644 rnn_speech_recognition/pytorch/utils/download_utils.py create mode 100644 rnn_speech_recognition/pytorch/utils/inference_librispeech.csv create mode 100644 rnn_speech_recognition/pytorch/utils/librispeech.csv create mode 100644 rnn_speech_recognition/pytorch/utils/preprocessing_utils.py diff --git a/rnn_speech_recognition/pytorch/Dockerfile b/rnn_speech_recognition/pytorch/Dockerfile new file mode 100755 index 000000000..1cb52bf62 --- /dev/null +++ b/rnn_speech_recognition/pytorch/Dockerfile @@ -0,0 +1,46 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3 +FROM ${FROM_IMAGE_NAME} + + +RUN apt-get update && apt-get install -y libsndfile1 && apt-get install -y sox && rm -rf /var/lib/apt/lists/* + +RUN COMMIT_SHA=c6d12f9e1562833c2b4e7ad84cb22aa4ba31d18c && \ + git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer && \ + cd deps/warp-transducer && \ + git checkout $COMMIT_SHA && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make VERBOSE=1 && \ + export CUDA_HOME="/usr/local/cuda" && \ + export WARP_RNNT_PATH=`pwd` && \ + export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME && \ + export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" && \ + export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH && \ + export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \ + export CFLAGS="-I$CUDA_HOME/include $CFLAGS" && \ + cd ../pytorch_binding && \ + python3 setup.py install --user && \ + rm -rf ../tests test ../tensorflow_binding && \ + cd ../../.. + +WORKDIR /workspace/jasper + +COPY requirements.txt . +RUN pip install --disable-pip-version-check -U -r requirements.txt + +COPY . . diff --git a/rnn_speech_recognition/pytorch/LICENSE b/rnn_speech_recognition/pytorch/LICENSE new file mode 100644 index 000000000..75ee157cd --- /dev/null +++ b/rnn_speech_recognition/pytorch/LICENSE @@ -0,0 +1,204 @@ + Except where otherwise noted, the following license applies to all files in this repo. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 NVIDIA Corporation + Copyright 2019 Myrtle Software Limited, www.myrtle.ai + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/rnn_speech_recognition/pytorch/NOTICE b/rnn_speech_recognition/pytorch/NOTICE new file mode 100644 index 000000000..7916839bc --- /dev/null +++ b/rnn_speech_recognition/pytorch/NOTICE @@ -0,0 +1,5 @@ +Jasper in PyTorch + +This repository includes source code (in "parts/") from: +* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license. + diff --git a/rnn_speech_recognition/pytorch/README.md b/rnn_speech_recognition/pytorch/README.md new file mode 100644 index 000000000..f62118c06 --- /dev/null +++ b/rnn_speech_recognition/pytorch/README.md @@ -0,0 +1,44 @@ +# DISCLAIMER +This codebase is a work in progress. There are known and unknown bugs in the implementation, and has not been optimized in any way. + +MLPerf has neither finalized on a decision to add a speech recognition benchmark, nor as this implementationn/architecture as a reference implementation. + +# 1. Problem +Speech recognition accepts raw audio samples and produces a corresponding text transcription. + +# 2. Directions +See https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/README.md. This implementation shares significant code with that repository. + +# 3. Dataset/Environment +### Publication/Attribution +["OpenSLR LibriSpeech Corpus"](http://www.openslr.org/12/) provides over 1000 hours of speech data in the form of raw audio. +### Data preprocessing +What preprocessing is done to the the dataset? +### Training and test data separation +How is the test set extracted? +### Training data order +In what order is the training data traversed? +### Test data order +In what order is the test data traversed? +### Simulation environment (RL models only) +Describe simulation environment briefly, if applicable. +# 4. Model +### Publication/Attribution +Cite paper describing model plus any additional attribution requested by code authors +### List of layers +Brief summary of structure of model +### Weight and bias initialization +How are weights and biases initialized +### Loss function +Transducer Loss +### Optimizer +TBD, currently Adam +# 5. Quality +### Quality metric +Word Error Rate (WER) across all words in the output text of all samples in the validation set. +### Quality target +What is the numeric quality target +### Evaluation frequency +TBD +### Evaluation thoroughness +TBD \ No newline at end of file diff --git a/rnn_speech_recognition/pytorch/configs/rnnt.toml b/rnn_speech_recognition/pytorch/configs/rnnt.toml new file mode 100644 index 000000000..11ed8b91a --- /dev/null +++ b/rnn_speech_recognition/pytorch/configs/rnnt.toml @@ -0,0 +1,77 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model = "RNNT" + +[input] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 80 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 # TODO +max_duration = 16.7 +speed_perturbation = true + + +cutout_rect_regions = 0 +cutout_rect_time = 60 +cutout_rect_freq = 25 + + +cutout_x_regions = 2 +cutout_y_regions = 2 +cutout_x_width = 6 +cutout_y_width = 6 + + +[input_eval] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 80 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 + + +[rnnt] +rnn_type = "lstm" +encoder_n_hidden = 1024 +encoder_pre_rnn_layers = 2 +encoder_stack_time_factor = 2 +encoder_post_rnn_layers = 3 +pred_n_hidden = 512 +pred_rnn_layers = 2 +forget_gate_bias = 1.0 +joint_n_hidden = 512 +dropout=0.32 + + +[labels] +labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] diff --git a/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml b/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml new file mode 100644 index 000000000..c1908128f --- /dev/null +++ b/rnn_speech_recognition/pytorch/configs/rnnt_bn.toml @@ -0,0 +1,78 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model = "RNNT" + +[input] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 64 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 # TODO +max_duration = 16.7 +speed_perturbation = true + + +cutout_rect_regions = 0 +cutout_rect_time = 60 +cutout_rect_freq = 25 + + +cutout_x_regions = 2 +cutout_y_regions = 2 +cutout_x_width = 6 +cutout_y_width = 6 + + +[input_eval] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 64 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 + + +[rnnt] +rnn_type = "lstm" +norm = "batch_norm" +encoder_n_hidden = 1024 +encoder_pre_rnn_layers = 2 +encoder_stack_time_factor = 2 +encoder_post_rnn_layers = 3 +pred_n_hidden = 1024 +pred_rnn_layers = 2 +forget_gate_bias = 1.0 +joint_n_hidden = 640 +dropout=0.0 + + +[labels] +labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] diff --git a/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml b/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml new file mode 100644 index 000000000..fd43b5595 --- /dev/null +++ b/rnn_speech_recognition/pytorch/configs/rnnt_ln.toml @@ -0,0 +1,78 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model = "RNNT" + +[input] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 64 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 # TODO +max_duration = 16.7 +speed_perturbation = true + + +cutout_rect_regions = 0 +cutout_rect_time = 60 +cutout_rect_freq = 25 + + +cutout_x_regions = 2 +cutout_y_regions = 2 +cutout_x_width = 6 +cutout_y_width = 6 + + +[input_eval] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 64 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 + + +[rnnt] +rnn_type = "lstm" +norm = "layer_norm" +encoder_n_hidden = 1024 +encoder_pre_rnn_layers = 2 +encoder_stack_time_factor = 2 +encoder_post_rnn_layers = 3 +pred_n_hidden = 1024 +pred_rnn_layers = 2 +forget_gate_bias = 1.0 +joint_n_hidden = 640 +dropout=0.0 + + +[labels] +labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] diff --git a/rnn_speech_recognition/pytorch/dataset.py b/rnn_speech_recognition/pytorch/dataset.py new file mode 100644 index 000000000..ad88d2f01 --- /dev/null +++ b/rnn_speech_recognition/pytorch/dataset.py @@ -0,0 +1,266 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file contains classes and functions related to data loading +""" +import torch +import numpy as np +import math +from torch.utils.data import Dataset, Sampler +import torch.distributed as dist +from parts.manifest import Manifest +from parts.features import WaveformFeaturizer + +class DistributedBucketBatchSampler(Sampler): + def __init__(self, dataset, batch_size, num_replicas=None, rank=None): + """Distributed sampler that buckets samples with similar length to minimize padding, + similar concept as pytorch BucketBatchSampler https://pytorchnlp.readthedocs.io/en/latest/source/torchnlp.samplers.html#torchnlp.samplers.BucketBatchSampler + + Args: + dataset: Dataset used for sampling. + batch_size: data batch size + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.dataset_size = len(dataset) + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.batch_size = batch_size + self.tile_size = batch_size * self.num_replicas + self.num_buckets = 6 + self.bucket_size = self.round_up_to(math.ceil(self.dataset_size / self.num_buckets), self.tile_size) + self.index_count = self.round_up_to(self.dataset_size, self.tile_size) + self.num_samples = self.index_count // self.num_replicas + + def round_up_to(self, x, mod): + return (x + mod - 1) // mod * mod + + def __iter__(self): + g = torch.Generator() + g.manual_seed(self.epoch) + indices = np.arange(self.index_count) % self.dataset_size + for bucket in range(self.num_buckets): + bucket_start = self.bucket_size * bucket + bucket_end = min(bucket_start + self.bucket_size, self.index_count) + indices[bucket_start:bucket_end] = indices[bucket_start:bucket_end][torch.randperm(bucket_end - bucket_start, generator=g)] + + tile_indices = torch.randperm(self.index_count // self.tile_size, generator=g) + for tile_index in tile_indices: + start_index = self.tile_size * tile_index + self.batch_size * self.rank + end_index = start_index + self.batch_size + yield indices[start_index:end_index] + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + +class data_prefetcher(): + def __init__(self, loader): + self.loader = iter(loader) + self.stream = torch.cuda.Stream() + self.preload() + + def preload(self): + try: + self.next_input = next(self.loader) + except StopIteration: + self.next_input = None + return + with torch.cuda.stream(self.stream): + self.next_input = [ x.cuda(non_blocking=True) for x in self.next_input] + + def __next__(self): + torch.cuda.current_stream().wait_stream(self.stream) + input = self.next_input + self.preload() + return input + def next(self): + return self.__next__() + def __iter__(self): + return self + +def seq_collate_fn(batch): + """batches samples and returns as tensors + Args: + batch : list of samples + Returns + batches of tensors + """ + batch_size = len(batch) + def _find_max_len(lst, ind): + max_len = -1 + for item in lst: + if item[ind].size(0) > max_len: + max_len = item[ind].size(0) + return max_len + max_audio_len = _find_max_len(batch, 0) + max_transcript_len = _find_max_len(batch, 2) + + batched_audio_signal = torch.zeros(batch_size, max_audio_len) + batched_transcript = torch.zeros(batch_size, max_transcript_len) + audio_lengths = [] + transcript_lengths = [] + for ind, sample in enumerate(batch): + batched_audio_signal[ind].narrow(0, 0, sample[0].size(0)).copy_(sample[0]) + audio_lengths.append(sample[1]) + batched_transcript[ind].narrow(0, 0, sample[2].size(0)).copy_(sample[2]) + transcript_lengths.append(sample[3]) + return batched_audio_signal, torch.stack(audio_lengths), batched_transcript, \ + torch.stack(transcript_lengths) + +class AudioToTextDataLayer: + """Data layer with data loader + """ + def __init__(self, **kwargs): + self._device = torch.device("cuda") + + featurizer_config = kwargs['featurizer_config'] + pad_to_max = kwargs.get('pad_to_max', False) + perturb_config = kwargs.get('perturb_config', None) + manifest_filepath = kwargs['manifest_filepath'] + dataset_dir = kwargs['dataset_dir'] + labels = kwargs['labels'] + batch_size = kwargs['batch_size'] + drop_last = kwargs.get('drop_last', False) + shuffle = kwargs.get('shuffle', True) + min_duration = featurizer_config.get('min_duration', 0.1) + max_duration = featurizer_config.get('max_duration', None) + normalize_transcripts = kwargs.get('normalize_transcripts', True) + trim_silence = kwargs.get('trim_silence', False) + multi_gpu = kwargs.get('multi_gpu', False) + sampler_type = kwargs.get('sampler', 'default') + speed_perturbation = featurizer_config.get('speed_perturbation', False) + sort_by_duration=sampler_type == 'bucket' + self._featurizer = WaveformFeaturizer.from_config(featurizer_config, perturbation_configs=perturb_config) + self._dataset = AudioDataset( + dataset_dir=dataset_dir, + manifest_filepath=manifest_filepath, + labels=labels, blank_index=len(labels), + sort_by_duration=sort_by_duration, + pad_to_max=pad_to_max, + featurizer=self._featurizer, max_duration=max_duration, + min_duration=min_duration, normalize=normalize_transcripts, + trim=trim_silence, speed_perturbation=speed_perturbation) + + print('sort_by_duration', sort_by_duration) + + if not multi_gpu: + self.sampler = None + self._dataloader = torch.utils.data.DataLoader( + dataset=self._dataset, + batch_size=batch_size, + collate_fn=lambda b: seq_collate_fn(b), + drop_last=drop_last, + shuffle=shuffle if self.sampler is None else False, + num_workers=4, + pin_memory=True, + sampler=self.sampler + ) + elif sampler_type == 'bucket': + self.sampler = DistributedBucketBatchSampler(self._dataset, batch_size=batch_size) + print("DDBucketSampler") + self._dataloader = torch.utils.data.DataLoader( + dataset=self._dataset, + collate_fn=lambda b: seq_collate_fn(b), + num_workers=4, + pin_memory=True, + batch_sampler=self.sampler + ) + elif sampler_type == 'default': + self.sampler = torch.utils.data.distributed.DistributedSampler(self._dataset) + print("DDSampler") + self._dataloader = torch.utils.data.DataLoader( + dataset=self._dataset, + batch_size=batch_size, + collate_fn=lambda b: seq_collate_fn(b), + drop_last=drop_last, + shuffle=shuffle if self.sampler is None else False, + num_workers=4, + pin_memory=True, + sampler=self.sampler + ) + else: + raise RuntimeError("Sampler {} not supported".format(sampler_type)) + + def __len__(self): + return len(self._dataset) + + @property + def data_iterator(self): + return self._dataloader + +class AudioDataset(Dataset): + def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False, + min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False, + trim=False, speed_perturbation=False): + """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations + (in seconds). Each entry is a different audio sample. + Args: + dataset_dir: absolute path to dataset folder + manifest_filepath: relative path from dataset folder to manifest json as described above. Can be coma-separated paths. + labels: String containing all the possible characters to map to + featurizer: Initialized featurizer class that converts paths of audio to feature tensors + max_duration: If audio exceeds this length, do not include in dataset + min_duration: If audio is less than this length, do not include in dataset + pad_to_max: if specified input sequences into dnn model will be padded to max_duration + blank_index: blank index for ctc loss / decoder + max_utts: Limit number of utterances + normalize: whether to normalize transcript text + sort_by_duration: whether or not to sort sequences by increasing duration + trim: if specified trims leading and trailing silence from an audio signal. + speed_perturbation: specify if using data contains speed perburbation + """ + m_paths = manifest_filepath.split(',') + self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max, + max_duration=max_duration, + sort_by_duration=sort_by_duration, + min_duration=min_duration, max_utts=max_utts, + normalize=normalize, speed_perturbation=speed_perturbation) + self.featurizer = featurizer + self.blank_index = blank_index + self.trim = trim + print( + "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format( + self.manifest.duration / 3600, + self.manifest.filtered_duration / 3600)) + + def __getitem__(self, index): + sample = self.manifest[index] + rn_indx = np.random.randint(len(sample['audio_filepath'])) + duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0 + offset = sample['offset'] if 'offset' in sample else 0 + features = self.featurizer.process(sample['audio_filepath'][rn_indx], + offset=offset, duration=duration, + trim=self.trim) + + return features, torch.tensor(features.shape[0]).int(), \ + torch.tensor(sample["transcript"]), torch.tensor( + len(sample["transcript"])).int() + + def __len__(self): + return len(self.manifest) diff --git a/rnn_speech_recognition/pytorch/decoders.py b/rnn_speech_recognition/pytorch/decoders.py new file mode 100644 index 000000000..882dee2e2 --- /dev/null +++ b/rnn_speech_recognition/pytorch/decoders.py @@ -0,0 +1,136 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +import torch.nn.functional as F +from model_rnnt import label_collate + +class TransducerDecoder: + """Decoder base class. + + Args: + alphabet: An Alphabet object. + blank_symbol: The symbol in `alphabet` to use as the blank during CTC + decoding. + model: Model to use for prediction. + """ + + def __init__(self, blank_index, model): + self._model = model + self._SOS = -1 # start of sequence + self._blank_id = blank_index + + def _pred_step(self, label, hidden, device): + if label == self._SOS: + return self._model.predict(None, hidden, add_sos=False) + if label > self._blank_id: + label -= 1 + label = label_collate([[label]]).to(device) + return self._model.predict(label, hidden, add_sos=False) + + def _joint_step(self, enc, pred, log_normalize=False): + logits = self._model.joint(enc, pred)[:, 0, 0, :] + if not log_normalize: + return logits + + probs = F.log_softmax(logits, dim=len(logits.shape) - 1) + + return probs + + def _get_last_symb(self, labels): + return self._SOS if labels == [] else labels[-1] + + +class RNNTGreedyDecoder(TransducerDecoder): + """A greedy transducer decoder. + + Args: + blank_symbol: See `Decoder`. + model: Model to use for prediction. + max_symbols_per_step: The maximum number of symbols that can be added + to a sequence in a single time step; if set to None then there is + no limit. + cutoff_prob: Skip to next step in search if current highest character + probability is less than this. + """ + def __init__(self, blank_index, model, max_symbols_per_step=30): + super().__init__(blank_index, model) + assert max_symbols_per_step is None or max_symbols_per_step > 0 + self.max_symbols = max_symbols_per_step + + def decode(self, x, out_lens): + """Returns a list of sentences given an input batch. + + Args: + x: A tensor of size (batch, channels, features, seq_len) + TODO was (seq_len, batch, in_features). + out_lens: list of int representing the length of each sequence + output sequence. + + Returns: + list containing batch number of sentences (strings). + """ + with torch.no_grad(): + # Apply optional preprocessing + + logits, out_lens = self._model.encode((x, out_lens)) + + output = [] + for batch_idx in range(logits.size(0)): + inseq = logits[batch_idx, :, :].unsqueeze(1) + logitlen = out_lens[batch_idx] + sentence = self._greedy_decode(inseq, logitlen) + output.append(sentence) + + return output + + def _greedy_decode(self, x, out_len): + training_state = self._model.training + self._model.eval() + + device = x.device + + hidden = None + label = [] + for time_idx in range(out_len): + f = x[time_idx, :, :].unsqueeze(0) + + not_blank = True + symbols_added = 0 + + while not_blank and ( + self.max_symbols is None or + symbols_added < self.max_symbols): + g, hidden_prime = self._pred_step( + self._get_last_symb(label), + hidden, + device + ) + logp = self._joint_step(f, g, log_normalize=False)[0, :] + + # get index k, of max prob + v, k = logp.max(0) + k = k.item() + + if k == self._blank_id: + not_blank = False + else: + label.append(k) + hidden = hidden_prime + symbols_added += 1 + + self._model.train(training_state) + return label diff --git a/rnn_speech_recognition/pytorch/helpers.py b/rnn_speech_recognition/pytorch/helpers.py new file mode 100644 index 000000000..e844b4c75 --- /dev/null +++ b/rnn_speech_recognition/pytorch/helpers.py @@ -0,0 +1,212 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed as dist +from apex.parallel import DistributedDataParallel as DDP +from enum import Enum +from metrics import word_error_rate + + +class Optimization(Enum): + """Various levels of Optimization. + WARNING: This might have effect on model accuracy.""" + nothing = 0 + mxprO0 = 1 + mxprO1 = 2 + mxprO2 = 3 + mxprO3 = 4 + + +AmpOptimizations = {Optimization.mxprO0: "O0", + Optimization.mxprO1: "O1", + Optimization.mxprO2: "O2", + Optimization.mxprO3: "O3"} + +def print_once(msg): + if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)): + print(msg) + +def add_blank_label(labels): + if not isinstance(labels, list): + raise ValueError("labels must be a list of symbols") + labels.append("") + return labels + +def __rnnt_decoder_predictions_tensor(tensor, labels): + """ + Takes output of greedy rnnt decoder and converts to strings. + Args: + tensor: model output tensor + label: A list of labels + Returns: + prediction + """ + hypotheses = [] + labels_map = dict([(i, labels[i]) for i in range(len(labels))]) + # iterate over batch + for ind in range(len(tensor)): + hypothesis = ''.join([labels_map[c] for c in tensor[ind]]) + hypotheses.append(hypothesis) + return hypotheses + + +def monitor_asr_train_progress(tensors: list, labels: list): + """ + Takes output of greedy ctc decoder and performs ctc decoding algorithm to + remove duplicates and special symbol. Prints wer and prediction examples to screen + Args: + tensors: A list of 3 tensors (predictions, targets, target_lengths) + labels: A list of labels + + Returns: + word error rate + """ + references = [] + + labels_map = dict([(i, labels[i]) for i in range(len(labels))]) + with torch.no_grad(): + targets_cpu_tensor = tensors[1].long().cpu() + tgt_lenths_cpu_tensor = tensors[2].long().cpu() + + # iterate over batch + for ind in range(targets_cpu_tensor.shape[0]): + tgt_len = tgt_lenths_cpu_tensor[ind].item() + target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() + reference = ''.join([labels_map[c] for c in target]) + references.append(reference) + hypotheses = __rnnt_decoder_predictions_tensor(tensors[0], labels=labels) + tag = "training_batch_WER" + wer, _, _ = word_error_rate(hypotheses, references) + print_once('{0}: {1}'.format(tag, wer)) + print_once('Prediction: {0}'.format(hypotheses[0])) + print_once('Reference: {0}'.format(references[0])) + return wer + + +def __gather_losses(losses_list: list) -> list: + return [torch.mean(torch.stack(losses_list))] + + +def __gather_predictions(predictions_list: list, labels: list) -> list: + results = [] + for prediction in predictions_list: + results += __rnnt_decoder_predictions_tensor(prediction, labels=labels) + return results + + +def __gather_transcripts(transcript_list: list, transcript_len_list: list, + labels: list) -> list: + results = [] + labels_map = dict([(i, labels[i]) for i in range(len(labels))]) + # iterate over workers + for t, ln in zip(transcript_list, transcript_len_list): + # iterate over batch + t_lc = t.long().cpu() + ln_lc = ln.long().cpu() + for ind in range(t.shape[0]): + tgt_len = ln_lc[ind].item() + target = t_lc[ind][:tgt_len].numpy().tolist() + reference = ''.join([labels_map[c] for c in target]) + results.append(reference) + return results + + +def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list): + """ + Processes results of an iteration and saves it in global_vars + Args: + tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output + global_vars: dictionary where processes results of iteration are saved + labels: A list of labels + """ + for kv, v in tensors.items(): + if kv.startswith('loss'): + global_vars['EvalLoss'] += __gather_losses(v) + elif kv.startswith('predictions'): + global_vars['predictions'] += __gather_predictions(v, labels=labels) + elif kv.startswith('transcript_length'): + transcript_len_list = v + elif kv.startswith('transcript'): + + transcript_list = v + elif kv.startswith('output'): + global_vars['logits'] += v + + global_vars['transcripts'] += __gather_transcripts(transcript_list, + transcript_len_list, + labels=labels) + + +def process_evaluation_epoch(global_vars: dict, tag=None): + """ + Processes results from each worker at the end of evaluation and combine to final result + Args: + global_vars: dictionary containing information of entire evaluation + Return: + wer: final word error rate + loss: final loss + """ + if 'EvalLoss' in global_vars: + eloss = torch.mean(torch.stack(global_vars['EvalLoss'])).item() + else: + eloss = None + hypotheses = global_vars['predictions'] + references = global_vars['transcripts'] + + wer, scores, num_words = word_error_rate(hypotheses=hypotheses, references=references) + multi_gpu = torch.distributed.is_initialized() + if multi_gpu: + if eloss is not None: + eloss /= torch.distributed.get_world_size() + eloss_tensor = torch.tensor(eloss).cuda() + dist.all_reduce(eloss_tensor) + eloss = eloss_tensor.item() + del eloss_tensor + + scores_tensor = torch.tensor(scores).cuda() + dist.all_reduce(scores_tensor) + scores = scores_tensor.item() + del scores_tensor + num_words_tensor = torch.tensor(num_words).cuda() + dist.all_reduce(num_words_tensor) + num_words = num_words_tensor.item() + del num_words_tensor + wer = scores *1.0/num_words + return wer, eloss + + + +def norm(x): + if not isinstance(x, list): + if not isinstance(x, tuple): + return x + return x[0] + + +def print_dict(d): + maxLen = max([len(ii) for ii in d.keys()]) + fmtString = '\t%' + str(maxLen) + 's : %s' + print('Arguments:') + for keyPair in sorted(d.items()): + print(fmtString % keyPair) + + + +def model_multi_gpu(model, multi_gpu=False): + if multi_gpu: + model = DDP(model) + print('DDP(model)') + return model diff --git a/rnn_speech_recognition/pytorch/inference.py b/rnn_speech_recognition/pytorch/inference.py new file mode 100644 index 000000000..1d512a570 --- /dev/null +++ b/rnn_speech_recognition/pytorch/inference.py @@ -0,0 +1,247 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import itertools +from typing import List +from tqdm import tqdm +import math +import toml +from dataset import AudioToTextDataLayer +from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_blank_label, AmpOptimizations, print_dict, model_multi_gpu +from decoders import RNNTGreedyDecoder +from model_rnnt import RNNT +from preprocessing import AudioPreprocessing +from parts.features import audio_from_file +import torch +import apex +from apex import amp +import random +import numpy as np +import pickle +import time + +import torchvision + +def parse_args(): + parser = argparse.ArgumentParser(description='Jasper') + parser.add_argument("--local_rank", default=None, type=int) + parser.add_argument("--batch_size", default=16, type=int, help='data batch size') + parser.add_argument("--steps", default=None, help='if not specified do evaluation on full dataset. otherwise only evaluates the specified number of iterations for each worker', type=int) + parser.add_argument("--model_toml", type=str, help='relative model configuration path given dataset folder') + parser.add_argument("--dataset_dir", type=str, help='absolute path to dataset folder') + parser.add_argument("--val_manifest", type=str, help='relative path to evaluation dataset manifest file') + parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint') + parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file') + parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value") + parser.add_argument("--fp16", action='store_true', help='use half precision') + parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark") + parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location") + parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path") + parser.add_argument("--seed", default=42, type=int, help='seed') + parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)') + return parser.parse_args() + +def eval( + data_layer, + audio_processor, + encoderdecoder, + greedy_decoder, + labels, + multi_gpu, + args): + """performs inference / evaluation + Args: + data_layer: data layer object that holds data loader + audio_processor: data processing module + encoderdecoder: acoustic model + greedy_decoder: greedy decoder + labels: list of labels as output vocabulary + multi_gpu: true if using multiple gpus + args: script input arguments + """ + logits_save_to=args.logits_save_to + encoderdecoder.eval() + with torch.no_grad(): + _global_var_dict = { + 'predictions': [], + 'transcripts': [], + 'logits' : [], + } + + + + if args.wav: + features, p_length_e = audio_processor(audio_from_file(args.wav)) + torch.cuda.synchronize() + t0 = time.perf_counter() + t_log_probs_e = encoderdecoder(features) + torch.cuda.synchronize() + t1 = time.perf_counter() + t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) + hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels) + print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0)) + print("TRANSCRIPT\t\t:", hypotheses[0]) + return + + for it, data in enumerate(tqdm(data_layer.data_iterator)): + t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data) + + t_log_probs_e, (x_len, y_len) = encoderdecoder( + ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), + ) + t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) + + values_dict = dict( + predictions=[t_predictions_e], + transcript=[t_transcript_e], + transcript_length=[t_transcript_len_e], + output=[t_log_probs_e] + ) + process_evaluation_batch(values_dict, _global_var_dict, labels=labels) + + if args.steps is not None and it + 1 >= args.steps: + break + wer, _ = process_evaluation_epoch(_global_var_dict) + if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)): + print("==========>>>>>>Evaluation WER: {0}\n".format(wer)) + if args.save_prediction is not None: + with open(args.save_prediction, 'w') as fp: + fp.write('\n'.join(_global_var_dict['predictions'])) + if logits_save_to is not None: + logits = [] + for batch in _global_var_dict["logits"]: + for i in range(batch.shape[0]): + logits.append(batch[i].cpu().numpy()) + with open(logits_save_to, 'wb') as f: + pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL) + +def main(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.benchmark = args.cudnn_benchmark + print("CUDNN BENCHMARK ", args.cudnn_benchmark) + assert(torch.cuda.is_available()) + + if args.local_rank is not None: + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + multi_gpu = args.local_rank is not None + if multi_gpu: + print("DISTRIBUTED with ", torch.distributed.get_world_size()) + + if args.fp16: + optim_level = Optimization.mxprO3 + else: + optim_level = Optimization.mxprO0 + + model_definition = toml.load(args.model_toml) + dataset_vocab = model_definition['labels']['labels'] + ctc_vocab = add_blank_label(dataset_vocab) + + val_manifest = args.val_manifest + featurizer_config = model_definition['input_eval'] + featurizer_config["optimization_level"] = optim_level + + if args.max_duration is not None: + featurizer_config['max_duration'] = args.max_duration + if args.pad_to is not None: + featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" + + print('model_config') + print_dict(model_definition) + print('feature_config') + print_dict(featurizer_config) + data_layer = None + + if args.wav is None: + data_layer = AudioToTextDataLayer( + dataset_dir=args.dataset_dir, + featurizer_config=featurizer_config, + manifest_filepath=val_manifest, + labels=dataset_vocab, + batch_size=args.batch_size, + pad_to_max=featurizer_config['pad_to'] == "max", + shuffle=False, + multi_gpu=multi_gpu) + audio_preprocessor = AudioPreprocessing(**featurizer_config) + + #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) + model = RNNT( + feature_config=featurizer_config, + rnnt=model_definition['rnnt'], + num_classes=len(ctc_vocab) + ) + + if args.ckpt is not None: + print("loading model from ", args.ckpt) + checkpoint = torch.load(args.ckpt, map_location="cpu") + model.load_state_dict(checkpoint['state_dict'], strict=False) + + #greedy_decoder = GreedyCTCDecoder() + + # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) + if args.wav is None: + N = len(data_layer) + step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) + + if args.steps is not None: + print('-----------------') + print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) + print('Have {0} steps / (gpu * epoch).'.format(args.steps)) + print('-----------------') + else: + print('-----------------') + print('Have {0} examples to eval on.'.format(N)) + print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) + print('-----------------') + else: + audio_preprocessor.featurizer.normalize = "per_feature" + + print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) + audio_preprocessor.cuda() + audio_preprocessor.eval() + + eval_transforms = torchvision.transforms.Compose([ + lambda xs: [x.cuda() for x in xs], + lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], + lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], + ]) + + model.cuda() + if args.fp16: + model = amp.initialize( + models=model, + opt_level=AmpOptimizations[optim_level]) + + model = model_multi_gpu(model, multi_gpu) + + greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) + + eval( + data_layer=data_layer, + audio_processor=eval_transforms, + encoderdecoder=model, + greedy_decoder=greedy_decoder, + labels=ctc_vocab, + args=args, + multi_gpu=multi_gpu) + +if __name__=="__main__": + args = parse_args() + + print_dict(vars(args)) + + main(args) diff --git a/rnn_speech_recognition/pytorch/inference_benchmark.py b/rnn_speech_recognition/pytorch/inference_benchmark.py new file mode 100644 index 000000000..fcc927ecb --- /dev/null +++ b/rnn_speech_recognition/pytorch/inference_benchmark.py @@ -0,0 +1,246 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import itertools +import os +import sys +import time +import random +import numpy as np +from heapq import nlargest +import math +from tqdm import tqdm +import toml +import torch +from apex import amp +from dataset import AudioToTextDataLayer +from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_ctc_labels, AmpOptimizations, print_dict +from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder + +def parse_args(): + parser = argparse.ArgumentParser(description='Jasper') + parser.add_argument("--steps", default=None, help='if not specified do evaluation on full dataset. otherwise only evaluates the specified number of iterations for each worker', type=int) + parser.add_argument("--batch_size", default=16, type=int, help='data batch size') + parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file') + parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value") + parser.add_argument("--model_toml", type=str, help='relative model configuration path given dataset folder') + parser.add_argument("--dataset_dir", type=str, help='absolute path to dataset folder') + parser.add_argument("--val_manifest", type=str, help='relative path to evaluation dataset manifest file') + parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark") + parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint') + parser.add_argument("--fp16", action='store_true', help='use half precision') + parser.add_argument("--seed", default=42, type=int, help='seed') + return parser.parse_args() + +def eval( + data_layer, + audio_processor, + encoderdecoder, + greedy_decoder, + labels, + args): + """performs evaluation and prints performance statistics + Args: + data_layer: data layer object that holds data loader + audio_processor: data processing module + encoderdecoder: acoustic model + greedy_decoder: greedy decoder + labels: list of labels as output vocabulary + args: script input arguments + """ + batch_size=args.batch_size + steps=args.steps + audio_processor.eval() + encoderdecoder.eval() + with torch.no_grad(): + _global_var_dict = { + 'predictions': [], + 'transcripts': [], + } + + it = 0 + ep = 0 + + if steps is None: + steps = math.ceil(len(data_layer) / batch_size) + durations_dnn = [] + durations_dnn_and_prep = [] + seq_lens = [] + while True: + ep += 1 + for data in tqdm(data_layer.data_iterator): + it += 1 + if it > steps: + break + tensors = [] + dl_device = torch.device("cuda") + for d in data: + tensors.append(d.to(dl_device)) + + + t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors + + inp=(t_audio_signal_e, t_a_sig_length_e) + torch.cuda.synchronize() + t0 = time.perf_counter() + t_processed_signal, p_length_e = audio_processor(x=inp) + torch.cuda.synchronize() + t1 = time.perf_counter() + + if args.use_conv_mask: + t_log_probs_e, t_encoded_len_e = encoderdecoder((t_processed_signal, p_length_e)) + else: + t_log_probs_e = encoderdecoder(t_processed_signal) + torch.cuda.synchronize() + stop_time = time.perf_counter() + + time_prep_and_dnn = stop_time - t0 + time_dnn = stop_time - t1 + t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) + + values_dict = dict( + predictions=[t_predictions_e], + transcript=[t_transcript_e], + transcript_length=[t_transcript_len_e], + ) + process_evaluation_batch(values_dict, _global_var_dict, labels=labels) + durations_dnn.append(time_dnn) + durations_dnn_and_prep.append(time_prep_and_dnn) + seq_lens.append(t_processed_signal.shape[-1]) + + if it >= steps: + + wer, _ = process_evaluation_epoch(_global_var_dict) + print("==========>>>>>>Evaluation of all iterations WER: {0}\n".format(wer)) + break + + ratios = [0.9, 0.95,0.99, 1.] + latencies_dnn = take_durations_and_output_percentile(durations_dnn, ratios) + latencies_dnn_and_prep = take_durations_and_output_percentile(durations_dnn_and_prep, ratios) + print("\n using batch size {} and {} frames ".format(batch_size, seq_lens[-1])) + print("\n".join(["dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn.items()])) + print("\n".join(["prep + dnn latency {} : {} ".format(k, v) for k, v in latencies_dnn_and_prep.items()])) + +def take_durations_and_output_percentile(durations, ratios): + durations = np.asarray(durations) * 1000 # in ms + latency = durations + + latency = latency[5:] + mean_latency = np.mean(latency) + + latency_worst = nlargest(math.ceil( (1 - min(ratios))* len(latency)), latency) + latency_ranges=get_percentile(ratios, latency_worst, len(latency)) + latency_ranges["0.5"] = mean_latency + return latency_ranges + +def get_percentile(ratios, arr, nsamples): + res = {} + for a in ratios: + idx = max(int(nsamples * (1 - a)), 0) + res[a] = arr[idx] + return res + +def main(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.benchmark = args.cudnn_benchmark + assert(args.steps is None or args.steps > 5) + print("CUDNN BENCHMARK ", args.cudnn_benchmark) + assert(torch.cuda.is_available()) + + if args.fp16: + optim_level = Optimization.mxprO3 + else: + optim_level = Optimization.mxprO0 + batch_size = args.batch_size + + jasper_model_definition = toml.load(args.model_toml) + dataset_vocab = jasper_model_definition['labels']['labels'] + ctc_vocab = add_ctc_labels(dataset_vocab) + + val_manifest = args.val_manifest + featurizer_config = jasper_model_definition['input_eval'] + featurizer_config["optimization_level"] = optim_level + args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True) + if args.max_duration is not None: + featurizer_config['max_duration'] = args.max_duration + if args.pad_to is not None: + featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" + + print('model_config') + print_dict(jasper_model_definition) + print('feature_config') + print_dict(featurizer_config) + + data_layer = AudioToTextDataLayer( + dataset_dir=args.dataset_dir, + featurizer_config=featurizer_config, + manifest_filepath=val_manifest, + labels=dataset_vocab, + batch_size=batch_size, + pad_to_max=featurizer_config['pad_to'] == "max", + shuffle=False, + multi_gpu=False) + + audio_preprocessor = AudioPreprocessing(**featurizer_config) + + encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) + + if args.ckpt is not None: + print("loading model from ", args.ckpt) + checkpoint = torch.load(args.ckpt, map_location="cpu") + for k in audio_preprocessor.state_dict().keys(): + checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k) + audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False) + encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False) + + greedy_decoder = GreedyCTCDecoder() + + # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) + + N = len(data_layer) + step_per_epoch = math.ceil(N / args.batch_size) + + print('-----------------') + if args.steps is None: + print('Have {0} examples to eval on.'.format(N)) + print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) + else: + print('Have {0} examples to eval on.'.format(args.steps * args.batch_size)) + print('Have {0} steps / (gpu * epoch).'.format(args.steps)) + print('-----------------') + + audio_preprocessor.cuda() + encoderdecoder.cuda() + if args.fp16: + encoderdecoder = amp.initialize( + models=encoderdecoder, + opt_level=AmpOptimizations[optim_level]) + + eval( + data_layer=data_layer, + audio_processor=audio_preprocessor, + encoderdecoder=encoderdecoder, + greedy_decoder=greedy_decoder, + labels=ctc_vocab, + args=args) + +if __name__=="__main__": + args = parse_args() + + print_dict(vars(args)) + + main(args) diff --git a/rnn_speech_recognition/pytorch/loss.py b/rnn_speech_recognition/pytorch/loss.py new file mode 100644 index 000000000..fa2bde88c --- /dev/null +++ b/rnn_speech_recognition/pytorch/loss.py @@ -0,0 +1,104 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Tuple + +import torch +from warprnnt_pytorch import RNNTLoss as WarpRNNTLoss + + +class RNNTLoss(torch.nn.Module): + """Wrapped :py:class:`warprnnt_pytorch.RNNTLoss`. + Args: + blank: Index of the blank label. + reduction: (string) Specifies the reduction to apply to the output: + none: + No reduction will be applied. + mean: + The output losses will be divided by the target lengths and + then the mean over the batch is taken. + sum: + Sum all losses in a batch. + Attributes: + rnnt_loss: A :py:class:`warprnnt_pytorch.RNNTLoss` instance. + """ + + def __init__(self, blank, reduction="mean"): + super().__init__() + self.rnnt_loss = WarpRNNTLoss(blank=blank) + self.use_cuda = torch.cuda.is_available() + + def forward( + self, + inputs: Tuple[torch.Tensor, torch.Tensor], + targets: Tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + """Computes RNNT loss. + All inputs are moved to the GPU with :py:meth:`torch.nn.Module.cuda` if + :py:func:`torch.cuda.is_available` was :py:data:`True` on + initialisation. + Args: + inputs: A tuple where the first element is the unnormalized network + :py:class:`torch.Tensor` outputs of size ``[batch, max_seq_len, + max_output_seq_len + 1, vocab_size + 1)``. The second element + is a Tuple of two :py:class:`torch.Tensor`s both of + size ``[batch]`` that contain the lengths of a) the audio features + logits and b) the target sequence logits. + targets: A tuple where the first element is a + :py:class:`torch.Tensor` such that each entry in the target + sequence is a class index. Target indices cannot be the blank + index. It must have size ``[batch, max_seq_len]``. In the former + form each target sequence is padded to the length of the longest + sequence and stacked. + The second element is a :py:class:`torch.Tensor` that gives + the lengths of the targets. Lengths are specified for each + sequence to achieve masking under the assumption that sequences + are padded to equal lengths. + """ + + logits, logit_lens = inputs + y, y_lens = targets + + # cast to required types + if logits.dtype != torch.float: + logits_orig = logits + logits = logits.float() + del logits_orig # save memory *before* computing the loss + + if y.dtype != torch.int32: + y = y.int() + + if logit_lens.dtype != torch.int32: + logit_lens = logit_lens.int() + + if y_lens.dtype != torch.int32: + y_lens = y_lens.int() + + # send to gpu + if self.use_cuda: + logits = logits.cuda() + logit_lens = logit_lens.cuda() + y = y.cuda() + y_lens = y_lens.cuda() + + loss = self.rnnt_loss( + acts=logits, labels=y, act_lens=logit_lens, label_lens=y_lens + ) + + # del new variables that may have been created due to float/int/cuda() + del logits, y, logit_lens, y_lens, inputs, targets + + return loss diff --git a/rnn_speech_recognition/pytorch/metrics.py b/rnn_speech_recognition/pytorch/metrics.py new file mode 100644 index 000000000..fdf287846 --- /dev/null +++ b/rnn_speech_recognition/pytorch/metrics.py @@ -0,0 +1,67 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + + +def __levenshtein(a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b. + """ + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + +def word_error_rate(hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError("In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references))) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += __levenshtein(h_list, r_list) + if words!=0: + wer = 1.0*scores/words + else: + wer = float('inf') + return wer, scores, words diff --git a/rnn_speech_recognition/pytorch/model.py b/rnn_speech_recognition/pytorch/model.py new file mode 100644 index 000000000..d61d68f22 --- /dev/null +++ b/rnn_speech_recognition/pytorch/model.py @@ -0,0 +1,452 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from apex import amp +import torch +import torch.nn as nn +from parts.features import FeatureFactory +from helpers import Optimization +import random + + +jasper_activations = { + "hardtanh": nn.Hardtanh, + "relu": nn.ReLU, + "selu": nn.SELU, +} + +def init_weights(m, mode='xavier_uniform'): + if type(m) == nn.Conv1d or type(m) == MaskedConv1d: + if mode == 'xavier_uniform': + nn.init.xavier_uniform_(m.weight, gain=1.0) + elif mode == 'xavier_normal': + nn.init.xavier_normal_(m.weight, gain=1.0) + elif mode == 'kaiming_uniform': + nn.init.kaiming_uniform_(m.weight, nonlinearity="relu") + elif mode == 'kaiming_normal': + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + else: + raise ValueError("Unknown Initialization mode: {0}".format(mode)) + elif type(m) == nn.BatchNorm1d: + if m.track_running_stats: + m.running_mean.zero_() + m.running_var.fill_(1) + m.num_batches_tracked.zero_() + if m.affine: + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + +def get_same_padding(kernel_size, stride, dilation): + if stride > 1 and dilation > 1: + raise ValueError("Only stride OR dilation may be greater than 1") + return (kernel_size // 2) * dilation + +class AudioPreprocessing(nn.Module): + """GPU accelerated audio preprocessing + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) # For PyTorch API + self.optim_level = kwargs.get('optimization_level', Optimization.nothing) + self.featurizer = FeatureFactory.from_config(kwargs) + + def forward(self, x): + input_signal, length = x + length.requires_grad_(False) + if self.optim_level not in [Optimization.nothing, Optimization.mxprO0, Optimization.mxprO3]: + with amp.disable_casts(): + processed_signal = self.featurizer(x) + processed_length = self.featurizer.get_seq_len(length) + else: + processed_signal = self.featurizer(x) + processed_length = self.featurizer.get_seq_len(length) + return processed_signal, processed_length + +class SpectrogramAugmentation(nn.Module): + """Spectrogram augmentation + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) + self.spec_cutout_regions = SpecCutoutRegions(kwargs) + self.spec_augment = SpecAugment(kwargs) + + @torch.no_grad() + def forward(self, input_spec): + augmented_spec = self.spec_cutout_regions(input_spec) + augmented_spec = self.spec_augment(augmented_spec) + return augmented_spec + +class SpecAugment(nn.Module): + """Spec augment. refer to https://arxiv.org/abs/1904.08779 + """ + def __init__(self, cfg): + super(SpecAugment, self).__init__() + self.cutout_x_regions = cfg.get('cutout_x_regions', 0) + self.cutout_y_regions = cfg.get('cutout_y_regions', 0) + + self.cutout_x_width = cfg.get('cutout_x_width', 10) + self.cutout_y_width = cfg.get('cutout_y_width', 10) + + @torch.no_grad() + def forward(self, x): + sh = x.shape + + mask = torch.zeros(x.shape).byte() + for idx in range(sh[0]): + for _ in range(self.cutout_x_regions): + cutout_x_left = int(random.uniform(0, sh[1] - self.cutout_x_width)) + + mask[idx, cutout_x_left:cutout_x_left + self.cutout_x_width, :] = 1 + + for _ in range(self.cutout_y_regions): + cutout_y_left = int(random.uniform(0, sh[2] - self.cutout_y_width)) + + mask[idx, :, cutout_y_left:cutout_y_left + self.cutout_y_width] = 1 + + x = x.masked_fill(mask.to(device=x.device), 0) + + return x + +class SpecCutoutRegions(nn.Module): + """Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf + """ + def __init__(self, cfg): + super(SpecCutoutRegions, self).__init__() + + self.cutout_rect_regions = cfg.get('cutout_rect_regions', 0) + self.cutout_rect_time = cfg.get('cutout_rect_time', 5) + self.cutout_rect_freq = cfg.get('cutout_rect_freq', 20) + + @torch.no_grad() + def forward(self, x): + sh = x.shape + + mask = torch.zeros(x.shape).byte() + + for idx in range(sh[0]): + for i in range(self.cutout_rect_regions): + cutout_rect_x = int(random.uniform( + 0, sh[1] - self.cutout_rect_freq)) + cutout_rect_y = int(random.uniform( + 0, sh[2] - self.cutout_rect_time)) + + mask[idx, cutout_rect_x:cutout_rect_x + self.cutout_rect_freq, + cutout_rect_y:cutout_rect_y + self.cutout_rect_time] = 1 + + x = x.masked_fill(mask.to(device=x.device), 0) + + return x + +class JasperEncoder(nn.Module): + + """Jasper encoder + """ + def __init__(self, **kwargs): + cfg = {} + for key, value in kwargs.items(): + cfg[key] = value + + nn.Module.__init__(self) + self._cfg = cfg + + activation = jasper_activations[cfg['encoder']['activation']]() + self.use_conv_mask = cfg['encoder'].get('convmask', False) + feat_in = cfg['input']['features'] * cfg['input'].get('frame_splicing', 1) + init_mode = cfg.get('init_mode', 'xavier_uniform') + + residual_panes = [] + encoder_layers = [] + self.dense_residual = False + for lcfg in cfg['jasper']: + dense_res = [] + if lcfg.get('residual_dense', False): + residual_panes.append(feat_in) + dense_res = residual_panes + self.dense_residual = True + encoder_layers.append( + JasperBlock(feat_in, lcfg['filters'], repeat=lcfg['repeat'], + kernel_size=lcfg['kernel'], stride=lcfg['stride'], + dilation=lcfg['dilation'], dropout=lcfg['dropout'], + residual=lcfg['residual'], activation=activation, + residual_panes=dense_res, use_conv_mask=self.use_conv_mask)) + feat_in = lcfg['filters'] + + self.encoder = nn.Sequential(*encoder_layers) + self.apply(lambda x: init_weights(x, mode=init_mode)) + + def num_weights(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def forward(self, x): + if self.use_conv_mask: + audio_signal, length = x + return self.encoder(([audio_signal], length)) + else: + return self.encoder([x]) + +class JasperDecoderForCTC(nn.Module): + """Jasper decoder + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) + self._feat_in = kwargs.get("feat_in") + self._num_classes = kwargs.get("num_classes") + init_mode = kwargs.get('init_mode', 'xavier_uniform') + + self.decoder_layers = nn.Sequential( + nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True),) + self.apply(lambda x: init_weights(x, mode=init_mode)) + + def num_weights(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def forward(self, encoder_output): + out = self.decoder_layers(encoder_output[-1]).transpose(1, 2) + return nn.functional.log_softmax(out, dim=2) + +class Jasper(nn.Module): + """Contains data preprocessing, spectrogram augmentation, jasper encoder and decoder + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) + if kwargs.get("no_featurizer", False): + self.audio_preprocessor = None + else: + self.audio_preprocessor = AudioPreprocessing(**kwargs.get("feature_config")) + + self.data_spectr_augmentation = SpectrogramAugmentation(**kwargs.get("feature_config")) + self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition")) + self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"), + num_classes=kwargs.get("num_classes")) + self.acoustic_model = JasperAcousticModel(self.jasper_encoder, self.jasper_decoder) + + def num_weights(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def forward(self, x): + + # Apply optional preprocessing + if self.audio_preprocessor is not None: + t_processed_signal, p_length_t = self.audio_preprocessor(x) + # Apply optional spectral augmentation + if self.training: + t_processed_signal = self.data_spectr_augmentation(input_spec=t_processed_signal) + + if (self.jasper_encoder.use_conv_mask): + a_inp = (t_processed_signal, p_length_t) + else: + a_inp = t_processed_signal + # Forward Pass through Encoder-Decoder + return self.acoustic_model.forward(a_inp) + + +class JasperAcousticModel(nn.Module): + def __init__(self, enc, dec, transpose_in=False): + nn.Module.__init__(self) + self.jasper_encoder = enc + self.jasper_decoder = dec + self.transpose_in = transpose_in + def forward(self, x): + if self.jasper_encoder.use_conv_mask: + t_encoded_t, t_encoded_len_t = self.jasper_encoder(x) + else: + if self.transpose_in: + x = x.transpose(1, 2) + t_encoded_t = self.jasper_encoder(x) + + out = self.jasper_decoder(encoder_output=t_encoded_t) + if self.jasper_encoder.use_conv_mask: + return out, t_encoded_len_t + else: + return out + +class JasperEncoderDecoder(nn.Module): + """Contains jasper encoder and decoder + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) + self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition")) + self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"), + num_classes=kwargs.get("num_classes")) + self.acoustic_model = JasperAcousticModel(self.jasper_encoder, + self.jasper_decoder, + kwargs.get("transpose_in", False)) + + def num_weights(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def forward(self, x): + return self.acoustic_model.forward(x) + +class MaskedConv1d(nn.Conv1d): + """1D convolution with sequence masking + """ + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, dilation=1, groups=1, bias=False, use_conv_mask=True): + super(MaskedConv1d, self).__init__(in_channels, out_channels, kernel_size, + stride=stride, + padding=padding, dilation=dilation, + groups=groups, bias=bias) + self.use_conv_mask = use_conv_mask + + def get_seq_len(self, lens): + return ((lens + 2 * self.padding[0] - self.dilation[0] * ( + self.kernel_size[0] - 1) - 1) / self.stride[0] + 1) + + def forward(self, inp): + if self.use_conv_mask: + x, lens = inp + max_len = x.size(2) + idxs = torch.arange(max_len).to(lens.dtype).to(lens.device).expand(len(lens), max_len) + mask = idxs >= lens.unsqueeze(1) + x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0) + del mask + del idxs + lens = self.get_seq_len(lens) + else: + x = inp + out = super(MaskedConv1d, self).forward(x) + + if self.use_conv_mask: + return out, lens + else: + return out + +class JasperBlock(nn.Module): + """Jasper Block. See https://arxiv.org/pdf/1904.03288.pdf + """ + def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1, + dilation=1, padding='same', dropout=0.2, activation=None, + residual=True, residual_panes=[], use_conv_mask=False): + super(JasperBlock, self).__init__() + + if padding != "same": + raise ValueError("currently only 'same' padding is supported") + + + padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0]) + self.use_conv_mask = use_conv_mask + self.conv = nn.ModuleList() + inplanes_loop = inplanes + for _ in range(repeat - 1): + self.conv.extend( + self._get_conv_bn_layer(inplanes_loop, planes, kernel_size=kernel_size, + stride=stride, dilation=dilation, + padding=padding_val)) + self.conv.extend( + self._get_act_dropout_layer(drop_prob=dropout, activation=activation)) + inplanes_loop = planes + self.conv.extend( + self._get_conv_bn_layer(inplanes_loop, planes, kernel_size=kernel_size, + stride=stride, dilation=dilation, + padding=padding_val)) + + self.res = nn.ModuleList() if residual else None + res_panes = residual_panes.copy() + self.dense_residual = residual + if residual: + if len(residual_panes) == 0: + res_panes = [inplanes] + self.dense_residual = False + for ip in res_panes: + self.res.append(nn.ModuleList( + modules=self._get_conv_bn_layer(ip, planes, kernel_size=1))) + self.out = nn.Sequential( + *self._get_act_dropout_layer(drop_prob=dropout, activation=activation)) + + def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11, + stride=1, dilation=1, padding=0, bias=False): + layers = [ + MaskedConv1d(in_channels, out_channels, kernel_size, stride=stride, + dilation=dilation, padding=padding, bias=bias, + use_conv_mask=self.use_conv_mask), + nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1) + ] + return layers + + def _get_act_dropout_layer(self, drop_prob=0.2, activation=None): + if activation is None: + activation = nn.Hardtanh(min_val=0.0, max_val=20.0) + layers = [ + activation, + nn.Dropout(p=drop_prob) + ] + return layers + + def num_weights(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def forward(self, input_): + if self.use_conv_mask: + xs, lens_orig = input_ + else: + xs = input_ + lens_orig = 0 + # compute forward convolutions + out = xs[-1] + lens = lens_orig + for i, l in enumerate(self.conv): + if self.use_conv_mask and isinstance(l, MaskedConv1d): + out, lens = l((out, lens)) + else: + out = l(out) + # compute the residuals + if self.res is not None: + for i, layer in enumerate(self.res): + res_out = xs[i] + for j, res_layer in enumerate(layer): + if j == 0 and self.use_conv_mask: + res_out, _ = res_layer((res_out, lens_orig)) + else: + res_out = res_layer(res_out) + out += res_out + + # compute the output + out = self.out(out) + if self.res is not None and self.dense_residual: + out = xs + [out] + else: + out = [out] + + if self.use_conv_mask: + return out, lens + else: + return out + +class GreedyCTCDecoder(nn.Module): + """ Greedy CTC Decoder + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) # For PyTorch API + + def forward(self, log_probs): + with torch.no_grad(): + argmx = log_probs.argmax(dim=-1, keepdim=False).int() + return argmx + +class CTCLossNM: + """ CTC loss + """ + def __init__(self, **kwargs): + self._blank = kwargs['num_classes'] - 1 + self._criterion = nn.CTCLoss(blank=self._blank, reduction='none') + + def __call__(self, log_probs, targets, input_length, target_length): + input_length = input_length.long() + target_length = target_length.long() + targets = targets.long() + loss = self._criterion(log_probs.transpose(1, 0), targets, input_length, + target_length) + # note that this is different from reduction = 'mean' + # because we are not dividing by target lengths + return torch.mean(loss) diff --git a/rnn_speech_recognition/pytorch/model_rnnt.py b/rnn_speech_recognition/pytorch/model_rnnt.py new file mode 100644 index 000000000..242e96424 --- /dev/null +++ b/rnn_speech_recognition/pytorch/model_rnnt.py @@ -0,0 +1,289 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch +import torch.nn as nn + +from rnn import rnn +from rnn import StackTime + +class BnReLUDropout(torch.nn.Module): + def __init__(self, input_size, dropout): + super(BnReLUDropout, self).__init__() + self.bn = torch.nn.BatchNorm1d(input_size) + self.relu = torch.nn.ReLU() + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x): + x = self.bn(x) + x = self.relu(x) + x = self.dropout(x) + return x + +class RNNT(torch.nn.Module): + """A Recurrent Neural Network Transducer (RNN-T). + + Args: + in_features: Number of input features per step per batch. + vocab_size: Number of output symbols (inc blank). + forget_gate_bias: Total initialized value of the bias used in the + forget gate. Set to None to use PyTorch's default initialisation. + (See: http://proceedings.mlr.press/v37/jozefowicz15.pdf) + batch_norm: Use batch normalization in encoder and prediction network + if true. + encoder_n_hidden: Internal hidden unit size of the encoder. + encoder_rnn_layers: Encoder number of layers. + pred_n_hidden: Internal hidden unit size of the prediction network. + pred_rnn_layers: Prediction network number of layers. + joint_n_hidden: Internal hidden unit size of the joint network. + rnn_type: string. Type of rnn in SUPPORTED_RNNS. + """ + def __init__(self, rnnt=None, num_classes=1, **kwargs): + super().__init__() + if kwargs.get("no_featurizer", False): + in_features = kwargs.get("in_features") + else: + feat_config = kwargs.get("feature_config") + in_features = feat_config['features'] * feat_config.get("frame_splicing", 1) + + self._pred_n_hidden = rnnt['pred_n_hidden'] + + self.encoder_n_hidden = rnnt["encoder_n_hidden"] + self.encoder_pre_rnn_layers = rnnt["encoder_pre_rnn_layers"] + self.encoder_post_rnn_layers = rnnt["encoder_post_rnn_layers"] + + self.pred_n_hidden = rnnt["pred_n_hidden"] + self.pred_rnn_layers = rnnt["pred_rnn_layers"] + + self.encoder = self._encoder( + in_features, + rnnt["encoder_n_hidden"], + rnnt["encoder_pre_rnn_layers"], + rnnt["encoder_post_rnn_layers"], + rnnt["forget_gate_bias"], + None if "norm" not in rnnt else rnnt["norm"], + rnnt["rnn_type"], + rnnt["encoder_stack_time_factor"], + rnnt["dropout"], + ) + + self.prediction = self._predict( + num_classes, + rnnt["pred_n_hidden"], + rnnt["pred_rnn_layers"], + rnnt["forget_gate_bias"], + None if "norm" not in "rnnt" else rnnt["norm"], + rnnt["rnn_type"], + rnnt["dropout"], + ) + + self.joint_net = self._joint_net( + num_classes, + rnnt["pred_n_hidden"], + rnnt["encoder_n_hidden"], + rnnt["joint_n_hidden"], + rnnt["dropout"], + ) + + def _encoder(self, in_features, encoder_n_hidden, + encoder_pre_rnn_layers, encoder_post_rnn_layers, + forget_gate_bias, norm, rnn_type, encoder_stack_time_factor, + dropout): + layers = torch.nn.ModuleDict({ + "pre_rnn": rnn( + rnn=rnn_type, + input_size=in_features, + hidden_size=encoder_n_hidden, + num_layers=encoder_pre_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + dropout=dropout, + ), + "stack_time": StackTime(factor=encoder_stack_time_factor), + "post_rnn": rnn( + rnn=rnn_type, + input_size=encoder_stack_time_factor*encoder_n_hidden, + hidden_size=encoder_n_hidden, + num_layers=encoder_post_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + norm_first_rnn=True, + dropout=dropout, + ), + }) + return layers + + def _predict(self, vocab_size, pred_n_hidden, pred_rnn_layers, + forget_gate_bias, norm, rnn_type, dropout): + layers = torch.nn.ModuleDict({ + "embed": torch.nn.Embedding(vocab_size - 1, pred_n_hidden), + "dec_rnn": rnn( + rnn=rnn_type, + input_size=pred_n_hidden, + hidden_size=pred_n_hidden, + num_layers=pred_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + dropout=dropout, + ), + }) + return layers + + def _joint_net(self, vocab_size, pred_n_hidden, enc_n_hidden, + joint_n_hidden, dropout): + layers = [ + torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden), + torch.nn.ReLU(), + ] + ([ torch.nn.Dropout(p=dropout), ] if dropout else [ ]) + [ + torch.nn.Linear(joint_n_hidden, vocab_size) + ] + return torch.nn.Sequential( + *layers + ) + + def forward(self, batch, state=None): + # batch: ((x, y), (x_lens, y_lens)) + + # x: (B, channels, features, seq_len) + (x, y), (x_lens, y_lens) = batch + y = label_collate(y) + + f, x_lens = self.encode((x, x_lens)) + + g, _ = self.predict(y, state) + out = self.joint(f, g) + + return out, (x_lens, y_lens) + + def encode(self, x): + """ + Args: + x: tuple of ``(input, input_lens)``. ``input`` has shape (T, B, I), + ``input_lens`` has shape ``(B,)``. + + Returns: + f: tuple of ``(output, output_lens)``. ``output`` has shape + (B, T, H), ``output_lens`` + """ + x, x_lens = x + x, _ = self.encoder["pre_rnn"](x, None) + x, x_lens = self.encoder["stack_time"]((x, x_lens)) + x, _ = self.encoder["post_rnn"](x, None) + + return x.transpose(0, 1), x_lens + + def predict(self, y, state=None, add_sos=True): + """ + B - batch size + U - label length + H - Hidden dimension size + L - Number of decoder layers = 2 + + Args: + y: (B, U) + + Returns: + Tuple (g, hid) where: + g: (B, U + 1, H) + hid: (h, c) where h is the final sequence hidden state and c is + the final cell state: + h (tensor), shape (L, B, H) + c (tensor), shape (L, B, H) + """ + if y is not None: + # (B, U) -> (B, U, H) + y = self.prediction["embed"](y) + else: + B = 1 if state is None else state[0].size(1) + y = torch.zeros((B, 1, self.pred_n_hidden)).to( + device=self.joint_net[0].weight.device, + dtype=self.joint_net[0].weight.dtype + ) + + # preprend blank "start of sequence" symbol + if add_sos: + B, U, H = y.shape + start = torch.zeros((B, 1, H)).to(device=y.device, dtype=y.dtype) + y = torch.cat([start, y], dim=1).contiguous() # (B, U + 1, H) + else: + start = None # makes del call later easier + + #if state is None: + # batch = y.size(0) + # state = [ + # (torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device), + # torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device)) + # for _ in range(self.pred_rnn_layers) + # ] + + y = y.transpose(0, 1)#.contiguous() # (U + 1, B, H) + g, hid = self.prediction["dec_rnn"](y, state) + g = g.transpose(0, 1)#.contiguous() # (B, U + 1, H) + del y, start, state + return g, hid + + def joint(self, f, g): + """ + f should be shape (B, T, H) + g should be shape (B, U + 1, H) + + returns: + logits of shape (B, T, U, K + 1) + """ + # Combine the input states and the output states + B, T, H = f.shape + B, U_, H2 = g.shape + + f = f.unsqueeze(dim=2) # (B, T, 1, H) + f = f.expand((B, T, U_, H)) + + g = g.unsqueeze(dim=1) # (B, 1, U + 1, H) + g = g.expand((B, T, U_, H2)) + + inp = torch.cat([f, g], dim=3) # (B, T, U, 2H) + res = self.joint_net(inp) + del f, g, inp + return res + + +def label_collate(labels): + """Collates the label inputs for the rnn-t prediction network. + + If `labels` is already in torch.Tensor form this is a no-op. + + Args: + labels: A torch.Tensor List of label indexes or a torch.Tensor. + + Returns: + A padded torch.Tensor of shape (batch, max_seq_len). + """ + + if isinstance(labels, torch.Tensor): + return labels.type(torch.int64) + if not isinstance(labels, (list, tuple)): + raise ValueError( + f"`labels` should be a list or tensor not {type(labels)}" + ) + + batch_size = len(labels) + max_len = max(len(l) for l in labels) + + cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32) + for e, l in enumerate(labels): + cat_labels[e, :len(l)] = l + labels = torch.LongTensor(cat_labels) + + return labels diff --git a/rnn_speech_recognition/pytorch/multiproc.py b/rnn_speech_recognition/pytorch/multiproc.py new file mode 100644 index 000000000..eecba31a8 --- /dev/null +++ b/rnn_speech_recognition/pytorch/multiproc.py @@ -0,0 +1,190 @@ +# From PyTorch: +# +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2016- Facebook, Inc (Adam Paszke) +# Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +# Copyright (c) 2011-2013 NYU (Clement Farabet) +# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +# Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) +# +# From Caffe2: +# +# Copyright (c) 2016-present, Facebook Inc. All rights reserved. +# +# All contributions by Facebook: +# Copyright (c) 2016 Facebook Inc. +# +# All contributions by Google: +# Copyright (c) 2015 Google Inc. +# All rights reserved. +# +# All contributions by Yangqing Jia: +# Copyright (c) 2015 Yangqing Jia +# All rights reserved. +# +# All contributions from Caffe: +# Copyright(c) 2013, 2014, 2015, the respective contributors +# All rights reserved. +# +# All other contributions: +# Copyright(c) 2015, 2016 the respective contributors +# All rights reserved. +# +# Caffe2 uses a copyright model similar to Caffe: each contributor holds +# copyright over their contributions to Caffe2. The project versioning records +# all such contribution and copyright details. If a contributor wants to further +# mark their specific copyright on a particular contribution, they should +# indicate their copyright solely in the commit message of the change when it is +# committed. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America +# and IDIAP Research Institute nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +import sys +import subprocess +import os +import socket +import time +from argparse import ArgumentParser, REMAINDER + +import torch + +def parse_args(): + """ + Helper function parsing the command line options + @retval ArgumentParser + """ + parser = ArgumentParser(description="PyTorch distributed training launch " + "helper utilty that will spawn up " + "multiple distributed processes") + + # Optional arguments for the launch helper + parser.add_argument("--nnodes", type=int, default=1, + help="The number of nodes to use for distributed " + "training") + parser.add_argument("--node_rank", type=int, default=0, + help="The rank of the node for multi-node distributed " + "training") + parser.add_argument("--nproc_per_node", type=int, default=1, + help="The number of processes to launch on each node, " + "for GPU training, this is recommended to be set " + "to the number of GPUs in your system so that " + "each process can be bound to a single GPU.") + parser.add_argument("--master_addr", default="127.0.0.1", type=str, + help="Master node (rank 0)'s address, should be either " + "the IP address or the hostname of node 0, for " + "single node multi-proc training, the " + "--master_addr can simply be 127.0.0.1") + parser.add_argument("--master_port", default=29500, type=int, + help="Master node (rank 0)'s free port that needs to " + "be used for communciation during distributed " + "training") + + # positional + parser.add_argument("training_script", type=str, + help="The full path to the single GPU training " + "program/script to be launched in parallel, " + "followed by all the arguments for the " + "training script") + + # rest from the training program + parser.add_argument('training_script_args', nargs=REMAINDER) + return parser.parse_args() + + +def main(): + args = parse_args() + + # world size in terms of number of processes + dist_world_size = args.nproc_per_node * args.nnodes + + # set PyTorch distributed related environmental variables + current_env = os.environ.copy() + current_env["MASTER_ADDR"] = args.master_addr + current_env["MASTER_PORT"] = str(args.master_port) + current_env["WORLD_SIZE"] = str(dist_world_size) + + processes = [] + + for local_rank in range(0, args.nproc_per_node): + # each process's rank + dist_rank = args.nproc_per_node * args.node_rank + local_rank + current_env["RANK"] = str(dist_rank) + current_env["LOCAL_RANK"] = str(local_rank) + + # spawn the processes + cmd = [sys.executable, + "-u", + args.training_script] + args.training_script_args + + print(cmd) + + stdout = None if local_rank == 0 else open("GPU_"+str(local_rank)+".log", "w") + + process = subprocess.Popen(cmd, env=current_env, stdout=stdout) + processes.append(process) + + try: + up = True + error = False + while up and not error: + up = False + for p in processes: + ret = p.poll() + if ret is None: + up = True + elif ret != 0: + error = True + time.sleep(1) + + if error: + for p in processes: + if p.poll() is None: + p.terminate() + exit(1) + + except KeyboardInterrupt: + for p in processes: + p.terminate() + raise + except SystemExit: + for p in processes: + p.terminate() + raise + except: + for p in processes: + p.terminate() + raise + + +if __name__ == "__main__": + main() diff --git a/rnn_speech_recognition/pytorch/optimizers.py b/rnn_speech_recognition/pytorch/optimizers.py new file mode 100644 index 000000000..da17030dd --- /dev/null +++ b/rnn_speech_recognition/pytorch/optimizers.py @@ -0,0 +1,223 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch.optim import Optimizer +import math + +class AdamW(Optimizer): + """Implements AdamW algorithm. + + It has been proposed in `Adam: A Method for Stochastic Optimization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + + Adam: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) + + return loss + +class Novograd(Optimizer): + """ + Implements Novograd algorithm. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.95, 0)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging: gradient averaging + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, + weight_decay=0, grad_averaging=False, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + grad_averaging=grad_averaging, + amsgrad=amsgrad) + + super(Novograd, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Novograd, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Sparse gradients are not supported.') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + norm = torch.sum(torch.pow(grad, 2)) + + if exp_avg_sq == 0: + exp_avg_sq.copy_(norm) + else: + exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + grad.div_(denom) + if group['weight_decay'] != 0: + grad.add_(group['weight_decay'], p.data) + if group['grad_averaging']: + grad.mul_(1 - beta1) + exp_avg.mul_(beta1).add_(grad) + + p.data.add_(-group['lr'], exp_avg) + + return loss diff --git a/rnn_speech_recognition/pytorch/parts/features.py b/rnn_speech_recognition/pytorch/parts/features.py new file mode 100644 index 000000000..2c80a9370 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/features.py @@ -0,0 +1,349 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import math +import librosa +from .perturb import AudioAugmentor +from .segment import AudioSegment +from apex import amp + + +def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000): + audio = AudioSegment.from_file(file_path, + target_sr=target_sr, + int_values=False, + offset=offset, duration=duration, trim=trim) + samples=torch.tensor(audio.samples, dtype=torch.float).cuda() + num_samples = torch.tensor(samples.shape[0]).int().cuda() + return (samples.unsqueeze(0), num_samples.unsqueeze(0)) + +class WaveformFeaturizer(object): + def __init__(self, input_cfg, augmentor=None): + self.augmentor = augmentor if augmentor is not None else AudioAugmentor() + self.cfg = input_cfg + + def max_augmentation_length(self, length): + return self.augmentor.max_augmentation_length(length) + + def process(self, file_path, offset=0, duration=0, trim=False): + audio = AudioSegment.from_file(file_path, + target_sr=self.cfg['sample_rate'], + int_values=self.cfg.get('int_values', False), + offset=offset, duration=duration, trim=trim) + return self.process_segment(audio) + + def process_segment(self, audio_segment): + self.augmentor.perturb(audio_segment) + return torch.tensor(audio_segment.samples, dtype=torch.float) + + @classmethod + def from_config(cls, input_config, perturbation_configs=None): + if perturbation_configs is not None: + aa = AudioAugmentor.from_config(perturbation_configs) + else: + aa = None + + return cls(input_config, augmentor=aa) + +constant = 1e-5 +def normalize_batch(x, seq_len, normalize_type): + if normalize_type == "per_feature": + x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + for i in range(x.shape[0]): + x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1) + x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1) + # make sure x_std is not zero + x_std += constant + return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2) + elif normalize_type == "all_features": + x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + for i in range(x.shape[0]): + x_mean[i] = x[i, :, :seq_len[i].item()].mean() + x_std[i] = x[i, :, :seq_len[i].item()].std() + # make sure x_std is not zero + x_std += constant + return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1) + else: + return x + +def splice_frames(x, frame_splicing): + """ Stacks frames together across feature dim + + input is batch_size, feature_dim, num_frames + output is batch_size, feature_dim*frame_splicing, num_frames + + """ + seq = [x] + for n in range(1, frame_splicing): + tmp = torch.zeros_like(x) + tmp[:, :, :-n] = x[:, :, n:] + seq.append(tmp) + return torch.cat(seq, dim=1)[:, :, ::frame_splicing] + +class SpectrogramFeatures(nn.Module): + def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, + n_fft=None, + window="hamming", normalize="per_feature", log=True, center=True, + dither=constant, pad_to=8, max_duration=16.7, + frame_splicing=1): + super(SpectrogramFeatures, self).__init__() + torch_windows = { + 'hann': torch.hann_window, + 'hamming': torch.hamming_window, + 'blackman': torch.blackman_window, + 'bartlett': torch.bartlett_window, + 'none': None, + } + self.win_length = int(sample_rate * window_size) + self.hop_length = int(sample_rate * window_stride) + self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) + + window_fn = torch_windows.get(window, None) + window_tensor = window_fn(self.win_length, + periodic=False) if window_fn else None + self.window = window_tensor + + self.normalize = normalize + self.log = log + self.center = center + self.dither = dither + self.pad_to = pad_to + self.frame_splicing = frame_splicing + + max_length = 1 + math.ceil( + (max_duration * sample_rate - self.win_length) / self.hop_length + ) + max_pad = 16 - (max_length % 16) + self.max_length = max_length + max_pad + + def get_seq_len(self, seq_len): + x = torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to( + dtype=torch.int) + if self.frame_splicing > 1: + x = torch.ceil(x.float() / self.frame_splicing).to(dtype=torch.int) + return x + + @torch.no_grad() + def forward(self, inp): + x, seq_len = inp + dtype = x.dtype + + seq_len = self.get_seq_len(seq_len) + + # dither + if self.dither > 0: + x += self.dither * torch.randn_like(x) + + # do preemphasis + if hasattr(self,'preemph') and self.preemph is not None: + x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), + dim=1) + + # get spectrogram + x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, + win_length=self.win_length, center=self.center, + window=self.window.to(torch.float)) + x = torch.sqrt(x.pow(2).sum(-1)) + + # log features if required + if self.log: + x = torch.log(x + 1e-20) + + # frame splicing if required + if self.frame_splicing > 1: + x = splice_frames(x, self.frame_splicing) + + # normalize if required + if self.normalize: + x = normalize_batch(x, seq_len, normalize_type=self.normalize) + + # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency) + #max_len = x.size(-1) + #mask = torch.arange(max_len).to(seq_len.dtype).to(seq_len.device).expand(x.size(0), max_len) >= seq_len.unsqueeze(1) + #x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0) + #del mask + x = x[:, :, :seq_len.max()] # rnnt loss requires lengths to match + pad_to = self.pad_to + if pad_to != 0: + raise NotImplementedError() + #if pad_to == "max": + # x = nn.functional.pad(x, (0, self.max_length - x.size(-1))) + #elif pad_to > 0: + # pad_amt = x.size(-1) % pad_to + # if pad_amt != 0: + # x = nn.functional.pad(x, (0, pad_to - pad_amt)) + + return x.to(dtype) + + @classmethod + def from_config(cls, cfg, log=False): + return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'], + window_stride=cfg['window_stride'], + n_fft=cfg['n_fft'], window=cfg['window'], + normalize=cfg['normalize'], + max_duration=cfg.get('max_duration', 16.7), + dither=cfg.get('dither', 1e-5), pad_to=cfg.get("pad_to", 0), + frame_splicing=cfg.get("frame_splicing", 1), log=log) + +class FilterbankFeatures(nn.Module): + def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, + window="hamming", normalize="per_feature", n_fft=None, + preemph=0.97, + nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant, + pad_to=8, + max_duration=16.7, + frame_splicing=1): + super(FilterbankFeatures, self).__init__() +# print("PADDING: {}".format(pad_to)) + + torch_windows = { + 'hann': torch.hann_window, + 'hamming': torch.hamming_window, + 'blackman': torch.blackman_window, + 'bartlett': torch.bartlett_window, + 'none': None, + } + + self.win_length = int(sample_rate * window_size) # frame size + self.hop_length = int(sample_rate * window_stride) + self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) + + self.normalize = normalize + self.log = log + self.dither = dither + self.frame_splicing = frame_splicing + self.nfilt = nfilt + self.preemph = preemph + self.pad_to = pad_to + highfreq = highfreq or sample_rate / 2 + window_fn = torch_windows.get(window, None) + window_tensor = window_fn(self.win_length, + periodic=False) if window_fn else None + filterbanks = torch.tensor( + librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, + fmax=highfreq), dtype=torch.float).unsqueeze(0) + # self.fb = filterbanks + # self.window = window_tensor + self.register_buffer("fb", filterbanks) + self.register_buffer("window", window_tensor) + # Calculate maximum sequence length (# frames) + max_length = 1 + math.ceil( + (max_duration * sample_rate - self.win_length) / self.hop_length + ) + max_pad = 16 - (max_length % 16) + self.max_length = max_length + max_pad + + + def get_seq_len(self, seq_len): + x = torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to( + dtype=torch.int) + # dtype=torch.long) + if self.frame_splicing > 1: + x = torch.ceil(x.float() / self.frame_splicing).to(dtype=torch.int) + return x + + @torch.no_grad() + def forward(self, inp): + x, seq_len = inp + + dtype = x.dtype + + seq_len = self.get_seq_len(seq_len) + + # dither + if self.dither > 0: + x += self.dither * torch.randn_like(x) + + # do preemphasis + if self.preemph is not None: + x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), + dim=1) + + # do stft + x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, + win_length=self.win_length, + center=True, window=self.window.to(dtype=torch.float)) + + # get power spectrum + x = x.pow(2).sum(-1) + + # dot with filterbank energies + x = torch.matmul(self.fb.to(x.dtype), x) + + # log features if required + if self.log: + x = torch.log(x + 1e-20) + + # frame splicing if required + if self.frame_splicing > 1: + x = splice_frames(x, self.frame_splicing) + + # normalize if required + if self.normalize: + x = normalize_batch(x, seq_len, normalize_type=self.normalize) + + # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency) + #max_len = x.size(-1) + x = x[:, :, :seq_len.max()] # rnnt loss requires lengths to match + #mask = torch.arange(max_len).to(seq_len.dtype).to(x.device).expand(x.size(0), + # max_len) >= seq_len.unsqueeze(1) + + #x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0) + pad_to = self.pad_to + if pad_to != 0: + raise NotImplementedError() + #if pad_to == "max": + # x = nn.functional.pad(x, (0, self.max_length - x.size(-1))) + #elif pad_to > 0: + # pad_amt = x.size(-1) % pad_to + # if pad_amt != 0: + # x = nn.functional.pad(x, (0, pad_to - pad_amt)) + + return x.to(dtype) + + @classmethod + def from_config(cls, cfg, log=False): + return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'], + window_stride=cfg['window_stride'], n_fft=cfg['n_fft'], + nfilt=cfg['features'], window=cfg['window'], + normalize=cfg['normalize'], + max_duration=cfg.get('max_duration', 16.7), + dither=cfg['dither'], pad_to=cfg.get("pad_to", 0), + frame_splicing=cfg.get("frame_splicing", 1), log=log) + +class FeatureFactory(object): + featurizers = { + "logfbank": FilterbankFeatures, + "fbank": FilterbankFeatures, + "stft": SpectrogramFeatures, + "logspect": SpectrogramFeatures, + "logstft": SpectrogramFeatures + } + + def __init__(self): + pass + + @classmethod + def from_config(cls, cfg): + feat_type = cfg.get('feat_type', "logspect") + featurizer = cls.featurizers[feat_type] + #return featurizer.from_config(cfg, log="log" in cfg['feat_type']) + return featurizer.from_config(cfg, log="log" in feat_type) diff --git a/rnn_speech_recognition/pytorch/parts/manifest.py b/rnn_speech_recognition/pytorch/parts/manifest.py new file mode 100644 index 000000000..08cd7b564 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/manifest.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import re +import string +import numpy as np +import os + +from .text import _clean_text + + +def normalize_string(s, labels, table, **unused_kwargs): + """ + Normalizes string. For example: + 'call me at 8:00 pm!' -> 'call me at eight zero pm' + + Args: + s: string to normalize + labels: labels used during model training. + + Returns: + Normalized string + """ + + def good_token(token, labels): + s = set(labels) + for t in token: + if not t in s: + return False + return True + + try: + text = _clean_text(s, ["english_cleaners"], table).strip() + return ''.join([t for t in text if good_token(t, labels=labels)]) + except: + print("WARNING: Normalizing {} failed".format(s)) + return None + +class Manifest(object): + def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False, + min_duration=None, sort_by_duration=False, max_utts=0, + normalize=True, speed_perturbation=False, filter_speed=1.0): + self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) + self.blank_index = blank_index + self.max_duration= max_duration + ids = [] + duration = 0.0 + filtered_duration = 0.0 + + # If removing punctuation, make a list of punctuation to remove + table = None + if normalize: + # Punctuation to remove + punctuation = string.punctuation + punctuation = punctuation.replace("+", "") + punctuation = punctuation.replace("&", "") + ### We might also want to consider: + ### @ -> at + ### # -> number, pound, hashtag + ### ~ -> tilde + ### _ -> underscore + ### % -> percent + # If a punctuation symbol is inside our vocab, we do not remove from text + for l in labels: + punctuation = punctuation.replace(l, "") + # Turn all punctuation to whitespace + table = str.maketrans(punctuation, " " * len(punctuation)) + for manifest_path in manifest_paths: + with open(manifest_path, "r", encoding="utf-8") as fh: + a=json.load(fh) + for data in a: + files_and_speeds = data['files'] + + if pad_to_max: + if not speed_perturbation: + min_speed = filter_speed + else: + min_speed = min(x['speed'] for x in files_and_speeds) + max_duration = self.max_duration * min_speed + + data['duration'] = data['original_duration'] + if min_duration is not None and data['duration'] < min_duration: + filtered_duration += data['duration'] + continue + if max_duration is not None and data['duration'] > max_duration: + filtered_duration += data['duration'] + continue + + # Prune and normalize according to transcript + transcript_text = data[ + 'transcript'] if "transcript" in data else self.load_transcript( + data['text_filepath']) + if normalize: + transcript_text = normalize_string(transcript_text, labels=labels, + table=table) + if not isinstance(transcript_text, str): + print( + "WARNING: Got transcript: {}. It is not a string. Dropping data point".format( + transcript_text)) + filtered_duration += data['duration'] + continue + data["transcript"] = self.parse_transcript(transcript_text) # convert to vocab indices + + if speed_perturbation: + audio_paths = [x['fname'] for x in files_and_speeds] + data['audio_duration'] = [x['duration'] for x in files_and_speeds] + else: + audio_paths = [x['fname'] for x in files_and_speeds if x['speed'] == filter_speed] + data['audio_duration'] = [x['duration'] for x in files_and_speeds if x['speed'] == filter_speed] + data['audio_filepath'] = [os.path.join(data_dir, x) for x in audio_paths] + data.pop('files') + data.pop('original_duration') + + ids.append(data) + duration += data['duration'] + + if max_utts > 0 and len(ids) >= max_utts: + print( + 'Stopping parsing %s as max_utts=%d' % (manifest_path, max_utts)) + break + + if sort_by_duration: + ids = sorted(ids, key=lambda x: x['duration']) + self._data = ids + self._size = len(ids) + self._duration = duration + self._filtered_duration = filtered_duration + + def load_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding="utf-8") as transcript_file: + transcript = transcript_file.read().replace('\n', '') + return transcript + + def parse_transcript(self, transcript): + chars = [self.labels_map.get(x, self.blank_index) for x in list(transcript)] + transcript = list(filter(lambda x: x != self.blank_index, chars)) + return transcript + + def __getitem__(self, item): + return self._data[item] + + def __len__(self): + return self._size + + def __iter__(self): + return iter(self._data) + + @property + def duration(self): + return self._duration + + @property + def filtered_duration(self): + return self._filtered_duration + + @property + def data(self): + return list(self._data) diff --git a/rnn_speech_recognition/pytorch/parts/perturb.py b/rnn_speech_recognition/pytorch/parts/perturb.py new file mode 100644 index 000000000..b8ff0f50a --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/perturb.py @@ -0,0 +1,111 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import librosa +from .manifest import Manifest +from .segment import AudioSegment + + +class Perturbation(object): + def max_augmentation_length(self, length): + return length + + def perturb(self, data): + raise NotImplementedError + + +class SpeedPerturbation(Perturbation): + def __init__(self, min_speed_rate=0.85, max_speed_rate=1.15, rng=None): + self._min_rate = min_speed_rate + self._max_rate = max_speed_rate + self._rng = random.Random() if rng is None else rng + + def max_augmentation_length(self, length): + return length * self._max_rate + + def perturb(self, data): + speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + data._samples = librosa.effects.time_stretch(data._samples, speed_rate) + + +class GainPerturbation(Perturbation): + def __init__(self, min_gain_dbfs=-10, max_gain_dbfs=10, rng=None): + self._min_gain_dbfs = min_gain_dbfs + self._max_gain_dbfs = max_gain_dbfs + self._rng = random.Random() if rng is None else rng + + def perturb(self, data): + gain = self._rng.uniform(self._min_gain_dbfs, self._max_gain_dbfs) + data._samples = data._samples * (10. ** (gain / 20.)) + + + +class ShiftPerturbation(Perturbation): + def __init__(self, min_shift_ms=-5.0, max_shift_ms=5.0, rng=None): + self._min_shift_ms = min_shift_ms + self._max_shift_ms = max_shift_ms + self._rng = random.Random() if rng is None else rng + + def perturb(self, data): + shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + if abs(shift_ms) / 1000 > data.duration: + # TODO: do something smarter than just ignore this condition + return + shift_samples = int(shift_ms * data.sample_rate // 1000) + # print("DEBUG: shift:", shift_samples) + if shift_samples < 0: + data._samples[-shift_samples:] = data._samples[:shift_samples] + data._samples[:-shift_samples] = 0 + elif shift_samples > 0: + data._samples[:-shift_samples] = data._samples[shift_samples:] + data._samples[-shift_samples:] = 0 + + +perturbation_types = { + "speed": SpeedPerturbation, + "gain": GainPerturbation, + "shift": ShiftPerturbation, +} + + +class AudioAugmentor(object): + def __init__(self, perturbations=None, rng=None): + self._rng = random.Random() if rng is None else rng + self._pipeline = perturbations if perturbations is not None else [] + + def perturb(self, segment): + for (prob, p) in self._pipeline: + if self._rng.random() < prob: + p.perturb(segment) + return + + def max_augmentation_length(self, length): + newlen = length + for (prob, p) in self._pipeline: + newlen = p.max_augmentation_length(newlen) + return newlen + + @classmethod + def from_config(cls, config): + ptbs = [] + for p in config: + if p['aug_type'] not in perturbation_types: + print(p['aug_type'], "perturbation not known. Skipping.") + continue + perturbation = perturbation_types[p['aug_type']] + ptbs.append((p['prob'], perturbation(**p['cfg']))) + return cls(perturbations=ptbs) diff --git a/rnn_speech_recognition/pytorch/parts/segment.py b/rnn_speech_recognition/pytorch/parts/segment.py new file mode 100644 index 000000000..b06983941 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/segment.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import librosa +import soundfile as sf + + +class AudioSegment(object): + """Monaural audio segment abstraction. + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate, target_sr=None, trim=False, + trim_db=60): + """Create audio segment from samples. + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + samples = self._convert_samples_to_float32(samples) + if target_sr is not None and target_sr != sample_rate: + samples = librosa.core.resample(samples, sample_rate, target_sr) + sample_rate = target_sr + if trim: + samples, _ = librosa.effects.trim(samples, trim_db) + self._samples = samples + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @staticmethod + def _convert_samples_to_float32(samples): + """Convert sample type to float32. + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2 ** (bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + @classmethod + def from_file(cls, filename, target_sr=None, int_values=False, offset=0, + duration=0, trim=False): + """ + Load a file supported by librosa and return as an AudioSegment. + :param filename: path of file to load + :param target_sr: the desired sample rate + :param int_values: if true, load samples as 32-bit integers + :param offset: offset in seconds when loading audio + :param duration: duration in seconds when loading audio + :return: numpy array of samples + """ + with sf.SoundFile(filename, 'r') as f: + dtype = 'int32' if int_values else 'float32' + sample_rate = f.samplerate + if offset > 0: + f.seek(int(offset * sample_rate)) + if duration > 0: + samples = f.read(int(duration * sample_rate), dtype=dtype) + else: + samples = f.read(dtype=dtype) + samples = samples.transpose() + return cls(samples, sample_rate, target_sr=target_sr, trim=trim) + + @property + def samples(self): + return self._samples.copy() + + @property + def sample_rate(self): + return self._sample_rate + + @property + def num_samples(self): + return self._samples.shape[0] + + @property + def duration(self): + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + mean_square = np.mean(self._samples ** 2) + return 10 * np.log10(mean_square) + + def gain_db(self, gain): + self._samples *= 10. ** (gain / 20.) + + def pad(self, pad_size, symmetric=False): + """Add zero padding to the sample. The pad size is given in number of samples. + If symmetric=True, `pad_size` will be added to both sides. If false, `pad_size` + zeros will be added only to the end. + """ + self._samples = np.pad(self._samples, + (pad_size if symmetric else 0, pad_size), + mode='constant') + + def subsegment(self, start_time=None, end_time=None): + """Cut the AudioSegment between given boundaries. + Note that this is an in-place transformation. + :param start_time: Beginning of subsegment in seconds. + :type start_time: float + :param end_time: End of subsegment in seconds. + :type end_time: float + :raise ValueError: If start_time or end_time is incorrectly set, e.g. out + of bounds in time. + """ + start_time = 0.0 if start_time is None else start_time + end_time = self.duration if end_time is None else end_time + if start_time < 0.0: + start_time = self.duration + start_time + if end_time < 0.0: + end_time = self.duration + end_time + if start_time < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_time) + if end_time < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_time) + if start_time > end_time: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_time, end_time)) + if end_time > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_time, self.duration)) + start_sample = int(round(start_time * self._sample_rate)) + end_sample = int(round(end_time * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] diff --git a/rnn_speech_recognition/pytorch/parts/text/LICENSE b/rnn_speech_recognition/pytorch/parts/text/LICENSE new file mode 100644 index 000000000..4ad4ed1d5 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/text/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2017 Keith Ito + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/rnn_speech_recognition/pytorch/parts/text/__init__.py b/rnn_speech_recognition/pytorch/parts/text/__init__.py new file mode 100644 index 000000000..da9e021cd --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/text/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2017 Keith Ito +""" from https://github.com/keithito/tacotron """ +import re +from . import cleaners + +def _clean_text(text, cleaner_names, *args): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text, *args) + return text diff --git a/rnn_speech_recognition/pytorch/parts/text/cleaners.py b/rnn_speech_recognition/pytorch/parts/text/cleaners.py new file mode 100644 index 000000000..a99db1a62 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/text/cleaners.py @@ -0,0 +1,107 @@ +# Copyright (c) 2017 Keith Ito +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" from https://github.com/keithito/tacotron +Modified to add puncturation removal +""" + +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). + +''' + +import re +from unidecode import unidecode +from .numbers import normalize_numbers + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + +def expand_numbers(text): + return normalize_numbers(text) + +def lowercase(text): + return text.lower() + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + +def convert_to_ascii(text): + return unidecode(text) + +def remove_punctuation(text, table): + text = text.translate(table) + text = re.sub(r'&', " and ", text) + text = re.sub(r'\+', " plus ", text) + return text + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + +def english_cleaners(text, table=None): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + if table is not None: + text = remove_punctuation(text, table) + text = collapse_whitespace(text) + return text diff --git a/rnn_speech_recognition/pytorch/parts/text/numbers.py b/rnn_speech_recognition/pytorch/parts/text/numbers.py new file mode 100644 index 000000000..46ce11067 --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/text/numbers.py @@ -0,0 +1,99 @@ +# Copyright (c) 2017 Keith Ito +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" from https://github.com/keithito/tacotron +Modifed to add support for time and slight tweaks to _expand_number +""" + +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') +_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + if int(m.group(0)[0]) == 0: + return _inflect.number_to_words(m.group(0), andword='', group=1) + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + # Add check for number phones and other large numbers + elif num > 1000000000 and num % 10000 != 0: + return _inflect.number_to_words(num, andword='', group=1) + else: + return _inflect.number_to_words(num, andword='') + +def _expand_time(m): + mins = int(m.group(2)) + if mins == 0: + return _inflect.number_to_words(m.group(1)) + return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))]) + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + text = re.sub(_time_re, _expand_time, text) + return text diff --git a/rnn_speech_recognition/pytorch/parts/text/symbols.py b/rnn_speech_recognition/pytorch/parts/text/symbols.py new file mode 100644 index 000000000..24efedf8d --- /dev/null +++ b/rnn_speech_recognition/pytorch/parts/text/symbols.py @@ -0,0 +1,19 @@ +# Copyright (c) 2017 Keith Ito +""" from https://github.com/keithito/tacotron """ + +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' +from . import cmudict + +_pad = '_' +_punctuation = '!\'(),.:;? ' +_special = '-' +_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +_arpabet = ['@' + s for s in cmudict.valid_symbols] + +# Export all symbols: +symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet diff --git a/rnn_speech_recognition/pytorch/preprocessing.py b/rnn_speech_recognition/pytorch/preprocessing.py new file mode 100644 index 000000000..eb2e5b2f3 --- /dev/null +++ b/rnn_speech_recognition/pytorch/preprocessing.py @@ -0,0 +1,123 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import torch +import torch.nn as nn +from apex import amp + +from helpers import Optimization +from parts.features import FeatureFactory + + +class SpecCutoutRegions(nn.Module): + """Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf + """ + def __init__(self, cfg): + super(SpecCutoutRegions, self).__init__() + + self.cutout_rect_regions = cfg.get('cutout_rect_regions', 0) + self.cutout_rect_time = cfg.get('cutout_rect_time', 5) + self.cutout_rect_freq = cfg.get('cutout_rect_freq', 20) + + @torch.no_grad() + def forward(self, x): + sh = x.shape + + mask = torch.zeros(x.shape).bool() + + for idx in range(sh[0]): + for i in range(self.cutout_rect_regions): + cutout_rect_x = int(random.uniform( + 0, sh[1] - self.cutout_rect_freq)) + cutout_rect_y = int(random.uniform( + 0, sh[2] - self.cutout_rect_time)) + + mask[idx, cutout_rect_x:cutout_rect_x + self.cutout_rect_freq, + cutout_rect_y:cutout_rect_y + self.cutout_rect_time] = 1 + + x = x.masked_fill(mask.to(device=x.device), 0) + + return x + + +class SpecAugment(nn.Module): + """Spec augment. refer to https://arxiv.org/abs/1904.08779 + """ + def __init__(self, cfg): + super(SpecAugment, self).__init__() + self.cutout_x_regions = cfg.get('cutout_x_regions', 0) + self.cutout_y_regions = cfg.get('cutout_y_regions', 0) + + self.cutout_x_width = cfg.get('cutout_x_width', 10) + self.cutout_y_width = cfg.get('cutout_y_width', 10) + + @torch.no_grad() + def forward(self, x): + sh = x.shape + + mask = torch.zeros(x.shape).bool() + for idx in range(sh[0]): + for _ in range(self.cutout_x_regions): + cutout_x_left = int(random.uniform(0, sh[1] - self.cutout_x_width)) + + mask[idx, cutout_x_left:cutout_x_left + self.cutout_x_width, :] = 1 + + for _ in range(self.cutout_y_regions): + cutout_y_left = int(random.uniform(0, sh[2] - self.cutout_y_width)) + + mask[idx, :, cutout_y_left:cutout_y_left + self.cutout_y_width] = 1 + + x = x.masked_fill(mask.to(device=x.device), 0) + + return x + + +class SpectrogramAugmentation(nn.Module): + """Spectrogram augmentation + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) + self.spec_cutout_regions = SpecCutoutRegions(kwargs) + self.spec_augment = SpecAugment(kwargs) + + @torch.no_grad() + def forward(self, input_spec): + augmented_spec = self.spec_cutout_regions(input_spec) + augmented_spec = self.spec_augment(augmented_spec) + return augmented_spec + + +class AudioPreprocessing(nn.Module): + """GPU accelerated audio preprocessing + """ + def __init__(self, **kwargs): + nn.Module.__init__(self) # For PyTorch API + self.optim_level = kwargs.get('optimization_level', Optimization.nothing) + self.featurizer = FeatureFactory.from_config(kwargs) + + def forward(self, x): + input_signal, length = x + length.requires_grad_(False) + if self.optim_level not in [Optimization.nothing, Optimization.mxprO0, Optimization.mxprO3]: + with amp.disable_casts(): + processed_signal = self.featurizer(x) + processed_length = self.featurizer.get_seq_len(length) + else: + processed_signal = self.featurizer(x) + processed_length = self.featurizer.get_seq_len(length) + return processed_signal, processed_length + + diff --git a/rnn_speech_recognition/pytorch/requirements.txt b/rnn_speech_recognition/pytorch/requirements.txt new file mode 100755 index 000000000..cc675c8d1 --- /dev/null +++ b/rnn_speech_recognition/pytorch/requirements.txt @@ -0,0 +1,10 @@ +pandas==0.24.2 +tqdm==4.31.1 +ascii-graph==1.5.1 +wrapt==1.10.11 +librosa +toml +soundfile +ipdb +sox +tensorboard==2.0.0 diff --git a/rnn_speech_recognition/pytorch/rnn.py b/rnn_speech_recognition/pytorch/rnn.py new file mode 100644 index 000000000..c6234d61f --- /dev/null +++ b/rnn_speech_recognition/pytorch/rnn.py @@ -0,0 +1,402 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +from typing import List +from typing import Optional +from typing import Tuple + +import torch +from torch.nn import Parameter + + +def rnn(rnn, input_size, hidden_size, num_layers, norm=None, + forget_gate_bias=1.0, dropout=0.0, **kwargs): + """TODO""" + if rnn != "lstm": + raise ValueError(f"Unknown rnn={rnn}") + if norm not in [None, "batch_norm", "layer_norm"]: + raise ValueError(f"unknown norm={norm}") + + if rnn == "lstm": + if norm is None: + return LstmDrop( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout, + forget_gate_bias=forget_gate_bias, + **kwargs + ) + + if norm == "batch_norm": + return BNRNNSum( + input_size=input_size, + hidden_size=hidden_size, + rnn_layers=num_layers, + batch_norm=True, + dropout=dropout, + forget_gate_bias=forget_gate_bias, + **kwargs + ) + + if norm == "layer_norm": + return torch.jit.script(lnlstm( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout, + forget_gate_bias=forget_gate_bias, + **kwargs + )) + + +class OverLastDim(torch.nn.Module): + """Collapses a tensor to 2D, applies a module, and (re-)expands the tensor. + + An n-dimensional tensor of shape (s_1, s_2, ..., s_n) is first collapsed to + a tensor with shape (s_1*s_2*...*s_n-1, s_n). The module is called with + this as input producing (s_1*s_2*...*s_n-1, s_n') --- note that the final + dimension can change. This is expanded to (s_1, s_2, ..., s_n-1, s_n') and + returned. + + Args: + module (torch.nn.Module): Module to apply. Must accept a 2D tensor as + input and produce a 2D tensor as output, optionally changing the + size of the last dimension. + """ + + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, x): + *dims, input_size = x.size() + + reduced_dims = 1 + for dim in dims: + reduced_dims *= dim + + x = x.view(reduced_dims, -1) + x = self.module(x) + x = x.view(*dims, -1) + return x + + +class LstmDrop(torch.nn.Module): + + def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bias, + **kwargs): + """Returns an LSTM with forget gate bias init to `forget_gate_bias`. + + Args: + input_size: See `torch.nn.LSTM`. + hidden_size: See `torch.nn.LSTM`. + num_layers: See `torch.nn.LSTM`. + dropout: See `torch.nn.LSTM`. + forget_gate_bias: For each layer and each direction, the total value of + to initialise the forget gate bias to. + + Returns: + A `torch.nn.LSTM`. + """ + super(LstmDrop, self).__init__() + + self.lstm = torch.nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout, + ) + if forget_gate_bias is not None: + for name, v in self.lstm.named_parameters(): + if "bias_ih" in name: + bias = getattr(self.lstm, name) + bias.data[hidden_size:2*hidden_size].fill_(forget_gate_bias) + if "bias_hh" in name: + bias = getattr(self.lstm, name) + bias.data[hidden_size:2*hidden_size].fill_(0) + + self.dropout = torch.nn.Dropout(dropout) if dropout else None + + def forward(self, x, h=None): + + x, h = self.lstm(x, h) + + if self.dropout: + x = self.dropout(x) + + return x, h + + + +class RNNLayer(torch.nn.Module): + """A single RNNLayer with optional batch norm.""" + def __init__(self, input_size, hidden_size, rnn_type=torch.nn.LSTM, + batch_norm=True, forget_gate_bias=1.0): + super().__init__() + + if batch_norm: + self.bn = OverLastDim(torch.nn.BatchNorm1d(input_size)) + + if isinstance(rnn_type, torch.nn.LSTM) and not batch_norm: + # batch_norm will apply bias, no need to add a second to LSTM + self.rnn = lstm(input_size=input_size, + hidden_size=hidden_size, + forget_gate_bias=forget_gate_bias) + else: + self.rnn = rnn_type(input_size=input_size, + hidden_size=hidden_size, + bias=not batch_norm) + + def forward(self, x, hx=None): + if hasattr(self, 'bn'): + x = x.contiguous() + x = self.bn(x) + x, h = self.rnn(x, hx=hx) + return x, h + + def _flatten_parameters(self): + self.rnn.flatten_parameters() + + +class BNRNNSum(torch.nn.Module): + """RNN wrapper with optional batch norm. + + Instantiates an RNN. If it is an LSTM it initialises the forget gate + bias =`lstm_gate_bias`. Optionally applies a batch normalisation layer to + the input with the statistics computed over all time steps. If dropout > 0 + then it is applied to all layer outputs except the last. + """ + def __init__(self, input_size, hidden_size, rnn_type=torch.nn.LSTM, + rnn_layers=1, batch_norm=True, dropout=0.0, + forget_gate_bias=1.0, norm_first_rnn=False, **kwargs): + super().__init__() + self.rnn_layers = rnn_layers + + self.layers = torch.nn.ModuleList() + for i in range(rnn_layers): + final_layer = (rnn_layers - 1) == i + + self.layers.append( + RNNLayer( + input_size, + hidden_size, + rnn_type=rnn_type, + batch_norm=batch_norm and (norm_first_rnn or i > 0), + forget_gate_bias=forget_gate_bias, + ) + ) + + if dropout > 0.0 and not final_layer: + self.layers.append(torch.nn.Dropout(dropout)) + + input_size = hidden_size + + def forward(self, x, hx=None): + hx = self._parse_hidden_state(hx) + + hs = [] + cs = [] + rnn_idx = 0 + for layer in self.layers: + if isinstance(layer, torch.nn.Dropout): + x = layer(x) + else: + x, h_out = layer(x, hx=hx[rnn_idx]) + hs.append(h_out[0]) + cs.append(h_out[1]) + rnn_idx += 1 + del h_out + + h_0 = torch.stack(hs, dim=0) + c_0 = torch.stack(cs, dim=0) + return x, (h_0, c_0) + + def _parse_hidden_state(self, hx): + """ + Dealing w. hidden state: + Typically in pytorch: (h_0, c_0) + h_0 = ``[num_layers * num_directions, batch, hidden_size]`` + c_0 = ``[num_layers * num_directions, batch, hidden_size]`` + """ + if hx is None: + return [None] * self.rnn_layers + else: + h_0, c_0 = hx + assert h_0.shape[0] == self.rnn_layers + return [(h_0[i], c_0[i]) for i in range(h_0.shape[0])] + + def _flatten_parameters(self): + for layer in self.layers: + if isinstance(layer, (torch.nn.LSTM, torch.nn.GRU, torch.nn.RNN)): + layer._flatten_parameters() + + +class StackTime(torch.nn.Module): + def __init__(self, factor): + super().__init__() + self.factor = int(factor) + + def forward(self, x): + # T, B, U + x, x_lens = x + seq = [x] + for i in range(1, self.factor): + tmp = torch.zeros_like(x) + tmp[:-i, :, :] = x[i:, :, :] + seq.append(tmp) + x_lens = torch.ceil(x_lens.float() / self.factor).int() + return torch.cat(seq, dim=2)[::self.factor, :, :], x_lens + + +def lnlstm(input_size, hidden_size, num_layers, dropout, forget_gate_bias, + **kwargs): + """Returns a ScriptModule that mimics a PyTorch native LSTM.""" + # The following are not implemented. + assert dropout == 0.0 + + return StackedLSTM( + num_layers, + LSTMLayer, + first_layer_args=[ + LayerNormLSTMCell, + input_size, + hidden_size, + forget_gate_bias, + ], + other_layer_args=[ + LayerNormLSTMCell, + hidden_size, + hidden_size, + forget_gate_bias, + ] + ) + + +class LSTMLayer(torch.nn.Module): + def __init__(self, cell, *cell_args): + super(LSTMLayer, self).__init__() + self.cell = cell(*cell_args) + + def forward( + self, + input: torch.Tensor, + state: Tuple[torch.Tensor, torch.Tensor] + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + inputs = input.unbind(0) + outputs = [] + for i in range(len(inputs)): + out, state = self.cell(inputs[i], state) + outputs += [out] + return torch.stack(outputs), state + + +class LayerNormLSTMCell(torch.nn.Module): + def __init__(self, input_size, hidden_size, forget_gate_bias): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size)) + self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size)) + + # layernorms provide learnable biases + self.layernorm_i = torch.nn.LayerNorm(4 * hidden_size) + self.layernorm_h = torch.nn.LayerNorm(4 * hidden_size) + self.layernorm_c = torch.nn.LayerNorm(hidden_size) + + self.reset_parameters() + + self.layernorm_i.bias.data[hidden_size:2*hidden_size].fill_(0.0) + self.layernorm_h.bias.data[hidden_size:2*hidden_size].fill_( + forget_gate_bias + ) + + def reset_parameters(self): + stdv = 1.0 / math.sqrt(self.hidden_size) + for weight in self.parameters(): + torch.nn.init.uniform_(weight, -stdv, stdv) + + def forward( + self, + input: torch.Tensor, + state: Tuple[torch.Tensor, torch.Tensor] + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + hx, cx = state + igates = self.layernorm_i(torch.mm(input, self.weight_ih.t())) + hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t())) + gates = igates + hgates + ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) + + ingate = torch.sigmoid(ingate) + forgetgate = torch.sigmoid(forgetgate) + cellgate = torch.tanh(cellgate) + outgate = torch.sigmoid(outgate) + + cy = self.layernorm_c((forgetgate * cx) + (ingate * cellgate)) + hy = outgate * torch.tanh(cy) + + return hy, (hy, cy) + + +def init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args): + layers = [layer(*first_layer_args)] + [layer(*other_layer_args) + for _ in range(num_layers - 1)] + return torch.nn.ModuleList(layers) + + +class StackedLSTM(torch.nn.Module): + def __init__(self, num_layers, layer, first_layer_args, other_layer_args): + super(StackedLSTM, self).__init__() + self.layers: Final[torch.nn.ModuleList] = init_stacked_lstm( + num_layers, layer, first_layer_args, other_layer_args + ) + + def forward( + self, + input: torch.Tensor, + states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] + ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: + if states is None: + states: List[Tuple[torch.Tensor, torch.Tensor]] = [] + batch = input.size(1) + for layer in self.layers: + states.append( + (torch.zeros( + batch, + layer.cell.hidden_size, + dtype=input.dtype, + device=input.device + ), + torch.zeros( + batch, + layer.cell.hidden_size, + dtype=input.dtype, + device=input.device + ) + ) + ) + + output_states: List[Tuple[Tensor, Tensor]] = [] + output = input + # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471 + i = 0 + for rnn_layer in self.layers: + state = states[i] + output, out_state = rnn_layer(output, state) + output_states += [out_state] + i += 1 + return output, output_states diff --git a/rnn_speech_recognition/pytorch/scripts/docker/build.sh b/rnn_speech_recognition/pytorch/scripts/docker/build.sh new file mode 100755 index 000000000..cfdc97c01 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/docker/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +docker build . --rm -t jasper \ No newline at end of file diff --git a/rnn_speech_recognition/pytorch/scripts/docker/launch.sh b/rnn_speech_recognition/pytorch/scripts/docker/launch.sh new file mode 100755 index 000000000..5c9c6a3f3 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/docker/launch.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash + +DATA_DIR=$1 +CHECKPOINT_DIR=$2 +RESULT_DIR=$3 + +docker run -it --rm \ + --gpus='"device=1"' \ + --shm-size=4g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v "$DATA_DIR":/datasets \ + -v "$CHECKPOINT_DIR":/checkpoints/ \ + -v "$RESULT_DIR":/results/ \ + -v $PWD:/code \ + -v $PWD:/workspace/jasper \ + mlperf-rnnt-ref bash diff --git a/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh b/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh new file mode 100755 index 000000000..ee322fe30 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/download_librispeech.sh @@ -0,0 +1,28 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/usr/bin/env bash + +DATA_SET="LibriSpeech" +DATA_ROOT_DIR="/datasets" +DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}" +if [ ! -d "$DATA_DIR" ] +then + mkdir $DATA_DIR + chmod go+rx $DATA_DIR + python utils/download_librispeech.py utils/librispeech.csv $DATA_DIR -e ${DATA_ROOT_DIR}/ +else + echo "Directory $DATA_DIR already exists." +fi diff --git a/rnn_speech_recognition/pytorch/scripts/evaluation.sh b/rnn_speech_recognition/pytorch/scripts/evaluation.sh new file mode 100755 index 000000000..fcd472fd9 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/evaluation.sh @@ -0,0 +1,92 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"false"} +NUM_GPUS=${8:-1} +PRECISION=${9:-"fp32"} +NUM_STEPS=${10:-"-1"} +SEED=${11:-0} +BATCH_SIZE=${12:-64} + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "jasper_evaluation_${DATASET}_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi + +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + + +CMD=" inference.py " +CMD+=" --batch_size $BATCH_SIZE " +CMD+=" --dataset_dir $DATA_DIR " +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --model_toml $MODEL_CONFIG " +CMD+=" --seed $SEED " +CMD+=" --ckpt $CHECKPOINT " +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PREC " +CMD+=" $STEPS " + + +if [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" +fi +set +x diff --git a/rnn_speech_recognition/pytorch/scripts/inference.sh b/rnn_speech_recognition/pytorch/scripts/inference.sh new file mode 100755 index 000000000..2d4474ce2 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/inference.sh @@ -0,0 +1,104 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + + +DATA_DIR=${1-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"false"} +PRECISION=${8:-"fp32"} +NUM_STEPS=${9:-"-1"} +SEED=${10:-0} +BATCH_SIZE=${11:-64} +MODELOUTPUT_FILE=${12:-"none"} +PREDICTION_FILE=${13:-"$RESULT_DIR/${DATASET}.predictions"} + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE) + printf -v TAG "jasper_inference_${DATASET}_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +PRED="" +if [ "$PREDICTION_FILE" = "none" ] ; then + PRED="" +else + PRED=" --save_prediction $PREDICTION_FILE" +fi + +OUTPUT="" +if [ "$MODELOUTPUT_FILE" = "none" ] ; then + OUTPUT=" " +else + OUTPUT=" --logits_save_to $MODELOUTPUT_FILE" +fi + + +if [ "$CUDNN_BENCHMARK" = "true" ]; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi + +CMD=" python inference.py " +CMD+=" --batch_size $BATCH_SIZE " +CMD+=" --dataset_dir $DATA_DIR " +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --model_toml $MODEL_CONFIG " +CMD+=" --seed $SEED " +CMD+=" --ckpt $CHECKPOINT " +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PRED " +CMD+=" $OUTPUT " +CMD+=" $PREC " +CMD+=" $STEPS " + + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" +fi +set +x +echo "MODELOUTPUT_FILE: ${MODELOUTPUT_FILE}" +echo "PREDICTION_FILE: ${PREDICTION_FILE}" diff --git a/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh b/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh new file mode 100755 index 000000000..7aeea84c1 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/inference_benchmark.sh @@ -0,0 +1,84 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"true"} +PRECISION=${8:-"fp32"} +NUM_STEPS=${9:-"-1"} +MAX_DURATION=${10:-"36"} +SEED=${11:-0} +BATCH_SIZE=${12:-64} + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + +CMD=" python inference_benchmark.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --seed=$SEED" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --ckpt=$CHECKPOINT" +CMD+=" --max_duration=$MAX_DURATION" +CMD+=" --pad_to=-1" +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PREC" +CMD+=" $STEPS" + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE ) + printf -v TAG "jasper_inference_benchmark_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" + grep 'latency' "$LOGFILE" +fi +set +x diff --git a/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh b/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh new file mode 100755 index 000000000..7cfe5cc6a --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/preprocess_librispeech.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env bash + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-100 \ + --dest_dir /datasets/LibriSpeech/train-clean-100-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \ + --speed 0.9 1.1 +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-360 \ + --dest_dir /datasets/LibriSpeech/train-clean-360-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \ + --speed 0.9 1.1 +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-other-500 \ + --dest_dir /datasets/LibriSpeech/train-other-500-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \ + --speed 0.9 1.1 + + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-clean \ + --dest_dir /datasets/LibriSpeech/dev-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-other \ + --dest_dir /datasets/LibriSpeech/dev-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json + + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-clean \ + --dest_dir /datasets/LibriSpeech/test-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-other \ + --dest_dir /datasets/LibriSpeech/test-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json diff --git a/rnn_speech_recognition/pytorch/scripts/train.sh b/rnn_speech_recognition/pytorch/scripts/train.sh new file mode 100755 index 000000000..d59ce8ebe --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/train.sh @@ -0,0 +1,113 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +MODEL_CONFIG=${2:-"configs/rnnt.toml"} +RESULT_DIR=${3:-"/results"} +CHECKPOINT=${4:-"none"} +CREATE_LOGFILE=${5:-"true"} +CUDNN_BENCHMARK=${6:-"true"} +NUM_GPUS=${7:-8} +PRECISION=${8:-"fp16"} +EPOCHS=${9:-100} +SEED=${10:-6} +BATCH_SIZE=${11:-8} +EVAL_BATCH_SIZE=${11:-2} +LEARNING_RATE=${12:-"0.001"} +LEARNING_RATE_WARMUP=${12:-"8000"} +GRADIENT_ACCUMULATION_STEPS=${13:-1} +LAUNCH_OPT=${LAUNCH_OPT:-"none"} + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +CUDNN="" +if [ "$CUDNN_BENCHMARK" = "true" ] && [ "$PRECISION" = "fp16" ]; then + CUDNN=" --cudnn" +else + CUDNN="" +fi + + + +if [ "$CHECKPOINT" = "none" ] ; then + CHECKPOINT="" +else + CHECKPOINT=" --ckpt=${CHECKPOINT}" +fi + + +CMD=" train.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --eval_batch_size=$EVAL_BATCH_SIZE" +CMD+=" --num_epochs=$EPOCHS" +CMD+=" --output_dir=$RESULT_DIR" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --lr=$LEARNING_RATE" +CMD+=" --lr_warmup=$LEARNING_RATE_WARMUP" +CMD+=" --seed=$SEED" +CMD+=" --optimizer=adam" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json" +CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json" +CMD+=" --weight_decay=1e-3" +CMD+=" --save_freq=100" +CMD+=" --eval_freq=1" +CMD+=" --train_freq=250" +CMD+=" --lr_decay" +CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS " +CMD+=" $CHECKPOINT" +CMD+=" $PREC" +CMD+=" $CUDNN" + + +if [ "${LAUNCH_OPT}" != "none" ]; then + CMD="python -m $LAUNCH_OPT $CMD" +elif [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m multiproc --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "rnnt_train_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log + printf "Logs written to %s\n" "$LOGFILE" +fi + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee $LOGFILE +fi +set +x diff --git a/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh b/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh new file mode 100755 index 000000000..7b5a33705 --- /dev/null +++ b/rnn_speech_recognition/pytorch/scripts/train_benchmark.sh @@ -0,0 +1,130 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${3:-"/results"} +CREATE_LOGFILE=${4:-"true"} +CUDNN_BENCHMARK=${5:-"true"} +NUM_GPUS=${6:-8} +PRECISION=${7:-"fp16"} +NUM_STEPS=${8:-"-1"} +MAX_DURATION=${9:-16.7} +SEED=${10:-0} +BATCH_SIZE=${11:-64} +LEARNING_RATE=${12:-"0.015"} +GRADIENT_ACCUMULATION_STEPS=${13:-1} +PRINT_FREQUENCY=${14:-1} + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC=" --fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +STEPS="" +if [ "$NUM_STEPS" -ne "-1" ] ; then + STEPS=" --num_steps=$NUM_STEPS" +elif [ "$NUM_STEPS" = "-1" ] ; then + STEPS="" +else + echo "Unknown argument" + exit -2 +fi + +CUDNN="" +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN=" --cudnn" +else + CUDNN="" +fi + + +CMD=" train.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --num_epochs=400" +CMD+=" --output_dir=$RESULT_DIR" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --lr=$LEARNING_RATE" +CMD+=" --seed=$SEED" +CMD+=" --optimizer=novograd" +CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json" +CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json" +CMD+=" --weight_decay=1e-3" +CMD+=" --save_freq=100000" +CMD+=" --eval_freq=100000" +CMD+=" --max_duration=$MAX_DURATION" +CMD+=" --pad_to_max" +CMD+=" --train_freq=$PRINT_FREQUENCY" +CMD+=" --lr_decay" +CMD+=" $CUDNN" +CMD+=" $PREC" +CMD+=" $STEPS" + +if [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" + +fi + +if [ -z "$LOGFILE" ] ; then + + set -x + $CMD + set +x +else + + set -x + ( + $CMD + ) |& tee "$LOGFILE" + + set +x + + mean_latency=`cat "$LOGFILE" | grep 'Step time' | awk '{print $3}' | tail -n +2 | egrep -o '[0-9.]+'| awk 'BEGIN {total=0} {total+=$1} END {printf("%.2f\n",total/NR)}'` + mean_throughput=`python -c "print($BATCH_SIZE*$NUM_GPUS/${mean_latency})"` + training_wer_per_pgu=`cat "$LOGFILE" | grep 'training_batch_WER'| awk '{print $2}' | tail -n 1 | egrep -o '[0-9.]+'` + training_loss_per_pgu=`cat "$LOGFILE" | grep 'Loss@Step'| awk '{print $4}' | tail -n 1 | egrep -o '[0-9.]+'` + final_eval_wer=`cat "$LOGFILE" | grep 'Evaluation WER'| tail -n 1 | egrep -o '[0-9.]+'` + final_eval_loss=`cat "$LOGFILE" | grep 'Evaluation Loss'| tail -n 1 | egrep -o '[0-9.]+'` + + echo "max duration: $MAX_DURATION s" | tee -a "$LOGFILE" + echo "mean_latency: $mean_latency s" | tee -a "$LOGFILE" + echo "mean_throughput: $mean_throughput sequences/s" | tee -a "$LOGFILE" + echo "training_wer_per_pgu: $training_wer_per_pgu" | tee -a "$LOGFILE" + echo "training_loss_per_pgu: $training_loss_per_pgu" | tee -a "$LOGFILE" + echo "final_eval_loss: $final_eval_loss" | tee -a "$LOGFILE" + echo "final_eval_wer: $final_eval_wer" | tee -a "$LOGFILE" +fi diff --git a/rnn_speech_recognition/pytorch/tb_logger.py b/rnn_speech_recognition/pytorch/tb_logger.py new file mode 100644 index 000000000..cbc2f215e --- /dev/null +++ b/rnn_speech_recognition/pytorch/tb_logger.py @@ -0,0 +1,52 @@ +import torch.utils.tensorboard as tb + +class DummyLogger: + def log_scalar(*args, **kwargs): + pass + + def log_params(*args, **kwargs): + pass + + def log_grad(*args, **kwargs): + pass + + def train_end(*args, **kwargs): + pass + + +class TensorBoardLogger(DummyLogger): + def __init__(self, path, model, histogram=False): + self.writer = tb.SummaryWriter(log_dir=str(path)) + self.model = model + self.histogram = histogram + + def log_scalar(self, name, value, step, stage='train'): + self.writer.add_scalar( + f'{stage}/{name}', + value, + global_step=step + ) + + def log_grad(self, step): + if not self.histogram: + return + for name, param in self.model.named_parameters(): + if param.grad is not None: + self.writer.add_histogram( + name.replace('.', '/'), + param.grad, + global_step=step + ) + + def log_params(self, step): + if not self.histogram: + return + for name, param in self.model.named_parameters(): + self.writer.add_histogram( + name.replace('.', '/'), + param, + global_step=step + ) + + def train_end(self): + self.writer.close() diff --git a/rnn_speech_recognition/pytorch/train.py b/rnn_speech_recognition/pytorch/train.py new file mode 100644 index 000000000..3261c4ec9 --- /dev/null +++ b/rnn_speech_recognition/pytorch/train.py @@ -0,0 +1,477 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import itertools +import os +import time +import toml +import torch +import apex +from apex import amp +import random +import numpy as np +import math +from dataset import AudioToTextDataLayer +from helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch, Optimization, add_blank_label, AmpOptimizations, model_multi_gpu, print_dict, print_once +from model_rnnt import RNNT +from decoders import RNNTGreedyDecoder +from loss import RNNTLoss +from optimizers import Novograd, AdamW + +import torchvision + +from tb_logger import DummyLogger, TensorBoardLogger +import preprocessing + + +def lr_decay(N, step, learning_rate): + """ + learning rate decay + Args: + learning_rate: base learning rate + step: current iteration number + N: total number of iterations over which learning rate is decayed + """ + min_lr = 0.00001 + res = learning_rate * ((N - step) / N) ** 2 + return max(res, min_lr) + +def lr_warmup(warmup_steps, step, learning_rate): + return min(1, (step / warmup_steps)) * learning_rate + +def save(model, optimizer, epoch, output_dir): + """ + Saves model checkpoint + Args: + model: model + optimizer: optimizer + epoch: epoch of model training + output_dir: path to save model checkpoint + """ + class_name = model.__class__.__name__ + unix_time = time.time() + file_name = "{0}_{1}-epoch-{2}.pt".format(class_name, unix_time, epoch) + print_once("Saving module {0} in {1}".format(class_name, os.path.join(output_dir, file_name))) + if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)): + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + save_checkpoint={ + 'epoch': epoch, + 'state_dict': model_to_save.state_dict(), + 'optimizer': optimizer.state_dict() + } + + torch.save(save_checkpoint, os.path.join(output_dir, file_name)) + print_once('Saved.') + + +def evaluator(model, data_transforms, loss_fn, greedy_decoder, labels, eval_datasets, logger): + """Evaluates model on evaluation dataset + """ + + def evalutaion(epoch=0): + model.eval() + + for dataset, frequency, name in eval_datasets: + if epoch % frequency != 0: + continue + + print_once(f"Doing {name} ....................... ...... ... .. . .") + + with torch.no_grad(): + _global_var_dict = { + 'EvalLoss': [], + 'predictions': [], + 'transcripts': [], + } + dataloader = dataset.data_iterator + for data in dataloader: + + t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data) + + t_log_probs_e, (x_len, y_len) = model( + ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), + ) + t_loss_e = loss_fn( + (t_log_probs_e, x_len), (t_transcript_e, y_len) + ) + del t_log_probs_e + + t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) + + values_dict = dict( + loss=[t_loss_e], + predictions=[t_predictions_e], + transcript=[t_transcript_e], + transcript_length=[t_transcript_len_e] + ) + process_evaluation_batch(values_dict, _global_var_dict, labels=labels) + + # final aggregation across all workers and minibatches) and logging of results + wer, eloss = process_evaluation_epoch(_global_var_dict) + logger.log_scalar('loss', eloss, epoch, name) + logger.log_scalar('wer', wer, epoch, name) + + print_once(f"==========>>>>>>{name} Loss: {eloss}\n") + print_once(f"==========>>>>>>{name} WER: {wer}\n") + + return evalutaion + + +def train( + data_layer, + model, + loss_fn, + greedy_decoder, + optimizer, + optim_level, + labels, + multi_gpu, + data_transforms, + args, + evalutaion, + logger, + fn_lr_policy): + """Trains model + Args: + data_layer: training data layer + model: model ( encapsulates data processing, encoder, decoder) + loss_fn: loss function + greedy_decoder: greedy ctc decoder + optimizer: optimizer + optim_level: AMP optimization level + labels: list of output labels + multi_gpu: true if multi gpu training + args: script input argument list + fn_lr_policy: function returning lr in given step + """ + print_once("Starting .....") + start_time = time.time() + + train_dataloader = data_layer.data_iterator + epoch = args.start_epoch + step = epoch * args.step_per_epoch + + while True: + if multi_gpu: + data_layer.sampler.set_epoch(epoch) + print_once("Starting epoch {0}, step {1}".format(epoch, step)) + last_epoch_start = time.time() + batch_counter = 0 + average_loss = 0 + for data in train_dataloader: + + if batch_counter == 0: + + adjusted_lr = fn_lr_policy(step) + for param_group in optimizer.param_groups: + param_group['lr'] = adjusted_lr + optimizer.zero_grad() + last_iter_start = time.time() + + t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data) + model.train() + + t_log_probs_t, (x_len, y_len) = model( + ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), + ) + + t_loss_t = loss_fn( + (t_log_probs_t, x_len), (t_transcript_t, y_len) + ) + logger.log_scalar('loss', t_loss_t.item(), step) + del t_log_probs_t + if args.gradient_accumulation_steps > 1: + t_loss_t = t_loss_t / args.gradient_accumulation_steps + + if optim_level in AmpOptimizations: + with amp.scale_loss(t_loss_t, optimizer) as scaled_loss: + scaled_loss.backward() + else: + t_loss_t.backward() + batch_counter += 1 + average_loss += t_loss_t.item() + + if batch_counter % args.gradient_accumulation_steps == 0: + optimizer.step() + + if (step + 1) % args.train_frequency == 0: + t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t) + + e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] + train_wer = monitor_asr_train_progress(e_tensors, labels=labels) + print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) + print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) + logger.log_scalar('wer', train_wer, step) + + step += 1 + batch_counter = 0 + average_loss = 0 + if args.num_steps is not None and step >= args.num_steps: + break + + evalutaion(epoch) + + if args.num_steps is not None and step >= args.num_steps: + break + print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) + epoch += 1 + if epoch % args.save_frequency == 0 and epoch > 0: + save(model, optimizer, epoch, output_dir=args.output_dir) + if args.num_steps is None and epoch >= args.num_epochs: + break + print_once("Done in {0}".format(time.time() - start_time)) + print_once("Final Evaluation ....................... ...... ... .. . .") + evalutaion() + save(model, optimizer, epoch, output_dir=args.output_dir) + +def main(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + assert(torch.cuda.is_available()) + torch.backends.cudnn.benchmark = args.cudnn + + args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank) + # set up distributed training + if args.local_rank is not None: + args.local_rank = int(args.local_rank) + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + multi_gpu = torch.distributed.is_initialized() + if multi_gpu: + print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) + + # define amp optimiation level + if args.fp16: + optim_level = Optimization.mxprO1 + else: + optim_level = Optimization.mxprO0 + + model_definition = toml.load(args.model_toml) + dataset_vocab = model_definition['labels']['labels'] + ctc_vocab = add_blank_label(dataset_vocab) + + train_manifest = args.train_manifest + val_manifest = args.val_manifest + tst_manifest = args.tst_manifest + featurizer_config = model_definition['input'] + featurizer_config_eval = model_definition['input_eval'] + featurizer_config["optimization_level"] = optim_level + featurizer_config_eval["optimization_level"] = optim_level + + sampler_type = featurizer_config.get("sampler", 'default') + perturb_config = model_definition.get('perturb', None) + if args.pad_to_max: + assert(args.max_duration > 0) + featurizer_config['max_duration'] = args.max_duration + featurizer_config_eval['max_duration'] = args.max_duration + featurizer_config['pad_to'] = "max" + featurizer_config_eval['pad_to'] = "max" + print_once('model_config') + print_dict(model_definition) + + if args.gradient_accumulation_steps < 1: + raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) + if args.batch_size % args.gradient_accumulation_steps != 0: + raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) + + + preprocessor = preprocessing.AudioPreprocessing(**featurizer_config) + preprocessor.cuda() + + augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config) + augmentations.cuda() + + train_transforms = torchvision.transforms.Compose([ + lambda xs: [x.cuda() for x in xs], + lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], + lambda xs: [augmentations(xs[0]), *xs[1:]], + lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], + ]) + + eval_transforms = torchvision.transforms.Compose([ + lambda xs: [x.cuda() for x in xs], + lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], + lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], + ]) + + data_layer = AudioToTextDataLayer( + dataset_dir=args.dataset_dir, + featurizer_config=featurizer_config, + perturb_config=perturb_config, + manifest_filepath=train_manifest, + labels=dataset_vocab, + batch_size=args.batch_size // args.gradient_accumulation_steps, + multi_gpu=multi_gpu, + pad_to_max=args.pad_to_max, + sampler=sampler_type) + + eval_datasets = [( + AudioToTextDataLayer( + dataset_dir=args.dataset_dir, + featurizer_config=featurizer_config_eval, + manifest_filepath=val_manifest, + labels=dataset_vocab, + batch_size=args.eval_batch_size, + multi_gpu=multi_gpu, + pad_to_max=args.pad_to_max + ), + args.eval_frequency, + 'Eval clean', + )] + + if tst_manifest: + eval_datasets.append(( + AudioToTextDataLayer( + dataset_dir=args.dataset_dir, + featurizer_config=featurizer_config_eval, + manifest_filepath=tst_manifest, + labels=dataset_vocab, + batch_size=args.eval_batch_size, + multi_gpu=multi_gpu, + pad_to_max=args.pad_to_max + ), + args.test_frequency, + 'Test other', + )) + + model = RNNT( + feature_config=featurizer_config, + rnnt=model_definition['rnnt'], + num_classes=len(ctc_vocab) + ) + + if args.ckpt is not None: + print_once("loading model from {}".format(args.ckpt)) + checkpoint = torch.load(args.ckpt, map_location="cpu") + model.load_state_dict(checkpoint['state_dict'], strict=True) + args.start_epoch = checkpoint['epoch'] + else: + args.start_epoch = 0 + + loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1) + + N = len(data_layer) + if sampler_type == 'default': + args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) + elif sampler_type == 'bucket': + args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) + + print_once('-----------------') + print_once('Have {0} examples to train on.'.format(N)) + print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) + print_once('-----------------') + + constant_lr_policy = lambda _: args.lr + fn_lr_policy = constant_lr_policy + if args.lr_decay: + pre_decay_policy = fn_lr_policy + fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s)) + if args.lr_warmup: + pre_warmup_policy = fn_lr_policy + fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) ) + + + model.cuda() + + + if args.optimizer_kind == "novograd": + optimizer = Novograd(model.parameters(), + lr=args.lr, + weight_decay=args.weight_decay) + elif args.optimizer_kind == "adam": + optimizer = AdamW(model.parameters(), + lr=args.lr, + weight_decay=args.weight_decay) + else: + raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) + + if optim_level in AmpOptimizations: + model, optimizer = amp.initialize( + min_loss_scale=0.125, + models=model, + optimizers=optimizer, + opt_level=AmpOptimizations[optim_level] + ) + + if args.ckpt is not None: + optimizer.load_state_dict(checkpoint['optimizer']) + + model = model_multi_gpu(model, multi_gpu) + print_once(model) + print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters()))) + greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) + + if args.tb_path and args.local_rank == 0: + logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram) + else: + logger = DummyLogger() + + train( + data_layer=data_layer, + model=model, + loss_fn=loss_fn, + greedy_decoder=greedy_decoder, + optimizer=optimizer, + data_transforms=train_transforms, + labels=ctc_vocab, + optim_level=optim_level, + multi_gpu=multi_gpu, + fn_lr_policy=fn_lr_policy, + evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger), + logger=logger, + args=args) + +def parse_args(): + parser = argparse.ArgumentParser(description='RNNT Training Reference') + parser.add_argument("--local_rank", default=None, type=int) + parser.add_argument("--batch_size", default=16, type=int, help='data batch size') + parser.add_argument("--eval_batch_size", default=1, type=int, help='eval data batch size') + parser.add_argument("--num_epochs", default=10, type=int, help='number of training epochs. if number of steps if specified will overwrite this') + parser.add_argument("--num_steps", default=None, type=int, help='if specified overwrites num_epochs and will only train for this number of iterations') + parser.add_argument("--save_freq", dest="save_frequency", default=300, type=int, help='number of epochs until saving checkpoint. will save at the end of training too.') + parser.add_argument("--eval_freq", dest="eval_frequency", default=1, type=int, help='number of epochs until doing evaluation on full dataset') + parser.add_argument("--test_freq", dest="test_frequency", default=2, type=int, help='number of epochs until doing test on full dataset') + parser.add_argument("--train_freq", dest="train_frequency", default=25, type=int, help='number of iterations until printing training statistics on the past iteration') + parser.add_argument("--lr", default=1e-3, type=float, help='learning rate') + parser.add_argument("--weight_decay", default=1e-3, type=float, help='weight decay rate') + parser.add_argument("--train_manifest", type=str, required=True, help='relative path given dataset folder of training manifest file') + parser.add_argument("--model_toml", type=str, required=True, help='relative path given dataset folder of model configuration file') + parser.add_argument("--val_manifest", type=str, required=True, help='relative path given dataset folder of evaluation manifest file') + parser.add_argument("--tst_manifest", type=str, required=False, help='relative path given dataset folder of test manifest file') + parser.add_argument("--max_duration", type=float, help='maximum duration of audio samples for training and evaluation') + parser.add_argument("--pad_to_max", action="store_true", default=False, help="pad sequence to max_duration") + parser.add_argument("--gradient_accumulation_steps", default=1, type=int, help='number of accumulation steps') + parser.add_argument("--optimizer", dest="optimizer_kind", default="novograd", type=str, help='optimizer') + parser.add_argument("--dataset_dir", dest="dataset_dir", required=True, type=str, help='root dir of dataset') + parser.add_argument("--lr_decay", action="store_true", default=False, help='use learning rate decay') + parser.add_argument("--lr_warmup", type=int, default=None, help='if provided, the learning rate will linearly scale for given number of iterations from zero') + parser.add_argument("--cudnn", action="store_true", default=False, help="enable cudnn benchmark") + parser.add_argument("--fp16", action="store_true", default=False, help="use mixed precision training") + parser.add_argument("--output_dir", type=str, required=True, help='saves results in this directory') + parser.add_argument("--ckpt", default=None, type=str, help="if specified continues training from given checkpoint. Otherwise starts from beginning") + parser.add_argument("--seed", default=42, type=int, help='seed') + parser.add_argument("--tb_path", default=None, type=str, help='where to store tensorboard data') + parser.add_argument("--histogram", default=False, action='store_true', help='whether to log param and grad histograms') + args=parser.parse_args() + return args + + +if __name__=="__main__": + args = parse_args() + print_dict(vars(args)) + main(args) diff --git a/rnn_speech_recognition/pytorch/utils/__init__.py b/rnn_speech_recognition/pytorch/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnn_speech_recognition/pytorch/utils/convert_librispeech.py b/rnn_speech_recognition/pytorch/utils/convert_librispeech.py new file mode 100644 index 000000000..914997516 --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/convert_librispeech.py @@ -0,0 +1,81 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/usr/bin/env python +import argparse +import os +import glob +import multiprocessing +import json + +import pandas as pd + +from preprocessing_utils import parallel_preprocess + +parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.') +parser.add_argument('--input_dir', type=str, required=True, + help='LibriSpeech collection input dir') +parser.add_argument('--dest_dir', type=str, required=True, + help='Output dir') +parser.add_argument('--output_json', type=str, default='./', + help='name of the output json file.') +parser.add_argument('-s','--speed', type=float, nargs='*', + help='Speed perturbation ratio') +parser.add_argument('--target_sr', type=int, default=None, + help='Target sample rate. ' + 'defaults to the input sample rate') +parser.add_argument('--overwrite', action='store_true', + help='Overwrite file if exists') +parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(), + help='Number of threads to use when processing audio files') +args = parser.parse_args() + +args.input_dir = args.input_dir.rstrip('/') +args.dest_dir = args.dest_dir.rstrip('/') + +def build_input_arr(input_dir): + txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'), + recursive=True) + input_data = [] + for txt_file in txt_files: + rel_path = os.path.relpath(txt_file, input_dir) + with open(txt_file) as fp: + for line in fp: + fname, _, transcript = line.partition(' ') + input_data.append(dict(input_relpath=os.path.dirname(rel_path), + input_fname=fname+'.flac', + transcript=transcript)) + return input_data + + +print("[%s] Scaning input dir..." % args.output_json) +dataset = build_input_arr(input_dir=args.input_dir) + +print("[%s] Converting audio files..." % args.output_json) +dataset = parallel_preprocess(dataset=dataset, + input_dir=args.input_dir, + dest_dir=args.dest_dir, + target_sr=args.target_sr, + speed=args.speed, + overwrite=args.overwrite, + parallel=args.parallel) + +print("[%s] Generating json..." % args.output_json) +df = pd.DataFrame(dataset, dtype=object) + +# Save json with python. df.to_json() produces back slashed in file paths +dataset = df.to_dict(orient='records') +with open(args.output_json, 'w') as fp: + json.dump(dataset, fp, indent=2) diff --git a/rnn_speech_recognition/pytorch/utils/download_librispeech.py b/rnn_speech_recognition/pytorch/utils/download_librispeech.py new file mode 100644 index 000000000..ad36ad4e4 --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/download_librispeech.py @@ -0,0 +1,72 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python + +import os +import argparse +import pandas as pd + +from download_utils import download_file, md5_checksum, extract + +parser = argparse.ArgumentParser(description='Download, verify and extract dataset files') +parser.add_argument('csv', type=str, + help='CSV file with urls and checksums to download.') +parser.add_argument('dest', type=str, + help='Download destnation folder.') +parser.add_argument('-e', type=str, default=None, + help='Extraction destnation folder. Defaults to download folder if not provided') +parser.add_argument('--skip_download', action='store_true', + help='Skip downloading the files') +parser.add_argument('--skip_checksum', action='store_true', + help='Skip checksum') +parser.add_argument('--skip_extract', action='store_true', + help='Skip extracting files') +args = parser.parse_args() +args.e = args.e or args.dest + + +df = pd.read_csv(args.csv, delimiter=',') + + +if not args.skip_download: + for url in df.url: + fname = url.split('/')[-1] + print("Downloading %s:" % fname) + download_file(url=url, dest_folder=args.dest, fname=fname) +else: + print("Skipping file download") + + +if not args.skip_checksum: + for index, row in df.iterrows(): + url = row['url'] + md5 = row['md5'] + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Verifing %s: " % fname, end='') + ret = md5_checksum(fpath=fpath, target_hash=md5) + print("Passed" if ret else "Failed") +else: + print("Skipping checksum") + + +if not args.skip_extract: + for url in df.url: + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Decompressing %s:" % fpath) + extract(fpath=fpath, dest_folder=args.e) +else: + print("Skipping file extraction") diff --git a/rnn_speech_recognition/pytorch/utils/download_utils.py b/rnn_speech_recognition/pytorch/utils/download_utils.py new file mode 100644 index 000000000..e881388a6 --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/download_utils.py @@ -0,0 +1,68 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python + +import hashlib +import requests +import os +import tarfile +import tqdm + +def download_file(url, dest_folder, fname, overwrite=False): + fpath = os.path.join(dest_folder, fname) + if os.path.isfile(fpath): + if overwrite: + print("Overwriting existing file") + else: + print("File exists, skipping download.") + return + + tmp_fpath = fpath + '.tmp' + + r = requests.get(url, stream=True) + file_size = int(r.headers['Content-Length']) + chunk_size = 1024 * 1024 # 1MB + total_chunks = int(file_size / chunk_size) + + with open(tmp_fpath, 'wb') as fp: + content_iterator = r.iter_content(chunk_size=chunk_size) + chunks = tqdm.tqdm(content_iterator, total=total_chunks, + unit='MB', desc=fpath, leave=True) + for chunk in chunks: + fp.write(chunk) + + os.rename(tmp_fpath, fpath) + + +def md5_checksum(fpath, target_hash): + file_hash = hashlib.md5() + with open(fpath, "rb") as fp: + for chunk in iter(lambda: fp.read(1024*1024), b""): + file_hash.update(chunk) + return file_hash.hexdigest() == target_hash + + +def extract(fpath, dest_folder): + if fpath.endswith('.tar.gz'): + mode = 'r:gz' + elif fpath.endswith('.tar'): + mode = 'r:' + else: + raise IOError('fpath has unknown extention: %s' % fpath) + + with tarfile.open(fpath, mode) as tar: + members = tar.getmembers() + for member in tqdm.tqdm(iterable=members, total=len(members), leave=True): + tar.extract(path=dest_folder, member=member) diff --git a/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv b/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv new file mode 100644 index 000000000..40dac4e0e --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/inference_librispeech.csv @@ -0,0 +1,5 @@ +url,md5 +http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 +http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 +http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 +http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 diff --git a/rnn_speech_recognition/pytorch/utils/librispeech.csv b/rnn_speech_recognition/pytorch/utils/librispeech.csv new file mode 100644 index 000000000..d48a9f8db --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/librispeech.csv @@ -0,0 +1,8 @@ +url,md5 +http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 +http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 +http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 +http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 +http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522 +http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa +http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708 diff --git a/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py b/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py new file mode 100644 index 000000000..15605cea2 --- /dev/null +++ b/rnn_speech_recognition/pytorch/utils/preprocessing_utils.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python +import os +import multiprocessing +import librosa +import functools + +import sox + + +from tqdm import tqdm + +def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None, + overwrite=True): + speed = speed or [] + speed.append(1) + speed = list(set(speed)) # Make uniqe + + input_fname = os.path.join(input_dir, + data['input_relpath'], + data['input_fname']) + input_sr = sox.file_info.sample_rate(input_fname) + target_sr = target_sr or input_sr + + os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True) + + output_dict = {} + output_dict['transcript'] = data['transcript'].lower().strip() + output_dict['files'] = [] + + fname = os.path.splitext(data['input_fname'])[0] + for s in speed: + output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s)) + output_fpath = os.path.join(dest_dir, + data['input_relpath'], + output_fname) + + if not os.path.exists(output_fpath) or overwrite: + cbn = sox.Transformer().speed(factor=s).convert(target_sr) + cbn.build(input_fname, output_fpath) + + file_info = sox.file_info.info(output_fpath) + file_info['fname'] = os.path.join(os.path.basename(dest_dir), + data['input_relpath'], + output_fname) + file_info['speed'] = s + output_dict['files'].append(file_info) + + if s == 1: + file_info = sox.file_info.info(output_fpath) + output_dict['original_duration'] = file_info['duration'] + output_dict['original_num_samples'] = file_info['num_samples'] + + return output_dict + + +def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel): + with multiprocessing.Pool(parallel) as p: + func = functools.partial(preprocess, + input_dir=input_dir, dest_dir=dest_dir, + target_sr=target_sr, speed=speed, overwrite=overwrite) + dataset = list(tqdm(p.imap(func, dataset), total=len(dataset))) + return dataset