coqui-ai · NanoNabla · May 5, 2021 · May 5, 2021
diff --git a/doc/TRAINING_ADVANCED.rst b/doc/TRAINING_ADVANCED.rst
@@ -17,3 +17,5 @@ This document contains more advanced topics with regard to training models with
 9. :ref:`parallel-training-optimization`
 10. :ref:`data-importers`
 11. :ref:`byte-output-mode`
+12. :ref:`horovod-parallel-training`
+
diff --git a/doc/TRAINING_HOROVOD.rst b/doc/TRAINING_HOROVOD.rst
@@ -0,0 +1,22 @@
+.. _horovod-parallel-training:
+
+Distributed training using Horovod
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you have a capable compute architecture, it is possible to distribute the training using `Horovod <https://github.com/horovod/horovod>`_. A fast network is recommended.
+Horovod is capable of using MPI and NVIDIA's NCCL for highly optimized inter-process communication.
+It also offers `Gloo <https://github.com/facebookincubator/gloo>`_ as an easy-to-setup communication backend.
+
+For more information about setup or tuning of Horovod please visit `Horovod's documentation <https://horovod.readthedocs.io/en/stable/summary_include.html>`_.
+
+Horovod is expected to run on heterogeneous systems (e.g. different number and model type of GPUs per machine).
+However, this can cause unpredictable problems and user interaction in training code is needed.
+Therefore, we do only support homogenous systems, which means same hardware and also same software configuration (OS, drivers, MPI, NCCL, TensorFlow, ...) on each machine.
+The only exception is different number of GPUs per machine, since this can be controlled by ``horovodrun -H``.
+
+Detailed documentation how to run Horovod is provided `here <https://horovod.readthedocs.io/en/stable/running.html>`_.
+The short command to train on 4 machines using 4 GPUs each:
+
+.. code-block:: bash
+
+    horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python3 DeepSpeech.py --train_files [...] --horovod
diff --git a/setup.py b/setup.py
@@ -39,6 +39,9 @@ def main():
         'tensorflow == 1.15.4'
     ]
 
+    horovod_pypi_dep = [
+        'horovod[tensorflow] == 0.21.3'
+    ]
     if os.environ.get('DS_NODECODER', ''):
         install_requires = install_requires_base
     else:
@@ -49,6 +52,12 @@ def main():
     else:
         install_requires = install_requires + tensorflow_pypi_dep
 
+
+    if os.environ.get('DS_WITH_HOROVOD', ''):
+        install_requires = install_requires + horovod_pypi_dep
+    else:
+        install_requires = install_requires
+
     setup(
         name='coqui_stt_training',
         version=version,

diff --git a/training/coqui_stt_training/train.py b/training/coqui_stt_training/train.py
diff --git a/training/coqui_stt_training/util/config.py b/training/coqui_stt_training/util/config.py
@@ -79,12 +79,37 @@ def initialize_globals():
     # CPU device
     c.cpu_device = '/cpu:0'
 
-    # Available GPU devices
-    c.available_devices = get_available_gpus(c.session_config)
+    if FLAGS.horovod:
+        try:
+            import horovod.tensorflow as hvd
+        except ImportError as e:
+            print(
+                "Error importing Horovod. Did you installed DeepSpeech with 'DS_WITH_HOROVOD=y'? "
+                "If you do not want to use horovod, do not start with '--horovod=True'")
+            raise e
+
+        hvd.init()
+
+        # Pin GPU to be used to process local rank (one GPU per process)
+        c.session_config.gpu_options.visible_device_list = str(hvd.local_rank())
+        c.num_devices = hvd.size()
+        c.is_master_process = True if hvd.rank() == 0 else False
+    else:
+        # Available GPU devices
+        c.available_devices = get_available_gpus(c.session_config)
+
+        # If there is no GPU available, we fall back to CPU based operation
+        if not c.available_devices:
+            c.available_devices = [c.cpu_device]
+
+        c.num_devices = len(c.available_devices)
+
+        # If there are no horovod processes the only one should handled like horovod master
+        c.is_master_process = True
 
-    # If there is no GPU available, we fall back to CPU based operation
-    if not c.available_devices:
-        c.available_devices = [c.cpu_device]
+        # If there is no GPU available, we fall back to CPU based operation
+        if not c.available_devices:
+            c.available_devices = [c.cpu_device]
 
     if FLAGS.bytes_output_mode:
         c.alphabet = UTF8Alphabet()

diff --git a/training/coqui_stt_training/util/feeding.py b/training/coqui_stt_training/util/feeding.py
@@ -94,7 +94,8 @@ def create_dataset(sources,
                    limit=0,
                    exception_box=None,
                    process_ahead=None,
-                   buffering=1 * MEGABYTE):
+                   buffering=1 * MEGABYTE,
+                   split_dataset = False):
     epoch_counter = Counter()  # survives restarts of the dataset and its generator
 
     def generate_values():
@@ -135,14 +136,26 @@ def batch_fn(sample_ids, features, features_len, transcripts):
 
     process_fn = partial(entry_to_features, train_phase=train_phase, augmentations=augmentations)
 
-    dataset = (tf.data.Dataset.from_generator(remember_exception(generate_values, exception_box),
-                                              output_types=(tf.string, tf.float32, tf.int32,
-                                                            (tf.int64, tf.int32, tf.int64), tf.float64))
-                              .map(process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE))
+
+    dataset = tf.data.Dataset.from_generator(remember_exception(generate_values, exception_box),
+                                             output_types=(tf.string, tf.float32, tf.int32,
+                                                           (tf.int64, tf.int32, tf.int64), tf.float64))
+    if split_dataset:
+        # Using horovod Iterator.get_next() is not aware of different devices.
+        # A.shard(n, i) will contain all elements of A whose index mod n = i.
+        import horovod.tensorflow as hvd
+        dataset = dataset.shard(hvd.size(), hvd.rank())
+    dataset = dataset.map(process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
     if cache_path:
         dataset = dataset.cache(cache_path)
-    dataset = (dataset.window(batch_size, drop_remainder=train_phase).flat_map(batch_fn)
-                      .prefetch(len(Config.available_devices)))
+
+    dataset = (dataset.window(batch_size, drop_remainder=train_phase).flat_map(batch_fn))
+    if split_dataset:
+        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    else:
+        dataset = dataset.prefetch(Config.num_devices)
+
     return dataset
 
 
@@ -178,5 +191,5 @@ def create_batch_set(bs, criteria):
     ods = create_batch_set(outlier_batch_size,
                            lambda start, end, f, fl: end - start > int(outlier_duration_ms))
     dataset = nds.concatenate(ods)
-    dataset = dataset.prefetch(len(Config.available_devices))
+    dataset = dataset.prefetch(len(Config.num_devices))
     return dataset
diff --git a/training/coqui_stt_training/util/flags.py b/training/coqui_stt_training/util/flags.py
@@ -69,6 +69,8 @@ def create_flags():
     f.DEFINE_boolean('train_cudnn', False, 'use CuDNN RNN backend for training on GPU. Note that checkpoints created with this flag can only be used with CuDNN RNN, i.e. fine tuning on a CPU device will not work')
     f.DEFINE_boolean('automatic_mixed_precision', False, 'whether to allow automatic mixed precision training. USE OF THIS FLAG IS UNSUPPORTED. Checkpoints created with automatic mixed precision training will not be usable without mixed precision.')
 
+    f.DEFINE_boolean('horovod', False, 'use horovod for training on multiple gpus')
+
     # Sample limits
 
     f.DEFINE_integer('limit_train', 0, 'maximum number of elements to use from train set - 0 means no limit')