diff --git a/aeneas/cewsubprocess.py b/aeneas/cewsubprocess.py index 3d00be6e..60bd1d56 100644 --- a/aeneas/cewsubprocess.py +++ b/aeneas/cewsubprocess.py @@ -36,7 +36,7 @@ #. https://groups.google.com/d/msg/aeneas-forced-alignment/NLbtSRf2_vg/mMHuTQiFEgAJ #. https://sourceforge.net/p/espeak/mailman/message/34861696/ -.. warning:: This module might be removed in a future version +.. warning:: This module might be removed in a future version. .. versionadded:: 1.5.0 """ diff --git a/aeneas/runtimeconfiguration.py b/aeneas/runtimeconfiguration.py index 42ec9eef..519b72ff 100644 --- a/aeneas/runtimeconfiguration.py +++ b/aeneas/runtimeconfiguration.py @@ -57,18 +57,20 @@ class RuntimeConfiguration(Configuration): otherwise, generate an error if the user attempts to use a language not listed. - Default: ``True``. + Default: ``False``. .. versionadded:: 1.4.1 """ C_EXTENSIONS = "c_extensions" """ - If ``True`` and Python C extensions are available, use them. - Otherwise, use pure Python code. + If ``True`` and the Python C/C++ extensions + are available, use them. + Otherwise, use the pure Python code. This option is equivalent to - setting ``CDTW``, ``CEW``, and ``CMFCC`` to ``True`` at once. + setting ``CDTW``, ``CEW``, ``CFW``, + and ``CMFCC`` to ``True`` or ``False`` at once. Default: ``True``. @@ -77,8 +79,9 @@ class RuntimeConfiguration(Configuration): CDTW = "cdtw" """ - If ``True`` and Python C extension ``cdtw`` is available, use it. - Otherwise, use pure Python code. + If ``True`` and the Python C extension ``cdtw`` + is available, use it. + Otherwise, use the pure Python code. Default: ``True``. @@ -87,8 +90,9 @@ class RuntimeConfiguration(Configuration): CEW = "cew" """ - If ``True`` and Python C extension ``cew`` is available, use it. - Otherwise, use pure Python code. + If ``True`` and the Python C extension ``cew`` + is available, use it. + Otherwise, use the pure Python code. Default: ``True``. @@ -97,8 +101,9 @@ class RuntimeConfiguration(Configuration): CFW = "cfw" """ - If ``True`` and Python C extension ``cfw`` is available, use it. - Otherwise, use pure Python code. + If ``True`` and the Python C++ extension ``cfw`` + is available, use it. + Otherwise, use the pure Python code. Default: ``True``. @@ -107,7 +112,10 @@ class RuntimeConfiguration(Configuration): CEW_SUBPROCESS_ENABLED = "cew_subprocess_enabled" """ - If ``True``, calls to ``aeneas.cew`` will be done via ``subprocess``. + If ``True``, calls to ``aeneas.cew`` + will be done via ``subprocess``, using the + :class:`~aeneas.cewsubprocess.CEWSubprocess` + helper class. Default: ``False``. @@ -128,8 +136,9 @@ class RuntimeConfiguration(Configuration): CMFCC = "cmfcc" """ - If ``True`` and Python C extension ``cmfcc`` is available, use it. - Otherwise, use pure Python code. + If ``True`` and the Python C extension ``cmfcc`` + is available, use it. + Otherwise, use the pure Python code. Default: ``True``. @@ -371,6 +380,10 @@ class RuntimeConfiguration(Configuration): Important: this feature is experimental, use at your own risk. It is recommended not to use this TTS at word-level granularity, as it will create many requests, hence it will be expensive. + If you still want to use it, you can enable + the TTS caching mechanism by setting + :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_CACHE` + to ``True``. .. versionadded:: 1.5.0 """ @@ -384,6 +397,10 @@ class RuntimeConfiguration(Configuration): Important: this feature is experimental, use at your own risk. It is recommended not to use this TTS at word-level granularity, as it will create many requests, hence it will be expensive. + If you still want to use it, you can enable + the TTS caching mechanism by setting + :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_CACHE` + to ``True``. .. versionadded:: 1.5.0 """ @@ -458,22 +475,29 @@ class RuntimeConfiguration(Configuration): The default value is :data:`~aeneas.synthesizer.Synthesizer.ESPEAK` (``espeak``) which will use the built-in eSpeak TTS wrapper. + You might need to provide a ``/full/path/to/your/espeak`` value + to the + :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_PATH` + parameter if the command ``espeak`` is not available in + one of the directories listed in your ``PATH`` environment variable. Specify the value :data:`~aeneas.synthesizer.Synthesizer.ESPEAKNG` (``espeak-ng``) - to use the eSpeak-ng TTS wrapper; - you might need to provide the ``espeak-ng`` or ``/full/path/to/your/espeak-ng`` value + to use the eSpeak-ng TTS wrapper. + You might need to provide a ``/full/path/to/your/espeak-ng`` value to the :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_PATH` - parameter. + parameter if the command ``espeak-ng`` is not available in + one of the directories listed in your ``PATH`` environment variable. Specify the value :data:`~aeneas.synthesizer.Synthesizer.FESTIVAL` (``festival``) - to use the built-in Festival TTS wrapper; - you might need to provide the ``text2wave`` or ``/full/path/to/your/text2wave`` value + to use the built-in Festival TTS wrapper. + You might need to provide a ``/full/path/to/your/text2wave`` value to the :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_PATH` - parameter. + parameter if the command ``text2wave`` is not available in + one of the directories listed in your ``PATH`` environment variable. Specify the value :data:`~aeneas.synthesizer.Synthesizer.NUANCE` (``nuance``) @@ -503,7 +527,8 @@ class RuntimeConfiguration(Configuration): The cache files will be removed after the synthesis is compled. - This option is useful when calling TTS engines via subprocess + This option is useful when calling TTS engines, + via subprocess or remote APIs, on text files with many identical fragments, for example when aligning at word-level granularity. @@ -539,6 +564,8 @@ class RuntimeConfiguration(Configuration): If you specify this value, it will override the default voice code associated with the language of your text. + Default: ``None``. + .. versionadded:: 1.5.0 """ @@ -672,6 +699,7 @@ class RuntimeConfiguration(Configuration): # NOTE not using aliases just not to become confused # about external (user rconf) and internal (lib code) key names + # although the functionality might be useful in the future FIELDS = [ (ALLOW_UNLISTED_LANGUAGES, (False, bool, [])), @@ -796,7 +824,7 @@ def tts(self): :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS` key stored in this configuration object. - :rtype: str + :rtype: string """ return self[self.TTS] @@ -807,7 +835,7 @@ def tts_path(self): :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_PATH` key stored in this configuration object. - :rtype: str + :rtype: string """ return self[self.TTS_PATH] diff --git a/aeneas/ttswrappers/festivalttswrapper.py b/aeneas/ttswrappers/festivalttswrapper.py index 70ee0d26..b9b11b80 100644 --- a/aeneas/ttswrappers/festivalttswrapper.py +++ b/aeneas/ttswrappers/festivalttswrapper.py @@ -43,9 +43,11 @@ class FESTIVALTTSWrapper(BaseTTSWrapper): A wrapper for the ``Festival`` TTS engine. This wrapper supports calling the TTS engine - via ``subprocess`` or via Python C extension. + via ``subprocess`` or via Python C++ extension. - .. note:: The latter call method is experimental and probably works only on Linux at the moment. + .. warning:: + The C++ extension call is experimental and + probably works only on Linux at the moment. In abstract terms, it performs one or more calls like :: diff --git a/debian/changelog b/debian/changelog index 87c49ebd..60b2e8f8 100644 --- a/debian/changelog +++ b/debian/changelog @@ -5,12 +5,13 @@ aeneas (1.6.0) stable; urgency=medium * Added eSpeak-ng TTS wrapper * Added caching audio files for faster synthesis * The TTS engine can be selected per-level (text in multilevel format) + * Added cfw Python C++ extension for Festival (disabled by default) * Python C extension compilation can be disabled in setup.py via env vars * All Python code formatted according to PEP 8 (-E501) * All source files have AGPLv3 header * Code, test, and documentation improvements - -- alberto Mon, 19 Sep 2016 14:35:01 +0200 + -- alberto Mon, 26 Sep 2016 01:02:03 +0200 aeneas (1.5.1) stable; urgency=medium diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 30f6249f..090fc0f7 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -20,7 +20,7 @@ v1.6.0 (2016-09-26) #. When working on multilevel sync, user can specify a different TTS for each level #. Added an optional TTS caching mechanism to reduce subprocess/API calls to the TTS engine (closes #87) #. Added wrapper for eSpeak-ng (subprocess only) -#. Added ``aeneas.cfw`` Python C++ Extension to call ``Festival`` via its C++ API, disabled by default (closes #109) +#. Added ``cfw`` Python C++ Extension to call ``Festival`` via its C++ API, disabled by default (closes #106) #. Unified unit tests for eSpeak, eSpeak-ng, and Festival #. Python C extension compilation can be disabled/forced in setup.py via env vars #. Added check on head/process/tail length which should not exceed the audio file length (closes #80) diff --git a/docs/source/clitutorial.rst b/docs/source/clitutorial.rst index a2cdc2f4..963c57f5 100644 --- a/docs/source/clitutorial.rst +++ b/docs/source/clitutorial.rst @@ -99,11 +99,13 @@ If you execute the program without arguments, it will print the following help message: .. literalinclude:: _static/execute_task_help.txt + :language: text If you pass the ``--help`` argument, it will print a slightly more verbose version: .. literalinclude:: _static/execute_task_help_arg.txt + :language: text Showing And Running Built-In Examples ------------------------------------- @@ -113,12 +115,14 @@ enabling the user to run live examples. To list them, pass the ``--examples`` switch: .. literalinclude:: _static/execute_task_examples.txt + :language: text Similarly, the ``--examples-all`` switch prints a list of more than twenty built-in examples, covering more specific input/output/parameter combinations. .. literalinclude:: _static/execute_task_examples_all.txt + :language: text Running a built-in example can help learning quickly all the options/parameters available in ``aeneas``. @@ -126,6 +130,7 @@ available in ``aeneas``. For example, passing the ``--example-json`` switch will produce: .. literalinclude:: _static/execute_task_example_json.txt + :language: text .. warning:: @@ -138,6 +143,7 @@ will print the command line arguments they shortcut. Therefore, the example above is essentially equivalent to: .. literalinclude:: _static/execute_task_example_json_2.txt + :language: text .. note:: @@ -154,10 +160,12 @@ In both cases, a new file ``output/sonnet.json`` is created, containing the sync map in JSON format: .. literalinclude:: _static/execute_task_example_json_output.txt + :language: json for the input file: .. literalinclude:: _static/execute_task_example_json_input.txt + :language: text Verbose Output And Logging To File ---------------------------------- @@ -165,6 +173,7 @@ Verbose Output And Logging To File If you want more verbose output, you can pass the ``-v`` or ``--verbose`` switch: .. literalinclude:: _static/execute_task_example_json_verbose.txt + :language: text There is also a ``-vv`` or ``--very-verbose`` switch to increase the verbosity of the output. @@ -173,13 +182,14 @@ Sometimes it is easier to dump the log to file, and then inspect it with a text editor. To do so, just specify the ``-l`` switch: .. literalinclude:: _static/execute_task_example_json_log.txt + :language: text The path of the log file will be printed. By default, the log file will be created in the temporary directory of your OS. If you want your log file to be created at a specific path, use ``--log=/path/to/your.log`` instead of ``-l``. -Finally, you can specify both ``-v``/``-vv`` and ``-l``/``--log``. +Note that you can specify both ``-v``/``-vv`` and ``-l``/``--log``. Input Text Formats ------------------ @@ -188,28 +198,34 @@ Input Text Formats :class:`~aeneas.textfile.TextFileFormat`: #. :data:`~aeneas.textfile.TextFileFormat.PLAIN`, - one fragment per line:: + one fragment per line + (example: ``--example-json``): + + .. code-block:: text Text of the first fragment Text of the second fragment Text of the third fragment - Example: run ``--example-json``. - #. :data:`~aeneas.textfile.TextFileFormat.PARSED`, - one fragment per line, starting with an explicit identifier:: + one fragment per line, starting with an explicit identifier + (example: ``--example-tsv``): + + .. code-block:: text f001|Text of the first fragment f002|Text of the second fragment f003|Text of the third fragment - Example: run ``--example-tsv``. #. :data:`~aeneas.textfile.TextFileFormat.SUBTITLES`, fragments separated by a blank line, each fragment might span multiple lines. This format is suitable - for creating subtitle sync map files:: + for creating subtitle sync map files + (example: ``--example-srt``): + .. code-block:: text + Fragment on a single row Fragment on two rows @@ -220,22 +236,23 @@ Input Text Formats Another fragment on two rows - Example: run ``--example-srt``. - #. :data:`~aeneas.textfile.TextFileFormat.UNPARSED`, XML file from which text fragments will be extracted - by matching ``id`` and/or ``class`` attributes: + by matching ``id`` and/or ``class`` attributes + (example: ``--example-smil``): .. literalinclude:: _static/unparsed.xhtml - - Example: run ``--example-smil``. + :language: xml #. :data:`~aeneas.textfile.TextFileFormat.MPLAIN`, the multilevel equivalent to PLAIN, with paragraphs separated by a blank line, one sentence per line, - and words separated by blank spaces:: + and words separated by blank spaces + (example: ``--example-mplain-json``): + .. code-block:: text + First sentence of Paragraph One. Second sentence of Paragraph One. @@ -245,14 +262,12 @@ Input Text Formats Second sentence of Paragraph Three. Third sentence of Paragraph Three. - Example: run ``--example-mplain-json``. - #. :data:`~aeneas.textfile.TextFileFormat.MUNPARSED`, - the multilevel equivalent to UNPARSED: + the multilevel equivalent to UNPARSED + (example: ``--example-munparsed-json``): .. literalinclude:: _static/munparsed.xhtml - - Example: run ``--example-munparsed-json``. + :language: xml If you use :data:`~aeneas.textfile.TextFileFormat.UNPARSED` files, you need to provide the following additional parameters: @@ -264,12 +279,18 @@ you need to provide the following additional parameters: to specify how extracted elements should be sorted, based on their ``id`` attributes. .. literalinclude:: _static/execute_task_example_smil.txt + :language: text .. note:: Even if you only specify the :data:`~aeneas.globalconstants.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX` regex, your XML elements still need to have ``id`` attributes. This is required for e.g. SMIL output to make sense. + (Although the EPUB 3 Media Overlays specification allows you + to specify an EPUB CFI instead of an ``id`` value, + it is recommended to use ``id`` values + for maximum reading system compatibility, + and hence ``aeneas`` only outputs SMIL files with ``id`` references.) Similarly, for :data:`~aeneas.textfile.TextFileFormat.MUNPARSED` files you need to provide the following additional parameters: @@ -279,6 +300,45 @@ you need to provide the following additional parameters: * :data:`~aeneas.globalconstants.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX`. .. literalinclude:: _static/execute_task_example_munparsed.txt + :language: text + +.. note:: + If you are interested in synchronizing at **word granularity**, + it is highly suggested to use a **multilevel text format**, + even if you are going to use only the timings for the finer granularity. + + (If you do not want the output sync map file to contain + the multilevel tree hierarchy for the timings, + you might "flatten" the output sync map file, + retaining only the word-level timings, + by using the configuration parameter + :data:`~aeneas.globalconstants.PPN_TASK_OS_FILE_LEVELS` + with value ``3``). + + There are two main reasons for this suggestion: + + 1. the computation should be faster, and + 2. likely, the timings will be more accurate. + + Starting with ``aeneas`` v1.5.1, + you can specify different MFCC parameters for each level, see: + + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_LENGTH_L1`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT_L1`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_LENGTH_L2`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT_L2`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_LENGTH_L3`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT_L3`. + + Starting with ``aeneas`` v1.6.0, + you can also specify a different TTS engine for each level, see: + + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_L1`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_L2`, + * :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.TTS_L3`. + + The ``aeneas`` mailing list contains some interesting threads + about using aeneas for word-level synchronization. Output Sync Map Formats ----------------------- @@ -299,6 +359,7 @@ requires: Example: .. literalinclude:: _static/execute_task_example_smil.txt + :language: text Listing Parameter Names And Values ---------------------------------- @@ -309,12 +370,14 @@ You can use the ``--list-parameters`` switch to print the list of parameter names that you can use in the configuration string. .. literalinclude:: _static/execute_task_list_parameters.txt + :language: text For parameters that accept a restricted set of values, you can list the allowed values with ``--list-values=PARAM``. For example: .. literalinclude:: _static/execute_task_list_values.txt + :language: text Downloading Audio From YouTube ------------------------------ @@ -324,10 +387,11 @@ Instead of the audio file path, you provide the YouTube URL, and add the ``-y`` switch at the end: .. literalinclude:: _static/execute_task_youtube.txt + :language: text .. warning:: - This feature is experimental, + The download feature is experimental, and it might be unavailable in the future, for example if YouTube disables API access to audio/video contents. @@ -357,63 +421,92 @@ providing a suitable configuration string as its value. The available paramenter names are listed in :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`. + Examples: -#. disable checks on the language codes:: +#. disable checks on the language codes: + + .. code-block:: text - python -m aeneas.tools.execute_task --example-json -r="allow_unlisted_languages=True" + python -m aeneas.tools.execute_task --example-json -r="allow_unlisted_languages=True" -#. disable the Python C/C++ extensions, running the pure Python code:: +#. disable the Python C/C++ extensions, running the pure Python code: - python -m aeneas.tools.execute_task --example-json -r="c_extensions=False" + .. code-block:: text -#. disable only the ``cew`` Python C/C++ extension, while ``cdtw`` and ``cmfcc`` will still run (if compiled):: + python -m aeneas.tools.execute_task --example-json -r="c_extensions=False" - python -m aeneas.tools.execute_task --example-json -r="cew=False" +#. disable only the ``cew`` Python C/C++ extension, while ``cdtw`` and ``cmfcc`` will still run (if compiled): -#. set the DTW margin to ``10.000`` seconds:: + .. code-block:: text - python -m aeneas.tools.execute_task --example-json -r="dtw_margin=10" + python -m aeneas.tools.execute_task --example-json -r="cew=False" -#. specify the path to the ``ffprobe`` and ``ffmpeg`` executables:: +#. set the DTW margin to ``10.000`` seconds: - python -m aeneas.tools.execute_task --example-json -r="ffmpeg_path=/path/to/my/ffmpeg|ffprobe_path=/path/to/my/ffprobe" + .. code-block:: text -#. set the time resolution of the aligner to ``0.050`` seconds:: + python -m aeneas.tools.execute_task --example-json -r="dtw_margin=10" - python -m aeneas.tools.execute_task --example-json -r="mfcc_window_length=0.150|mfcc_window_shift=0.050" +#. specify the path to the ``ffprobe`` and ``ffmpeg`` executables: -#. use the eSpeak-ng TTS, via the ``espeak-ng`` executable available on ``$PATH``, instead of eSpeak:: + .. code-block:: text - python -m aeneas.tools.execute_task --example-json -r="tts=espeak-ng" + python -m aeneas.tools.execute_task --example-json -r="ffmpeg_path=/path/to/my/ffmpeg|ffprobe_path=/path/to/my/ffprobe" -#. use the eSpeak-ng TTS, via the ``espeak-ng`` executable at a custom location, instead of eSpeak:: +#. set the time resolution of the aligner to ``0.050`` seconds: - python -m aeneas.tools.execute_task --example-json -r="tts=espeak-ng|tts_path=/path/to/espeak-ng" + .. code-block:: text -#. use the Festival TTS, via the ``text2wave`` executable available on ``$PATH``, instead of eSpeak:: + python -m aeneas.tools.execute_task --example-json -r="mfcc_window_length=0.150|mfcc_window_shift=0.050" - python -m aeneas.tools.execute_task --example-json -r="tts=festival" +#. use the eSpeak-ng TTS, via the ``espeak-ng`` executable available on ``$PATH``, instead of eSpeak: -#. use the Festival TTS, via the ``text2wave`` executable at a custom location, instead of eSpeak:: + .. code-block:: text - python -m aeneas.tools.execute_task --example-json -r="tts=festival|tts_path=/path/to/text2wave" + python -m aeneas.tools.execute_task --example-json -r="tts=espeak-ng" + +#. use the eSpeak-ng TTS, via the ``espeak-ng`` executable at a custom location, instead of eSpeak: + + .. code-block:: text + + python -m aeneas.tools.execute_task --example-json -r="tts=espeak-ng|tts_path=/path/to/espeak-ng" + +#. use the Festival TTS, via the ``text2wave`` executable available on ``$PATH``, instead of eSpeak: + + .. code-block:: text + + python -m aeneas.tools.execute_task --example-json -r="tts=festival" + +#. use the Festival TTS, via the ``text2wave`` executable at a custom location, instead of eSpeak: + + .. code-block:: text + + python -m aeneas.tools.execute_task --example-json -r="tts=festival|tts_path=/path/to/text2wave" #. use the Nuance TTS API instead of eSpeak:: - python -m aeneas.tools.execute_task --example-json -r="tts=nuance|nuance_tts_api_id=YOUR_NUANCE_API_ID|nuance_tts_api_key=YOUR_NUANCE_API_KEY" + .. code-block:: text + + python -m aeneas.tools.execute_task --example-json -r="tts=nuance|nuance_tts_api_id=YOUR_NUANCE_API_ID|nuance_tts_api_key=YOUR_NUANCE_API_KEY" + +#. use a custom TTS wrapper located at ``/path/to/your/wrapper.py`` (see the ``aeneas/extra/`` directory for examples): + + .. code-block:: text -#. use a custom TTS wrapper located at ``/path/to/your/wrapper.py`` (see the ``aeneas/extra/`` directory for examples):: + python -m aeneas.tools.execute_task --example-json -r="tts=custom|tts_path=/path/to/your/wrapper.py" - python -m aeneas.tools.execute_task --example-json -r="tts=custom|tts_path=/path/to/your/wrapper.py" +#. set the temporary directory: -#. set the temporary directory:: + .. code-block:: text - python -m aeneas.tools.execute_task --example-json -r="tmp_path=/path/to/tmp/" + python -m aeneas.tools.execute_task --example-json -r="tmp_path=/path/to/tmp/" -#. allow processing tasks with arbitrarily long audio:: +#. allow processing tasks with arbitrarily long audio: - python -m aeneas.tools.execute_task --example-json -r="task_max_audio_length=0" + .. code-block:: text + + python -m aeneas.tools.execute_task --example-json -r="task_max_audio_length=0" Miscellanea ----------- @@ -452,7 +545,9 @@ you might want to create a Job: containing the output sync map files. Example: ``/home/rb/job.zip``, containing the following files, - corresponding to three Tasks:: + corresponding to three Tasks: + + .. code-block:: text . ├── config.txt @@ -477,17 +572,21 @@ have the same meaning for ``aeneas.tools.execute_job`` as described above. For example, the help message reads: .. literalinclude:: _static/execute_job_help.txt + :language: text Currently ``aeneas.tools.execute_job`` does not have built-in examples shortcuts (``--example-*``), but you can run a built-in example: .. literalinclude:: _static/execute_job_example.txt + :language: text TXT Config File (``config.txt``) -------------------------------- -A ZIP container with the following files:: +A ZIP container with the following files: + +.. code-block:: text . ├── config.txt @@ -506,7 +605,9 @@ where the ``config.txt`` config file reads: will generate three tasks (``sonnet001``, ``sonnet002`` and ``sonnet003``), output a SMIL file for each of them, -finally compress them in a ZIP file with the following structure:: +finally compress them in a ZIP file with the following structure: + +.. code-block:: text . └── OEBPS @@ -534,11 +635,13 @@ named ``config.xml``. The following ``config.xml`` is equivalent to the example above: .. literalinclude:: _static/execute_job_config_xml_1.txt + :language: xml Now note that ``config.xml`` allows you to bundle together Tasks with different languages, output formats, etc.: .. literalinclude:: _static/execute_job_config_xml_2.txt + :language: xml diff --git a/docs/source/index.rst b/docs/source/index.rst index 89300e2b..2ddd376e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,7 +13,9 @@ In computer science this task is known as (automatically computing a) **forced a For example, given the verses and a ``53.240s``-long audio recording of *Sonnet I* by William Shakespeare, -**aeneas** might compute a map like the following:: +**aeneas** might compute a map like the following: + +.. code-block:: text 1 => [00:00:00.000, 00:00:02.640] From fairest creatures we desire increase, => [00:00:02.640, 00:00:05.880] @@ -111,6 +113,7 @@ enabling the user to run **live examples**. The help message for ``aeneas.tools.execute_job`` reads: .. literalinclude:: _static/execute_job_help.txt + :language: text The paths in the example might differ, depending on the installation location of **aeneas**. Usually, each command line in the ``EXAMPLES`` section @@ -119,16 +122,20 @@ can be copied-and-pasted to see the corresponding example running live. The help message for ``aeneas.tools.execute_task`` reads: .. literalinclude:: _static/execute_task_help.txt + :language: text The ``--examples`` switch prints a list of **common** built-in live examples: .. literalinclude:: _static/execute_task_examples.txt + :language: text Similarly, the ``--examples-all`` switch prints a list of more than twenty built-in examples, -covering **more specific input/output/parameter combinations**. +covering **more peculiar input/output/parameter combinations**. + +For example, ``--example-srt`` produces the following output: -For example, ``--example-srt`` produces the following output:: +.. code-block:: text $ python -m aeneas.tools.execute_task --example-srt [INFO] Running example task with arguments: @@ -152,7 +159,9 @@ automatically aligned with the audio file ``audio.mp3``. Example shortcuts also print the **actual parameters** which are hidden behind the ``--example-srt`` shortcut. -Thus, the above example is equivalent to:: +Thus, the above example is equivalent to: + +.. code-block:: text $ python -m aeneas.tools.execute_task aeneas/tools/res/audio.mp3 aeneas/tools/res/subtitles.txt "task_language=eng|is_text_type=subtitles|os_task_file_format=srt" output/sonnet.srt [INFO] Validating config string (specify --skip-validator to bypass)... @@ -166,7 +175,9 @@ Thus, the above example is equivalent to:: [INFO] Created file 'output/sonnet.srt' Note that a validation of the input files and parameters is performed as the first step. -If incorrect or incomplete parameters are specified, an error message is printed:: +If incorrect or incomplete parameters are specified, an error message is printed: + +.. code-block:: text $ python -m aeneas.tools.execute_task aeneas/tools/res/audio.mp3 aeneas/tools/res/subtitles.txt "task_language=eng|is_text_type=subtitles" output/sonnet.srt [INFO] Validating config string (specify --skip-validator to bypass)... diff --git a/docs/source/libtutorial.rst b/docs/source/libtutorial.rst index a2ee1f88..9d3b24dd 100644 --- a/docs/source/libtutorial.rst +++ b/docs/source/libtutorial.rst @@ -13,7 +13,9 @@ thanks to their standard I/O interface. .. Topic:: Example - Create a Task and process it, outputting the resulting sync map to file:: + Create a Task and process it, outputting the resulting sync map to file: + + .. code-block:: python #!/usr/bin/env python # coding=utf-8 @@ -34,7 +36,9 @@ thanks to their standard I/O interface. # output sync map to file task.output_sync_map_file() - You can also use :class:`~aeneas.tools.execute_task.ExecuteTaskCLI`:: + You can also use :class:`~aeneas.tools.execute_task.ExecuteTaskCLI`: + + .. code-block:: python #!/usr/bin/env python # coding=utf-8 @@ -54,7 +58,9 @@ Clearly, you can also manipulate objects programmatically. .. Topic:: Example Create a Task, process it, and print all fragments in the resulting sync map - whose duration is less than five seconds:: + whose duration is less than five seconds: + + .. code-block:: python #!/usr/bin/env python # coding=utf-8 @@ -82,8 +88,10 @@ using the library functions and constants. .. Topic:: Example - Create a Task, process it, and print the resulting sync map:: + Create a Task, process it, and print the resulting sync map: + .. code-block:: python + #!/usr/bin/env python # coding=utf-8 @@ -127,6 +135,7 @@ The other two dependencies (``lxml`` and ``BeautifulSoup``) are needed only if you use XML-like input or output formats. However, since they are popular Python packages, to avoid complex import testing, they are listed as requirements. +This choice might change in the future. Depending on what ``aeneas`` classes you want to use, you might need to install the following optional dependencies: @@ -160,8 +169,16 @@ can be done in few minutes. The drawback is that your environment must be able to compile Python C/C++ extensions. If you install ``aeneas`` via ``PyPI`` (e.g., ``pip install aeneas``), the compilation step is done automatically for you. -Note that, due to the Python C/C++ extension compile and setup mechanism, -you must install ``numpy`` before installing ``aeneas``. + +.. warning:: + + Due to the Python C/C++ extension compile and setup mechanism, + you must install ``numpy`` before installing ``aeneas``, + and there is no (sane) way for the ``aeneas`` ``setup.py`` + to install ``numpy`` before compiling the ``aeneas`` source code. + Hence, you really need to (manually) install ``numpy`` + before installing ``aeneas``. + Hopefully this inconvenience will be removed in the future. The Python C/C++ extensions included in ``aeneas`` are: @@ -201,7 +218,7 @@ The Python C/C++ extensions included in ``aeneas`` are: (e.g., install the ``festival-dev`` package on DEB-based OSes) and set the environment variable ``AENEAS_FORCE_CFW=True`` - before running ``pip`` or ``python setup.py``. + before running ``pip install aeneas`` or ``python setup.py``. .. note:: @@ -231,8 +248,10 @@ and they do not require explicitly passing an ``rconf`` object. .. Topic:: Example - Process a task with custom parameters, and log messages:: + Process a task with custom parameters, and log messages: + .. code-block:: python + # create Logger which logs and tees logger = Logger(tee=True) @@ -264,7 +283,9 @@ and then assign it to your Task. .. Topic:: Example - Create a TextFile programmatically, and assign it to Task:: + Create a TextFile programmatically, and assign it to Task: + + .. code-block:: python task = Task() textfile = TextFile() @@ -294,7 +315,7 @@ Miscellanea will save you a lot of headaches. If you read from files, be sure they are encoded using ``UTF-8``. * You can use any audio file format that is supported by ``ffprobe`` and ``ffmpeg``. - If unsure, just try them on your audio file on the console: + If unsure, just try to play them on your audio file on the console: if it works there, it should work inside ``aeneas`` too. * Enumeration classes usually have an ``ALLOWED_VALUE`` class member, which lists all the allowed values. For example: diff --git a/wiki/HOWITWORKS.md b/wiki/HOWITWORKS.md index 595b0c8c..0c0c394e 100644 --- a/wiki/HOWITWORKS.md +++ b/wiki/HOWITWORKS.md @@ -4,13 +4,7 @@ Math. -## One Sentence Explanation (Layman Edition) - -A good deal of math and computer science, -a handful of software engineering and -some optimization tricks. - -## One Sentence Explanation (Pro Edition) +## One Sentence Explanation Using the Sakoe-Chiba Band Dynamic Time Warping (DTW) algorithm to align the Mel-frequency cepstral coefficients (MFCCs) @@ -19,6 +13,314 @@ the audio wave obtained by synthesizing the text fragments with a TTS engine, eventually mapping the computed alignment back onto the (real) time domain. + + ## Extended Explanation -To be written. Eventually. Some day. +### The Forced Alignment Problem + +It might be useful to remind that **aeneas** is mainly a +[**forced aligner (FA)**](https://www.isip.piconepress.com/projects/speech/software/tutorials/production/fundamentals/v1.0/section_04/s04_04_p01.html), +that is, a software that takes an audio file and a text file divided into fragments as its input, +and it outputs a synchronization map, +that is, it automatically associates to each text fragment the time interval (in the audio file) +when that text fragment is spoken. +The following waveform illustrates the concept: + +![Waveform with aligned labels, detail](align.png) + +``` +Sonnet I => [00:00:00.000, 00:00:02.640] +From fairest creatures we desire increase, => [00:00:02.640, 00:00:05.880] +That thereby beauty's rose might never die, => [00:00:05.880, 00:00:09.240] +But as the riper should by time decease, => [00:00:09.240, 00:00:11.920] +His tender heir might bear his memory: => [00:00:11.920, 00:00:15.280] +... +``` + +Typical applications of FA +include [**Audio-eBooks**](http://www.readbeyond.it/audioebooks.html) +where audio and text are synchronized, +or **closed captioning** for videos. + +FA systems are handy because they output the synchronization map **automatically**, +without requiring any human intervention. +In fact, manually aligning audio and text is a **painfully tiring task**, +prone to fatigue errors, and it requires **trained operators** +who understand the language being spoken. +FA software can do the job **effortlessly**, while maintaining a good **quality** +of the alignment output, often indistinguishable from a manual alignment. + +### (TTS+DTW)-Based Aligner + +Most [force alignment tools](https://github.com/pettarin/forced-alignment-tools) +are based on automated speech recognition (ASR) techniques. +ASRs systems, used as aligners, +first try to recognize what the speaker says, +then they align the recognized text with the ground truth text, +producing the final text-to-audio synchronization map. + +Unlike ASRs, +**aeneas** uses a **more classic, signal-processing-based approach**, +called [**Dynamic Time Warping**](https://en.wikipedia.org/wiki/Dynamic_time_warping) (DTW) +leveraging text-to-speech (TTS) synthesis. + +To compute a synchronization map, +the **aeneas** performs the steps described below. + +#### Step 1: convert real audio to mono + +The first step consists in converting +the given audio file `R` (real), +containing the narration of the text by a human being, +obtaining a mono WAVE file `C`, for example: + +![Waveform C, the real audio converted to mono WAVE](wave_c.png) + +Since the difference in timings +between `R` (real) and `C` (converted) +is assumed to be negligible, +in the rest of this explanation we will directly use +`R` (real) for clarity, +but you must bear in mind that +**aeneas** actually operates on `C`, +that is, the mono WAVE version of `R`. + +**NOTE**: +The above observation is true +for all the modern audio formats, +considered at "file level". +However this might not be always true +when a given audio file in a non-WAVE format +is consumed by an application, +due to the implementation details of the application. +For example, Web browsers +might have incorrect timings when seeking +MP3 VBR files through the `