From 51ecccae05198f3ddb35bd83b6710c542c7ca1d3 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Sun, 30 Jun 2024 14:37:31 +0300 Subject: [PATCH] Add English README.md --- README.en.md | 390 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 32 +++-- 2 files changed, 413 insertions(+), 9 deletions(-) create mode 100644 README.en.md diff --git a/README.en.md b/README.en.md new file mode 100644 index 0000000..ef29433 --- /dev/null +++ b/README.en.md @@ -0,0 +1,390 @@ +

+ +

+ +
+ + + + + + + + +
+ +
+ + [![ar](https://img.shields.io/badge/lang-ar-brightgreen.svg)](README.md) + [![en](https://img.shields.io/badge/lang-en-red.svg)](README.en.md) + +
+ +

Tafrigh

+ +

Transcribing visual or audio materials into text.

+ +

You can view examples transcribed using Tafrigh from here.

+ +

Features of Tafrigh

+ + + +

Requirements

+ + + +

Installing Tafrigh

+ +

Using pip

+ +

You can install Tafrigh using pip with the command: pip install tafrigh[wit,whisper]

+ +

You can specify the dependencies you want to install based on the technology you want to use by writing wit or whisper in square brackets as shown in the previous command.

+ +

From the Source Code

+ + + +

Add -E wit or -E whisper to specify the dependencies to install.

+ +

Using Tafrigh

+ +

Available Options

+ + + +``` +➜ tafrigh --help +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] + [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] + [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] + [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] + [--ct2_compute_type {default,int8,int8_float16,int16,float16}] + [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] + [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] + [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] + [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] + urls_or_paths [urls_or_paths ...] + +options: + -h, --help show this help message and exit + --version show program's version number and exit + +Input: + urls_or_paths Video/Playlist URLs or local folder/file(s) to transcribe. + --skip_if_output_exist, --no-skip_if_output_exist + Whether to skip generating the output if the output file already exists. + --playlist_items PLAYLIST_ITEMS + Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]". + --download_retries DOWNLOAD_RETRIES + Number of retries for yt-dlp downloads that fail. + --verbose, --no-verbose + Whether to print out the progress and debug messages. + +Whisper: + -m MODEL_NAME_OR_PATH, --model_name_or_path MODEL_NAME_OR_PATH + Name or path of the Whisper model to use. + -t {transcribe,translate}, --task {transcribe,translate} + Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate'). + -l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}, --language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh} + Language spoken in the audio, skip to perform language detection. + --use_faster_whisper, --no-use_faster_whisper + Whether to use Faster Whisper implementation. + --beam_size BEAM_SIZE + Number of beams in beam search, only applicable when temperature is zero. + --ct2_compute_type {default,int8,int8_float16,int16,float16} + Quantization type applied while converting the model to CTranslate2 format. + +Wit: + -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...] + List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise + whisper will be used. + --max_cutting_duration [1-17] + The maximum allowed cutting duration. It should be between 1 and 17. + +Output: + --min_words_per_segment MIN_WORDS_PER_SEGMENT + The minimum number of words should appear in each transcript segment. Any segment have words count less than + this threshold will be merged with the next one. Pass 0 to disable this behavior. + --save_files_before_compact, --no-save_files_before_compact + Saves the output files before applying the compact logic that is based on --min_words_per_segment. + --save_yt_dlp_responses, --no-save_yt_dlp_responses + Whether to save the yt-dlp library JSON responses or not. + --output_sample OUTPUT_SAMPLE + Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to + disable this behavior. + -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] + Format of the output file; if not specified, all available formats will be produced. + -o OUTPUT_DIR, --output_dir OUTPUT_DIR + Directory to save the outputs. +``` + +

Transcription from command line

+ +

Transcribing using Whisper models

+ +
Transcribing a single material
+ +```bash +tafrigh "https://youtu.be/dDzxYcEJbgo" \ + --model_name_or_path small \ + --task transcribe \ + --language ar \ + --output_dir . \ + --output_formats txt srt +``` + +
Transcribing a full playlist
+ +```bash +tafrigh "https://youtube.com/playlist?list=PLyS-PHSxRDxsLnVsPrIwnsHMO5KgLz7T5" \ + --model_name_or_path small \ + --task transcribe \ + --language ar \ + --output_dir . \ + --output_formats txt srt +``` + +
Transcribing multiple materials
+ +```bash +tafrigh "https://youtu.be/4h5P7jXvW98" "https://youtu.be/jpfndVSROpw" \ + --model_name_or_path small \ + --task transcribe \ + --language ar \ + --output_dir . \ + --output_formats txt srt +``` + +
Speeding up the transcription process
+ +

You can use the faster_whisper library, which provides faster transcription, by passing the --use_faster_whisper option as follows:

+ +```bash +tafrigh "https://youtu.be/3K5Jh_-UYeA" \ + --model_name_or_path large \ + --task transcribe \ + --language ar \ + --use_faster_whisper \ + --output_dir . \ + --output_formats txt srt +``` + +

Transcribing using wit.ai technology

+ +
Transcribing a single material
+ +```bash +tafrigh "https://youtu.be/dDzxYcEJbgo" \ + --wit_client_access_tokens XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \ + --output_dir . \ + --output_formats txt srt \ + --min_words_per_segment 10 \ + --max_cutting_duration 10 +``` + +
Transcribing a full playlist
+ +```bash +tafrigh "https://youtube.com/playlist?list=PLyS-PHSxRDxsLnVsPrIwnsHMO5KgLz7T5" \ + --wit_client_access_tokens XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \ + --output_dir . \ + --output_formats txt srt \ + --min_words_per_segment 10 \ + --max_cutting_duration 10 +``` + +
Transcribing multiple materials
+ +```bash +tafrigh "https://youtu.be/4h5P7jXvW98" "https://youtu.be/jpfndVSROpw" \ + --wit_client_access_tokens XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \ + --output_dir . \ + --output_formats txt srt \ + --min_words_per_segment 10 \ + --max_cutting_duration 10 +``` + +

Transcribing using code

+ +

You can use Tafrigh through code as follows:

+ +```python +from tafrigh import farrigh, Config + +if __name__ == '__main__': + config = Config( + input=Config.Input( + urls_or_paths=['https://youtu.be/qFsUwp5iomU'], + skip_if_output_exist=False, + playlist_items='', + download_retries=3, + verbose=False, + ), + whisper=Config.Whisper( + model_name_or_path='tiny', + task='transcribe', + language='ar', + use_faster_whisper=True, + beam_size=5, + ct2_compute_type='default', + ), + wit=Config.Wit( + wit_client_access_tokens=[], + max_cutting_duration=10, + ), + output=Config.Output( + min_words_per_segment=10, + save_files_before_compact=False, + save_yt_dlp_responses=False, + output_sample=0, + output_formats=['txt', 'srt'], + output_dir='.', + ), + ) + + for progress in farrigh(config): + print(progress) +``` + +

The farrigh function is a generator that produces the current transcription state and the progress of the process. If you do not need to track this, you can skip the loop by using deque as follows:

+ +```python +from collections import deque + +from tafrigh import farrigh, Config + +if __name__ == '__main__': + config = Config(...) + + deque(farrigh(config), maxlen=0) +``` + +

Transcribing using Docker

+ +

If you have Docker on your computer, the easiest way to use Tafrigh is through Docker. The following command downloads the Tafrigh Docker image and transcribes a YouTube material using wit.ai technologies, outputting the results in the current folder:

+ +```bash +docker run -it --rm -v "$PWD:/tafrigh" ghcr.io/ieasybooks/tafrigh \ + "https://www.youtube.com/watch?v=qFsUwp5iomU" \ + --wit_client_access_tokens XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \ + -f txt srt +``` + +

You can pass any option from the Tafrigh library options mentioned above.

+ +

There are multiple Docker images you can use for Tafrigh based on the dependencies you want to use:

+ + +

One drawback is that Whisper models cannot use your computer's GPU when used through Docker, which is something we are working on resolving in the future.

+ +
+ +

A significant part of this project is based on the yt-whisper repository to achieve Tafrigh faster.

diff --git a/README.md b/README.md index 0c9d0c0..07f49be 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,13 @@ +
+ + [![ar](https://img.shields.io/badge/lang-ar-brightgreen.svg)](README.md) + [![en](https://img.shields.io/badge/lang-en-red.svg)](README.en.md) + +
+

تفريغ

تفريغ المواد المرئية أو المسموعة إلى نصوص.

@@ -86,6 +93,7 @@
  • medium
  • large-v1
  • large-v2
  • +
  • large-v3
  • large (الأعلى دقة)
  • اسم نموذج Whisper موجود على HuggingFace Hub
  • مسار نموذج Whisper تم تنزيله مسبقًا
  • @@ -150,12 +158,14 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] - [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] + [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] - [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] - [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] - [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] + [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] + [--ct2_compute_type {default,int8,int8_float16,int16,float16}] + [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] + [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] + [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] urls_or_paths [urls_or_paths ...] @@ -169,6 +179,8 @@ Input: Whether to skip generating the output if the output file already exists. --playlist_items PLAYLIST_ITEMS Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]". + --download_retries DOWNLOAD_RETRIES + Number of retries for yt-dlp downloads that fail. --verbose, --no-verbose Whether to print out the progress and debug messages. @@ -188,20 +200,22 @@ Whisper: Wit: -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...] - List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise whisper will be used. + List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise + whisper will be used. --max_cutting_duration [1-17] The maximum allowed cutting duration. It should be between 1 and 17. Output: --min_words_per_segment MIN_WORDS_PER_SEGMENT - The minimum number of words should appear in each transcript segment. Any segment have words count less than this threshold will be merged with the next one. - Pass 0 to disable this behavior. + The minimum number of words should appear in each transcript segment. Any segment have words count less than + this threshold will be merged with the next one. Pass 0 to disable this behavior. --save_files_before_compact, --no-save_files_before_compact Saves the output files before applying the compact logic that is based on --min_words_per_segment. --save_yt_dlp_responses, --no-save_yt_dlp_responses Whether to save the yt-dlp library JSON responses or not. --output_sample OUTPUT_SAMPLE - Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior. + Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to + disable this behavior. -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] Format of the output file; if not specified, all available formats will be produced. -o OUTPUT_DIR, --output_dir OUTPUT_DIR