From 1758b8493f6ad006e215daac2ae3deef5f2b6f0c Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 25 Aug 2023 15:38:37 +0300 Subject: [PATCH 1/3] Add CSV and TSV file formats --- README.md | 11 +++++++---- tafrigh/types/transcript_type.py | 2 ++ tafrigh/writer.py | 20 +++++++++++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0e1a83d..e328bf1 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@
  • تفريغ المواد المرئي والمسموع إلى نصوص باستخدام أحدث تقنيات الذكاء الاصطناعي المقدمة من شركة OpenAI
  • إمكانية تفريغ المواد باستخدام تقنيات wit.ai المقدمة من شركة Facebook
  • تحميل المحتوى المرئي بشكل مباشر من منصة YouTube سواءً كان المستهدف مادة واحدة أو قائمة تشغيل كاملة
  • -
  • توفير صيَغ مخرجات مختلفة كـ txt و srt و vtt و json
  • +
  • توفير صيَغ مخرجات مختلفة كـ txt و srt و vtt و csv و tsv و json
  • متطلبات الاستخدام

    @@ -134,6 +134,8 @@
  • txt
  • srt
  • vtt
  • +
  • csv
  • +
  • tsv
  • json
  • all (الاختيار الإفتراضي)
  • none (لن يتم إنشاء ملف في حال تمرير هذه الصيغة)
  • @@ -146,15 +148,16 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] - [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...]] [-o OUTPUT_DIR] + [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] urls_or_paths [urls_or_paths ...] options: -h, --help show this help message and exit + --version show program's version number and exit Input: urls_or_paths Video/Playlist URLs or local folder/file(s) to transcribe. @@ -194,7 +197,7 @@ Output: Whether to save the yt-dlp library JSON responses or not. (default: False) --output_sample OUTPUT_SAMPLE Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior. - -f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...], --output_formats {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...] + -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] Format of the output file; if not specified, all available formats will be produced. -o OUTPUT_DIR, --output_dir OUTPUT_DIR Directory to save the outputs. diff --git a/tafrigh/types/transcript_type.py b/tafrigh/types/transcript_type.py index e05c514..3cfa105 100644 --- a/tafrigh/types/transcript_type.py +++ b/tafrigh/types/transcript_type.py @@ -6,6 +6,8 @@ class TranscriptType(Enum): TXT = 'txt' SRT = 'srt' VTT = 'vtt' + CSV = 'csv' + TSV = 'tsv' JSON = 'json' NONE = 'none' diff --git a/tafrigh/writer.py b/tafrigh/writer.py index 71fe43f..ab5d56d 100644 --- a/tafrigh/writer.py +++ b/tafrigh/writer.py @@ -1,3 +1,4 @@ +import csv import json import os @@ -46,6 +47,10 @@ def write( self.write_srt(file_path, segments) elif format == TranscriptType.VTT: self.write_vtt(file_path, segments) + elif format == TranscriptType.CSV: + self.write_csv(file_path, segments) + elif format == TranscriptType.TSV: + self.write_csv(file_path, segments, '\t') elif format == TranscriptType.JSON: self.write_json(file_path, segments) @@ -70,12 +75,25 @@ def write_vtt( ) -> None: self._write_to_file(file_path, self.generate_vtt(segments)) + def write_csv( + self, + file_path: str, + segments: list[dict[str, Union[str, float]]], + delimiter=',', + ) -> None: + with open(file_path, 'w', encoding='utf-8') as fp: + writer = csv.writer(fp, delimiter=delimiter) + writer.writerow(['text', 'start', 'end']) + + for segment in segments: + writer.writerow([segment['text'], segment['start'], segment['end']]) + def write_json( self, file_path: str, segments: list[dict[str, Union[str, float]]], ) -> None: - with open(file_path, 'w') as fp: + with open(file_path, 'w', encoding='utf-8') as fp: json.dump(segments, fp, ensure_ascii=False, indent=2) def generate_txt(self, segments: list[dict[str, Union[str, float]]]) -> str: From a98c8aa50c2a3f1e1db3eefbeb698aae5c8eb94b Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 25 Aug 2023 15:45:42 +0300 Subject: [PATCH 2/3] Increase the package version --- colab_notebook.ipynb | 3 +-- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/colab_notebook.ipynb b/colab_notebook.ipynb index 50b05e1..bd9b9b2 100644 --- a/colab_notebook.ipynb +++ b/colab_notebook.ipynb @@ -69,8 +69,7 @@ "print('جارٍ تجهيز بيئة العمل.')\n", "\n", "# Setup Tafrigh.\n", - "%pip uninstall -y tafrigh\n", - "%pip install -U tafrigh[wit,whisper]==1.0.1 -qqq\n", + "%pip install -U tafrigh[wit,whisper]==1.1.0 > install_logs.txt\n", "\n", "# Get inputs.\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index 28a192c..26fb339 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta" [project] name = "tafrigh" -version = "1.0.1" -description = "تفريغ النصوص وإنشاء ملفات SRT و VTT باستخدام نماذج Whisper وتقنية OpenAI." +version = "1.1.0" +description = "تفريغ النصوص وإنشاء ملفات SRT و VTT باستخدام نماذج Whisper وتقنية wit.ai." readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 90aa3925f3cf506296431ec4b25c7fa593267dd3 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 25 Aug 2023 16:27:30 +0300 Subject: [PATCH 3/3] Use DictWriter instead of plain loop --- tafrigh/writer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tafrigh/writer.py b/tafrigh/writer.py index ab5d56d..47ed889 100644 --- a/tafrigh/writer.py +++ b/tafrigh/writer.py @@ -82,11 +82,9 @@ def write_csv( delimiter=',', ) -> None: with open(file_path, 'w', encoding='utf-8') as fp: - writer = csv.writer(fp, delimiter=delimiter) - writer.writerow(['text', 'start', 'end']) - - for segment in segments: - writer.writerow([segment['text'], segment['start'], segment['end']]) + writer = csv.DictWriter(fp, fieldnames=['text', 'start', 'end'], delimiter=delimiter) + writer.writeheader() + writer.writerows(segments) def write_json( self,