From 1758b8493f6ad006e215daac2ae3deef5f2b6f0c Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 25 Aug 2023 15:38:37 +0300 Subject: [PATCH] Add CSV and TSV file formats --- README.md | 11 +++++++---- tafrigh/types/transcript_type.py | 2 ++ tafrigh/writer.py | 20 +++++++++++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0e1a83d..e328bf1 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@
  • تفريغ المواد المرئي والمسموع إلى نصوص باستخدام أحدث تقنيات الذكاء الاصطناعي المقدمة من شركة OpenAI
  • إمكانية تفريغ المواد باستخدام تقنيات wit.ai المقدمة من شركة Facebook
  • تحميل المحتوى المرئي بشكل مباشر من منصة YouTube سواءً كان المستهدف مادة واحدة أو قائمة تشغيل كاملة
  • -
  • توفير صيَغ مخرجات مختلفة كـ txt و srt و vtt و json
  • +
  • توفير صيَغ مخرجات مختلفة كـ txt و srt و vtt و csv و tsv و json
  • متطلبات الاستخدام

    @@ -134,6 +134,8 @@
  • txt
  • srt
  • vtt
  • +
  • csv
  • +
  • tsv
  • json
  • all (الاختيار الإفتراضي)
  • none (لن يتم إنشاء ملف في حال تمرير هذه الصيغة)
  • @@ -146,15 +148,16 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] - [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...]] [-o OUTPUT_DIR] + [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] urls_or_paths [urls_or_paths ...] options: -h, --help show this help message and exit + --version show program's version number and exit Input: urls_or_paths Video/Playlist URLs or local folder/file(s) to transcribe. @@ -194,7 +197,7 @@ Output: Whether to save the yt-dlp library JSON responses or not. (default: False) --output_sample OUTPUT_SAMPLE Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior. - -f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...], --output_formats {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...] + -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] Format of the output file; if not specified, all available formats will be produced. -o OUTPUT_DIR, --output_dir OUTPUT_DIR Directory to save the outputs. diff --git a/tafrigh/types/transcript_type.py b/tafrigh/types/transcript_type.py index e05c514..3cfa105 100644 --- a/tafrigh/types/transcript_type.py +++ b/tafrigh/types/transcript_type.py @@ -6,6 +6,8 @@ class TranscriptType(Enum): TXT = 'txt' SRT = 'srt' VTT = 'vtt' + CSV = 'csv' + TSV = 'tsv' JSON = 'json' NONE = 'none' diff --git a/tafrigh/writer.py b/tafrigh/writer.py index 71fe43f..ab5d56d 100644 --- a/tafrigh/writer.py +++ b/tafrigh/writer.py @@ -1,3 +1,4 @@ +import csv import json import os @@ -46,6 +47,10 @@ def write( self.write_srt(file_path, segments) elif format == TranscriptType.VTT: self.write_vtt(file_path, segments) + elif format == TranscriptType.CSV: + self.write_csv(file_path, segments) + elif format == TranscriptType.TSV: + self.write_csv(file_path, segments, '\t') elif format == TranscriptType.JSON: self.write_json(file_path, segments) @@ -70,12 +75,25 @@ def write_vtt( ) -> None: self._write_to_file(file_path, self.generate_vtt(segments)) + def write_csv( + self, + file_path: str, + segments: list[dict[str, Union[str, float]]], + delimiter=',', + ) -> None: + with open(file_path, 'w', encoding='utf-8') as fp: + writer = csv.writer(fp, delimiter=delimiter) + writer.writerow(['text', 'start', 'end']) + + for segment in segments: + writer.writerow([segment['text'], segment['start'], segment['end']]) + def write_json( self, file_path: str, segments: list[dict[str, Union[str, float]]], ) -> None: - with open(file_path, 'w') as fp: + with open(file_path, 'w', encoding='utf-8') as fp: json.dump(segments, fp, ensure_ascii=False, indent=2) def generate_txt(self, segments: list[dict[str, Union[str, float]]]) -> str: