src/scripts/get_subtitles/get_subtitles.py

import os
import re
import sys
import copy
import json
import shutil
import ffmpeg
import logging
import warnings
import argparse
from yt_dlp import YoutubeDL
from write_subtitles_json import write_json
from argparse import ArgumentDefaultsHelpFormatter
from pyannote.audio.utils.reproducibility import ReproducibilityWarning

logging.getLogger('speechbrain.utils.quirks').setLevel(logging.WARNING)
logging.getLogger('pytorch_lightning.utilities.migration.utils').setLevel(logging.WARNING)

import whisperx
from whisperx.utils import get_writer

def get_subtitles(urls: list, cookies_path: str, subs_path: str, prefer_autogenerated: bool, generate_missing: bool, whisper_model: str, whisper_compute_type: str, whisper_batch_size: int, whisper_device: str):
    class UnavailableSubsLogger:
        def __init__(self):
            self.log_files = ['videos_with_no_subs.txt', 'unavailable_videos.txt']
            self.no_subs_log_file = open('videos_with_no_subs.txt', 'a', encoding='utf-8')
            self.unavailable_log_file = open('unavailable_videos.txt', 'a', encoding='utf-8')
            self.no_subs_pattern = re.compile(r'^\[info\] There are no subtitles for the requested languages$')
            self.current_video_id = ''

        def debug(self, msg):
            if '[download]' not in msg:
                print(msg, flush=True)
            elif '[download]' in msg:
                print(f'\r{msg}', end='', flush=True)
            if '[youtube] Extracting URL: https://www.youtube.com/watch?v=' in msg:
                self.current_video_id = msg.split('[youtube] Extracting URL: https://www.youtube.com/watch?v=')[1]
            if self.no_subs_pattern.search(msg):
                self.no_subs_log_file.write(f'https://www.youtube.com/watch?v={self.current_video_id}\n')

        def warning(self, msg):
            print(msg)

        def error(self, msg):
            print(msg, file=sys.stderr)
            # account for: [youtube] VQ4B5d_upTE: Requested format is not available. Use --list-formats for a list of available formats
            if 'Video unavailable.' in msg:
                self.unavailable_log_file.write(f'https://www.youtube.com/watch?v={self.current_video_id}\n')
                with open('downloaded.txt', 'a', encoding='utf-8') as f:
                    f.write(f'youtube {self.current_video_id}\n')

        def close(self):
            self.no_subs_log_file.close()
            self.unavailable_log_file.close()
            for log_file in self.log_files:
                if os.path.getsize(log_file) == 0:
                    os.remove(log_file)

    logger = UnavailableSubsLogger()

    common_args = {
        # 'playlist_items': '1-3',
        # 'cookiesfrombrowser': {'firefox'},
        'cookiefile': cookies_path,
        'ignoreerrors': True,
        'clean_infojson': True,
        'download_archive': 'downloaded.txt',
        'force_write_download_archive': True,
        'forcejson': True,
        'outtmpl': '%(title)s.%(ext)s',
        'paths': {'home': subs_path},
        'restrictfilenames': True,
        'simulate': False,
        'writeinfojson': True
    }

    audio_only_args = copy.deepcopy(common_args)
    audio_only_args.update({
        'format': 'bestaudio/best',
        'logger': logger,
        # 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'opus', 'preferredquality': '0'}]
    })

    subtitle_only_args = copy.deepcopy(common_args)
    subtitle_only_args.update({
        'logger': logger,
        'postprocessors': [{'format': 'lrc', 'key': 'FFmpegSubtitlesConvertor', 'when': 'before_dl'}],
        'skip_download': True,
        'writeautomaticsub': True
    })

    os.makedirs(subs_path, exist_ok=True)

    if prefer_autogenerated:
        with YoutubeDL(subtitle_only_args) as ydl:
            ydl.download(urls)
    else:
        with YoutubeDL(audio_only_args) as ydl:
            ydl.download(urls)

    logger.close()

    if prefer_autogenerated and generate_missing and os.path.exists('videos_with_no_subs.txt') and os.path.getsize('videos_with_no_subs.txt') > 0:
        videos_with_no_subs_urls = open('videos_with_no_subs.txt', 'r', encoding='utf-8').read().splitlines()
        logger = UnavailableSubsLogger()
        with YoutubeDL(audio_only_args) as ydl:
            ydl.download(videos_with_no_subs_urls)
        logger.close()

    audio_files = [filename for filename in os.listdir(subs_path) if filename.endswith(('.m4a', '.mp4', '.opus', '.webm'))]

    if audio_files:
        warnings.simplefilter('ignore', category=UserWarning)
        warnings.simplefilter('ignore', category=FutureWarning)
        warnings.simplefilter('ignore', category=ReproducibilityWarning)
        model = whisperx.load_model(whisper_model, device=whisper_device, compute_type=whisper_compute_type, language='en')
        model_a, metadata = whisperx.load_align_model(language_code='en', device=whisper_device)
        subtitle_writer = get_writer('srt', '.')
        processed_dir = os.path.join(subs_path, 'processed')
        os.makedirs(processed_dir, exist_ok=True)
        starting_whisper_batch_size = whisper_batch_size
        for filename in audio_files:
            audio_path = os.path.join(subs_path, filename)
            audio = whisperx.load_audio(audio_path)
            whisper_batch_size = starting_whisper_batch_size
            result = None
            aligned_result = None
            while result == None:
                try:
                    print(f'Generating subtitles for {filename}')
                    result = model.transcribe(audio, batch_size=whisper_batch_size, print_progress=True, combined_progress=False, verbose=False)
                except RuntimeError as e:
                    print(f'Runtime Error: {e}')
                    if e.args[0] == 'CUDA failed with error out of memory':
                        whisper_batch_size = whisper_batch_size - 1
                        if whisper_batch_size < 1:
                            whisper_batch_size = 1
                        print(f'Lowering batch size to: {whisper_batch_size}')
                except Exception as e:
                    print(f'Error: {e}')
                    print('Retrying...')
            print(f'Aligning subtitles for {filename}')
            aligned_result = whisperx.align(result['segments'], model_a, metadata, audio, whisper_device, return_char_alignments=False)
            aligned_result['language'] = result['language'] # type: ignore
            subtitle_writer(aligned_result, 'output.srt', {'max_line_width': None, 'max_line_count': None, 'highlight_words': False})
            ffmpeg.input('output.srt').output(os.path.join(subs_path, f'{os.path.splitext(filename)[0]}.en.lrc')).overwrite_output().run()
            shutil.move(audio_path, os.path.join(processed_dir, filename))
        os.remove('output.srt')

def rename_files(subs_path: str):
    prefix_pattern = re.compile(r'^(?:Old_)?(?:Jerma(?:985)?(?:_s)?|Livestream)_(?!and|tries)(?:Full_Stream|Stream(?:s)?)?(?:_with_Chat)?[_-]*', flags=re.IGNORECASE)
    seperator_pattern = re.compile(r'[-+_]+')
    suffix_pattern = re.compile(r'(?:_with_Chat)?(?:_v\d+)?(?=\.)', flags=re.IGNORECASE)
    year_pattern = re.compile(r'(?=\d)0_s')
    _s_fix_pattern = re.compile(r'\b(Assassin|Asura|Barker|Bugleberry|Cabela|Children|Clancy|Dante|Demon|Devil|Director|Dragon|Farmer|Freddy|Hawk|It|Jerma|Let|Ludlum|Mario|Marvel|Papa|Raven|Sanity|Shop|Spooky|STAR|Unknown|Who|Youtuber)_s\b', flags=re.IGNORECASE)

    for filename in os.listdir(subs_path):
        new_filename = prefix_pattern.sub('', filename)
        new_filename = suffix_pattern.sub('', new_filename)
        new_filename = seperator_pattern.sub('_', new_filename)
        new_filename = year_pattern.sub('0s', new_filename)
        new_filename = _s_fix_pattern.sub(r'\1s', new_filename)
        new_filename = new_filename.replace('You_re', 'Youre')
        if new_filename != filename:
            print(f'Renamed {filename} -> {new_filename}')
            dest_path = os.path.join(subs_path, new_filename)
            src_path = os.path.join(subs_path, filename)
            if os.path.exists(dest_path):
                os.remove(dest_path)
            os.rename(src_path, dest_path)

def fix_subtitles(subs_path: str):
    timestamp_pattern = re.compile(r'^(\[\d+:\d+(?:\.\d+)?\])\s*(.*)$')

    for filename in os.listdir(subs_path):
        if filename.endswith('.lrc'):
            with open(os.path.join(subs_path, filename), 'r', encoding='utf-8') as f:
                file_content = f.read()
                replacements = {
                    '[Music]': '🎶',
                    '[Applause]': '👏',
                    '[\\h__\\h]': '🤬',
                    'Jermak': 'Jerma',
                    'Jermat': 'Jerma',
                    'Jermaw': 'Jerma',
                    'German985': 'Jerma985',
                    'German 085': 'Jerma985',
                    'Glooman': 'Glue Man',
                    'Zerkoth': 'zerk off',
                    'i': 'I',
                    "i'm": "I'm",
                    "i'll": "I'll",
                    '—': '-',
                }
                for k, v in replacements.items():
                    pattern = rf'\b{re.escape(k)}\b'
                    file_content = re.sub(pattern, v, file_content)
                lines = file_content.splitlines()

            new_lines = []
            for line in lines:
                match = timestamp_pattern.match(line.strip())
                if match:
                    ts = match.group(1)
                    text_after_ts = match.group(2).strip()
                    if not text_after_ts:
                        continue
                    new_lines.append(f'{ts}{text_after_ts}\n')
                else:
                    new_lines.append(line.rstrip('\r\n') + '\n')
            with open(os.path.join(subs_path, filename), 'w', encoding='utf-8', newline='\n') as f:
                f.writelines(new_lines)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='Downloads/generates subtitles for a set of URLs in .lrc format using yt-dlp, WhisperX and ffmpeg. Requires ffmpeg to be installed and available on path for subtitle format conversion. Requires cuBLAS and cuDNN for WhisperX GPU execution (see: https://github.com/m-bain/whisperX#setup-%EF%B8%8F)')
    parser.add_argument('--urls', type=str, nargs='+', default=[
        # 'https://youtube.com/channel/UCK3kaNXbB57CLcyhtccV_yw',                       # Jerma985 channel
        'https://youtube.com/playlist?list=PLd4kmFVnghOiWHL8EStIzMXwySWm-7K1f',         # Jerma Streams in Chronological Order
        'https://youtube.com/playlist?list=PLLZVyAYonIO5Wlrf5-i8iXKmWYl6h0yx9',         # More Jerma content, by me (create an issue if you'd like me to add videos)
        'https://youtube.com/playlist?list=PLiwPsPm4ewgv5dPbkSRBzjJ4SjAOXXxZH',         # Jerma animations by Corax
        'https://youtube.com/playlist?list=PL9wV0Fmo7N_g5X64TzZUY-eQyoZ4xvAf2',         # JermAnimated playlist on 2ndJerma
        # 'https://archive.org/download/jerma-streams-public-domain-movie-night/PUBLIC%20DOMAIN%20MOVIE%20NIGHT%20PART%201.mp4'
    ], help='List of URLs to obtain subtitles for.')
    parser.add_argument('--cookies-path', type=str, default='cookies.txt', help='Netscape formatted file to read cookies from, can be obtained with: yt-dlp --cookies-from-browser firefox --cookies cookies.txt')
    parser.add_argument('--generate-missing', action='store_true', default=True, help='Generate subtitles for videos with no autogenerated subtitles. Only works if --prefer-autogenerated is set.')
    parser.add_argument('--json-path', type=str, default='../../assets/Subtitles.json', help='Path to save JSON file to.')
    parser.add_argument('--prefer-autogenerated', action='store_true', default=False, help='Prefer autogenerated subtitles over doing transcription.')
    parser.add_argument('--subs-path', type=str, default='../../assets/subtitles', help='Path to save files to.')
    parser.add_argument('--whisper-batch-size', type=int, default=8, help='Batch size to use for WhisperX transcription. Lower this if you run out of VRAM.')
    parser.add_argument('--whisper-compute-type', type=str, default='float16', help='Compute type to use for WhisperX transcription (int8/float16/float32).')
    parser.add_argument('--whisper-device', type=str, default='cuda', help='Whether to perform transcription on GPU (CUDA) or CPU.')
    parser.add_argument('--whisper-model', type=str, default='large-v3', help='Whisper model to use for transcription (see: https://github.com/openai/whisper#available-models-and-languages)')
    args = parser.parse_args()

    get_subtitles(args.urls, args.cookies_path, args.subs_path, args.prefer_autogenerated, args.generate_missing, args.whisper_model, args.whisper_compute_type, args.whisper_batch_size, args.whisper_device)
    print('\nRenaming files...')
    rename_files(args.subs_path)
    print('\nFixing subtitles...')
    fix_subtitles(args.subs_path)
    print('\nWriting JSON...')
    write_json(args.subs_path, args.json_path)