diff --git a/README.md b/README.md index 7dcb567..dad1b10 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,16 @@ Import stuff ```python import os + +# SRT File from srtranslator import SrtFile +# ASS File +from srtranslator import AssFile + from srtranslator.translators.deepl_api import DeeplApi from srtranslator.translators.deepl_scrap import DeeplTranslator from srtranslator.translators.translatepy import TranslatePy -from srtranslator.translators.pydeeplx import DeepLX +from srtranslator.translators.pydeeplx import PyDeepLX ``` Initialize translator. It can be any translator, even your own, check the docs, there are instructions per translator and how to create your own. @@ -35,13 +40,19 @@ Load, translate and save. For multiple recursive files in folder, check `example ```python filepath = "./filepath/to/srt" -srt = SrtFile(filepath) -srt.translate(translator, "en", "es") + +# SRT File +sub = SrtFile(filepath) +# ASS File +sub = AssFile(filepath) + +# Translate +sub.translate(translator, "en", "es") # Making the result subtitles prettier -srt.wrap_lines() +sub.wrap_lines() -srt.save(f"{os.path.splitext(filepath)[0]}_translated.srt") +sub.save(f"{os.path.splitext(filepath)[0]}_translated.srt") ``` Quit translator @@ -57,15 +68,19 @@ translator.quit() ## Usage command line ```bash +# SRT file python -m srtranslator ./filepath/to/srt -i SRC_LANG -o DEST_LANG + +# ASS file +python -m srtranslator ./filepath/to/ass -i SRC_LANG -o DEST_LANG ``` ## Advanced usage ``` -usage: __main__.py [-h] [-i SRC_LANG] [-o DEST_LANG] [-v] [-vv] [-s] [-w WRAP_LIMIT] [-t {deepl-scrap,translatepy,deepl-api}] [--auth AUTH] path +usage: __main__.py [-h] [-i SRC_LANG] [-o DEST_LANG] [-v] [-vv] [-s] [-w WRAP_LIMIT] [-t {deepl-scrap,translatepy,deepl-api,pydeeplx}] [--auth AUTH] path -Translate an .STR file +Translate an .STR and .ASS file positional arguments: path File to translate @@ -81,7 +96,8 @@ options: -s, --show-browser Show browser window -w WRAP_LIMIT, --wrap-limit WRAP_LIMIT Number of characters -including spaces- to wrap a line of text. Default: 50 - -t {deepl-scrap,translatepy,deepl-api}, --translator {deepl-scrap,translatepy,deepl-api} + -t {deepl-scrap,translatepy,deepl-api}, --translator {deepl-scrap,translatepy,deepl-api,pydeeplx} Built-in translator to use --auth AUTH Api key if needed on translator + --proxies Use proxy by default for pydeeplx ``` diff --git a/dev-requirements.txt b/dev-requirements.txt index 3f4c863..a2a0c4b 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -21,7 +21,9 @@ mypy-extensions==0.4.3 outcome==1.2.0 pathspec==0.10.3 platformdirs==2.6.2 +pyass==0.1.4 pycparser==2.21 +PyDeepLX==1.0.4 PySocks==1.7.1 python-editor==1.0.4 pyuseragents==1.0.5 diff --git a/requirements.txt b/requirements.txt index 1ef7abe..5939a30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ inquirer==3.1.1 jinxed==1.2.0 lxml==4.9.3 outcome==1.2.0 +pyass==0.1.4 pycparser==2.21 PyDeepLX==1.0.4 PySocks==1.7.1 diff --git a/srtranslator/__main__.py b/srtranslator/__main__.py index 931b00e..3ac2fca 100644 --- a/srtranslator/__main__.py +++ b/srtranslator/__main__.py @@ -4,12 +4,13 @@ import traceback from .srt_file import SrtFile +from .ass_file import AssFile from .translators.deepl_api import DeeplApi from .translators.deepl_scrap import DeeplTranslator from .translators.translatepy import TranslatePy -from .translators.pydeeplx import DeepLX +from .translators.pydeeplx import PyDeepLX -parser = argparse.ArgumentParser(description="Translate an .STR file") +parser = argparse.ArgumentParser(description="Translate an .STR and .ASS file") parser.add_argument( "filepath", @@ -72,7 +73,7 @@ "-t", "--translator", type=str, - choices=["deepl-scrap", "translatepy", "deepl-api", "deeplx"], + choices=["deepl-scrap", "translatepy", "deepl-api", "pydeeplx"], help="Built-in translator to use", default="deepl-scrap", ) @@ -83,11 +84,17 @@ help="Api key if needed on translator", ) +parser.add_argument( + "--proxies", + action="store_true", + help="Use proxy by default for pydeeplx", +) + builtin_translators = { "deepl-scrap": DeeplTranslator, "deepl-api": DeeplApi, "translatepy": TranslatePy, - "deeplx": DeepLX, + "pydeeplx": PyDeepLX, } args = parser.parse_args() @@ -104,17 +111,23 @@ translator_args = {} if args.auth: translator_args["api_key"] = args.auth +if args.proxies: + translator_args["proxies"] = args.proxies translator = builtin_translators[args.translator](**translator_args) -srt = SrtFile(args.filepath) +try: + sub = AssFile(args.filepath) +except AttributeError: + print("... Exception while loading as ASS try as SRT") + sub = SrtFile(args.filepath) try: - srt.translate(translator, args.src_lang, args.dest_lang) - srt.wrap_lines(args.wrap_limit) - srt.save(f"{os.path.splitext(args.filepath)[0]}_{args.dest_lang}.srt") + sub.translate(translator, args.src_lang, args.dest_lang) + sub.wrap_lines(args.wrap_limit) + sub.save(f"{os.path.splitext(args.filepath)[0]}_{args.dest_lang}{os.path.splitext(args.filepath)[1]}") except: - srt.save_backup() + sub.save_backup() traceback.print_exc() translator.quit() diff --git a/srtranslator/ass_file.py b/srtranslator/ass_file.py new file mode 100644 index 0000000..c32dece --- /dev/null +++ b/srtranslator/ass_file.py @@ -0,0 +1,207 @@ +import os +import re +import pyass + +from typing import List, Generator + +from .translators.base import Translator + + +class AssFile: + """ASS file class abstraction + + Args: + filepath (str): file path of ass + """ + + def __init__(self, filepath: str) -> None: + self.filepath = filepath + self.backup_file = f"{self.filepath}.tmp" + self.subtitles = [] + self.start_from = 0 + self.current_subtitle = 0 + self.text_styles = [] + + print(f"Loading {filepath} as ASS") + with open(filepath, "r", encoding="utf-8", errors="ignore") as input_file: + self.subtitles = self.load_from_file(input_file) + + self._load_backup() + + def _load_backup(self): + if not os.path.exists(self.backup_file): + return + + print(f"Backup file found = {self.backup_file}") + with open( + self.backup_file, "r", encoding="utf-8", errors="ignore" + ) as input_file: + subtitles = self.load_from_file(input_file) + + self.start_from = len(subtitles.events) + self.current_subtitle = self.start_from + print(f"Starting from subtitle {self.start_from}") + self.subtitles.events = [ + *subtitles.events, + *self.subtitles.events[self.start_from :], + ] + + def load_from_file(self, input_file): + ass_file = pyass.load(input_file) + ass_file.events = sorted(ass_file.events, key=lambda e: (e.start)) + return self._clean_subs_content(ass_file) + + def _get_next_chunk(self, chunk_size: int = 4500) -> Generator: + """Get a portion of the subtitles at the time based on the chunk size + + Args: + chunk_size (int, optional): Maximum number of letter in text chunk. Defaults to 4500. + + Yields: + Generator: Each chunk at the time + """ + portion = [] + + for subtitle in self.subtitles.events[self.start_from :]: + # Manage ASS styles for subtitle before add it to the portion + # Extract a list of styles + # Replace the styles by | + + # Each style starts with { and end with } + # If we have an "}" then we can split and keep the part on the left and keep it in our list + for i in subtitle.text.split("{"): + if "}" in i: + self.text_styles.append("{" + i.split("}")[0] + "}") + + subtitle.text = re.sub(r"{.*?}", r"|", subtitle.text) + + # Calculate new chunk size if subtitle content is added to actual chunk + n_char = ( + sum(len(sub.text) for sub in portion) # All subtitles in chunk + + len(subtitle.text) # New subtitle + + len(portion) # Break lines in chunk + + 1 # New breakline + ) + + # If chunk goes beyond the limit, yield it + if n_char >= chunk_size and len(portion) != 0: + yield portion + portion = [] + + # Put subtitle content in chunk + portion.append(subtitle) + + # Yield last chunk + yield portion + + def _clean_subs_content(self, subtitles): + """Cleans subtitles content and delete line breaks + + Args: + subtitles List of subtitles + + Returns: + Same list of subtitles, but cleaned + """ + cleanr = re.compile("<.*?>") + + for sub in subtitles.events: + sub.text = cleanr.sub("", sub.text) + # No real equivalent in ASS + #sub.text = srt.make_legal_content(sub.content) + sub.text = sub.text.strip() + + if sub.text == "": + sub.text = "..." + + if all(sentence.startswith("-") for sentence in sub.text.split("\n")): + sub.text = sub.text.replace("\n", "////") + continue + + # It looks like \N is removed by the translation so we replace them by \\\\ + sub.text = sub.text.replace(r"\N", r"\\\\") + + # The \\\\ must be separated from the words to avoid weird conversions + sub.text = re.sub(r"[aA0-zZ9]\\\\", r" \\\\", sub.text) + sub.text = re.sub(r"\\\\[aA0-zZ9]", r"\\\\ ", sub.text) + + sub.text = sub.text.replace("\n", " ") + + return subtitles + + def wrap_lines(self, line_wrap_limit: int = 50) -> None: + """ + + Args: + line_wrap_limit (int): Number of maximum characters in a line before wrap. Defaults to 50. (not used) + """ + for sub in self.subtitles.events: + sub.text = sub.text.replace("////", "\n") + sub.text = sub.text.replace(r" \\\\ ", r"\N") + + def translate( + self, + translator: Translator, + source_language: str, + destination_language: str, + ) -> None: + """Translate ASS file using a translator of your choose + + Args: + translator (Translator): Translator object of choose + destination_language (str): Destination language (must be coherent with your translator) + source_language (str): Source language (must be coherent with your translator) + """ + + # For each chunk of the file (based on the translator capabilities) + for subs_slice in self._get_next_chunk(translator.max_char): + progress = int(100 * self.current_subtitle / len(self.subtitles.events)) + print(f"... Translating {progress} %") + + # Put chunk in a single text with break lines + text = [sub.text for sub in subs_slice] + text = "\n".join(text) + + # Translate + translation = translator.translate( + text, source_language, destination_language + ) + + # Manage ASS commands + # Insert the styles back in the text instead of | + self.text_styles.reverse() + translation_with_styles = "" + for i in translation.split(r"|"): + try: + # We set i at the left part because the style must "replace" the "|" + translation_with_styles += i + self.text_styles.pop() + except IndexError: + translation_with_styles += i + + # Break each line back into subtitle content + translation = translation_with_styles.splitlines() + for i in range(len(subs_slice)): + subs_slice[i].text = translation[i] + self.current_subtitle += 1 + + print(f"... Translation done") + + def save_backup(self): + self.subtitles.events = self.subtitles.events[: self.current_subtitle] + self.save(self.backup_file) + + def _delete_backup(self): + if os.path.exists(self.backup_file): + os.remove(self.backup_file) + + def save(self, filepath: str) -> None: + """Saves ASS to file + + Args: + filepath (str): Path of the new file + """ + self._delete_backup() + + print(f"Saving {filepath}") + with open(filepath, "w", encoding="utf-8") as file_out: + pyass.dump(self.subtitles, file_out) diff --git a/srtranslator/srt_file.py b/srtranslator/srt_file.py index 9db958e..f4fabcf 100644 --- a/srtranslator/srt_file.py +++ b/srtranslator/srt_file.py @@ -22,7 +22,7 @@ def __init__(self, filepath: str) -> None: self.start_from = 0 self.current_subtitle = 0 - print(f"Loading {filepath}") + print(f"Loading {filepath} as SRT") with open(filepath, "r", encoding="utf-8", errors="ignore") as input_file: self.subtitles = self.load_from_file(input_file) diff --git a/srtranslator/translators/pydeeplx.py b/srtranslator/translators/pydeeplx.py index cf3bcba..eb82ecf 100644 --- a/srtranslator/translators/pydeeplx.py +++ b/srtranslator/translators/pydeeplx.py @@ -1,33 +1,36 @@ -from PyDeepLX import PyDeepLX +from PyDeepLX import PyDeepLX as PDLX from random import randint from time import sleep from .base import Translator as BaseTranslator from fp.fp import FreeProxy -from .selenium_utils import ( - create_proxy, -) -class DeepLX(BaseTranslator): +class PyDeepLX(BaseTranslator): max_char = 1500 - def __init__(self): - self.proxies = None + def __init__(self, proxies=None): + self.proxies = proxies + + # Use proxy by default if self.proxies is True + if self.proxies: + print("...... Use proxy") + self.proxies = FreeProxy(rand=True, timeout=1).get() def translate(self, text, source_language, destination_language): - # Sleep a random number of seconds (between 3 and 5) + # Sleep a random number of seconds (between 5 and 10) # https://www.shellhacks.com/python-sleep-random-time-web-scraping/ - RANDOM_WAIT = randint(3,5) + RANDOM_WAIT = randint(5, 10) print(f"...... Wait randomly {RANDOM_WAIT}s") sleep(RANDOM_WAIT) - # Max retry 3 + # Max retry 10 RETRY_COUNTER = 10 result = None + while RETRY_COUNTER > 0 : try: - result = PyDeepLX.translate( + result = PDLX.translate( text, source_language, destination_language, @@ -49,4 +52,10 @@ def translate(self, text, source_language, destination_language): # Decrease RETRY_COUNTER RETRY_COUNTER -= 1 + + # Raise error if RETRY_COUNTER is 0 + if RETRY_COUNTER == 0: + print("...... Exception RETRY_COUNTER reached 0") + raise + return result