From c7230c743cf60aa819713293de34e3a9d08952f7 Mon Sep 17 00:00:00 2001 From: machinewrapped Date: Sat, 18 May 2024 16:49:45 +0200 Subject: [PATCH] Removed simple batcher --- GUI/Commands/BatchSubtitlesCommand.py | 4 +- GUI/NewProjectSettings.py | 16 +----- GUI/SettingsDialog.py | 2 - PySubtitle/Options.py | 2 - PySubtitle/SubtitleBatcher.py | 46 +---------------- PySubtitle/SubtitleTranslator.py | 4 +- Tests/batcher_test.py | 72 +++++++-------------------- readme.md | 13 ++--- scripts/gui-subtrans.py | 1 - scripts/subtrans_common.py | 1 - 10 files changed, 28 insertions(+), 133 deletions(-) diff --git a/GUI/Commands/BatchSubtitlesCommand.py b/GUI/Commands/BatchSubtitlesCommand.py index 5fdc57b0..c09e0981 100644 --- a/GUI/Commands/BatchSubtitlesCommand.py +++ b/GUI/Commands/BatchSubtitlesCommand.py @@ -1,7 +1,7 @@ from GUI.Command import Command from GUI.ProjectDataModel import ProjectDataModel from PySubtitle.Options import Options -from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher, SubtitleBatcher +from PySubtitle.SubtitleBatcher import SubtitleBatcher from PySubtitle.SubtitleProcessor import SubtitleProcessor from PySubtitle.SubtitleProject import SubtitleProject @@ -29,7 +29,7 @@ def execute(self): preprocessor = SubtitleProcessor(self.options) project.subtitles.PreProcess(preprocessor) - batcher : SubtitleBatcher = CreateSubtitleBatcher(self.options) + batcher : SubtitleBatcher = SubtitleBatcher(self.options) project.subtitles.AutoBatch(batcher) project.WriteProjectFile() diff --git a/GUI/NewProjectSettings.py b/GUI/NewProjectSettings.py index 68dbc9bf..68352f54 100644 --- a/GUI/NewProjectSettings.py +++ b/GUI/NewProjectSettings.py @@ -9,7 +9,7 @@ from GUI.Widgets.OptionsWidgets import CreateOptionWidget, DropdownOptionWidget from PySubtitle.Instructions import GetInstructionFiles, LoadInstructionsResource -from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher, BaseSubtitleBatcher +from PySubtitle.SubtitleBatcher import SubtitleBatcher from PySubtitle.SubtitleLine import SubtitleLine from PySubtitle.SubtitleProcessor import SubtitleProcessor from PySubtitle.SubtitleProject import SubtitleProject @@ -30,8 +30,6 @@ class NewProjectSettings(QDialog): 'min_batch_size': (int, "Fewest lines to send in separate batch"), 'max_batch_size': (int, "Most lines to send in each batch"), 'preprocess_subtitles': (bool, "Preprocess subtitles before batching"), - 'use_simple_batcher': (bool, "Use old batcher instead of batching dynamically based on gap size"), - 'batch_threshold': (float, "Number of seconds gap to consider starting a new batch (simple batcher)"), 'instruction_file': (str, "Detailed instructions for the translator"), 'prompt': (str, "High-level instructions for the translator") } @@ -148,15 +146,6 @@ def _update_settings(self): field = layout.itemAt(row, QFormLayout.FieldRole).widget() self.settings[field.key] = field.GetValue() - def _update_inputs(self): - layout : QFormLayout = self.form_layout.layout() - - for row in range(layout.rowCount()): - field = layout.itemAt(row, QFormLayout.ItemRole.FieldRole).widget() - if field.key == 'batch_threshold': - use_simple_batcher = self.settings.get('use_simple_batcher') - field.setEnabled(use_simple_batcher) - def _update_instruction_file(self): """ Update the prompt when the instruction file is changed """ instruction_file = self.fields['instruction_file'].GetValue() @@ -172,7 +161,6 @@ def _update_instruction_file(self): def _preview_batches(self): try: self._update_settings() - self._update_inputs() with QMutexLocker(self.preview_mutex): self.preview_count += 1 @@ -229,7 +217,7 @@ def run(self): else: lines = self.subtitles - batcher : BaseSubtitleBatcher = CreateSubtitleBatcher(self.settings) + batcher : SubtitleBatcher = SubtitleBatcher(self.settings) if batcher.max_batch_size < batcher.min_batch_size: self.update_preview.emit(self.count, "Max batch size is less than min batch size") return diff --git a/GUI/SettingsDialog.py b/GUI/SettingsDialog.py index 23e1fe9a..3738abf4 100644 --- a/GUI/SettingsDialog.py +++ b/GUI/SettingsDialog.py @@ -62,8 +62,6 @@ class SettingsDialog(QDialog): 'min_batch_size': (int, "Avoid creating a new batch smaller than this"), 'max_batch_size': (int, "Divide any batches larger than this into multiple batches"), 'scene_threshold': (float, "Consider a new scene to have started after this many seconds without subtitles"), - 'batch_threshold': (float, "Consider starting a new batch after a gap of this many seconds (simple batcher only)"), - 'use_simple_batcher': (bool, "Use old batcher instead of batching dynamically based on gap size"), 'substitution_mode': (Substitutions.Mode, "Whether to substitute whole words or partial matches, or choose automatically based on input language"), 'max_context_summaries': (int, "Limits the number of scene/batch summaries to include as context with each translation batch"), 'max_summary_length': (int, "Maximum length of the context summary to include with each translation batch"), diff --git a/PySubtitle/Options.py b/PySubtitle/Options.py index 21600d31..3feac10c 100644 --- a/PySubtitle/Options.py +++ b/PySubtitle/Options.py @@ -30,9 +30,7 @@ def env_bool(key, default=False): 'instruction_file': os.getenv('INSTRUCTION_FILE', "instructions.txt"), 'target_language': os.getenv('TARGET_LANGUAGE', 'English'), 'include_original': env_bool('INCLUDE_ORIGINAL', False), - 'use_simple_batcher': env_bool('USE_SIMPLE_BATCHER', False), 'scene_threshold': float(os.getenv('SCENE_THRESHOLD', 30.0)), - 'batch_threshold': float(os.getenv('BATCH_THRESHOLD', 7.0)), 'min_batch_size': int(os.getenv('MIN_BATCH_SIZE', 10)), 'max_batch_size': int(os.getenv('MAX_BATCH_SIZE', 30)), 'max_context_summaries': int(os.getenv('MAX_CONTEXT_SUMMARIES', 10)), diff --git a/PySubtitle/SubtitleBatcher.py b/PySubtitle/SubtitleBatcher.py index 4222beb0..960894c8 100644 --- a/PySubtitle/SubtitleBatcher.py +++ b/PySubtitle/SubtitleBatcher.py @@ -3,7 +3,7 @@ from PySubtitle.SubtitleScene import SubtitleScene from PySubtitle.SubtitleLine import SubtitleLine -class BaseSubtitleBatcher: +class SubtitleBatcher: def __init__(self, settings): self.min_batch_size = settings.get('min_batch_size', 0) self.max_batch_size = settings.get('max_batch_size', 99) @@ -11,42 +11,6 @@ def __init__(self, settings): scene_threshold_seconds = settings.get('scene_threshold', 30.0) self.scene_threshold = timedelta(seconds=scene_threshold_seconds) - def BatchSubtitles(self, lines : list[SubtitleLine]): - raise NotImplementedError - -class OldSubtitleBatcher(BaseSubtitleBatcher): - def __init__(self, settings): - super().__init__(settings) - self.batch_threshold_seconds = settings.get('batch_threshold', 2.0) - - def BatchSubtitles(self, lines : list[SubtitleLine]): - batch_threshold = timedelta(seconds=self.batch_threshold_seconds) - - scenes = [] - last_endtime = None - - for line in lines: - gap = line.start - last_endtime if last_endtime else None - - if gap is None or gap > self.scene_threshold: - scene = SubtitleScene() - scenes.append(scene) - scene.number = len(scenes) - batch = None - - if batch is None or (batch.size >= self.max_batch_size) or (batch.size >= self.min_batch_size and gap > batch_threshold): - batch = scene.AddNewBatch() - - batch.AddLine(line) - - last_endtime = line.end - - return scenes - -class SubtitleBatcher(BaseSubtitleBatcher): - def __init__(self, settings): - super().__init__(settings) - def BatchSubtitles(self, lines : list[SubtitleLine]): if self.min_batch_size > self.max_batch_size: raise ValueError("min_batch_size must be less than max_batch_size.") @@ -119,11 +83,3 @@ def _split_lines(self, lines : list[SubtitleLine]): # Recursively split the batches and concatenate the lists return self._split_lines(left) + self._split_lines(right) -def CreateSubtitleBatcher(settings : dict) -> BaseSubtitleBatcher: - """ - Helper to create an appropriate batcher for the settings - """ - if settings.get('use_simple_batcher'): - return OldSubtitleBatcher(settings) - - return SubtitleBatcher(settings) \ No newline at end of file diff --git a/PySubtitle/SubtitleTranslator.py b/PySubtitle/SubtitleTranslator.py index 274a6277..a7fbe263 100644 --- a/PySubtitle/SubtitleTranslator.py +++ b/PySubtitle/SubtitleTranslator.py @@ -6,7 +6,7 @@ from PySubtitle.Helpers.Text import Linearise, SanitiseSummary from PySubtitle.Instructions import Instructions from PySubtitle.Substitutions import Substitutions -from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher +from PySubtitle.SubtitleBatcher import SubtitleBatcher from PySubtitle.SubtitleProcessor import SubtitleProcessor from PySubtitle.Translation import Translation from PySubtitle.TranslationClient import TranslationClient @@ -69,7 +69,7 @@ def __init__(self, options: Options, translation_provider: TranslationProvider): if not self.client: raise ProviderError("Unable to create translation client") - self.batcher = CreateSubtitleBatcher(options) + self.batcher = SubtitleBatcher(options) self.postprocessor = SubtitleProcessor(options) if options.get('postprocess_translation') else None diff --git a/Tests/batcher_test.py b/Tests/batcher_test.py index fea41588..be426250 100644 --- a/Tests/batcher_test.py +++ b/Tests/batcher_test.py @@ -1,5 +1,5 @@ import os -from PySubtitle.SubtitleBatcher import OldSubtitleBatcher, SubtitleBatcher +from PySubtitle.SubtitleBatcher import SubtitleBatcher from PySubtitle.SubtitleFile import SubtitleFile from PySubtitle.Helpers.Tests import RunTestOnAllSrtFiles, separator @@ -27,71 +27,33 @@ def analyze_scenes(scenes): def batcher_test(subtitles: SubtitleFile, logger, options): try: - old_batcher = OldSubtitleBatcher(options) - old_scenes = old_batcher.BatchSubtitles(subtitles.originals) + batcher = SubtitleBatcher(options) + scenes = batcher.BatchSubtitles(subtitles.originals) except Exception as e: - raise Exception(f"Error in old_batcher.BatchSubtitles: {e}") - - try: - new_batcher = SubtitleBatcher(options) - new_scenes = new_batcher.BatchSubtitles(subtitles.originals) - except Exception as e: - raise Exception(f"Error in new_batcher.BatchSubtitles: {e}") - - if len(old_scenes) != len(new_scenes): - raise Exception(f"Scene count mismatch (Old: {len(old_scenes)}, New: {len(new_scenes)})") + raise Exception(f"Error in batcher.BatchSubtitles: {e}") # Analyze scenes - old_num_scenes, old_num_batches_list, old_largest_batch_list, old_smallest_batch_list, old_avg_batch_list = analyze_scenes(old_scenes) - new_num_scenes, new_num_batches_list, new_largest_batch_list, new_smallest_batch_list, new_avg_batch_list = analyze_scenes(new_scenes) - - logger.info(f"{f'':<25}{'Old':<10}{'New':<10}{'Delta':<10}") - - total_old_batches = sum(old_num_batches_list) - total_new_batches = sum(new_num_batches_list) - total_delta_batches = total_new_batches - total_old_batches + num_scenes, num_batches_list, largest_batch_list, smallest_batch_list, avg_batch_list = analyze_scenes(scenes) - total_old_largest = max(old_largest_batch_list) - total_new_largest = max(new_largest_batch_list) - total_delta_largest = total_new_largest - total_old_largest - - total_old_smallest = min(old_smallest_batch_list) - total_new_smallest = min(new_smallest_batch_list) - total_delta_smallest = total_new_smallest - total_old_smallest - - total_old_avg = sum(old_avg_batch_list) / old_num_scenes - total_new_avg = sum(new_avg_batch_list) / new_num_scenes - total_delta_avg = total_new_avg - total_old_avg + total_batches = sum(num_batches_list) + total_largest = max(largest_batch_list) + total_smallest = min(smallest_batch_list) + total_avg = sum(avg_batch_list) / num_scenes logger.info(separator) - logger.info(f"Total (min {options['min_batch_size']}, max {options['max_batch_size']}, scene {options['scene_threshold']}, batch {options['batch_threshold']})") + logger.info(f"Total (min {options['min_batch_size']}, max {options['max_batch_size']}, scene {options['scene_threshold']})") logger.info(separator) - logger.info(f"{'Total Batches':<25}{total_old_batches:<10}{total_new_batches:<10}{' ' if total_delta_batches == 0 else total_delta_batches:<10}") - logger.info(f"{'Total Largest Batch':<25}{total_old_largest:<10}{total_new_largest:<10}{' ' if total_delta_largest == 0 else total_delta_largest:<10}") - logger.info(f"{'Total Smallest Batch':<25}{total_old_smallest:<10}{total_new_smallest:<10}{' ' if total_delta_smallest == 0 else total_delta_smallest:<10}") - logger.info(f"{'Average Batch Size':<25}{total_old_avg:<10.2f}{total_new_avg:<10.2f}{'' if abs(total_delta_avg) < 1.0 else f'{total_delta_avg:.2f}':<10}") + logger.info(f"{'Total Batches':<25}{total_batches:<10}") + logger.info(f"{'Total Largest Batch':<25}{total_largest:<10}") + logger.info(f"{'Total Smallest Batch':<25}{total_smallest:<10}") + logger.info(f"{'Average Batch Size':<25}{total_avg:<10.2f}") logger.info(separator) - for i in range(old_num_scenes): - scene_num = i + 1 - - delta_num_batches = new_num_batches_list[i] - old_num_batches_list[i] - delta_largest_batch = new_largest_batch_list[i] - old_largest_batch_list[i] - delta_smallest_batch = new_smallest_batch_list[i] - old_smallest_batch_list[i] - delta_avg_batch = new_avg_batch_list[i] - old_avg_batch_list[i] - - logger.info(f"{f'-- Scene {scene_num} --':<25}") - logger.info(f"{'Num Batches':<25}{old_num_batches_list[i]:<10}{new_num_batches_list[i]:<10}{' ' if delta_num_batches == 0 else delta_num_batches:<10}") - logger.info(f"{'Largest Batch':<25}{old_largest_batch_list[i]:<10}{new_largest_batch_list[i]:<10}{' ' if delta_largest_batch == 0 else delta_largest_batch:<10}") - logger.info(f"{'Smallest Batch':<25}{old_smallest_batch_list[i]:<10}{new_smallest_batch_list[i]:<10}{' ' if delta_smallest_batch == 0 else delta_smallest_batch:<10}") - logger.info(f"{'Average Batch Size':<25}{old_avg_batch_list[i]:<10.2f}{new_avg_batch_list[i]:<10.2f}{'' if abs(delta_avg_batch) < 1.0 else f'{delta_avg_batch:.2f}':<10}") - logger.info("") - def run_tests(directory_path, results_path): test_options = [ - { 'min_batch_size': 10, 'max_batch_size': 100, 'scene_threshold': 60, 'batch_threshold': 20 }, - { 'min_batch_size': 8, 'max_batch_size': 40, 'scene_threshold': 30, 'batch_threshold': 5 }, - { 'min_batch_size': 16, 'max_batch_size': 80, 'scene_threshold': 40, 'batch_threshold': 8 }, + { 'min_batch_size': 10, 'max_batch_size': 100, 'scene_threshold': 60 }, + { 'min_batch_size': 8, 'max_batch_size': 40, 'scene_threshold': 30 }, + { 'min_batch_size': 16, 'max_batch_size': 80, 'scene_threshold': 40 }, ] RunTestOnAllSrtFiles(batcher_test, test_options, directory_path, results_path) diff --git a/readme.md b/readme.md index 9fa30c94..e2510274 100644 --- a/readme.md +++ b/readme.md @@ -47,16 +47,16 @@ Prebuilt Linux packages are not provided so you will need to install from source ### Installing from source For other platforms, or if you want to modify the program, you will need to have Python 3.10+ and pip installed on your system, then follow these steps. -#### step1 +#### step1 1. Clone the GPT-Subtrans repository onto your local machine using the following command: ``` git clone https://github.com/machinewrapped/gpt-subtrans.git ``` -The easiest setup method for most users is to run an installation script, e.g. `install-openai.bat` or `install-gemini.bat` at this point and enter your API key when prompted. This will create a virtual environment and install all the required packages for the provider, and generate command scripts to launch the specified provider. You can then skip the remaining steps. +The easiest setup method for most users is to run an installation script, e.g. `install-openai.bat` or `install-gemini.bat` at this point and enter your API key when prompted. This will create a virtual environment and install all the required packages for the provider, and generate command scripts to launch the specified provider. You can then skip the remaining steps. -MacOS and Linux users should run `install.sh` instead (this should work on any unix-like system). +MacOS and Linux users should run `install.sh` instead (this should work on any unix-like system). During the installing process, input the apikey you have, and the .env file will be created automatically. Thus, you can ignore step2, but you are recommended to read it. @@ -147,7 +147,7 @@ python3 gpt-subtrans.py --target_language - ``` Remember to change the local port to yours and turn on your proxy tools such as v2ray, naiveproxy and clash. -### batch process +### batch process you can process files with the following struct: @@ -211,11 +211,6 @@ gpt-subtrans path/to/my/subtitles.srt --moviename "My Awesome Movie" --ratelimit - `--scenethreshold`: Number of seconds between lines to consider it a new scene. -- `--batchthreshold`: - Number of seconds between lines to consider starting a new batch of subtitles to translate. - Smaller batches take longer and cost more, but introduce more sync points and reduce the scope for the AI to drift. - This setting is ignored with the new subtitle batcher, as it batches dynamically based on the gaps between lines. - - `--minbatchsize`: Minimum number of lines to consider starting a new batch to send to the translator. Higher values typically result in faster and cheaper translations but increase the risk of desyncs. diff --git a/scripts/gui-subtrans.py b/scripts/gui-subtrans.py index d6e16d22..706d59cf 100644 --- a/scripts/gui-subtrans.py +++ b/scripts/gui-subtrans.py @@ -46,7 +46,6 @@ def parse_arguments(): logger_options = InitLogger("gui-subtrans", args.debug) arguments = { - 'batch_threshold': args.batchthreshold, 'firstrun': args.firstrun, 'include_original': args.includeoriginal, 'max_batch_size': args.maxbatchsize, diff --git a/scripts/subtrans_common.py b/scripts/subtrans_common.py index e60cee46..4109a112 100644 --- a/scripts/subtrans_common.py +++ b/scripts/subtrans_common.py @@ -85,7 +85,6 @@ def CreateOptions(args: Namespace, provider: str, **kwargs) -> Options: """ Create options with additional arguments """ options = { 'api_key': args.apikey, - 'batch_threshold': args.batchthreshold, 'description': args.description, 'include_original': args.includeoriginal, 'instruction_args': args.instruction,