Skip to content

Commit

Permalink
Removed simple batcher
Browse files Browse the repository at this point in the history
  • Loading branch information
machinewrapped committed May 18, 2024
1 parent 22eb2ba commit c7230c7
Show file tree
Hide file tree
Showing 10 changed files with 28 additions and 133 deletions.
4 changes: 2 additions & 2 deletions GUI/Commands/BatchSubtitlesCommand.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from GUI.Command import Command
from GUI.ProjectDataModel import ProjectDataModel
from PySubtitle.Options import Options
from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher, SubtitleBatcher
from PySubtitle.SubtitleBatcher import SubtitleBatcher
from PySubtitle.SubtitleProcessor import SubtitleProcessor
from PySubtitle.SubtitleProject import SubtitleProject

Expand Down Expand Up @@ -29,7 +29,7 @@ def execute(self):
preprocessor = SubtitleProcessor(self.options)
project.subtitles.PreProcess(preprocessor)

batcher : SubtitleBatcher = CreateSubtitleBatcher(self.options)
batcher : SubtitleBatcher = SubtitleBatcher(self.options)
project.subtitles.AutoBatch(batcher)

project.WriteProjectFile()
Expand Down
16 changes: 2 additions & 14 deletions GUI/NewProjectSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from GUI.Widgets.OptionsWidgets import CreateOptionWidget, DropdownOptionWidget

from PySubtitle.Instructions import GetInstructionFiles, LoadInstructionsResource
from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher, BaseSubtitleBatcher
from PySubtitle.SubtitleBatcher import SubtitleBatcher
from PySubtitle.SubtitleLine import SubtitleLine
from PySubtitle.SubtitleProcessor import SubtitleProcessor
from PySubtitle.SubtitleProject import SubtitleProject
Expand All @@ -30,8 +30,6 @@ class NewProjectSettings(QDialog):
'min_batch_size': (int, "Fewest lines to send in separate batch"),
'max_batch_size': (int, "Most lines to send in each batch"),
'preprocess_subtitles': (bool, "Preprocess subtitles before batching"),
'use_simple_batcher': (bool, "Use old batcher instead of batching dynamically based on gap size"),
'batch_threshold': (float, "Number of seconds gap to consider starting a new batch (simple batcher)"),
'instruction_file': (str, "Detailed instructions for the translator"),
'prompt': (str, "High-level instructions for the translator")
}
Expand Down Expand Up @@ -148,15 +146,6 @@ def _update_settings(self):
field = layout.itemAt(row, QFormLayout.FieldRole).widget()
self.settings[field.key] = field.GetValue()

def _update_inputs(self):
layout : QFormLayout = self.form_layout.layout()

for row in range(layout.rowCount()):
field = layout.itemAt(row, QFormLayout.ItemRole.FieldRole).widget()
if field.key == 'batch_threshold':
use_simple_batcher = self.settings.get('use_simple_batcher')
field.setEnabled(use_simple_batcher)

def _update_instruction_file(self):
""" Update the prompt when the instruction file is changed """
instruction_file = self.fields['instruction_file'].GetValue()
Expand All @@ -172,7 +161,6 @@ def _update_instruction_file(self):
def _preview_batches(self):
try:
self._update_settings()
self._update_inputs()

with QMutexLocker(self.preview_mutex):
self.preview_count += 1
Expand Down Expand Up @@ -229,7 +217,7 @@ def run(self):
else:
lines = self.subtitles

batcher : BaseSubtitleBatcher = CreateSubtitleBatcher(self.settings)
batcher : SubtitleBatcher = SubtitleBatcher(self.settings)
if batcher.max_batch_size < batcher.min_batch_size:
self.update_preview.emit(self.count, "Max batch size is less than min batch size")
return
Expand Down
2 changes: 0 additions & 2 deletions GUI/SettingsDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ class SettingsDialog(QDialog):
'min_batch_size': (int, "Avoid creating a new batch smaller than this"),
'max_batch_size': (int, "Divide any batches larger than this into multiple batches"),
'scene_threshold': (float, "Consider a new scene to have started after this many seconds without subtitles"),
'batch_threshold': (float, "Consider starting a new batch after a gap of this many seconds (simple batcher only)"),
'use_simple_batcher': (bool, "Use old batcher instead of batching dynamically based on gap size"),
'substitution_mode': (Substitutions.Mode, "Whether to substitute whole words or partial matches, or choose automatically based on input language"),
'max_context_summaries': (int, "Limits the number of scene/batch summaries to include as context with each translation batch"),
'max_summary_length': (int, "Maximum length of the context summary to include with each translation batch"),
Expand Down
2 changes: 0 additions & 2 deletions PySubtitle/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ def env_bool(key, default=False):
'instruction_file': os.getenv('INSTRUCTION_FILE', "instructions.txt"),
'target_language': os.getenv('TARGET_LANGUAGE', 'English'),
'include_original': env_bool('INCLUDE_ORIGINAL', False),
'use_simple_batcher': env_bool('USE_SIMPLE_BATCHER', False),
'scene_threshold': float(os.getenv('SCENE_THRESHOLD', 30.0)),
'batch_threshold': float(os.getenv('BATCH_THRESHOLD', 7.0)),
'min_batch_size': int(os.getenv('MIN_BATCH_SIZE', 10)),
'max_batch_size': int(os.getenv('MAX_BATCH_SIZE', 30)),
'max_context_summaries': int(os.getenv('MAX_CONTEXT_SUMMARIES', 10)),
Expand Down
46 changes: 1 addition & 45 deletions PySubtitle/SubtitleBatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,14 @@
from PySubtitle.SubtitleScene import SubtitleScene
from PySubtitle.SubtitleLine import SubtitleLine

class BaseSubtitleBatcher:
class SubtitleBatcher:
def __init__(self, settings):
self.min_batch_size = settings.get('min_batch_size', 0)
self.max_batch_size = settings.get('max_batch_size', 99)

scene_threshold_seconds = settings.get('scene_threshold', 30.0)
self.scene_threshold = timedelta(seconds=scene_threshold_seconds)

def BatchSubtitles(self, lines : list[SubtitleLine]):
raise NotImplementedError

class OldSubtitleBatcher(BaseSubtitleBatcher):
def __init__(self, settings):
super().__init__(settings)
self.batch_threshold_seconds = settings.get('batch_threshold', 2.0)

def BatchSubtitles(self, lines : list[SubtitleLine]):
batch_threshold = timedelta(seconds=self.batch_threshold_seconds)

scenes = []
last_endtime = None

for line in lines:
gap = line.start - last_endtime if last_endtime else None

if gap is None or gap > self.scene_threshold:
scene = SubtitleScene()
scenes.append(scene)
scene.number = len(scenes)
batch = None

if batch is None or (batch.size >= self.max_batch_size) or (batch.size >= self.min_batch_size and gap > batch_threshold):
batch = scene.AddNewBatch()

batch.AddLine(line)

last_endtime = line.end

return scenes

class SubtitleBatcher(BaseSubtitleBatcher):
def __init__(self, settings):
super().__init__(settings)

def BatchSubtitles(self, lines : list[SubtitleLine]):
if self.min_batch_size > self.max_batch_size:
raise ValueError("min_batch_size must be less than max_batch_size.")
Expand Down Expand Up @@ -119,11 +83,3 @@ def _split_lines(self, lines : list[SubtitleLine]):
# Recursively split the batches and concatenate the lists
return self._split_lines(left) + self._split_lines(right)

def CreateSubtitleBatcher(settings : dict) -> BaseSubtitleBatcher:
"""
Helper to create an appropriate batcher for the settings
"""
if settings.get('use_simple_batcher'):
return OldSubtitleBatcher(settings)

return SubtitleBatcher(settings)
4 changes: 2 additions & 2 deletions PySubtitle/SubtitleTranslator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from PySubtitle.Helpers.Text import Linearise, SanitiseSummary
from PySubtitle.Instructions import Instructions
from PySubtitle.Substitutions import Substitutions
from PySubtitle.SubtitleBatcher import CreateSubtitleBatcher
from PySubtitle.SubtitleBatcher import SubtitleBatcher
from PySubtitle.SubtitleProcessor import SubtitleProcessor
from PySubtitle.Translation import Translation
from PySubtitle.TranslationClient import TranslationClient
Expand Down Expand Up @@ -69,7 +69,7 @@ def __init__(self, options: Options, translation_provider: TranslationProvider):
if not self.client:
raise ProviderError("Unable to create translation client")

self.batcher = CreateSubtitleBatcher(options)
self.batcher = SubtitleBatcher(options)

self.postprocessor = SubtitleProcessor(options) if options.get('postprocess_translation') else None

Expand Down
72 changes: 17 additions & 55 deletions Tests/batcher_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from PySubtitle.SubtitleBatcher import OldSubtitleBatcher, SubtitleBatcher
from PySubtitle.SubtitleBatcher import SubtitleBatcher
from PySubtitle.SubtitleFile import SubtitleFile
from PySubtitle.Helpers.Tests import RunTestOnAllSrtFiles, separator

Expand Down Expand Up @@ -27,71 +27,33 @@ def analyze_scenes(scenes):

def batcher_test(subtitles: SubtitleFile, logger, options):
try:
old_batcher = OldSubtitleBatcher(options)
old_scenes = old_batcher.BatchSubtitles(subtitles.originals)
batcher = SubtitleBatcher(options)
scenes = batcher.BatchSubtitles(subtitles.originals)
except Exception as e:
raise Exception(f"Error in old_batcher.BatchSubtitles: {e}")

try:
new_batcher = SubtitleBatcher(options)
new_scenes = new_batcher.BatchSubtitles(subtitles.originals)
except Exception as e:
raise Exception(f"Error in new_batcher.BatchSubtitles: {e}")

if len(old_scenes) != len(new_scenes):
raise Exception(f"Scene count mismatch (Old: {len(old_scenes)}, New: {len(new_scenes)})")
raise Exception(f"Error in batcher.BatchSubtitles: {e}")

# Analyze scenes
old_num_scenes, old_num_batches_list, old_largest_batch_list, old_smallest_batch_list, old_avg_batch_list = analyze_scenes(old_scenes)
new_num_scenes, new_num_batches_list, new_largest_batch_list, new_smallest_batch_list, new_avg_batch_list = analyze_scenes(new_scenes)

logger.info(f"{f'':<25}{'Old':<10}{'New':<10}{'Delta':<10}")

total_old_batches = sum(old_num_batches_list)
total_new_batches = sum(new_num_batches_list)
total_delta_batches = total_new_batches - total_old_batches
num_scenes, num_batches_list, largest_batch_list, smallest_batch_list, avg_batch_list = analyze_scenes(scenes)

total_old_largest = max(old_largest_batch_list)
total_new_largest = max(new_largest_batch_list)
total_delta_largest = total_new_largest - total_old_largest

total_old_smallest = min(old_smallest_batch_list)
total_new_smallest = min(new_smallest_batch_list)
total_delta_smallest = total_new_smallest - total_old_smallest

total_old_avg = sum(old_avg_batch_list) / old_num_scenes
total_new_avg = sum(new_avg_batch_list) / new_num_scenes
total_delta_avg = total_new_avg - total_old_avg
total_batches = sum(num_batches_list)
total_largest = max(largest_batch_list)
total_smallest = min(smallest_batch_list)
total_avg = sum(avg_batch_list) / num_scenes

logger.info(separator)
logger.info(f"Total (min {options['min_batch_size']}, max {options['max_batch_size']}, scene {options['scene_threshold']}, batch {options['batch_threshold']})")
logger.info(f"Total (min {options['min_batch_size']}, max {options['max_batch_size']}, scene {options['scene_threshold']})")
logger.info(separator)
logger.info(f"{'Total Batches':<25}{total_old_batches:<10}{total_new_batches:<10}{' ' if total_delta_batches == 0 else total_delta_batches:<10}")
logger.info(f"{'Total Largest Batch':<25}{total_old_largest:<10}{total_new_largest:<10}{' ' if total_delta_largest == 0 else total_delta_largest:<10}")
logger.info(f"{'Total Smallest Batch':<25}{total_old_smallest:<10}{total_new_smallest:<10}{' ' if total_delta_smallest == 0 else total_delta_smallest:<10}")
logger.info(f"{'Average Batch Size':<25}{total_old_avg:<10.2f}{total_new_avg:<10.2f}{'' if abs(total_delta_avg) < 1.0 else f'{total_delta_avg:.2f}':<10}")
logger.info(f"{'Total Batches':<25}{total_batches:<10}")
logger.info(f"{'Total Largest Batch':<25}{total_largest:<10}")
logger.info(f"{'Total Smallest Batch':<25}{total_smallest:<10}")
logger.info(f"{'Average Batch Size':<25}{total_avg:<10.2f}")
logger.info(separator)

for i in range(old_num_scenes):
scene_num = i + 1

delta_num_batches = new_num_batches_list[i] - old_num_batches_list[i]
delta_largest_batch = new_largest_batch_list[i] - old_largest_batch_list[i]
delta_smallest_batch = new_smallest_batch_list[i] - old_smallest_batch_list[i]
delta_avg_batch = new_avg_batch_list[i] - old_avg_batch_list[i]

logger.info(f"{f'-- Scene {scene_num} --':<25}")
logger.info(f"{'Num Batches':<25}{old_num_batches_list[i]:<10}{new_num_batches_list[i]:<10}{' ' if delta_num_batches == 0 else delta_num_batches:<10}")
logger.info(f"{'Largest Batch':<25}{old_largest_batch_list[i]:<10}{new_largest_batch_list[i]:<10}{' ' if delta_largest_batch == 0 else delta_largest_batch:<10}")
logger.info(f"{'Smallest Batch':<25}{old_smallest_batch_list[i]:<10}{new_smallest_batch_list[i]:<10}{' ' if delta_smallest_batch == 0 else delta_smallest_batch:<10}")
logger.info(f"{'Average Batch Size':<25}{old_avg_batch_list[i]:<10.2f}{new_avg_batch_list[i]:<10.2f}{'' if abs(delta_avg_batch) < 1.0 else f'{delta_avg_batch:.2f}':<10}")
logger.info("")

def run_tests(directory_path, results_path):
test_options = [
{ 'min_batch_size': 10, 'max_batch_size': 100, 'scene_threshold': 60, 'batch_threshold': 20 },
{ 'min_batch_size': 8, 'max_batch_size': 40, 'scene_threshold': 30, 'batch_threshold': 5 },
{ 'min_batch_size': 16, 'max_batch_size': 80, 'scene_threshold': 40, 'batch_threshold': 8 },
{ 'min_batch_size': 10, 'max_batch_size': 100, 'scene_threshold': 60 },
{ 'min_batch_size': 8, 'max_batch_size': 40, 'scene_threshold': 30 },
{ 'min_batch_size': 16, 'max_batch_size': 80, 'scene_threshold': 40 },
]

RunTestOnAllSrtFiles(batcher_test, test_options, directory_path, results_path)
Expand Down
13 changes: 4 additions & 9 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ Prebuilt Linux packages are not provided so you will need to install from source
### Installing from source
For other platforms, or if you want to modify the program, you will need to have Python 3.10+ and pip installed on your system, then follow these steps.

#### step1
#### step1

1. Clone the GPT-Subtrans repository onto your local machine using the following command:
```
git clone https://github.com/machinewrapped/gpt-subtrans.git
```

The easiest setup method for most users is to run an installation script, e.g. `install-openai.bat` or `install-gemini.bat` at this point and enter your API key when prompted. This will create a virtual environment and install all the required packages for the provider, and generate command scripts to launch the specified provider. You can then skip the remaining steps.
The easiest setup method for most users is to run an installation script, e.g. `install-openai.bat` or `install-gemini.bat` at this point and enter your API key when prompted. This will create a virtual environment and install all the required packages for the provider, and generate command scripts to launch the specified provider. You can then skip the remaining steps.

MacOS and Linux users should run `install.sh` instead (this should work on any unix-like system).
MacOS and Linux users should run `install.sh` instead (this should work on any unix-like system).

During the installing process, input the apikey you have, and the .env file will be created automatically. Thus, you can ignore step2, but you are recommended to read it.

Expand Down Expand Up @@ -147,7 +147,7 @@ python3 gpt-subtrans.py <path_to_srt_file> --target_language <target_language> -
```
Remember to change the local port to yours and turn on your proxy tools such as v2ray, naiveproxy and clash.

### batch process
### batch process

you can process files with the following struct:

Expand Down Expand Up @@ -211,11 +211,6 @@ gpt-subtrans path/to/my/subtitles.srt --moviename "My Awesome Movie" --ratelimit
- `--scenethreshold`:
Number of seconds between lines to consider it a new scene.

- `--batchthreshold`:
Number of seconds between lines to consider starting a new batch of subtitles to translate.
Smaller batches take longer and cost more, but introduce more sync points and reduce the scope for the AI to drift.
This setting is ignored with the new subtitle batcher, as it batches dynamically based on the gaps between lines.

- `--minbatchsize`:
Minimum number of lines to consider starting a new batch to send to the translator.
Higher values typically result in faster and cheaper translations but increase the risk of desyncs.
Expand Down
1 change: 0 additions & 1 deletion scripts/gui-subtrans.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def parse_arguments():
logger_options = InitLogger("gui-subtrans", args.debug)

arguments = {
'batch_threshold': args.batchthreshold,
'firstrun': args.firstrun,
'include_original': args.includeoriginal,
'max_batch_size': args.maxbatchsize,
Expand Down
1 change: 0 additions & 1 deletion scripts/subtrans_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def CreateOptions(args: Namespace, provider: str, **kwargs) -> Options:
""" Create options with additional arguments """
options = {
'api_key': args.apikey,
'batch_threshold': args.batchthreshold,
'description': args.description,
'include_original': args.includeoriginal,
'instruction_args': args.instruction,
Expand Down

0 comments on commit c7230c7

Please sign in to comment.