Skip to content

Commit

Permalink
Convert blocks of space to newline
Browse files Browse the repository at this point in the history
Optional because I don't know how general it is, but it's common for Chinese subtitles to use multiple spaces instead of newlines to separate speakers.
  • Loading branch information
machinewrapped committed May 26, 2023
1 parent d820d09 commit a287abc
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 0 deletions.
1 change: 1 addition & 0 deletions GUI/SettingsDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class SettingsDialog(QDialog):
'max_batch_size': int,
'scene_threshold': float,
'batch_threshold': float,
'whitespaces_to_newline': bool,
'max_context_summaries': int,
'max_characters': int,
'max_newlines': int,
Expand Down
1 change: 1 addition & 0 deletions PySubtitleGPT/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def env_bool(key, default=False):
'max_context_summaries': int(os.getenv('MAX_CONTEXT_SUMMARIES', 10)),
'max_characters': int(os.getenv('MAX_CHARACTERS', 120)),
'max_newlines': int(os.getenv('MAX_NEWLINES', 3)),
'whitespaces_to_newline' : env_bool('WHITESPACES_TO_NEWLINE', False),
'max_lines': int(os.getenv('MAX_LINES')) if os.getenv('MAX_LINES') else None,
'rate_limit': float(os.getenv('RATE_LIMIT')) if os.getenv('RATE_LIMIT') else None,
'max_threads': int(os.getenv('MAX_THREADS', 4)),
Expand Down
9 changes: 9 additions & 0 deletions PySubtitleGPT/SubtitleBatch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import timedelta
import re
from PySubtitleGPT.SubtitleValidator import SubtitleValidator
from PySubtitleGPT.SubtitleError import SubtitleError, TranslationError
from PySubtitleGPT.Helpers import PerformSubstitutions
Expand Down Expand Up @@ -157,6 +158,14 @@ def PerformOutputSubstitutions(self, substitutions):
item.text = replacements.get(item.text) or item.text

return replacements

def ConvertWhitespaceBlocksToNewlines(self):
"""
Convert blocks of 3 or more spaces to a newline, unless there are newlines already
"""
for item in self.originals:
if '\n' not in item.text:
item.text = re.sub(r' {3,}', '\n', item.text)

def MergeLines(self, original_lines : list[int], translated_lines : list[int]):
first_line = next((line for line in self.originals if line.number == original_lines[0]), None)
Expand Down
4 changes: 4 additions & 0 deletions PySubtitleGPT/SubtitleTranslator.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ def TranslateBatches(self, batches : list[SubtitleBatch], context : dict, remain
# Apply any substitutions to the input
replacements = batch.PerformInputSubstitutions(substitutions)

# Split single lines with blocks of whitespace
if options.get('whitespaces_to_newline'):
batch.ConvertWhitespaceBlocksToNewlines()

# Filter out empty lines
originals = [line for line in batch.originals if line.text.strip()]

Expand Down

0 comments on commit a287abc

Please sign in to comment.