Skip to content

Commit

Permalink
Optionally merge very short lines in pre-process
Browse files Browse the repository at this point in the history
... this may prove to be a bad idea.
  • Loading branch information
machinewrapped committed Jun 2, 2024
1 parent b7e4627 commit 1f37307
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 0 deletions.
1 change: 1 addition & 0 deletions GUI/SettingsDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class SettingsDialog(QDialog):
'save_preprocessed_subtitles': (bool, "Save preprocessed subtitles to a separate file"),
'max_line_duration': (float, "Maximum duration of a single line of subtitles"),
'min_line_duration': (float, "Minimum duration of a single line of subtitles"),
'merge_line_duration': (float, "Merge lines with a duration less than this with the previous line"),
'min_split_chars': (int, "Minimum number of characters to split a line at"),
'break_dialog_on_one_line': (bool, "Add line breaks to text with dialog markers"),
'normalise_dialog_tags': (bool, "Ensure dialog markers match in multi-line subtitles"),
Expand Down
1 change: 1 addition & 0 deletions PySubtitle/Helpers/Text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import regex

common_punctuation = r"[.,!?;:…¡¿]"
sentence_end_punctuation = r"[.!?…?!。﹑]"

dialog_marker = "- "
emdash = "—"
Expand Down
1 change: 1 addition & 0 deletions PySubtitle/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def env_bool(key, default=False):
'break_dialog_on_one_line': env_bool('break_dialog_on_one_line', True),
'max_line_duration': float(os.getenv('MAX_LINE_DURATION', 4.0)),
'min_line_duration': float(os.getenv('MIN_LINE_DURATION', 0.8)),
'merge_line_duration': float(os.getenv('MERGE_LINE_DURATION', 0.0)),
'min_split_chars': int(os.getenv('MIN_SPLIT_CHARS', 3)),
'normalise_dialog_tags': env_bool('NORMALISE_DIALOG_TAGS', True),
'remove_filler_words': env_bool('REMOVE_FILLER_WORDS', True),
Expand Down
35 changes: 35 additions & 0 deletions PySubtitle/SubtitleProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
dialog_marker,
split_sequences,
break_sequences,
sentence_end_punctuation,
BreakLongLine,
BreakDialogOnOneLine,
CompileDialogSplitPattern,
Expand Down Expand Up @@ -42,6 +43,7 @@ def __init__(self, settings : Options | dict):

self.max_line_duration = timedelta(seconds = settings.get('max_line_duration', 0.0))
self.min_line_duration = timedelta(seconds = settings.get('min_line_duration', 0.0))
self.merge_line_duration = timedelta(seconds = settings.get('merge_line_duration', 0.0))
self.min_gap = timedelta(seconds=settings.get('min_gap', 0.05))
self.min_split_chars = settings.get('min_split_chars', 4)

Expand Down Expand Up @@ -72,6 +74,9 @@ def PreprocessSubtitles(self, lines : list[SubtitleLine]):
processed = []
line_number = lines[0].number

if self.merge_line_duration.total_seconds() > 0.0:
lines = self._merge_short_lines(lines, self.merge_line_duration)

for line in lines:
line.number = line_number
self._preprocess_line(line)
Expand Down Expand Up @@ -236,6 +241,36 @@ def _split_line_by_duration(self, line: SubtitleLine) -> list[SubtitleLine]:

return result

def _merge_short_lines(self, lines : list[SubtitleLine], short_duration : timedelta) -> list[SubtitleLine]:
"""
Merge lines with very short durations into the previous line
"""
if not lines:
return []

merged_lines = []
current_line = lines[0]

for line in lines[1:]:
if not current_line.text.strip():
current_line = line
continue

if line.duration < short_duration:
if current_line.text[-1] in sentence_end_punctuation:
# If the line ends with a sentence-ending punctuation mark, assume different speakers (questionable logic)
current_line.text = f"{dialog_marker}{current_line.text}\n{dialog_marker}{line.text}"
else:
current_line.text = f"{current_line.text}\n{line.text}"

current_line.end = line.end
else:
merged_lines.append(current_line)
current_line = line

merged_lines.append(current_line)
return merged_lines

def _compile_split_sequences(self):
self._compiled_split_sequences = [regex.compile(seq) for seq in self.split_sequences]

Expand Down

0 comments on commit 1f37307

Please sign in to comment.