From 1f37307fb2a2357f93cb95c4ebb89beb1060301c Mon Sep 17 00:00:00 2001 From: machinewrapped Date: Sun, 2 Jun 2024 14:06:34 +0200 Subject: [PATCH] Optionally merge very short lines in pre-process ... this may prove to be a bad idea. --- GUI/SettingsDialog.py | 1 + PySubtitle/Helpers/Text.py | 1 + PySubtitle/Options.py | 1 + PySubtitle/SubtitleProcessor.py | 35 +++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/GUI/SettingsDialog.py b/GUI/SettingsDialog.py index cd2f124..c49c007 100644 --- a/GUI/SettingsDialog.py +++ b/GUI/SettingsDialog.py @@ -47,6 +47,7 @@ class SettingsDialog(QDialog): 'save_preprocessed_subtitles': (bool, "Save preprocessed subtitles to a separate file"), 'max_line_duration': (float, "Maximum duration of a single line of subtitles"), 'min_line_duration': (float, "Minimum duration of a single line of subtitles"), + 'merge_line_duration': (float, "Merge lines with a duration less than this with the previous line"), 'min_split_chars': (int, "Minimum number of characters to split a line at"), 'break_dialog_on_one_line': (bool, "Add line breaks to text with dialog markers"), 'normalise_dialog_tags': (bool, "Ensure dialog markers match in multi-line subtitles"), diff --git a/PySubtitle/Helpers/Text.py b/PySubtitle/Helpers/Text.py index 74dd93a..7cc175f 100644 --- a/PySubtitle/Helpers/Text.py +++ b/PySubtitle/Helpers/Text.py @@ -2,6 +2,7 @@ import regex common_punctuation = r"[.,!?;:…¡¿]" +sentence_end_punctuation = r"[.!?…?!。﹑]" dialog_marker = "- " emdash = "—" diff --git a/PySubtitle/Options.py b/PySubtitle/Options.py index 24e62ec..073025c 100644 --- a/PySubtitle/Options.py +++ b/PySubtitle/Options.py @@ -45,6 +45,7 @@ def env_bool(key, default=False): 'break_dialog_on_one_line': env_bool('break_dialog_on_one_line', True), 'max_line_duration': float(os.getenv('MAX_LINE_DURATION', 4.0)), 'min_line_duration': float(os.getenv('MIN_LINE_DURATION', 0.8)), + 'merge_line_duration': float(os.getenv('MERGE_LINE_DURATION', 0.0)), 'min_split_chars': int(os.getenv('MIN_SPLIT_CHARS', 3)), 'normalise_dialog_tags': env_bool('NORMALISE_DIALOG_TAGS', True), 'remove_filler_words': env_bool('REMOVE_FILLER_WORDS', True), diff --git a/PySubtitle/SubtitleProcessor.py b/PySubtitle/SubtitleProcessor.py index fa0cd67..8479171 100644 --- a/PySubtitle/SubtitleProcessor.py +++ b/PySubtitle/SubtitleProcessor.py @@ -7,6 +7,7 @@ dialog_marker, split_sequences, break_sequences, + sentence_end_punctuation, BreakLongLine, BreakDialogOnOneLine, CompileDialogSplitPattern, @@ -42,6 +43,7 @@ def __init__(self, settings : Options | dict): self.max_line_duration = timedelta(seconds = settings.get('max_line_duration', 0.0)) self.min_line_duration = timedelta(seconds = settings.get('min_line_duration', 0.0)) + self.merge_line_duration = timedelta(seconds = settings.get('merge_line_duration', 0.0)) self.min_gap = timedelta(seconds=settings.get('min_gap', 0.05)) self.min_split_chars = settings.get('min_split_chars', 4) @@ -72,6 +74,9 @@ def PreprocessSubtitles(self, lines : list[SubtitleLine]): processed = [] line_number = lines[0].number + if self.merge_line_duration.total_seconds() > 0.0: + lines = self._merge_short_lines(lines, self.merge_line_duration) + for line in lines: line.number = line_number self._preprocess_line(line) @@ -236,6 +241,36 @@ def _split_line_by_duration(self, line: SubtitleLine) -> list[SubtitleLine]: return result + def _merge_short_lines(self, lines : list[SubtitleLine], short_duration : timedelta) -> list[SubtitleLine]: + """ + Merge lines with very short durations into the previous line + """ + if not lines: + return [] + + merged_lines = [] + current_line = lines[0] + + for line in lines[1:]: + if not current_line.text.strip(): + current_line = line + continue + + if line.duration < short_duration: + if current_line.text[-1] in sentence_end_punctuation: + # If the line ends with a sentence-ending punctuation mark, assume different speakers (questionable logic) + current_line.text = f"{dialog_marker}{current_line.text}\n{dialog_marker}{line.text}" + else: + current_line.text = f"{current_line.text}\n{line.text}" + + current_line.end = line.end + else: + merged_lines.append(current_line) + current_line = line + + merged_lines.append(current_line) + return merged_lines + def _compile_split_sequences(self): self._compiled_split_sequences = [regex.compile(seq) for seq in self.split_sequences]