From 1f37307fb2a2357f93cb95c4ebb89beb1060301c Mon Sep 17 00:00:00 2001
From: machinewrapped <machinewrapped@users.noreply.github.com>
Date: Sun, 2 Jun 2024 14:06:34 +0200
Subject: [PATCH] Optionally merge very short lines in pre-process

... this may prove to be a bad idea.
---
 GUI/SettingsDialog.py           |  1 +
 PySubtitle/Helpers/Text.py      |  1 +
 PySubtitle/Options.py           |  1 +
 PySubtitle/SubtitleProcessor.py | 35 +++++++++++++++++++++++++++++++++
 4 files changed, 38 insertions(+)

diff --git a/GUI/SettingsDialog.py b/GUI/SettingsDialog.py
index cd2f124..c49c007 100644
--- a/GUI/SettingsDialog.py
+++ b/GUI/SettingsDialog.py
@@ -47,6 +47,7 @@ class SettingsDialog(QDialog):
             'save_preprocessed_subtitles': (bool, "Save preprocessed subtitles to a separate file"),
             'max_line_duration': (float, "Maximum duration of a single line of subtitles"),
             'min_line_duration': (float, "Minimum duration of a single line of subtitles"),
+            'merge_line_duration': (float, "Merge lines with a duration less than this with the previous line"),
             'min_split_chars': (int, "Minimum number of characters to split a line at"),
             'break_dialog_on_one_line': (bool, "Add line breaks to text with dialog markers"),
             'normalise_dialog_tags': (bool, "Ensure dialog markers match in multi-line subtitles"),
diff --git a/PySubtitle/Helpers/Text.py b/PySubtitle/Helpers/Text.py
index 74dd93a..7cc175f 100644
--- a/PySubtitle/Helpers/Text.py
+++ b/PySubtitle/Helpers/Text.py
@@ -2,6 +2,7 @@
 import regex
 
 common_punctuation = r"[.,!?;:…¡¿]"
+sentence_end_punctuation = r"[.!?…？！。﹑]"
 
 dialog_marker = "- "
 emdash = "—"
diff --git a/PySubtitle/Options.py b/PySubtitle/Options.py
index 24e62ec..073025c 100644
--- a/PySubtitle/Options.py
+++ b/PySubtitle/Options.py
@@ -45,6 +45,7 @@ def env_bool(key, default=False):
     'break_dialog_on_one_line': env_bool('break_dialog_on_one_line', True),
     'max_line_duration': float(os.getenv('MAX_LINE_DURATION', 4.0)),
     'min_line_duration': float(os.getenv('MIN_LINE_DURATION', 0.8)),
+    'merge_line_duration': float(os.getenv('MERGE_LINE_DURATION', 0.0)),
     'min_split_chars': int(os.getenv('MIN_SPLIT_CHARS', 3)),
     'normalise_dialog_tags': env_bool('NORMALISE_DIALOG_TAGS', True),
     'remove_filler_words': env_bool('REMOVE_FILLER_WORDS', True),
diff --git a/PySubtitle/SubtitleProcessor.py b/PySubtitle/SubtitleProcessor.py
index fa0cd67..8479171 100644
--- a/PySubtitle/SubtitleProcessor.py
+++ b/PySubtitle/SubtitleProcessor.py
@@ -7,6 +7,7 @@
     dialog_marker,
     split_sequences,
     break_sequences,
+    sentence_end_punctuation,
     BreakLongLine,
     BreakDialogOnOneLine,
     CompileDialogSplitPattern,
@@ -42,6 +43,7 @@ def __init__(self, settings : Options | dict):
 
         self.max_line_duration = timedelta(seconds = settings.get('max_line_duration', 0.0))
         self.min_line_duration = timedelta(seconds = settings.get('min_line_duration', 0.0))
+        self.merge_line_duration = timedelta(seconds = settings.get('merge_line_duration', 0.0))
         self.min_gap = timedelta(seconds=settings.get('min_gap', 0.05))
         self.min_split_chars = settings.get('min_split_chars', 4)
 
@@ -72,6 +74,9 @@ def PreprocessSubtitles(self, lines : list[SubtitleLine]):
         processed = []
         line_number = lines[0].number
 
+        if self.merge_line_duration.total_seconds() > 0.0:
+            lines = self._merge_short_lines(lines, self.merge_line_duration)
+
         for line in lines:
             line.number = line_number
             self._preprocess_line(line)
@@ -236,6 +241,36 @@ def _split_line_by_duration(self, line: SubtitleLine) -> list[SubtitleLine]:
 
         return result
 
+    def _merge_short_lines(self, lines : list[SubtitleLine], short_duration : timedelta) -> list[SubtitleLine]:
+        """
+        Merge lines with very short durations into the previous line
+        """
+        if not lines:
+            return []
+
+        merged_lines = []
+        current_line = lines[0]
+
+        for line in lines[1:]:
+            if not current_line.text.strip():
+                current_line = line
+                continue
+
+            if line.duration < short_duration:
+                if current_line.text[-1] in sentence_end_punctuation:
+                    # If the line ends with a sentence-ending punctuation mark, assume different speakers (questionable logic)
+                    current_line.text = f"{dialog_marker}{current_line.text}\n{dialog_marker}{line.text}"
+                else:
+                    current_line.text = f"{current_line.text}\n{line.text}"
+
+                current_line.end = line.end
+            else:
+                merged_lines.append(current_line)
+                current_line = line
+
+        merged_lines.append(current_line)
+        return merged_lines
+
     def _compile_split_sequences(self):
         self._compiled_split_sequences = [regex.compile(seq) for seq in self.split_sequences]