promote quotes

unfoldingWord-dev · May 31, 2023 · b16ccda · b16ccda
1 parent 40ef5f9
commit b16ccda
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 66 deletions.
diff --git a/usfm/doublequotes.py b/usfm/doublequotes.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+# Used by usfm_cleanup.py.
+# Substitutions in this file convert straight double quotes to curly double quotes.
+# To be used in languages where the single quote (apostrophe) is a word-forming character.
+# These substitutions are applied after some regular expressions replacements have been made.
+
+# subs is a list of tuples to be used for string substitutions.
+subs = [
+# Convert open quote marks
+	('"“', '““'),
+	('“"', '““'),
+# Convert closing quote marks
+	('"”', "””"),
+	('”"', "””"),
+]
+
+import re
+quote0_re = re.compile(r'\s("+)[\w\']+("+)\s')     # a single word in quotes
+quote1_re = re.compile(r' ("+)[\w\']')     # SPACE quotes word => open quotes
+quote2_re = re.compile(r': ("+)')     # colon SPACE quotes => open quotes
+quote3_re = re.compile(r'[,;]("+) ')     # comma/semicolon quotes SPACE => close quotes
+quote4_re = re.compile(r'[\.!\?]("+)')     # period/bang/question quotes => close quotes
+quote5_re = re.compile(r'[\w\']("+) *\n')        # word quotes EOL => close quotes
+opentrans = str.maketrans('"', '“')
+closetrans = str.maketrans('"', '”')
+
+# Changes straight quotes to curly quotes where context suggests with very high confidence.
+def promoteQuotes(str):
+    pos = 0
+    snippet = quote0_re.search(str, pos)
+    while snippet:
+        # if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1:       # TEMPORARY!!!!!!
+        if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
+            (i,j) = (snippet.start()+1, snippet.end()-1)
+            str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
+        pos = snippet.end()
+        snippet = quote0_re.search(str, pos)
+
+    snippet = quote1_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end()-1)
+        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
+        snippet = quote1_re.search(str)
+
+    snippet = quote2_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+2, snippet.end())
+        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
+        snippet = quote2_re.search(str)
+
+    snippet = quote3_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end()-1)
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote3_re.search(str)
+
+    snippet = quote4_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end())
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote4_re.search(str)
+
+    snippet = quote5_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote5_re.search(str)
+
+    for pair in subs:
+        str = str.replace(pair[0], pair[1])
+    return str
diff --git a/usfm/quotes.py b/usfm/quotes.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# Used by usfm_cleanup.py.
+# Substitutions in this file convert straight quotes to curly quotes.
+# These substitutions are applied after some regular expressions replacements have been made.
+
+# subs is a list of tuples to be used for string substitutions.
+subs = [
+# Convert open quote marks
+	("'“", "‘“"),
+	("“'", "“‘"),
+	('‘"', '‘“'),
+	('"‘', '“‘'),
+
+# Convert closing quote marks
+	("'”", "’”"),
+	("”'", "”’"),
+	('’"', '’”'),
+	('"’', '”’')
+]
+
+quote0_re = re.compile(r'\s([\'"]+)[\w]+([\'"]+)\s')     # a single word in quotes
+quote1_re = re.compile(r' ([\'"]+)\w')     # SPACE quotes word => open quotes
+quote2_re = re.compile(r': ([\'"])+')     # colon SPACE quotes => open quotes
+quote3_re = re.compile(r'[,;]([\'"]+) ')     # comma/semicolon quotes SPACE => close quotes
+quote4_re = re.compile(r'[\.!\?]([\'"]+)')     # period/bang/question quotes => close quotes
+quote5_re = re.compile(r'\w([\'"]+) *\n')        # word quotes EOL
+opentrans = str.maketrans('\'"', "‘“")
+closetrans = str.maketrans('\'"', '’”')
+
+# Changes straight quotes to curly quotes where context suggests with very high confidence.
+def promoteQuotes(str):
+    pos = 0
+    snippet = quote0_re.search(str, pos)
+    while snippet:
+        # if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1:       # TEMPORARY!!!!!!
+        if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
+            (i,j) = (snippet.start()+1, snippet.end()-1)
+            str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
+        pos = snippet.end()
+        snippet = quote0_re.search(str, pos)
+
+    snippet = quote1_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end()-1)
+        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
+        snippet = quote1_re.search(str)
+
+    snippet = quote2_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+2, snippet.end())
+        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
+        snippet = quote2_re.search(str)
+
+    snippet = quote3_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end()-1)
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote3_re.search(str)
+
+    snippet = quote4_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.end())
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote4_re.search(str)
+
+    snippet = quote5_re.search(str)
+    while snippet:
+        (i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
+        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
+        snippet = quote5_re.search(str)
+
+    for pair in subs:
+        str = str.replace(pair[0], pair[1])
+    return str
diff --git a/usfm/usfm_cleanup.py b/usfm/usfm_cleanup.py
@@ -6,16 +6,10 @@
 # Moves standalone \p \m and \q markers which occur just before an \s# marker
 #    to the next line after the \s# marker.
 
-import re       # regular expression module
-import io
-import os
-import quotes
-import shutil
-import substitutions
-import sys
-
-# Globals
-source_dir = r"C:\DCS\Malagasy\plt_ulb.lv"
+# Set these globals
+source_dir = r"C:\DCS\Kubu\work\41-MAT.usfm"
+promote_all_quotes = True
+promote_double_quotes = True
 
 nChanged = 0
 max_changes = 66
@@ -27,6 +21,15 @@
 aligned_usfm = False
 remove_s5 = True
 
+import re       # regular expression module
+import io
+import os
+import shutil
+import sys
+import substitutions
+import quotes
+import doublequotes
+
 
 def shortname(longpath):
     shortname = longpath
@@ -112,60 +115,6 @@ def add_spaces(str):
             found = sub_re.search(str)
     return str
 
-quote0_re = re.compile(r'\s([\'"]+)[\w]+([\'"]+)\s')     # a single word in quotes
-quote1_re = re.compile(r' ([\'"]+)\w')     # SPACE quotes word => open quotes
-quote2_re = re.compile(r': ([\'"])+')     # colon SPACE quotes => open quotes
-quote3_re = re.compile(r'[,;]([\'"]+) ')     # comma/semicolon quotes SPACE => close quotes
-quote4_re = re.compile(r'[\.!\?]([\'"]+)')     # period/bang/question quotes => close quotes
-quote5_re = re.compile(r'\w([\'"]+) *\n')        # word quotes EOL
-opentrans = str.maketrans('\'"', "‘“")
-closetrans = str.maketrans('\'"', '’”')
-
-# Changes straight quotes to curly quotes where context suggests with very high confidence.
-def promoteQuotes(str):
-    pos = 0
-    snippet = quote0_re.search(str, pos)
-    while snippet:
-        # if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1:       # TEMPORARY!!!!!!
-        if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
-            (i,j) = (snippet.start()+1, snippet.end()-1)
-            str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
-        pos = snippet.end()
-        snippet = quote0_re.search(str, pos)
-
-    snippet = quote1_re.search(str)
-    while snippet:
-        (i,j) = (snippet.start()+1, snippet.end()-1)
-        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
-        snippet = quote1_re.search(str)
-
-    snippet = quote2_re.search(str)
-    while snippet:
-        (i,j) = (snippet.start()+2, snippet.end())
-        str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
-        snippet = quote2_re.search(str)
-
-    snippet = quote3_re.search(str)
-    while snippet:
-        (i,j) = (snippet.start()+1, snippet.end()-1)
-        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
-        snippet = quote3_re.search(str)
-
-    snippet = quote4_re.search(str)
-    while snippet:
-        (i,j) = (snippet.start()+1, snippet.end())
-        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
-        snippet = quote4_re.search(str)
-
-    snippet = quote5_re.search(str)
-    while snippet:
-        (i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
-        str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
-        snippet = quote5_re.search(str)
-
-    for pair in quotes.subs:
-        str = str.replace(pair[0], pair[1])
-    return str
 
 # Rewrites file and returns True if any changes are made.
 def convert_wholefile(path):
@@ -187,8 +136,10 @@ def convert_wholefile(path):
         alltext = fix_punctuation(alltext)
     if enable_add_spaces and not aligned_usfm:
         alltext = add_spaces(alltext)
-    if promote_quotes and not aligned_usfm:
-        alltext = promoteQuotes(alltext)
+    if promote_all_quotes and not aligned_usfm:
+        alltext = quotes.promoteQuotes(alltext)
+    elif promote_double_quotes and not aligned_usfm:
+        alltext = doublequotes.promoteQuotes(alltext)
     if alltext != origtext:
         output = io.open(path, "tw", buffering=1, encoding='utf-8', newline='\n')
         output.write(alltext)