Skip to content

Commit

Permalink
promote quotes
Browse files Browse the repository at this point in the history
  • Loading branch information
lversaw committed May 31, 2023
1 parent 40ef5f9 commit b16ccda
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 66 deletions.
71 changes: 71 additions & 0 deletions usfm/doublequotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
# Used by usfm_cleanup.py.
# Substitutions in this file convert straight double quotes to curly double quotes.
# To be used in languages where the single quote (apostrophe) is a word-forming character.
# These substitutions are applied after some regular expressions replacements have been made.

# subs is a list of tuples to be used for string substitutions.
subs = [
# Convert open quote marks
('"“', '““'),
('“"', '““'),
# Convert closing quote marks
('"”', "””"),
('”"', "””"),
]

import re
quote0_re = re.compile(r'\s("+)[\w\']+("+)\s') # a single word in quotes
quote1_re = re.compile(r' ("+)[\w\']') # SPACE quotes word => open quotes
quote2_re = re.compile(r': ("+)') # colon SPACE quotes => open quotes
quote3_re = re.compile(r'[,;]("+) ') # comma/semicolon quotes SPACE => close quotes
quote4_re = re.compile(r'[\.!\?]("+)') # period/bang/question quotes => close quotes
quote5_re = re.compile(r'[\w\']("+) *\n') # word quotes EOL => close quotes
opentrans = str.maketrans('"', '“')
closetrans = str.maketrans('"', '”')

# Changes straight quotes to curly quotes where context suggests with very high confidence.
def promoteQuotes(str):
pos = 0
snippet = quote0_re.search(str, pos)
while snippet:
# if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1: # TEMPORARY!!!!!!
if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
pos = snippet.end()
snippet = quote0_re.search(str, pos)

snippet = quote1_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote1_re.search(str)

snippet = quote2_re.search(str)
while snippet:
(i,j) = (snippet.start()+2, snippet.end())
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote2_re.search(str)

snippet = quote3_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote3_re.search(str)

snippet = quote4_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end())
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote4_re.search(str)

snippet = quote5_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote5_re.search(str)

for pair in subs:
str = str.replace(pair[0], pair[1])
return str
74 changes: 74 additions & 0 deletions usfm/quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# Used by usfm_cleanup.py.
# Substitutions in this file convert straight quotes to curly quotes.
# These substitutions are applied after some regular expressions replacements have been made.

# subs is a list of tuples to be used for string substitutions.
subs = [
# Convert open quote marks
("'“", "‘“"),
("“'", "“‘"),
('‘"', '‘“'),
('"‘', '“‘'),

# Convert closing quote marks
("'”", "’”"),
("”'", "”’"),
('’"', '’”'),
('"’', '”’')
]

quote0_re = re.compile(r'\s([\'"]+)[\w]+([\'"]+)\s') # a single word in quotes
quote1_re = re.compile(r' ([\'"]+)\w') # SPACE quotes word => open quotes
quote2_re = re.compile(r': ([\'"])+') # colon SPACE quotes => open quotes
quote3_re = re.compile(r'[,;]([\'"]+) ') # comma/semicolon quotes SPACE => close quotes
quote4_re = re.compile(r'[\.!\?]([\'"]+)') # period/bang/question quotes => close quotes
quote5_re = re.compile(r'\w([\'"]+) *\n') # word quotes EOL
opentrans = str.maketrans('\'"', "‘“")
closetrans = str.maketrans('\'"', '’”')

# Changes straight quotes to curly quotes where context suggests with very high confidence.
def promoteQuotes(str):
pos = 0
snippet = quote0_re.search(str, pos)
while snippet:
# if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1: # TEMPORARY!!!!!!
if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
pos = snippet.end()
snippet = quote0_re.search(str, pos)

snippet = quote1_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote1_re.search(str)

snippet = quote2_re.search(str)
while snippet:
(i,j) = (snippet.start()+2, snippet.end())
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote2_re.search(str)

snippet = quote3_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote3_re.search(str)

snippet = quote4_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end())
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote4_re.search(str)

snippet = quote5_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote5_re.search(str)

for pair in subs:
str = str.replace(pair[0], pair[1])
return str
83 changes: 17 additions & 66 deletions usfm/usfm_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@
# Moves standalone \p \m and \q markers which occur just before an \s# marker
# to the next line after the \s# marker.

import re # regular expression module
import io
import os
import quotes
import shutil
import substitutions
import sys

# Globals
source_dir = r"C:\DCS\Malagasy\plt_ulb.lv"
# Set these globals
source_dir = r"C:\DCS\Kubu\work\41-MAT.usfm"
promote_all_quotes = True
promote_double_quotes = True

nChanged = 0
max_changes = 66
Expand All @@ -27,6 +21,15 @@
aligned_usfm = False
remove_s5 = True

import re # regular expression module
import io
import os
import shutil
import sys
import substitutions
import quotes
import doublequotes


def shortname(longpath):
shortname = longpath
Expand Down Expand Up @@ -112,60 +115,6 @@ def add_spaces(str):
found = sub_re.search(str)
return str

quote0_re = re.compile(r'\s([\'"]+)[\w]+([\'"]+)\s') # a single word in quotes
quote1_re = re.compile(r' ([\'"]+)\w') # SPACE quotes word => open quotes
quote2_re = re.compile(r': ([\'"])+') # colon SPACE quotes => open quotes
quote3_re = re.compile(r'[,;]([\'"]+) ') # comma/semicolon quotes SPACE => close quotes
quote4_re = re.compile(r'[\.!\?]([\'"]+)') # period/bang/question quotes => close quotes
quote5_re = re.compile(r'\w([\'"]+) *\n') # word quotes EOL
opentrans = str.maketrans('\'"', "‘“")
closetrans = str.maketrans('\'"', '’”')

# Changes straight quotes to curly quotes where context suggests with very high confidence.
def promoteQuotes(str):
pos = 0
snippet = quote0_re.search(str, pos)
while snippet:
# if len(snippet.group(1)) == 1 and len(snippet.group(1)) == 1: # TEMPORARY!!!!!!
if snippet.group(1) == snippet.group(2) and len(snippet.group(1)) == 1:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[i+1:j-1] + snippet.group(2).translate(closetrans) + str[j:]
pos = snippet.end()
snippet = quote0_re.search(str, pos)

snippet = quote1_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote1_re.search(str)

snippet = quote2_re.search(str)
while snippet:
(i,j) = (snippet.start()+2, snippet.end())
str = str[0:i] + snippet.group(1).translate(opentrans) + str[j:]
snippet = quote2_re.search(str)

snippet = quote3_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end()-1)
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote3_re.search(str)

snippet = quote4_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.end())
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote4_re.search(str)

snippet = quote5_re.search(str)
while snippet:
(i,j) = (snippet.start()+1, snippet.start() + 1 + len(snippet.group(1)))
str = str[0:i] + snippet.group(1).translate(closetrans) + str[j:]
snippet = quote5_re.search(str)

for pair in quotes.subs:
str = str.replace(pair[0], pair[1])
return str

# Rewrites file and returns True if any changes are made.
def convert_wholefile(path):
Expand All @@ -187,8 +136,10 @@ def convert_wholefile(path):
alltext = fix_punctuation(alltext)
if enable_add_spaces and not aligned_usfm:
alltext = add_spaces(alltext)
if promote_quotes and not aligned_usfm:
alltext = promoteQuotes(alltext)
if promote_all_quotes and not aligned_usfm:
alltext = quotes.promoteQuotes(alltext)
elif promote_double_quotes and not aligned_usfm:
alltext = doublequotes.promoteQuotes(alltext)
if alltext != origtext:
output = io.open(path, "tw", buffering=1, encoding='utf-8', newline='\n')
output.write(alltext)
Expand Down

0 comments on commit b16ccda

Please sign in to comment.