Skip to content

Commit

Permalink
better unalign
Browse files Browse the repository at this point in the history
  • Loading branch information
lversaw committed Jan 20, 2022
1 parent aaade76 commit 1799567
Showing 1 changed file with 15 additions and 11 deletions.
26 changes: 15 additions & 11 deletions usfm/usfm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
from __future__ import unicode_literals
import re

def usfm3_to_usfm2(usfm):

def unalign_usfm(aligned_usfm):
"""
Converts a USFM 3 string to a USFM 2 compatible string
:param usfm3:
:return: the USFM 2 version of the string
Converts an aligned USFM string to an unaligned USFM compatible string
:param aligned_usfm:
:return: the unaligned USFM of the string
"""
# Kind of usfm3 to usfm2
usfm = re.sub(r'\\zaln-s[^\*]*\*', r'', usfm, flags=re.UNICODE | re.MULTILINE)
# Remove all tags used for alignments and words
usfm = re.sub(r'\\ts(-s)*\s*\\\*\s*', r'', aligned_usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\zaln-s[^*]*?\*', r'', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\zaln-e\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\k-s.*?\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\k-e\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\w ([^|]+)\|.*?\\w\*', r'\1', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'^\n', '', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'^([^\\].*)\n(?=[^\\])', r'\1 ', usfm, flags=re.UNICODE | re.MULTILINE)
Expand All @@ -22,14 +26,14 @@ def usfm3_to_usfm2(usfm):
usfm = re.sub(r"\s*' s(?!\w)", "'s", usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\s5', '', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'\\fqa([^*]+)\\fqa(?![*])', r'\\fqa\1\\fqa*', usfm, flags=re.UNICODE | re.MULTILINE)

# Pair up quotes by chapter
chapters = re.compile(r'\\c').split(usfm)
chapters = re.compile(r'\\c ').split(usfm)
usfm = chapters[0]
for chapter in chapters[1:]:
chapter = re.sub(r'\s*"\s*([^"]+)\s*"\s*', r' "\1" ', chapter, flags=re.UNICODE | re.MULTILINE | re.DOTALL)
usfm += '\c'+chapter
usfm = re.sub(r'\\(\w+\**)([^\w* \n])', r'\\\1 \2', usfm, flags=re.UNICODE | re.MULTILINE) # \\q1" => \q1 "
chapter = re.sub(r'[ \t]*"([^"]+)"[ \t]*', r' "\1" ', chapter, flags=re.UNICODE | re.MULTILINE | re.DOTALL)
usfm += '\\c {0}'.format(chapter)
usfm = re.sub(r'\\(\w+\**)([^\w* \n])', r'\\\1 \2', usfm, flags=re.UNICODE | re.MULTILINE) # \\q1" => \q1 "
usfm = re.sub(r" ' ", r" '", usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r' +([:;.?,!\]})-])', r'\1', usfm, flags=re.UNICODE | re.MULTILINE)
usfm = re.sub(r'([{(\[-]) +', r'\1', usfm, flags=re.UNICODE | re.MULTILINE)
Expand Down

0 comments on commit 1799567

Please sign in to comment.