From 1799567cec4d8a0503bc5f0d2d6e4b166c5313d5 Mon Sep 17 00:00:00 2001 From: Larry Versaw Date: Thu, 20 Jan 2022 11:58:30 -0700 Subject: [PATCH] better unalign --- usfm/usfm_utils.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/usfm/usfm_utils.py b/usfm/usfm_utils.py index c555fc1de..0e387b33f 100644 --- a/usfm/usfm_utils.py +++ b/usfm/usfm_utils.py @@ -3,15 +3,19 @@ from __future__ import unicode_literals import re -def usfm3_to_usfm2(usfm): + +def unalign_usfm(aligned_usfm): """ - Converts a USFM 3 string to a USFM 2 compatible string - :param usfm3: - :return: the USFM 2 version of the string + Converts an aligned USFM string to an unaligned USFM compatible string + :param aligned_usfm: + :return: the unaligned USFM of the string """ - # Kind of usfm3 to usfm2 - usfm = re.sub(r'\\zaln-s[^\*]*\*', r'', usfm, flags=re.UNICODE | re.MULTILINE) + # Remove all tags used for alignments and words + usfm = re.sub(r'\\ts(-s)*\s*\\\*\s*', r'', aligned_usfm, flags=re.UNICODE | re.MULTILINE) + usfm = re.sub(r'\\zaln-s[^*]*?\*', r'', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'\\zaln-e\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE) + usfm = re.sub(r'\\k-s.*?\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE) + usfm = re.sub(r'\\k-e\\\*', r'', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'\\w ([^|]+)\|.*?\\w\*', r'\1', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'^\n', '', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'^([^\\].*)\n(?=[^\\])', r'\1 ', usfm, flags=re.UNICODE | re.MULTILINE) @@ -22,14 +26,14 @@ def usfm3_to_usfm2(usfm): usfm = re.sub(r"\s*' s(?!\w)", "'s", usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'\\s5', '', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'\\fqa([^*]+)\\fqa(?![*])', r'\\fqa\1\\fqa*', usfm, flags=re.UNICODE | re.MULTILINE) - + # Pair up quotes by chapter - chapters = re.compile(r'\\c').split(usfm) + chapters = re.compile(r'\\c ').split(usfm) usfm = chapters[0] for chapter in chapters[1:]: - chapter = re.sub(r'\s*"\s*([^"]+)\s*"\s*', r' "\1" ', chapter, flags=re.UNICODE | re.MULTILINE | re.DOTALL) - usfm += '\c'+chapter - usfm = re.sub(r'\\(\w+\**)([^\w* \n])', r'\\\1 \2', usfm, flags=re.UNICODE | re.MULTILINE) # \\q1" => \q1 " + chapter = re.sub(r'[ \t]*"([^"]+)"[ \t]*', r' "\1" ', chapter, flags=re.UNICODE | re.MULTILINE | re.DOTALL) + usfm += '\\c {0}'.format(chapter) + usfm = re.sub(r'\\(\w+\**)([^\w* \n])', r'\\\1 \2', usfm, flags=re.UNICODE | re.MULTILINE) # \\q1" => \q1 " usfm = re.sub(r" ' ", r" '", usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r' +([:;.?,!\]})-])', r'\1', usfm, flags=re.UNICODE | re.MULTILINE) usfm = re.sub(r'([{(\[-]) +', r'\1', usfm, flags=re.UNICODE | re.MULTILINE)