Skip to content

Commit

Permalink
Usj to usfm (#224)
Browse files Browse the repository at this point in the history
* Implement USJ to USFM converson feature

* Add tests for USJ to USFM conversion

* Fix a bug noticed in USJ generation

* Add usage examples in README and python-notebook

* resolve conflicts

* Move usj-to-usfm conversion API into the main USFMParser class itself(in constructor)

* Update documentation as per the API change

* Update tests as per the change in usj-to-usfm API
  • Loading branch information
kavitharaju authored Oct 18, 2023
1 parent e4347d7 commit bf8b42d
Show file tree
Hide file tree
Showing 8 changed files with 363 additions and 197 deletions.
273 changes: 124 additions & 149 deletions docs/API guide for python usfm_grammar.ipynb

Large diffs are not rendered by default.

50 changes: 45 additions & 5 deletions py-usfm-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,44 @@ print(table_output)
```

To round trip with USJ
```
from usfm_grammar import USFMParser, Filter
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj()
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.usfm)
```
:warning: There will be differences between first USFM and the generated one in 1. Spaces and lines 2. Default attributes will be given their names 3. Closing markers may be newly added

To remove unwanted markers from USFM
```
from usfm_grammar import USFMParser, Filter, USFMGenerator
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj(include_markers=Filter.BCV+Filter.TEXT)
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.usfm)
```
USJ to USX or Table
```
rom usfm_grammar import USFMParser, Filter
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj()
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.to_usx())
# print(my_parser2.to_list())
```
### From CLI

```
usage: usfm-grammar [-h] [--format {json,table,syntax-tree,usx,markdown}]
usage: usfm-grammar [-h] [--in_format {usfm,usj}]
[--out_format {usj,table,syntax-tree,usx,markdown,usfm}]
[--include_markers {book_headers,titles,...}]
[--exclude_markers {book_headers,titles,...}]
[--csv_col_sep CSV_COL_SEP] [--csv_row_sep CSV_ROW_SEP]
Expand All @@ -100,11 +134,13 @@ Uses the tree-sitter-usfm grammar to parse and convert USFM to Syntax-tree,
JSON, CSV, USX etc.
positional arguments:
infile input usfm file
infile input usfm or usj file
options:
-h, --help show this help message and exit
--format {json,table,syntax-tree,usx,markdown}
--in_format {usfm,usj}
input file format
--out_format {usj,table,syntax-tree,usx,markdown,usfm}
output format
--include_markers {book_headers,titles,comments,paragraphs,characters,notes,study_bible,bcv,text,ide,usfm,h,toc,toca,imt,is,ip,ipi,im,imi,ipq,imq,ipr,iq,ib,ili,iot,io,iex,imte,ie,mt,mte,cl,cd,ms,mr,s,sr,r,d,sp,sd,sts,rem,lit,restore,p,m,po,pr,cls,pmo,pm,pmc,pmr,pi,mi,nb,pc,ph,q,qr,qc,qa,qm,qd,lh,li,lf,lim,litl,tr,tc,th,tcr,thr,table,b,add,bk,dc,ior,iqt,k,litl,nd,ord,pn,png,qac,qs,qt,rq,sig,sls,tl,wj,em,bd,bdit,it,no,sc,sup,rb,pro,w,wh,wa,wg,lik,liv,jmp,f,fe,ef,efe,x,ex,fr,ft,fk,fq,fqa,fl,fw,fp,fv,fdc,xo,xop,xt,xta,xk,xq,xot,xnt,xdc,esb,cat,id,c,v,text-in-excluded-parent}
the list of of contents to be included
Expand All @@ -123,11 +159,15 @@ options:
```
Example
```
>>> python3 -m usfm_grammar sample.usfm --format usx
>>> python3 -m usfm_grammar sample.usfm --out_format usx
>>> usfm-grammar sample.usfm --format usx
>>> usfm-grammar sample.usfm
>>> usfm-grammar sample.usfm --out_format usx
>>> usfm-grammar sample.usfm --include_markers bcv --include_markers text --include_markers s
>>> usfm-grammar sample-usj.json --out_format usfm
```

### Filtering on USJ
Expand Down
1 change: 1 addition & 0 deletions py-usfm-parser/src/usfm_grammar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'''Entry point of the package with its public values'''

from usfm_grammar import usfm_parser
from usfm_grammar import usfm_generator

Filter = usfm_parser.Filter
Format = usfm_parser.Format
Expand Down
91 changes: 58 additions & 33 deletions py-usfm-parser/src/usfm_grammar/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,60 @@
for member in Filter:
all_markers += member.value

def handle_input_file(arg_parser):
'''If initialsing with USFM or USJ?'''
infile = arg_parser.parse_args().infile
input_format = arg_parser.parse_args().in_format
with open(infile, 'r', encoding='utf-8') as usfm_file:
file_content = usfm_file.read()

if input_format == Format.JSON or infile.split(".")[-1] in ['json', 'usj']:
usj_obj = json.loads(file_content)
my_parser = USFMParser(from_usj=usj_obj)
elif input_format == Format.USFM:
my_parser = USFMParser(file_content)
else:
raise Exception("Un-recognized input_format!")
return my_parser

def handle_include_exclude_options(arg_parser):
'''Process list of markers and ENUMs'''
exclude_markers = arg_parser.parse_args().exclude_markers
include_markers = arg_parser.parse_args().include_markers

filter_names = [member.name for member in Filter]
if exclude_markers is None:
updated_exclude_markers = None
else:
updated_exclude_markers = []
for itm in exclude_markers:
if itm.upper() in filter_names:
updated_exclude_markers += Filter[itm.upper()]
else:
updated_exclude_markers.append(itm.lower().replace("\\", ""))
if include_markers is None:
updated_include_markers = None
else:
updated_include_markers = []
for itm in include_markers:
if itm.upper() in filter_names:
updated_include_markers += Filter[itm.upper()]
else:
updated_include_markers.append(itm.lower().replace("\\", ""))
return updated_exclude_markers, updated_include_markers


def main(): #pylint: disable=too-many-locals
'''handles the command line requests'''
arg_parser = argparse.ArgumentParser(
description='Uses the tree-sitter-usfm grammar to parse and convert USFM to '+\
'Syntax-tree, JSON, CSV, USX etc.')
arg_parser.add_argument('infile', type=str, help='input usfm file')
arg_parser.add_argument('--format', type=str, help='output format',
arg_parser.add_argument('infile', type=str, help='input usfm or usj file')

arg_parser.add_argument('--in_format', type=str, help='input file format',
choices=[Format.USFM.value, Format.JSON.value],
default=Format.USFM.value)
arg_parser.add_argument('--out_format', type=str, help='output format',
choices=[itm.value for itm in Format],
default=Format.JSON.value)
arg_parser.add_argument('--include_markers', type=str,
Expand All @@ -44,52 +90,29 @@ def main(): #pylint: disable=too-many-locals
'from different components, or not',
action='store_true')

infile = arg_parser.parse_args().infile
output_format = arg_parser.parse_args().format
exclude_markers = arg_parser.parse_args().exclude_markers
include_markers = arg_parser.parse_args().include_markers

with open(infile, 'r', encoding='utf-8') as usfm_file:
file_content = usfm_file.read()

my_parser = USFMParser(file_content)
my_parser = handle_input_file(arg_parser)

if my_parser.errors and not arg_parser.parse_args().ignore_errors:
err_str = "\n\t".join([":".join(err) for err in my_parser.errors])
print(f"Errors present:\n\t{err_str}")
sys.exit(1)

filter_names = [member.name for member in Filter]
if exclude_markers is None:
updated_exclude_markers = None
else:
updated_exclude_markers = []
for itm in exclude_markers:
if itm.upper() in filter_names:
updated_exclude_markers += Filter[itm.upper()]
else:
updated_exclude_markers.append(itm.lower().replace("\\", ""))
if include_markers is None:
updated_include_markers = None
else:
updated_include_markers = []
for itm in include_markers:
if itm.upper() in filter_names:
updated_include_markers += Filter[itm.upper()]
else:
updated_include_markers.append(itm.lower().replace("\\", ""))
exclude_markers, include_markers = handle_include_exclude_options(arg_parser)

output_format = arg_parser.parse_args().out_format

match output_format:
case Format.JSON:
dict_output = my_parser.to_usj(
exclude_markers=updated_exclude_markers,
include_markers=updated_include_markers,
exclude_markers=exclude_markers,
include_markers=include_markers,
ignore_errors=True)
print(json.dumps(dict_output, indent=4, ensure_ascii=False))
case Format.CSV:
table_output = my_parser.to_list(
exclude_markers=updated_exclude_markers,
include_markers=updated_include_markers,
exclude_markers=exclude_markers,
include_markers=include_markers,
ignore_errors=True)
outfile = sys.stdout
writer = csv.writer(outfile,
Expand All @@ -104,6 +127,8 @@ def main(): #pylint: disable=too-many-locals
print(my_parser.to_markdown())
case Format.ST:
print(my_parser.to_syntax_tree(ignore_errors=True))
case Format.USFM:
print(my_parser.usfm)
case _:
raise Exception(f"Un-recognized output format:{output_format}!")

Expand Down
89 changes: 89 additions & 0 deletions py-usfm-parser/src/usfm_grammar/usfm_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
'''Convert other formats back into USFM'''

NO_USFM_USJ_TYPES = ['USJ', 'table']
NO_NEWLINE_USJ_TYPES = ['char', 'note', 'verse', 'table:cell']
CLOSING_USJ_TYPES = ['char', 'note', 'figure']
NON_ATTRIB_USJ_KEYS = ['type', 'content', 'number', 'sid',
'code', 'caller', 'align',
'version', 'altnumber', 'pubnumber', 'category']

class USFMGenerator:
'''Combines the different methods that generate USFM from other formats in one class'''
def __init__(self):
self.usfm_string = ''

def is_valid_usfm(self, usfm_string: dict = None) -> bool:
'''Check the generated or passed USFM's correctness using the grammar'''
if usfm_string is None:
usfm_string = self.usfm_string
return False

def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
'''Traverses through the dict/json and uses 'type' field to form USFM elements'''
marker_types = usj_obj['type'].split(':')
if usj_obj['type'] not in NO_USFM_USJ_TYPES:
self.usfm_string += "\\"
if nested and marker_types[0] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]} "
if 'code' in usj_obj:
self.usfm_string += f"{usj_obj['code']} "
if 'number' in usj_obj:
self.usfm_string += usj_obj['number']
if marker_types[0] == "verse":
self.usfm_string += " "
if 'caller' in usj_obj:
self.usfm_string += f"{usj_obj['caller']} "
if 'category' in usj_obj:
self.usfm_string += f"\\cat {usj_obj['category']}\\cat*\n"
if 'content' in usj_obj:
for item in usj_obj['content']:
if isinstance(item, str):
self.usfm_string += item
else:
if marker_types[0] in ['char']:
self.usj_to_usfm(item, nested=True)
else:
self.usj_to_usfm(item)
attributes = False
for key in usj_obj:
if key not in NON_ATTRIB_USJ_KEYS:
if not attributes:
self.usfm_string += "|"
attributes = True
if key == "file":
self.usfm_string += f"src=\"{usj_obj[key]}\" "
else:
self.usfm_string += f"{key}=\"{usj_obj[key]}\" "

if marker_types[0] in CLOSING_USJ_TYPES:
self.usfm_string = self.usfm_string.strip() + "\\"
if nested and marker_types[0] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]}* "
if marker_types[0] == "ms":
if "sid" in usj_obj:
if not attributes:
self.usfm_string += "|"
attributes = True
self.usfm_string += f"sid=\"{usj_obj['sid']}\" "
self.usfm_string = self.usfm_string.strip() + "\\*"
if marker_types[0] == "sidebar":
self.usfm_string += "\\esbe"
if ":".join(marker_types[:-1]) not in NO_NEWLINE_USJ_TYPES and \
self.usfm_string[-1] != "\n":
self.usfm_string += "\n"
if "altnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}a {usj_obj['altnumber']}"
self.usfm_string += f"\\{marker_types[-1]}a* "
if "pubnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}p {usj_obj['pubnumber']}"
if marker_types[-1] == "v":
self.usfm_string += f"\\{marker_types[-1]}p* "
else:
self.usfm_string += "\n"

# def usx_to_usfm(self, usx_xml_tree) -> str: # should we call it just from_usx() instead
# '''Traverses xml tree and converts nodes to usfm elements
# based on type and style fields'''
# return self.usfm_string
20 changes: 16 additions & 4 deletions py-usfm-parser/src/usfm_grammar/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from usfm_grammar.usx_generator import USXGenerator
from usfm_grammar.usj_generator import USJGenerator
from usfm_grammar.list_generator import ListGenerator
from usfm_grammar.usfm_generator import USFMGenerator
from usfm_grammar.filters import exclude_markers_in_usj, include_markers_in_usj

class Filter(list, Enum):
Expand Down Expand Up @@ -41,12 +42,13 @@ class Filter(list, Enum):
# INNER_CONTENT = ['content-in-excluded-parent']

class Format(str, Enum):
'''Defines the valid values for output formats'''
JSON = "json"
'''Defines the valid values for input and output formats'''
JSON = "usj"
CSV = "table"
ST = "syntax-tree"
USX = "usx"
MD = "markdown"
USFM = "usfm"

lang_file = resources.path('usfm_grammar','my-languages.so')
USFM_LANGUAGE = Language(str(lang_file), 'usfm3')
Expand Down Expand Up @@ -75,9 +77,19 @@ class Format(str, Enum):

class USFMParser():
"""Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
def __init__(self, usfm_string):
def __init__(self, usfm_string:str=None, from_usj:dict=None):
# super(USFMParser, self).__init__()
self.usfm = usfm_string
if usfm_string is not None and from_usj is not None:
raise Exception("Found USFM and USJ inputs! Only one supported in one object.")
if usfm_string is not None:
self.usfm = usfm_string
elif from_usj is not None:
usj_converter = USFMGenerator()
usj_converter.usj_to_usfm(from_usj)
self.usfm = usj_converter.usfm_string
else:
raise Exception("Missing input! Either USFM or USJ to be provided.")

self.usfm_bytes = None
self.syntax_tree = None
self.errors = None
Expand Down
10 changes: 10 additions & 0 deletions py-usfm-parser/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ def initialise_parser(input_usfm_path):
test_parser = USFMParser(usfm_string)
return test_parser

def generate_USFM_from_USJ(input_usj):
'''Create a generator, and use usj_to_usfm convertion API'''
usj_parser = USFMParser(from_usj=input_usj)
return usj_parser.usfm

def parse_USFM_string(usfm_string):
'''Set up a parser obj with given string input'''
test_parser = USFMParser(usfm_string)
return test_parser

def is_valid_usfm(input_usfm_path):
'''Checks the metadata.xml to see is the USFM is a valid one'''
if input_usfm_path in pass_fail_override_list:
Expand Down
Loading

0 comments on commit bf8b42d

Please sign in to comment.