Usj to usfm (#224)

* Implement USJ to USFM converson feature * Add tests for USJ to USFM conversion * Fix a bug noticed in USJ generation * Add usage examples in README and python-notebook * resolve conflicts * Move usj-to-usfm conversion API into the main USFMParser class itself(in constructor) * Update documentation as per the API change * Update tests as per the change in usj-to-usfm API
Bridgeconn · Oct 18, 2023 · bf8b42d · bf8b42d
1 parent e4347d7
commit bf8b42d
Show file tree

Hide file tree

Showing 8 changed files with 363 additions and 197 deletions.
diff --git a/docs/API guide for python usfm_grammar.ipynb b/docs/API guide for python usfm_grammar.ipynb
diff --git a/py-usfm-parser/README.md b/py-usfm-parser/README.md
@@ -86,10 +86,44 @@ print(table_output)
 
 ```
 
+To round trip with USJ
+```
+from usfm_grammar import USFMParser, Filter
+
+my_parser = USFMParser(input_usfm_str)
+usj_obj = my_parser.to_usj()
+
+my_parser2 = USFMParser(from_usj=usj_obj)
+print(my_parser2.usfm)
+```
+:warning: There will be differences between first USFM and the generated one in 1. Spaces and lines 2. Default attributes will be given their names 3. Closing markers may be newly added
+
+To remove unwanted markers from USFM
+```
+from usfm_grammar import USFMParser, Filter, USFMGenerator
+
+my_parser = USFMParser(input_usfm_str)
+usj_obj = my_parser.to_usj(include_markers=Filter.BCV+Filter.TEXT)
+
+my_parser2 = USFMParser(from_usj=usj_obj)
+print(my_parser2.usfm)
+```
+USJ to USX or Table
+```
+rom usfm_grammar import USFMParser, Filter
+
+my_parser = USFMParser(input_usfm_str)
+usj_obj = my_parser.to_usj()
+
+my_parser2 = USFMParser(from_usj=usj_obj)
+print(my_parser2.to_usx())
+# print(my_parser2.to_list())
+```
 ### From CLI
 
 ```
-usage: usfm-grammar [-h] [--format {json,table,syntax-tree,usx,markdown}]
+usage: usfm-grammar [-h] [--in_format {usfm,usj}]
+                    [--out_format {usj,table,syntax-tree,usx,markdown,usfm}]
                     [--include_markers {book_headers,titles,...}]
                     [--exclude_markers {book_headers,titles,...}]
                     [--csv_col_sep CSV_COL_SEP] [--csv_row_sep CSV_ROW_SEP]
@@ -100,11 +134,13 @@ Uses the tree-sitter-usfm grammar to parse and convert USFM to Syntax-tree,
 JSON, CSV, USX etc.
 
 positional arguments:
-  infile                input usfm file
+  infile                input usfm or usj file
 
 options:
   -h, --help            show this help message and exit
-  --format {json,table,syntax-tree,usx,markdown}
+  --in_format {usfm,usj}
+                        input file format
+  --out_format {usj,table,syntax-tree,usx,markdown,usfm}
                         output format
   --include_markers {book_headers,titles,comments,paragraphs,characters,notes,study_bible,bcv,text,ide,usfm,h,toc,toca,imt,is,ip,ipi,im,imi,ipq,imq,ipr,iq,ib,ili,iot,io,iex,imte,ie,mt,mte,cl,cd,ms,mr,s,sr,r,d,sp,sd,sts,rem,lit,restore,p,m,po,pr,cls,pmo,pm,pmc,pmr,pi,mi,nb,pc,ph,q,qr,qc,qa,qm,qd,lh,li,lf,lim,litl,tr,tc,th,tcr,thr,table,b,add,bk,dc,ior,iqt,k,litl,nd,ord,pn,png,qac,qs,qt,rq,sig,sls,tl,wj,em,bd,bdit,it,no,sc,sup,rb,pro,w,wh,wa,wg,lik,liv,jmp,f,fe,ef,efe,x,ex,fr,ft,fk,fq,fqa,fl,fw,fp,fv,fdc,xo,xop,xt,xta,xk,xq,xot,xnt,xdc,esb,cat,id,c,v,text-in-excluded-parent}
                         the list of of contents to be included
@@ -123,11 +159,15 @@ options:
 ```
 Example
 ```
->>> python3 -m usfm_grammar sample.usfm --format usx
+>>> python3 -m usfm_grammar sample.usfm --out_format usx
 
->>> usfm-grammar sample.usfm --format usx
+>>> usfm-grammar sample.usfm
+
+>>> usfm-grammar sample.usfm --out_format usx
 
 >>> usfm-grammar sample.usfm --include_markers bcv --include_markers text --include_markers s
+
+>>> usfm-grammar sample-usj.json --out_format usfm
 ```
 
 ### Filtering on USJ

diff --git a/py-usfm-parser/src/usfm_grammar/__init__.py b/py-usfm-parser/src/usfm_grammar/__init__.py
@@ -1,6 +1,7 @@
 '''Entry point of the package with its public values'''
 
 from usfm_grammar import usfm_parser
+from usfm_grammar import usfm_generator
 
 Filter = usfm_parser.Filter
 Format = usfm_parser.Format

diff --git a/py-usfm-parser/src/usfm_grammar/__main__.py b/py-usfm-parser/src/usfm_grammar/__main__.py
@@ -11,14 +11,60 @@
 for member in Filter:
     all_markers += member.value
 
+def handle_input_file(arg_parser):
+    '''If initialsing with USFM or USJ?'''
+    infile = arg_parser.parse_args().infile
+    input_format = arg_parser.parse_args().in_format
+    with open(infile, 'r', encoding='utf-8') as usfm_file:
+        file_content = usfm_file.read()
+
+    if input_format == Format.JSON or infile.split(".")[-1] in ['json', 'usj']:
+        usj_obj = json.loads(file_content)
+        my_parser = USFMParser(from_usj=usj_obj)
+    elif input_format == Format.USFM:
+        my_parser = USFMParser(file_content)
+    else:
+        raise Exception("Un-recognized input_format!")
+    return my_parser
+
+def handle_include_exclude_options(arg_parser):
+    '''Process list of markers and ENUMs'''
+    exclude_markers = arg_parser.parse_args().exclude_markers
+    include_markers = arg_parser.parse_args().include_markers
+
+    filter_names =  [member.name for member in Filter]
+    if exclude_markers is None:
+        updated_exclude_markers = None
+    else:
+        updated_exclude_markers = []
+        for itm in exclude_markers:
+            if itm.upper() in filter_names:
+                updated_exclude_markers += Filter[itm.upper()]
+            else:
+                updated_exclude_markers.append(itm.lower().replace("\\", ""))
+    if include_markers is None:
+        updated_include_markers = None
+    else:
+        updated_include_markers = []
+        for itm in include_markers:
+            if itm.upper() in filter_names:
+                updated_include_markers += Filter[itm.upper()]
+            else:
+                updated_include_markers.append(itm.lower().replace("\\", ""))
+    return updated_exclude_markers, updated_include_markers
+
 
 def main(): #pylint: disable=too-many-locals
     '''handles the command line requests'''
     arg_parser = argparse.ArgumentParser(
         description='Uses the tree-sitter-usfm grammar to parse and convert USFM to '+\
         'Syntax-tree, JSON, CSV, USX etc.')
-    arg_parser.add_argument('infile', type=str, help='input usfm file')
-    arg_parser.add_argument('--format', type=str, help='output format',
+    arg_parser.add_argument('infile', type=str, help='input usfm or usj file')
+
+    arg_parser.add_argument('--in_format', type=str, help='input file format',
+                            choices=[Format.USFM.value, Format.JSON.value],
+                            default=Format.USFM.value)
+    arg_parser.add_argument('--out_format', type=str, help='output format',
                             choices=[itm.value for itm in Format],
                             default=Format.JSON.value)
     arg_parser.add_argument('--include_markers', type=str,
@@ -44,52 +90,29 @@ def main(): #pylint: disable=too-many-locals
                             'from different components, or not',
                             action='store_true')
 
-    infile = arg_parser.parse_args().infile
-    output_format = arg_parser.parse_args().format
-    exclude_markers = arg_parser.parse_args().exclude_markers
-    include_markers = arg_parser.parse_args().include_markers
-
-    with open(infile, 'r', encoding='utf-8') as usfm_file:
-        file_content = usfm_file.read()
 
-    my_parser = USFMParser(file_content)
+    my_parser = handle_input_file(arg_parser)
 
     if my_parser.errors and not arg_parser.parse_args().ignore_errors:
         err_str = "\n\t".join([":".join(err) for err in my_parser.errors])
         print(f"Errors present:\n\t{err_str}")
         sys.exit(1)
 
-    filter_names =  [member.name for member in Filter]
-    if exclude_markers is None:
-        updated_exclude_markers = None
-    else:
-        updated_exclude_markers = []
-        for itm in exclude_markers:
-            if itm.upper() in filter_names:
-                updated_exclude_markers += Filter[itm.upper()]
-            else:
-                updated_exclude_markers.append(itm.lower().replace("\\", ""))
-    if include_markers is None:
-        updated_include_markers = None
-    else:
-        updated_include_markers = []
-        for itm in include_markers:
-            if itm.upper() in filter_names:
-                updated_include_markers += Filter[itm.upper()]
-            else:
-                updated_include_markers.append(itm.lower().replace("\\", ""))
+    exclude_markers, include_markers = handle_include_exclude_options(arg_parser)
+
+    output_format = arg_parser.parse_args().out_format
 
     match output_format:
         case Format.JSON:
             dict_output = my_parser.to_usj(
-                exclude_markers=updated_exclude_markers,
-                include_markers=updated_include_markers,
+                exclude_markers=exclude_markers,
+                include_markers=include_markers,
                 ignore_errors=True)
             print(json.dumps(dict_output, indent=4, ensure_ascii=False))
         case Format.CSV:
             table_output = my_parser.to_list(
-                exclude_markers=updated_exclude_markers,
-                include_markers=updated_include_markers,
+                exclude_markers=exclude_markers,
+                include_markers=include_markers,
                 ignore_errors=True)
             outfile = sys.stdout
             writer = csv.writer(outfile,
@@ -104,6 +127,8 @@ def main(): #pylint: disable=too-many-locals
             print(my_parser.to_markdown())
         case Format.ST:
             print(my_parser.to_syntax_tree(ignore_errors=True))
+        case Format.USFM:
+            print(my_parser.usfm)
         case _:
             raise Exception(f"Un-recognized output format:{output_format}!")
 

diff --git a/py-usfm-parser/src/usfm_grammar/usfm_generator.py b/py-usfm-parser/src/usfm_grammar/usfm_generator.py
@@ -0,0 +1,89 @@
+'''Convert other formats back into USFM'''
+
+NO_USFM_USJ_TYPES = ['USJ', 'table']
+NO_NEWLINE_USJ_TYPES = ['char', 'note', 'verse', 'table:cell']
+CLOSING_USJ_TYPES = ['char', 'note', 'figure']
+NON_ATTRIB_USJ_KEYS = ['type', 'content', 'number', 'sid',
+                        'code', 'caller', 'align',
+                        'version', 'altnumber', 'pubnumber', 'category']
+
+class USFMGenerator:
+    '''Combines the different methods that generate USFM from other formats in one class'''
+    def __init__(self):
+        self.usfm_string = ''
+
+    def is_valid_usfm(self, usfm_string: dict = None) -> bool:
+        '''Check the generated or passed USFM's correctness using the grammar'''
+        if usfm_string is None:
+            usfm_string = self.usfm_string
+        return False
+
+    def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
+        '''Traverses through the dict/json and uses 'type' field to form USFM elements'''
+        marker_types = usj_obj['type'].split(':')
+        if usj_obj['type'] not in NO_USFM_USJ_TYPES:
+            self.usfm_string += "\\"
+            if nested and marker_types[0] == 'char':
+                self.usfm_string+="+"
+            self.usfm_string += f"{marker_types[-1]} "
+        if 'code' in usj_obj:
+            self.usfm_string += f"{usj_obj['code']} "
+        if 'number' in usj_obj:
+            self.usfm_string += usj_obj['number']
+            if marker_types[0] == "verse":
+                self.usfm_string += " "
+        if 'caller' in usj_obj:
+            self.usfm_string += f"{usj_obj['caller']} "
+        if 'category' in usj_obj:
+            self.usfm_string += f"\\cat {usj_obj['category']}\\cat*\n"
+        if 'content' in usj_obj:
+            for item in usj_obj['content']:
+                if isinstance(item, str):
+                    self.usfm_string += item
+                else:
+                    if marker_types[0] in ['char']:
+                        self.usj_to_usfm(item, nested=True)
+                    else:
+                        self.usj_to_usfm(item)
+        attributes = False
+        for key in usj_obj:
+            if key not in NON_ATTRIB_USJ_KEYS:
+                if not attributes:
+                    self.usfm_string += "|"
+                    attributes = True
+                if key == "file":
+                    self.usfm_string += f"src=\"{usj_obj[key]}\" "
+                else:
+                    self.usfm_string += f"{key}=\"{usj_obj[key]}\" "
+
+        if marker_types[0] in CLOSING_USJ_TYPES:
+            self.usfm_string = self.usfm_string.strip() + "\\"
+            if nested and marker_types[0] == 'char':
+                self.usfm_string+="+"
+            self.usfm_string += f"{marker_types[-1]}* "
+        if marker_types[0] == "ms":
+            if "sid" in usj_obj:
+                if not attributes:
+                    self.usfm_string += "|"
+                    attributes = True
+                self.usfm_string += f"sid=\"{usj_obj['sid']}\" "
+            self.usfm_string = self.usfm_string.strip() + "\\*"
+        if marker_types[0] == "sidebar":
+            self.usfm_string += "\\esbe"
+        if ":".join(marker_types[:-1]) not in NO_NEWLINE_USJ_TYPES and \
+            self.usfm_string[-1] != "\n":
+            self.usfm_string += "\n"
+        if "altnumber" in usj_obj:
+            self.usfm_string += f"\\{marker_types[-1]}a {usj_obj['altnumber']}"
+            self.usfm_string += f"\\{marker_types[-1]}a* "
+        if "pubnumber" in usj_obj:
+            self.usfm_string += f"\\{marker_types[-1]}p {usj_obj['pubnumber']}"
+            if marker_types[-1] == "v":
+                self.usfm_string += f"\\{marker_types[-1]}p* "
+            else:
+                self.usfm_string += "\n"
+
+    # def usx_to_usfm(self, usx_xml_tree) -> str: # should we call it just from_usx() instead
+    #     '''Traverses xml tree and converts nodes to usfm elements
+    #     based on type and style fields'''
+    #     return self.usfm_string
diff --git a/py-usfm-parser/src/usfm_grammar/usfm_parser.py b/py-usfm-parser/src/usfm_grammar/usfm_parser.py
@@ -10,6 +10,7 @@
 from usfm_grammar.usx_generator import USXGenerator
 from usfm_grammar.usj_generator import USJGenerator
 from usfm_grammar.list_generator import ListGenerator
+from usfm_grammar.usfm_generator import USFMGenerator
 from usfm_grammar.filters import exclude_markers_in_usj, include_markers_in_usj
 
 class Filter(list, Enum):
@@ -41,12 +42,13 @@ class Filter(list, Enum):
     # INNER_CONTENT = ['content-in-excluded-parent']
 
 class Format(str, Enum):
-    '''Defines the valid values for output formats'''
-    JSON = "json"
+    '''Defines the valid values for input and output formats'''
+    JSON = "usj"
     CSV = "table"
     ST = "syntax-tree"
     USX = "usx"
     MD = "markdown"
+    USFM = "usfm"
 
 lang_file = resources.path('usfm_grammar','my-languages.so')
 USFM_LANGUAGE = Language(str(lang_file), 'usfm3')
@@ -75,9 +77,19 @@ class Format(str, Enum):
 
 class USFMParser():
     """Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
-    def __init__(self, usfm_string):
+    def __init__(self, usfm_string:str=None, from_usj:dict=None):
         # super(USFMParser, self).__init__()
-        self.usfm = usfm_string
+        if usfm_string is not None and from_usj is not None:
+            raise Exception("Found USFM and USJ inputs! Only one supported in one object.")
+        if usfm_string is not None:
+            self.usfm = usfm_string
+        elif from_usj is not None:
+            usj_converter = USFMGenerator()
+            usj_converter.usj_to_usfm(from_usj)
+            self.usfm = usj_converter.usfm_string
+        else:
+            raise Exception("Missing input! Either USFM or USJ to be provided.")
+
         self.usfm_bytes = None
         self.syntax_tree = None
         self.errors = None

diff --git a/py-usfm-parser/tests/__init__.py b/py-usfm-parser/tests/__init__.py
@@ -13,6 +13,16 @@ def initialise_parser(input_usfm_path):
     test_parser = USFMParser(usfm_string)
     return test_parser
 
+def generate_USFM_from_USJ(input_usj):
+    '''Create a generator, and use usj_to_usfm convertion API'''
+    usj_parser = USFMParser(from_usj=input_usj)
+    return usj_parser.usfm
+
+def parse_USFM_string(usfm_string):
+    '''Set up a parser obj with given string input'''
+    test_parser = USFMParser(usfm_string)
+    return test_parser
+
 def is_valid_usfm(input_usfm_path):
     '''Checks the metadata.xml to see is the USFM is a valid one'''
     if input_usfm_path in pass_fail_override_list: