Skip to content

Commit

Permalink
Add tests to compare generated usj with testsuite samples
Browse files Browse the repository at this point in the history
  • Loading branch information
kavitharaju committed Sep 28, 2023
1 parent 0e0e596 commit 028d6a7
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 3 deletions.
33 changes: 31 additions & 2 deletions py-usfm-parser/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True):
f"{TEST_DIR}/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", # \c without number

f"{TEST_DIR}/specExamples/extended/contentCatogories1/origin.usfm": "fail", # cat inside footnote

f'{TEST_DIR}/special-cases/figure_with_quotes_in_desc/origin.usfm': "fail", # quote within quote
f'{TEST_DIR}/specExamples/poetry/origin.usfm': "fail", # \b not followed by a \p or \q
########### Need to be fixed #######################
f"{TEST_DIR}/paratextTests/NoErrorsShort/origin.usfm": "pass", # \c is mandatory!
f"{TEST_DIR}/usfmjsTests/gn_headers/origin.usfm": "fail", # what is the valid position for mte and imt
Expand All @@ -222,10 +225,36 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True):
negative_tests.append(file_path)

exclude_USX_files = [
# f'{TEST_DIR}/specExamples/chapter-verse/origin.usx',
# f'{TEST_DIR}/specExamples/chapter-verse/origin.xml',
# # ca is added as attribute to cl not chapter node
# f'{TEST_DIR}/specExamples/milestone/origin.usx',
# f'{TEST_DIR}/specExamples/milestone/origin.xml',
# # Znamespace not represented properly. Even no docs of it on https://ubsicap.github.io/usx
# f'{TEST_DIR}/advanced/table/origin.xml',
# # There is no verse end node at end(in last row of the table)
f'{TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml',
# \ef not treated as inline content of paragraph
f'{TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml',
# verse number="+"!!!
f'{TEST_DIR}/specExamples/character/origin.xml',
# lit element treated as a body paragraph enclosing a verse!
f'{TEST_DIR}/usfmjsTests/esb/origin.xml',
# last verse text given outside of paragraph.
f'{TEST_DIR}/special-cases/nbsp/origin.xml',
# ~ not being replaced by nbsp in usfm-grammar
f'{TEST_DIR}/special-cases/empty-attributes/origin.xml',
# attributes treated as text content of marker
]

invalid_usxs = []
for file_path in all_usfm_files:
usx_path = file_path.replace("origin.usfm", "origin.xml")
try:
with open(usx_path, 'r', encoding='utf-8') as usx_file:
usx_text = usx_file.read()
if 'status="invalid"' in usx_text:
invalid_usxs.append(usx_path)
except FileNotFoundError as exe:
print(exe)
invalid_usxs.append(usx_path)

exclude_USX_files += invalid_usxs
53 changes: 52 additions & 1 deletion py-usfm-parser/tests/test_json_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from jsonschema import validate

from tests import all_usfm_files, initialise_parser, doubtful_usfms, negative_tests,\
find_all_markers, Filter
find_all_markers, Filter, exclude_USX_files

all_valid_markers = []
for member in Filter:
Expand Down Expand Up @@ -116,3 +116,54 @@ def test_output_is_valid_usj(file_path):
assert not test_parser.errors, test_parser.errors
usj_dict = test_parser.to_usj()
validate(instance=usj_dict, schema=USJ_SCHEMA)

def remove_newlines_in_text(usj_dict):
'''The test samples in testsuite do not preserve new lines in. But we do in usfm-grammar.
So removing them just for comparison'''
if "content" in usj_dict:
for i,item in enumerate(usj_dict["content"]):
if isinstance(item, str):
usj_dict['content'][i] = item.replace("\n", " ")
usj_dict['content'][i] = usj_dict['content'][i].replace(" ", " ")
usj_dict['content'][i] = usj_dict['content'][i].replace(" ", " ")
continue
if "sid" in item and "PSA 09" in item['sid']: # for /usfmjsTests/tstudio/origin.usfm
item['sid'] = item['sid'].replace("PSA 091:01", "PSA 91:1")
item['sid'] = item['sid'].replace("PSA 091:02", "PSA 91:2")
item['sid'] = item['sid'].replace("PSA 09", "PSA 9")
remove_newlines_in_text(item)

def strip_default_attrib_value(usj_dict):
'''The USX samples in test suite have space in lemma values when given as default attribute'''
if "content" in usj_dict:
for item in usj_dict["content"]:
if isinstance(item, dict):
if item['type'] == "char:w":
if "lemma" in item:
item['lemma'] = item['lemma'].strip()
strip_default_attrib_value(item)


@pytest.mark.parametrize('file_path', test_files)
@pytest.mark.timeout(30)
def test_compare_usj_with_testsuite_samples(file_path):
'''Compare the generated USX with the origin.xml in test suite'''
test_parser = initialise_parser(file_path)
assert not test_parser.errors, test_parser.errors
usx_file_path = file_path.replace("origin.usfm", "origin.xml")
if usx_file_path not in exclude_USX_files:
usj_dict = test_parser.to_usj()
remove_newlines_in_text(usj_dict)
try:
usj_file_path = file_path.replace("origin.usfm", "origin-usj.json")
with open(usj_file_path, 'r', encoding='utf-8') as usj_file:
origin_usj = json.load(usj_file)
assert usj_dict == origin_usj, f"generated USJ:\n{usj_dict}\n"+\
f"USJ in testsuite:\n{origin_usj}\n syntax tree: {test_parser.to_syntax_tree()}"
except FileNotFoundError:
pass
except AssertionError:
strip_default_attrib_value(origin_usj)
assert usj_dict == origin_usj, f"generated USJ:\n{usj_dict}\n"+\
f"USJ in testsuite:\n{origin_usj}\n syntax tree: {test_parser.to_syntax_tree()}"
# assert usj_dict == origin_usj

0 comments on commit 028d6a7

Please sign in to comment.