Skip to content

Commit

Permalink
Mandarin parser: handle line breaks.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed May 18, 2024
1 parent 58175df commit 5fe4206
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 1 deletion.
2 changes: 1 addition & 1 deletion plugins/lute-mandarin/lute_mandarin_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
Lute Mandarin Parser
"""

__version__ = "0.0.1"
__version__ = "0.0.2"
9 changes: 9 additions & 0 deletions plugins/lute-mandarin/lute_mandarin_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,22 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
"""
Returns ParsedToken array for given language.
"""

# Ensure standard carriage returns so that paragraph
# markers are used correctly. Lute uses paragraph markers
# for rendering.
text = text.replace("\r\n", "\n")

words = list(jieba.cut(text))
tokens = []
pattern = f"[{language.word_characters}]"
for word in words:
is_word_char = re.match(pattern, word) is not None
is_end_of_sentence = word in language.regexp_split_sentences
if word == "\n":
word = "¶"
if word == "¶":
is_word_char = False
is_end_of_sentence = True
p = ParsedToken(word, is_word_char, is_end_of_sentence)
tokens.append(p)
Expand Down
16 changes: 16 additions & 0 deletions plugins/lute-mandarin/tests/test_MandarinParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ def test_end_of_sentence_stored_in_parsed_tokens(mandarin_chinese):
assert_tokens_equals(s, mandarin_chinese, expected)


def test_carriage_returns_treated_as_reverse_p_character(mandarin_chinese):
"""
Returns need to be marked with the backwards P for rendering etc.
"""
s = "你好。\n现在。"

expected = [
("你好", True),
("。", False, True),
("¶", False, True),
("现在", True),
("。", False, True),
]
assert_tokens_equals(s, mandarin_chinese, expected)


def test_readings():
"""
Parser returns readings if they add value.
Expand Down

0 comments on commit 5fe4206

Please sign in to comment.