Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] label segmentation on whitespace #213

Merged
merged 3 commits into from
Apr 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions persephone/preprocess/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,34 @@
""")


UNICODE_WHITESPACE_CHARACTERS = [
"\u0009", # character tabulation
"\u000a", # line feed
"\u000b", # line tabulation
"\u000c", # form feed
"\u000d", # carriage return
"\u0020", # space
"\u0085", # next line
"\u00a0", # no-break space
"\u1680", # ogham space mark
"\u2000", # en quad
"\u2001", # em quad
"\u2002", # en space
"\u2003", # em space
"\u2004", # three-per-em space
"\u2005", # four-per-em space
"\u2006", # six-per-em space
"\u2007", # figure space
"\u2008", # punctuation space
"\u2009", # thin space
"\u200A", # hair space
"\u2028", # line separator
"\u2029", # paragraph separator
"\u202f", # narrow no-break space
"\u205f", # medium mathematical space
"\u3000", # ideographic space
]

def segment_into_chars(utterance: str) -> str:
""" Segments an utterance into space delimited characters. """

Expand All @@ -33,6 +61,8 @@ def segment_into_chars(utterance: str) -> str:

utterance.strip()
utterance = utterance.replace(" ", "")
for char in UNICODE_WHITESPACE_CHARACTERS:
utterance = utterance.replace(char, "")
return " ".join(utterance)

def segment_into_tokens(utterance: str, token_inventory: Iterable[str]):
Expand Down
23 changes: 23 additions & 0 deletions persephone/tests/test_transcription_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,26 @@ def test_segment_into_tokens():
assert segment_into_tokens(input_1, token_inv) == output_1
assert segment_into_tokens(input_2, token_inv) == output_2
assert segment_into_tokens(input_3, token_inv) == output_3

def test_unicode_segmentation():
"""Test that unicode whitespace characters are correctly handled in segmentation"""
from persephone.preprocess.labels import segment_into_chars
no_break_space = "hello\u00A0world"
assert segment_into_chars(no_break_space) == "h e l l o w o r l d"

unicode_spaces = [
"\u2000", #EN QUAD
"\u2001", #EM QUAD
"\u2002", #EN SPACE
"\u2003", #EM SPACE
"\u2004", #THREE-PER-EM SPACE
"\u2005", #FOUR-PER-EM SPACE
"\u2006", #SIX-PER-EM SPACE
"\u2007", #FIGURE SPACE
"\u2008", #PUNCTUATION SPACE
"\u2009", #THIN SPACE
"\u200A", #HAIR SPACE
]

for space_character in unicode_spaces:
assert segment_into_chars("hello"+space_character+"world") == "h e l l o w o r l d"