persephone-tools · oadams · Apr 21, 2020 · Dec 27, 2018 · Dec 27, 2018 · Apr 19, 2020
diff --git a/persephone/preprocess/labels.py b/persephone/preprocess/labels.py
@@ -25,6 +25,34 @@
     """)
 
 
+UNICODE_WHITESPACE_CHARACTERS = [
+    "\u0009", # character tabulation
+    "\u000a", # line feed
+    "\u000b", # line tabulation
+    "\u000c", # form feed
+    "\u000d", # carriage return
+    "\u0020", # space
+    "\u0085", # next line
+    "\u00a0", # no-break space
+    "\u1680", # ogham space mark
+    "\u2000", # en quad
+    "\u2001", # em quad
+    "\u2002", # en space
+    "\u2003", # em space
+    "\u2004", # three-per-em space
+    "\u2005", # four-per-em space
+    "\u2006", # six-per-em space
+    "\u2007", # figure space
+    "\u2008", # punctuation space
+    "\u2009", # thin space
+    "\u200A", # hair space
+    "\u2028", # line separator
+    "\u2029", # paragraph separator
+    "\u202f", # narrow no-break space
+    "\u205f", # medium mathematical space
+    "\u3000", # ideographic space
+]
+
 def segment_into_chars(utterance: str) -> str:
     """ Segments an utterance into space delimited characters. """
 
@@ -33,6 +61,8 @@ def segment_into_chars(utterance: str) -> str:
 
     utterance.strip()
     utterance = utterance.replace(" ", "")
+    for char in UNICODE_WHITESPACE_CHARACTERS:
+        utterance = utterance.replace(char, "")
     return " ".join(utterance)
 
 def segment_into_tokens(utterance: str, token_inventory: Iterable[str]):

diff --git a/persephone/tests/test_transcription_preprocessing.py b/persephone/tests/test_transcription_preprocessing.py
@@ -38,3 +38,26 @@ def test_segment_into_tokens():
     assert segment_into_tokens(input_1, token_inv) == output_1
     assert segment_into_tokens(input_2, token_inv) == output_2
     assert segment_into_tokens(input_3, token_inv) == output_3
+
+def test_unicode_segmentation():
+    """Test that unicode whitespace characters are correctly handled in segmentation"""
+    from persephone.preprocess.labels import segment_into_chars
+    no_break_space = "hello\u00A0world"
+    assert segment_into_chars(no_break_space) == "h e l l o w o r l d"
+
+    unicode_spaces = [
+        "\u2000", #EN QUAD
+        "\u2001", #EM QUAD
+        "\u2002", #EN SPACE
+        "\u2003", #EM SPACE
+        "\u2004", #THREE-PER-EM SPACE
+        "\u2005", #FOUR-PER-EM SPACE
+        "\u2006", #SIX-PER-EM SPACE
+        "\u2007", #FIGURE SPACE
+        "\u2008", #PUNCTUATION SPACE
+        "\u2009", #THIN SPACE
+        "\u200A", #HAIR SPACE
+    ]
+
+    for space_character in unicode_spaces:
+        assert segment_into_chars("hello"+space_character+"world") ==  "h e l l o w o r l d"