|
| 1 | +import regex as re |
| 2 | +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer |
| 3 | +from typing import List, Generator, Tuple |
| 4 | +import unicodedata |
| 5 | +from pie_extended.pipeline.tokenizers.utils.excluder import ( |
| 6 | + ReferenceExcluder, |
| 7 | + DEFAULT_CHAR_REGISTRY |
| 8 | +) |
| 9 | + |
| 10 | +_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" |
| 11 | +_SpaceNormalizer = re.compile(r"(\s+)") |
| 12 | + |
| 13 | + |
| 14 | +class OccMemorizingTokenizer(MemorizingTokenizer): |
| 15 | + """ Occitan Tokenizer with memorizing capacities (for normalization steps) |
| 16 | +
|
| 17 | + This tokenizer is based on a Perl script published by Marianne Verges-Couret in 2019 |
| 18 | + (https://zenodo.org/records/2533873), as well as the description for the Python tokenizer described in |
| 19 | + (Miletić, 2023) that was also derived from the work in project RESTAURE. |
| 20 | + It was adapted by Oriane Nedey in python and then adapted to Pie-Extended |
| 21 | + """ |
| 22 | + _sentence_boundaries = re.compile( |
| 23 | + r"([" + _Dots_except_apostrophe + r"]+\s*)+" |
| 24 | + ) |
| 25 | + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") |
| 26 | + |
| 27 | + # Define a pattern that matches any punctuation or symbol, with exceptions |
| 28 | + re_in_non_amb = re.compile(r"(?!['\-,.<>])[\p{P}\p{S}]") |
| 29 | + |
| 30 | + # Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok |
| 31 | + re_tags = re.compile(r'(<\\?[^\d\s].*>)') |
| 32 | + |
| 33 | + re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}") |
| 34 | + |
| 35 | + def __init__(self): |
| 36 | + super(OccMemorizingTokenizer, self).__init__() |
| 37 | + self.tokens = [] |
| 38 | + self.char_registry = DEFAULT_CHAR_REGISTRY |
| 39 | + self.normalizers: Tuple[ReferenceExcluder] = ( |
| 40 | + ReferenceExcluder(char_registry=self.char_registry), |
| 41 | + ) |
| 42 | + self.re_ref = re.compile(rf"{self.char_registry['[']}REF[^{self.char_registry[']']}]+{self.char_registry[']']}") |
| 43 | + self.re_split_step_one = re.compile( |
| 44 | + rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})" |
| 45 | + ) |
| 46 | + |
| 47 | + @staticmethod |
| 48 | + def _sentence_tokenizer_merge_matches(match): |
| 49 | + """ Best way we found to deal with repeating groups""" |
| 50 | + start, end = match.span() |
| 51 | + return match.string[start:end] + "<SPLIT>" |
| 52 | + |
| 53 | + def _real_sentence_tokenizer(self, string: str) -> List[str]: |
| 54 | + string = _SpaceNormalizer.sub(" ", string.strip()) |
| 55 | + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) |
| 56 | + |
| 57 | + for normalizer in self.normalizers: |
| 58 | + string = normalizer.after_sentence_tokenizer(string) |
| 59 | + |
| 60 | + return string.split("<SPLIT>") |
| 61 | + |
| 62 | + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: |
| 63 | + """ |
| 64 | + Segments a string into a list of tokens by applying Occitan-specific regular expressions. |
| 65 | +
|
| 66 | + :param text: string, ideally one single segment. |
| 67 | + :returns: list of segmented tokens |
| 68 | + """ |
| 69 | + text = text.replace("qu'", "qu' ") # TODO Is this not already done by the regexes afterwards ? |
| 70 | + text = text.replace("d'", "d' ") # TODO Is this not already done by the regexes afterwards ? |
| 71 | + res = [] |
| 72 | + text = re.sub(r'(\d)\s(\d)', r'\1<PPLesp>\2', text) |
| 73 | + for m in self.re_split_step_one.split(text): |
| 74 | + if not m: |
| 75 | + continue |
| 76 | + elif self.normalizers[0].re.match(m): |
| 77 | + res.append(m) |
| 78 | + elif self.re_split_match.match(m): |
| 79 | + res.append(m) |
| 80 | + elif not re.match(r'^\s*$', m): |
| 81 | + m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace |
| 82 | + m = re.sub(r"(\P{L}|^)([dlmnst]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant |
| 83 | + m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant |
| 84 | + m = re.sub(r"(\P{L}|^)(\p{L}*qu\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ? |
| 85 | + m = re.sub(r"(\P{L}|^)(\p{L}*ent\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant |
| 86 | + m = re.sub(r"(\P{L}|^)(\p{L}*[çcbzu]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ? |
| 87 | + m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après |
| 88 | + m = re.sub(r"\.($|\P{L})", r"\t.\1", m) |
| 89 | + m = re.sub(r"(\D|^),", r"\1\t,\t", m) |
| 90 | + m = re.sub(r",($|\D)", r"\t,\t\1", m) |
| 91 | + m = re.sub(r"-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ? |
| 92 | + m = re.sub(r"\'([unv]\p{L}*)($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après |
| 93 | + m = re.sub(r"\'([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après |
| 94 | + m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m) |
| 95 | + m = re.sub(r"<PPLesp>", ' ', m) |
| 96 | + m = re.sub(r"([<>])", r"\t\1\t", m) |
| 97 | + res.extend(m.split('\t')) |
| 98 | + |
| 99 | + # Remove empty tokens |
| 100 | + res = [item for item in res if item.strip()] |
| 101 | + return res |
| 102 | + |
| 103 | + def normalizer(self, data: str) -> str: |
| 104 | + for excluder in self.normalizers: |
| 105 | + data = excluder.before_sentence_tokenizer(data) |
| 106 | + return data |
| 107 | + |
| 108 | + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: |
| 109 | + sentences = list() |
| 110 | + data = self.normalizer(text) |
| 111 | + for sent in self._real_sentence_tokenizer(data): |
| 112 | + sent = sent.strip() |
| 113 | + if sent: |
| 114 | + sentences.append(self.word_tokenizer(sent)) |
| 115 | + yield from sentences |
| 116 | + |
| 117 | + def replacer(self, inp: str): |
| 118 | + for excluder in self.normalizers: |
| 119 | + if excluder.exclude_regexp.match(inp): |
| 120 | + if excluder.can_be_replaced: |
| 121 | + return inp |
| 122 | + |
| 123 | + return unicodedata.normalize("NFKC", inp) |
0 commit comments