Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit fbaa970

Browse files
committedMay 30, 2024·
[WIP] OCC-Cont [Working Tokenizer and Ref Excluder]
1 parent 7d37a36 commit fbaa970

File tree

5 files changed

+173
-1
lines changed

5 files changed

+173
-1
lines changed
 

‎pie_extended/models/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
"fr",
55
"freem",
66
"grc",
7-
"dum"
7+
"dum",
8+
"occ_cont"
89
]
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from ...utils import Metadata, File, get_path
2+
3+
4+
DESC = Metadata(
5+
"OccitanContemporain",
6+
"occ_cont",
7+
["Oriane Nédey", "Juliette Janès"],
8+
"Model trained on ...",
9+
"https://github.com/DEFI-COLaF/modeles-papie"
10+
)
11+
12+
VERSION = "0.0.1"
13+
DOWNLOADS = [
14+
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
15+
"/occ-cont-lemma.tar",
16+
"lemma.tar"),
17+
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
18+
"/occ-cont-POS.tar",
19+
"lemma.tar"),
20+
]
21+
22+
Models = "".join([
23+
"<{},lemma>".format(get_path("occ_cont", "lemma.tar")),
24+
"<{},pos>".format(get_path("occ_cont", "pos.tar"))
25+
])
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
2+
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
3+
from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer
4+
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype
5+
6+
7+
def get_iterator_and_processor(max_tokens=256):
8+
tokenizer = OccMemorizingTokenizer()
9+
processor = MemoryzingProcessor(
10+
tokenizer_memory=tokenizer,
11+
head_processor=ProcessorPrototype()
12+
)
13+
iterator = DataIterator(
14+
tokenizer=tokenizer,
15+
max_tokens=max_tokens,
16+
exclude_patterns=[
17+
excl.exclude_regexp
18+
for excl in tokenizer.normalizers
19+
if excl.exclude_regexp
20+
]
21+
)
22+
return iterator, processor
23+

‎pie_extended/models/occ_cont/processor.py

Whitespace-only changes.
+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import regex as re
2+
from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer
3+
from typing import List, Generator, Tuple
4+
import unicodedata
5+
from pie_extended.pipeline.tokenizers.utils.excluder import (
6+
ReferenceExcluder,
7+
DEFAULT_CHAR_REGISTRY
8+
)
9+
10+
_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“"
11+
_SpaceNormalizer = re.compile(r"(\s+)")
12+
13+
14+
class OccMemorizingTokenizer(MemorizingTokenizer):
15+
""" Occitan Tokenizer with memorizing capacities (for normalization steps)
16+
17+
This tokenizer is based on a Perl script published by Marianne Verges-Couret in 2019
18+
(https://zenodo.org/records/2533873), as well as the description for the Python tokenizer described in
19+
(Miletić, 2023) that was also derived from the work in project RESTAURE.
20+
It was adapted by Oriane Nedey in python and then adapted to Pie-Extended
21+
"""
22+
_sentence_boundaries = re.compile(
23+
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
24+
)
25+
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)")
26+
27+
# Define a pattern that matches any punctuation or symbol, with exceptions
28+
re_in_non_amb = re.compile(r"(?!['\-,.<>])[\p{P}\p{S}]")
29+
30+
# Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok
31+
re_tags = re.compile(r'(<\\?[^\d\s].*>)')
32+
33+
re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}")
34+
35+
def __init__(self):
36+
super(OccMemorizingTokenizer, self).__init__()
37+
self.tokens = []
38+
self.char_registry = DEFAULT_CHAR_REGISTRY
39+
self.normalizers: Tuple[ReferenceExcluder] = (
40+
ReferenceExcluder(char_registry=self.char_registry),
41+
)
42+
self.re_ref = re.compile(rf"{self.char_registry['[']}REF[^{self.char_registry[']']}]+{self.char_registry[']']}")
43+
self.re_split_step_one = re.compile(
44+
rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})"
45+
)
46+
47+
@staticmethod
48+
def _sentence_tokenizer_merge_matches(match):
49+
""" Best way we found to deal with repeating groups"""
50+
start, end = match.span()
51+
return match.string[start:end] + "<SPLIT>"
52+
53+
def _real_sentence_tokenizer(self, string: str) -> List[str]:
54+
string = _SpaceNormalizer.sub(" ", string.strip())
55+
string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string)
56+
57+
for normalizer in self.normalizers:
58+
string = normalizer.after_sentence_tokenizer(string)
59+
60+
return string.split("<SPLIT>")
61+
62+
def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]:
63+
"""
64+
Segments a string into a list of tokens by applying Occitan-specific regular expressions.
65+
66+
:param text: string, ideally one single segment.
67+
:returns: list of segmented tokens
68+
"""
69+
text = text.replace("qu'", "qu' ") # TODO Is this not already done by the regexes afterwards ?
70+
text = text.replace("d'", "d' ") # TODO Is this not already done by the regexes afterwards ?
71+
res = []
72+
text = re.sub(r'(\d)\s(\d)', r'\1<PPLesp>\2', text)
73+
for m in self.re_split_step_one.split(text):
74+
if not m:
75+
continue
76+
elif self.normalizers[0].re.match(m):
77+
res.append(m)
78+
elif self.re_split_match.match(m):
79+
res.append(m)
80+
elif not re.match(r'^\s*$', m):
81+
m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace
82+
m = re.sub(r"(\P{L}|^)([dlmnst]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
83+
m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
84+
m = re.sub(r"(\P{L}|^)(\p{L}*qu\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ?
85+
m = re.sub(r"(\P{L}|^)(\p{L}*ent\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
86+
m = re.sub(r"(\P{L}|^)(\p{L}*[çcbzu]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ?
87+
m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après
88+
m = re.sub(r"\.($|\P{L})", r"\t.\1", m)
89+
m = re.sub(r"(\D|^),", r"\1\t,\t", m)
90+
m = re.sub(r",($|\D)", r"\t,\t\1", m)
91+
m = re.sub(r"-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ?
92+
m = re.sub(r"\'([unv]\p{L}*)($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après
93+
m = re.sub(r"\'([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après
94+
m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m)
95+
m = re.sub(r"<PPLesp>", ' ', m)
96+
m = re.sub(r"([<>])", r"\t\1\t", m)
97+
res.extend(m.split('\t'))
98+
99+
# Remove empty tokens
100+
res = [item for item in res if item.strip()]
101+
return res
102+
103+
def normalizer(self, data: str) -> str:
104+
for excluder in self.normalizers:
105+
data = excluder.before_sentence_tokenizer(data)
106+
return data
107+
108+
def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]:
109+
sentences = list()
110+
data = self.normalizer(text)
111+
for sent in self._real_sentence_tokenizer(data):
112+
sent = sent.strip()
113+
if sent:
114+
sentences.append(self.word_tokenizer(sent))
115+
yield from sentences
116+
117+
def replacer(self, inp: str):
118+
for excluder in self.normalizers:
119+
if excluder.exclude_regexp.match(inp):
120+
if excluder.can_be_replaced:
121+
return inp
122+
123+
return unicodedata.normalize("NFKC", inp)

0 commit comments

Comments
 (0)
Please sign in to comment.