forked from proycon/pynlpl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextprocessors.py
462 lines (373 loc) · 15.4 KB
/
textprocessors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
# -*- coding: utf8 -*-
###############################################################
# PyNLPl - Text Processors
# by Maarten van Gompel
# Centre for Language Studies
# Radboud University Nijmegen
# http://www.github.com/proycon/pynlpl
# proycon AT anaproy DOT nl
#
# Licensed under GPLv3
#
# This is a Python library containing text processors
#
###############################################################
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pynlpl.common import isstring
import sys
if sys.version < '3':
from codecs import getwriter
stderr = getwriter('utf-8')(sys.stderr)
stdout = getwriter('utf-8')(sys.stdout)
else:
stderr = sys.stderr
stdout = sys.stdout
import unicodedata
import string
import io
import array
import re
from itertools import permutations
from pynlpl.statistics import FrequencyList
from pynlpl.formats import folia
from pynlpl.algorithms import bytesize
WHITESPACE = [" ", "\t", "\n", "\r","\v","\f"]
EOSMARKERS = ('.','?','!','。',';','؟','。','?','!','।','։','՞','።','᙮','។','៕')
REGEXP_URL = re.compile(r"^(?:(?:https?):(?:(?://)|(?:\\\\))|www\.)(?:[\w\d:#@%/;$()~_?\+-=\\\.&](?:#!)?)*")
REGEXP_MAIL = re.compile(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+(?:\.[a-zA-Z]+)+") #email
TOKENIZERRULES = (REGEXP_URL, REGEXP_MAIL)
class Windower(object):
"""Moves a sliding window over a list of tokens, upon iteration in yields all n-grams of specified size in a tuple.
Example without markers:
>>> for ngram in Windower("This is a test .",3, None, None):
... print(" ".join(ngram))
This is a
is a test
a test .
Example with default markers:
>>> for ngram in Windower("This is a test .",3):
... print(" ".join(ngram))
<begin> <begin> This
<begin> This is
This is a
is a test
a test .
test . <end>
. <end> <end>
"""
def __init__(self, tokens, n=1, beginmarker = "<begin>", endmarker = "<end>"):
"""
Constructor for Windower
:param tokens: The tokens to iterate over. Should be an itereable. Strings will be split on spaces automatically.
:type tokens: iterable
:param n: The size of the n-grams to extract
:type n: integer
:param beginmarker: The marker for the beginning of the sentence, defaults to "<begin>". Set to None if no markers are desired.
:type beginmarker: string or None
:param endmarker: The marker for the end of the sentence, defaults to "<end>". Set to None if no markers are desired.
:type endmarker: string or None
"""
if isinstance(tokens, str) or (sys.version < '3' and isinstance(tokens, unicode)):
self.tokens = tuple(tokens.split())
else:
self.tokens = tuple(tokens)
assert isinstance(n, int)
self.n = n
self.beginmarker = beginmarker
self.endmarker = endmarker
def __len__(self):
"""Returns the number of n-grams in the data (quick computation without iteration)
Without markers:
>>> len(Windower("This is a test .",3, None, None))
3
>>> len(Windower("This is a test .",2, None, None))
4
>>> len(Windower("This is a test .",1, None, None))
5
With default markers:
>>> len(Windower("This is a test .",3))
7
"""
c = (len(self.tokens) - self.n) + 1
if self.beginmarker: c += self.n-1
if self.endmarker: c += self.n-1
return c
def __iter__(self):
"""Yields an n-gram (tuple) at each iteration"""
l = len(self.tokens)
if self.beginmarker:
beginmarker = (self.beginmarker), #tuple
if self.endmarker:
endmarker = (self.endmarker), #tuple
for i in range(-(self.n - 1),l):
begin = i
end = i + self.n
if begin >= 0 and end <= l:
yield tuple(self.tokens[begin:end])
elif begin < 0 and end > l:
if not self.beginmarker or not self.endmarker:
continue
else:
yield tuple(((begin * -1) * beginmarker ) + self.tokens + ((end - l) * endmarker ))
elif begin < 0:
if not self.beginmarker:
continue
else:
yield tuple(((begin * -1) * beginmarker ) + self.tokens[0:end])
elif end > l:
if not self.endmarker:
continue
else:
yield tuple(self.tokens[begin:] + ((end - l) * endmarker))
class MultiWindower(object):
"Extract n-grams of various configurations from a sequence"
def __init__(self,tokens, min_n = 1, max_n = 9, beginmarker=None, endmarker=None):
if isinstance(tokens, str) or (sys.version < '3' and isinstance(tokens, unicode)):
self.tokens = tuple(tokens.split())
else:
self.tokens = tuple(tokens)
assert isinstance(min_n, int)
assert isinstance(max_n, int)
self.min_n = min_n
self.max_n = max_n
self.beginmarker = beginmarker
self.endmarker = endmarker
def __iter__(self):
for n in range(self.min_n, self.max_n + 1):
for ngram in Windower(self.tokens,n, self.beginmarker, self.endmarker):
yield ngram
class ReflowText(object):
"""Attempts to re-flow a text that has arbitrary line endings in it. Also undoes hyphenisation"""
def __init__(self, stream, filternontext=True):
self.stream = stream
self.filternontext = filternontext
def __iter__(self):
eosmarkers = ('.',':','?','!','"',"'","„","”","’")
emptyline = 0
buffer = ""
for line in self.stream:
line = line.strip()
if line:
if emptyline:
if buffer:
yield buffer
yield ""
emptyline = 0
buffer = ""
if buffer: buffer += ' '
if (line[-1] in eosmarkers):
buffer += line
yield buffer
buffer = ""
emptyline = 0
elif len(line) > 2 and line[-1] == '-' and line[-2].isalpha():
#undo hyphenisation
buffer += line[:-1]
else:
if self.filternontext:
hastext = False
for c in line:
if c.isalpha():
hastext = True
break
else:
hastext = True
if hastext:
buffer += line
else:
emptyline += 1
#print "BUFFER=[" + buffer.encode('utf-8') + "] emptyline=" + str(emptyline)
if buffer:
yield buffer
def calculate_overlap(haystack, needle, allowpartial=True):
"""Calculate the overlap between two sequences. Yields (overlap, placement) tuples (multiple because there may be multiple overlaps!). The former is the part of the sequence that overlaps, and the latter is -1 if the overlap is on the left side, 0 if it is a subset, 1 if it overlaps on the right side, 2 if its an identical match"""
needle = tuple(needle)
haystack = tuple(haystack)
solutions = []
#equality check
if needle == haystack:
return [(needle, 2)]
if allowpartial:
minl =1
else:
minl = len(needle)
for l in range(minl,min(len(needle), len(haystack))+1):
#print "LEFT-DEBUG", l,":", needle[-l:], " vs ", haystack[:l]
#print "RIGHT-DEBUG", l,":", needle[:l], " vs ", haystack[-l:]
#Search for overlap left (including partial overlap!)
if needle[-l:] == haystack[:l]:
#print "LEFT MATCH"
solutions.append( (needle[-l:], -1) )
#Search for overlap right (including partial overlap!)
if needle[:l] == haystack[-l:]:
#print "RIGHT MATCH"
solutions.append( (needle[:l], 1) )
if len(needle) <= len(haystack):
options = list(iter(Windower(haystack,len(needle),beginmarker=None,endmarker=None)))
for option in options[1:-1]:
if option == needle:
#print "SUBSET MATCH"
solutions.append( (needle, 0) )
return solutions
class Tokenizer(object):
"""A tokenizer and sentence splitter, which acts on a file/stream-like object and when iterating over the object it yields
a lists of tokens (in case the sentence splitter is active (default)), or a token (if the sentence splitter is deactivated).
"""
def __init__(self, stream, splitsentences=True, onesentenceperline=False, regexps=TOKENIZERRULES):
"""
Constructor for Tokenizer
:param stream: An iterable or file-object containing the data to tokenize
:type stream: iterable or file-like object
:param splitsentences: Enable sentence splitter? (default=_True_)
:type splitsentences: bool
:param onesentenceperline: Assume input has one sentence per line? (default=_False_)
:type onesentenceperline: bool
:param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
:type regexps: Tuple/list of regular expressions to use in tokenisation
"""
self.stream = stream
self.regexps = regexps
self.splitsentences=splitsentences
self.onesentenceperline = onesentenceperline
def __iter__(self):
buffer = ""
for line in self.stream:
line = line.strip()
if line:
if buffer: buffer += "\n"
buffer += line
if (self.onesentenceperline or not line) and buffer:
if self.splitsentences:
yield split_sentences(tokenize(buffer))
else:
for token in tokenize(buffer, self.regexps):
yield token
buffer = ""
if buffer:
if self.splitsentences:
yield split_sentences(tokenize(buffer))
else:
for token in tokenize(buffer, self.regexps):
yield token
def tokenize(text, regexps=TOKENIZERRULES):
"""Tokenizes a string and returns a list of tokens
:param text: The text to tokenise
:type text: string
:param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
:type regexps: Tuple/list of regular expressions to use in tokenisation
:rtype: Returns a list of tokens
Examples:
>>> for token in tokenize("This is a test."):
... print(token)
This
is
a
test
.
"""
for i,regexp in list(enumerate(regexps)):
if isstring(regexp):
regexps[i] = re.compile(regexp)
tokens = []
begin = 0
for i, c in enumerate(text):
if begin > i:
continue
elif i == begin:
m = False
for regexp in regexps:
m = regexp.findall(text[i:i+300])
if m:
tokens.append(m[0])
begin = i + len(m[0])
break
if m: continue
if c in string.punctuation or c in WHITESPACE:
prev = text[i-1] if i > 0 else ""
next = text[i+1] if i < len(text)-1 else ""
if (c == '.' or c == ',') and prev.isdigit() and next.isdigit():
#punctuation in between numbers, keep as one token
pass
elif (c == "'" or c == "`") and prev.isalpha() and next.isalpha():
#quote in between chars, keep...
pass
elif c not in WHITESPACE and next == c: #group clusters of identical punctuation together
continue
elif c == '\r' and prev == '\n':
#ignore
begin = i+1
continue
else:
token = text[begin:i]
if token: tokens.append(token)
if c not in WHITESPACE:
tokens.append(c) #anything but spaces and newlines (i.e. punctuation) counts as a token too
begin = i + 1 #set the begin cursor
if begin <= len(text) - 1:
token = text[begin:]
tokens.append(token)
return tokens
def crude_tokenizer(text):
"""Replaced by tokenize(). Alias"""
return tokenize(text) #backwards-compatibility, not so crude anymore
def tokenise(text, regexps=TOKENIZERRULES): #for the British
"""Alias for the British"""
return tokenize(text)
def is_end_of_sentence(tokens,i ):
# is this an end-of-sentence marker? ... and is this either
# the last token or the next token is NOT an end of sentence
# marker as well? (to deal with ellipsis etc)
return tokens[i] in EOSMARKERS and (i == len(tokens) - 1 or not tokens[i+1] in EOSMARKERS)
def split_sentences(tokens):
"""Split sentences (based on tokenised data), returns sentences as a list of lists of tokens, each sentence is a list of tokens"""
begin = 0
for i, token in enumerate(tokens):
if is_end_of_sentence(tokens, i):
yield tokens[begin:i+1]
begin = i+1
if begin <= len(tokens)-1:
yield tokens[begin:]
def strip_accents(s, encoding= 'utf-8'):
"""Strip characters with diacritics and return a flat ascii representation"""
if sys.version < '3':
if isinstance(s,unicode):
return unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
else:
return unicodedata.normalize('NFKD', unicode(s,encoding)).encode('ASCII', 'ignore')
else:
if isinstance(s,bytes): s = str(s,encoding)
return str(unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore'),'ascii')
def swap(tokens, maxdist=2):
"""Perform a swap operation on a sequence of tokens, exhaustively swapping all tokens up to the maximum specified distance. This is a subset of all permutations."""
assert maxdist >= 2
tokens = list(tokens)
if maxdist > len(tokens):
maxdist = len(tokens)
l = len(tokens)
for i in range(0,l - 1):
for permutation in permutations(tokens[i:i+maxdist]):
if permutation != tuple(tokens[i:i+maxdist]):
newtokens = tokens[:i]
newtokens += permutation
newtokens += tokens[i+maxdist:]
yield newtokens
if maxdist == len(tokens):
break
def find_keyword_in_context(tokens, keyword, contextsize=1):
"""Find a keyword in a particular sequence of tokens, and return the local context. Contextsize is the number of words to the left and right. The keyword may have multiple word, in which case it should to passed as a tuple or list"""
if isinstance(keyword,tuple) and isinstance(keyword,list):
l = len(keyword)
else:
keyword = (keyword,)
l = 1
n = l + contextsize*2
focuspos = contextsize + 1
for ngram in Windower(tokens,n,None,None):
if ngram[focuspos:focuspos+l] == keyword:
yield ngram[:focuspos], ngram[focuspos:focuspos+l],ngram[focuspos+l+1:]
if __name__ == "__main__":
import doctest
doctest.testmod()