-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathaux.py
50 lines (40 loc) · 1.21 KB
/
aux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# encoding: utf-8
# Regular expressions and auxilary functions used in this script are taken from O'Connor et al.'s Tweetmotif
# see: https://github.com/brendano/tweetmotif
import re
def regex_or(*items):
r = '|'.join(items)
r = '(' + r + ')'
return r
def pos_lookahead(r):
return '(?=' + r + ')'
def neg_lookahead(r):
return '(?!' + r + ')'
def optional(r):
return '(%s)?' % r
# Build URL
PunctChars = r'[\'“".?!,:;]'
html_entity = '&(amp|lt|gt|quot);'
UrlStart1 = regex_or('https?://', r'www\.')
CommonTLDs = regex_or('com','co\\.uk','org','net','info','ca')
UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]')
UrlBody = r'[^ \t\r\n<>]*?'
UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, html_entity)
UrlEnd = regex_or( r'\.\.+', r'[<>]', r'\s', '$')
url = (r'\b' +
regex_or(UrlStart1, UrlStart2) +
UrlBody +
pos_lookahead( optional(UrlExtraCrapBeforeEnd) + UrlEnd))
# Build emoticon
NormalEyes = r'[:=]'
Wink = r'[;]'
NoseArea = r'(|o|O|-)'
HappyMouths = r'[D\)\]]'
SadMouths = r'[\(\[]'
Tongue = r'[pP]'
OtherMouths = r'[doO/\\]'
emoticon = (
"("+NormalEyes+"|"+Wink+")" +
NoseArea +
"("+Tongue+"|"+OtherMouths+"|"+SadMouths+"|"+HappyMouths+")"
)