-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon.py
83 lines (67 loc) · 2.68 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from re import sub, MULTILINE, IGNORECASE
from string import ascii_lowercase, digits
import numpy as np
from lxml.etree import ParserError
from lxml.html import fromstring
sequence_length = 1000
min_sequence_length = 500 # data with lower than 500 chars is discarded
tokens = ascii_lowercase + '!"#%&\'()/:@^~ *+,-.;?' + digits
pad_char = '\0'
tokens = pad_char + tokens
char_to_int = {t: i for i, t in enumerate(tokens)}
def clean_text(text):
text = text.lower()
text = sub(r'http\S+', '', text, flags=MULTILINE) # remove http urls
text = sub(r'www\.[^ ]+', '', text) # remove www.* urls
text = text.replace('urllink', '')
text = text.replace('(Taken with Instagram)', '')
text = ' '.join(text.split()) # substitutes multiple whitespaces with single whitespace
return text
def encode_text(text):
text = clean_text(text)
ints = [char_to_int[c] for c in text if c in tokens]
encoding = one_hot_encode(ints, len(tokens))
return encoding
def one_hot_encode(ints, number_classes):
return np.eye(number_classes, dtype='uint8')[ints]
def pad(x, length):
if len(x.shape) == 2:
zeros = np.zeros((length, x.shape[1]), dtype=x.dtype)
else:
zeros = np.zeros((length,), dtype=x.dtype)
zeros[:x.shape[0]] = x
return zeros
def tokenize(text):
text = clean_text(text)
ints = [char_to_int[c] for c in text if c in tokens]
return np.array(ints, dtype='uint8')
def read_xml(path):
with open(path, 'r', encoding='ascii', errors='ignore') as file:
blog = file.read()
blog = "".join(c for c in blog if ord(c) < 128)
xml = fromstring(blog)
return xml
def interpret_html_texts(texts, is_female):
x = []
y = []
for text in texts:
text = text.replace('>;', '>') # remove "extra" semicolon after named entities
text = sub(r'<br\s*\/?>', ' ', text, flags=IGNORECASE) # replace line breaks with a single white space
text = sub(r'<p\s*>', ' <p>', text, flags=IGNORECASE) # replace paragraph starts with a single white space
text = sub(r'& nbsp;', ' ', text, flags=IGNORECASE) # replace paragraph starts with a single white space
if text.isspace():
continue
try:
inner_text = fromstring(text).text_content()
except ParserError as e:
print('Blog post parsing error: ', e)
continue
encoding = tokenize(inner_text)
if encoding.shape[0] <= min_sequence_length:
continue
if encoding.shape[0] < sequence_length:
encoding = pad(encoding, sequence_length)
label = 1 if is_female else 0
x.append(encoding[:sequence_length])
y.append(label)
return x, y