-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenron.py
executable file
·83 lines (76 loc) · 3.01 KB
/
enron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
from argparse import ArgumentParser
from email import message_from_file
from glob import glob
from hashlib import md5
from os import makedirs
from os.path import isdir, join
from re import sub, IGNORECASE
import numpy as np
from gender_guesser.detector import Detector
from pandas import read_csv
from common import min_sequence_length, pad, sequence_length, tokenize
parser = ArgumentParser()
parser.add_argument('-d', '--data', default='data')
args = parser.parse_args()
data_path = args.data
dataset_path = join(data_path, 'raw', 'enron')
in_paths = glob(join(dataset_path, '**/*sent*/**'), recursive=True)
roles_path = join(data_path, 'raw', 'enron_roles.txt')
out_dir = join(data_path, 'proc', 'enron')
checksums = []
detector = Detector()
x = []
y = []
roles = read_csv(roles_path, sep='\t', skiprows=[139], header=None)
for path in in_paths:
if isdir(path):
continue
with open(path, 'r', encoding='iso8859') as file:
mail = message_from_file(file)
if 'X-From' not in mail.keys():
continue
names = None
if '@' in mail.get('From'):
mail_addr = mail.get('From').split('@')[0]
role = roles[roles[0] == mail_addr]
if len(role) > 0:
full_name = role[1].item().split(' ')[0]
names = full_name.split(' ')
if names is None:
names = mail.get('X-From').split(' ')
sender_first_name = names[0]
gender = detector.get_gender(sender_first_name)
if gender not in ['female', 'mostly_female', 'mostly_male', 'male']: # ignore androgynous names etc.
continue
checksum = md5(mail.as_bytes()).hexdigest()
if checksum in checksums: # do not process duplicate mails
continue
text = mail.get_payload()
if 'Forwarded by' in text: # discard forwarded emails for obvious authorship disambiguation
continue
reply_indices = [text.find(indicator) for indicator in ['\nFrom:', '\nTo:', '- Original Message -', '-----------']]
reply_indices = [index for index in reply_indices if index > -1]
if len(reply_indices) > 0: # remove older messages, signatures, etc.
text = text[:min(reply_indices) - 5]
lines = text.split('\n')
non_quote_lines = [line for line in lines if not line.startswith('>')] # remove quotes
text = '\n'.join(non_quote_lines)
for name in names: # remove own name as indicator for gender (e.g. in signature)
if len(name) > 2:
text = sub(name, '', text, flags=IGNORECASE)
text = text.replace('?', '') # replace question mark, as it is also used for unknown characters and redacted text
encoding = tokenize(text)
if encoding.shape[0] <= min_sequence_length:
continue
if encoding.shape[0] < sequence_length:
encoding = pad(encoding, sequence_length)
x.append(encoding[:sequence_length])
label = 1 if 'female' in gender else 0
y.append(label)
checksums.append(checksum)
x = np.vstack(x)
y = np.array(y)
makedirs(out_dir, exist_ok=True)
np.save(join(out_dir, 'x'), x)
np.save(join(out_dir, 'y'), y)