-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
159 lines (147 loc) · 5.8 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from pymysql.err import InternalError,IntegrityError
from dateutil.parser import parse
import os
import hashlib
import yaml
import pymysql
# Parsing the emails
class Extract(object):
def level(self, email):
store = {}
to_addr = []
cc_addr = []
bcc_addr = []
from_addr = None
msg_id = None
subject = None
dt = None
prev = ""
if self.invalid(email):
print "INVALID"
return None
for line in email:
line = line.replace("\r", "").replace("\n", "").replace(
"\t", "").strip()
if "X-" in line:
break
elif "Message-ID:" in line:
prev = "Message-ID:"
msg_id = line.replace("Message-ID:", "").strip()
elif "Date:" in line:
prev = "Date:"
dt = line.replace("Date:", "").strip()
elif "From:" in line:
prev = "From:"
from_addr = line.replace("From:", "").strip()
elif "To:" in line:
prev = "To:"
t = [i.strip() for i in
line.replace("To:", "").strip().split(",") if
len(i) > 1]
to_addr.extend(t)
elif "Cc:" in line:
prev = "Cc:"
t = [i.strip() for i in
line.replace("Cc:", "").strip().split(",") if
len(i) > 1]
cc_addr.extend(t)
elif "Bcc:" in line:
prev = "Bcc:"
t = [i.strip() for i in
line.replace("Bcc:", "").strip().split(",") if
len(i) > 1]
bcc_addr.extend(t)
elif "Subject:" in line:
prev = "Subject:"
subject = line.replace("Subject:", "").strip()
elif "Mime-Version:" in line or "Content-Type:" in line or "Content-Transfer-Encoding:" in line:
prev = ""
else:
if prev == "To:":
t = [i.strip() for i in line.strip().split(",") if
len(i) > 1]
to_addr.extend(t)
if prev == "Subject:":
subject = subject + line
if prev == 'Cc:':
t = [i.strip() for i in line.strip().split(",") if
len(i) > 1]
cc_addr.extend(t)
if prev == 'Bcc:':
t = [i.strip() for i in line.strip().split(",") if
len(i) > 1]
bcc_addr.extend(t)
store['to'] = [i.strip() for i in to_addr if '@' in i]
store['cc'] = [i.strip() for i in cc_addr if '@' in i]
store['from'] = from_addr.strip() if from_addr else ""
store['bcc'] = [i.strip() for i in bcc_addr if '@' in i]
store['subject'] = subject.strip() if subject else ""
store['message_id'] = msg_id if msg_id else ""
store['date'] = dt
store['sub_md5'] = hashlib.md5(
subject.lower().replace("re:", "").strip()).hexdigest() if subject else None
return store
# Creating Database, Inserting Parsed Data, running queries.
class Database(object):
def __init__(self):
self.config = os.getcwd() + "/ddl.yaml"
self.config_yaml = self._get_config()
self.schema = self.config_yaml.get('config').get(
'schema')
self.dbhost = self.config_yaml.get('config').get('host')
self.dbuser = self.config_yaml.get('config').get('user')
self.dbpass = self.config_yaml.get('config').get('pass')
self.dbport = self.config_yaml.get('config').get('port')
self.conn = pymysql.connect(host=self.dbhost,
port=self.dbport,
user=self.dbuser,
passwd=self.dbpass,
db=self.schema, charset='utf8',
autocommit=True)
self.cursor = self.conn.cursor()
def _get_config(self):
return yaml.load(open(self.config))
def create_tables(self):
for name, ddl in self.config_yaml.get('ddl').items():
if 'drop' in name:
print("Dropping table {}: ".format(name))
elif 'create' in name:
print("Creating table {}: ".format(name))
try:
self.cursor.execute(ddl)
except InternalError as e:
print e
def insert(self, table=None, rows=None):
if rows is None or table is None:
print "Table Missing?"
return False
if table is 'email':
for r in rows:
try:
self.cursor.execute("INSERT INTO email(message_id, sender, subject, email_date, label, sub_md5) VALUES (%s,%s,%s,%s,%s, %s)",r)
except IntegrityError as e:
print e
elif table is 'recipient':
for r in rows:
try:
self.cursor.execute("INSERT INTO recipient(message_id, sender, recipient, is_to, is_cc, is_bcc) VALUES (%s,%s,%s,%s,%s,%s)",r)
except IntegrityError as e:
print e
else:
print "FAILURE"
return False
def run_query(self, query=None, n=None):
if query is None:
return None
self.cursor.execute(query)
if n is None:
return self.cursor.fetchall()
elif n == 1:
return self.cursor.fetchone()
else:
return self.cursor.fetchmany(5)
def convert_date_format(self,date=None):
if date:
return parse(date).strftime("%Y-%m-%d %H:%M:%S")
else:
return None