-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserializer.py
executable file
·185 lines (148 loc) · 5.88 KB
/
serializer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
import os
import time
import mailbox
import os.path
from email.parser import Parser
from email.utils import formatdate
from dateutil.parser import parse as date_parse
# import html2text
from grab_self_mail_list import ShelveDB
# Variables ===================================================================
# Classes =====================================================================
class Message(object):
def __init__(self, uid, author_name, subject, raw_email, timestamp,
user_id):
self.uid = uid
self.user_id = user_id
self.timestamp = timestamp
# mail module in py2 requires utf-8
self.subject = subject.encode("utf-8")
self.raw_email = raw_email.encode("utf-8")
self.author_name = author_name.encode("utf-8")
# ugly, I know, but it only takes 1s for all the messages
# http://bruno.im/2009/dec/18/decoding-emails-python/
raw_email = raw_email.replace("<", "<") \
.replace("<=\n;", "<=\n") \
.replace("&l=\nt;", "<=\n") \
.replace("&=\nlt;", "<=\n") \
.replace("'", "'") \
.replace(">", ">") \
.replace(">=\n;", ">=\n") \
.replace("&g=\nt;", ">=\n") \
.replace("&=\ngt;", ">=\n") \
.replace(""", '"')
self.email_msg = Parser().parsestr(raw_email.encode("utf-8"))
self._topic_id = None
self._prev_in_topic = None
self._next_in_topic = None
self._postprocess()
def _parse_subject(self):
si_sign = "[self-interest]"
if si_sign in self.subject:
self.subject = self.subject.replace(si_sign, "")
self.subject = " ".join(self.subject.split()) # remove multiple spaces
def _self_parse_time(self):
def date_from_string(s):
try:
return date_parse(s)
except ValueError:
return None
def extract_date_from_received(s):
# (qmail 43149 invoked from ..); 5 Sep 2011 13:53:18 -0000 (...)
# -> 5 Sep 2011 13:53:18 -0000 (...)
s = s.split(";")[-1].strip()
# 5 Sep 2011 13:53:18 -0000 (...) -> 5 Sep 2011 13:53:18 -0000
return s.split("(")[0]
# try to parse dates in `Received` headers, which may look like this:
# (qmail 43149 invoked from network); 5 Sep 2011 13:53:18 -0000 (...)
received = (
date_from_string(extract_date_from_received(val))
for key, val in self.email_msg.items()
if key == "Received" or key == "X-Received"
)
def datetime_to_timestamp(dt):
"""
Sometimes the year is out of range, so thats why the try..
"""
try:
return time.mktime(dt.timetuple())
except ValueError:
return None
timestamps = [
datetime_to_timestamp(dt)
for dt in received
if dt # filter None from previous expression
]
if self.timestamp > 0:
timestamps.append(self.timestamp)
if not timestamps:
raise ValueError("Couldn't find any reasonable timestamp!")
def older_than_1998(timestamp):
return timestamp > 883612800
timestamp = min(
ts for ts in timestamps
if ts and older_than_1998(ts)
)
self.timestamp = int(timestamp)
def _postprocess(self):
self._self_parse_time()
self._parse_subject()
# mail objects look like dicts, but aren't - you cannot rewrite item,
# you have to delete it first and then save it again
del self.email_msg["To"]
del self.email_msg["From"]
del self.email_msg["Date"]
del self.email_msg["Subject"]
del self.email_msg["Reply-To"]
del self.email_msg["Return-Path"]
del self.email_msg["X-Original-From"]
self.email_msg["From"] = "%s <%d>" % (self.author_name, self.user_id)
self.email_msg["Date"] = formatdate(self.timestamp)
self.email_msg["Subject"] = self.subject
@staticmethod
def from_json(j_msg):
yg_data = j_msg["ygData"]
msg = Message(
uid=yg_data["msgId"],
author_name=yg_data["authorName"],
subject=yg_data["subject"],
raw_email=yg_data["rawEmail"],
timestamp=int(yg_data.get("postDate", 0)),
user_id=int(yg_data["userId"])
)
msg._topic_id = yg_data["topicId"]
msg._prev_in_topic = yg_data["prevInTopic"]
msg._next_in_topic = yg_data["nextInTopic"]
if msg._topic_id == 0:
msg._topic_id = None
if msg._prev_in_topic == 0:
msg._prev_in_topic = None
if msg._next_in_topic == 0:
msg._next_in_topic = None
return msg
@staticmethod
def to_mbox(messages, filename="self_archive.mbox"):
# mailbox merges, we don't want that
if os.path.exists(filename):
os.unlink(filename)
mbox = mailbox.mbox(filename)
for msg in messages.values():
mbox.add(msg.email_msg)
def __repr__(self):
return "Message(uid=%d, subject=%s)" % (self.uid, repr(self.subject))
# Main program ================================================================
if __name__ == '__main__':
db = ShelveDB()
messages = {
uid: Message.from_json(msg)
for uid, msg in db.msgs.iteritems()
}
Message.to_mbox(messages, "self")
# for uid, msg in db.msgs.iteritems():
# print msg["ygData"].get("profile", None)