-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·139 lines (120 loc) · 5.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
"""Lighten a mbox file by stripping out all non-text parts and producing a new mbox with just text"""
import time
import sys
import argparse
import os
import mailbox
from email.mime.text import MIMEText
import re
import logging
from typing import Union
import mboxcl_parser
import mbox_type_detect
import email_textutils
def main() -> int:
parser = argparse.ArgumentParser(description='"Lighten" a mbox file by stripping out non-text parts"')
parser.add_argument('infile', help='Input mbox file to read from')
parser.add_argument('outfile', help='Output mbox file to write to (will append if exists)')
parser.add_argument('--type', '-t', help='Manually specify input mbox type (mbox, mboxcl)')
parser.add_argument('--quoteblock', '-q', type=int, default=3, metavar='N',
help="Strip blocks of N+ quoted lines found together (default 3)")
parser.add_argument('--debug', help='Enable debug mode (very verbose output)', action='store_true')
args = parser.parse_args()
if args.debug:
logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
logging.debug('Debug output enabled')
else:
logging.basicConfig(encoding='utf-8', level=logging.INFO)
infile = args.infile
outfile = args.outfile
logging.info(f'Attempting to "lighten" {infile} --> {outfile}')
if args.type:
type = args.type
else:
type = mbox_type_detect.detect(infile)
if not type:
logging.error('Could not determine mbox flavor. Try specifying manually with --type')
return 1
if type == 'mbox':
mbox = mailbox.mbox(infile)
if type == 'mboxcl':
mbox = mboxcl_parser.parse_from_filename(infile)
logging.info(f'Mailbox {os.path.basename(infile)} opened. Contains {len(mbox)} messages.')
outbox = mailbox.mbox(outfile)
logging.debug(f'{outfile} opened for writing.')
for m in mbox:
headers = []
for h in m.items():
headers.append(h) # appends a (key, value) tuple for each header
lightmessage = lighten_message(m, headers, args.quoteblock)
if not lightmessage:
logging.debug('No content in message after comment stripping.')
lightmessage = html_to_text(m, headers, args.quoteblock)
if not lightmessage:
logging.info(f'Skipped message without text or HTML payload: {m["Message-ID"]}')
continue
else:
outbox.add(lightmessage)
logging.info(f'Output mailbox {os.path.basename(outfile)} now contains {len(outbox)} messages')
outbox.flush()
outbox.close()
return 0
def lighten_message(msg: mailbox.mboxMessage, headers: list, blocksize: int) -> Union[MIMEText, bool]:
logging.debug(f'Processing message {msg["Message-ID"]} {msg["Content-Type"].split(";")[0]}')
for part in msg.walk():
if part.get_content_disposition() is not None: # skip attachments
logging.debug(f'Skipped {msg["Content-Type"].split(";")[0]} -- appears to be an attachment')
continue
if part.get_content_maintype().lower() in ['application', 'audio', 'multipart']: # skip multipart, etc.
logging.debug(f'Skipped {msg["Content-Type"].split(";")[0]} -- appears to be a non-text part')
continue
if part.get_content_type().lower() == 'text/plain':
logging.debug(f'Found {msg["Content-Type"].split(";")[0]}')
newmsg = MIMEText(
email_textutils.strip_quoteblocks(
part.get_payload(decode=True).decode('utf-8', errors='replace'),
blocksize)
)
return add_headers(newmsg, headers)
else:
logging.debug(f'Unhandled message part: {part.get_content_type().lower()}')
continue
return False # message will fall through if it lacks a 'text/plain' part
def add_headers(msg: MIMEText, headers: list) -> MIMEText:
for h in headers:
keepheaders = ['Received', 'Date', 'From', 'To', 'Subject', 'Message-ID', 'User-Agent']
if h[0] in keepheaders:
msg.add_header(h[0], h[1])
else:
continue
return msg
def html_to_text(msg: mailbox.mboxMessage, headers: list, blocksize: int) -> Union[MIMEText, bool]:
logging.debug(f'Looking for HTML in {msg["Message-ID"]} {msg["Content-Type"].split(";")[0]}')
for part in msg.walk():
ctype = part.get_content_type().lower()
cdispo = str(part.get('Content-Disposition')).lower()
if ctype == 'text/html' and 'attachment' not in cdispo:
logging.debug(f'Found {msg["Content-Type"].split(";")[0]}')
htmlbody = part.get_payload(decode=True).decode('utf-8', errors='replace')
textbody = strip_html(htmlbody)
newmsg = MIMEText(email_textutils.strip_quoteblocks(textbody, blocksize))
return add_headers(newmsg, headers)
return False
def strip_html(htmlbody: str) -> str:
"""Attempt to produce readable plaintext from HTML; will mangle most complex HTML messages"""
body = re.sub(r"<style.*</style>", "", htmlbody, re.DOTALL) # strip <style...</style>
newlinetags = ['<br>', '<br/>', '</p>', '</div>'] # convert all these to \n for readability of text version
for t in newlinetags:
body = body.replace(t, '\n')
spacetags = [' ', ' ', ' ', ' ']
for s in spacetags:
body = body.replace(s, ' ')
body = re.sub(r"<[^>]*>", " ", body) # and yes, an HTML parser would be the right way to do this...
body = re.sub(r" {2,}", " ", body) # condense spaces
return body.strip()
if __name__ == '__main__':
start = time.process_time() # For performance measurements
exitval = main()
print(f'{os.path.basename(sys.argv[0])} completed in {str((time.process_time() - start))} seconds')
sys.exit(exitval)