Skip to content

Commit 016d603

Browse files
authored
File preview for msg and eml (#5)
* working .msg, prototype .eml * rework to check if file is valid for msg or eml * update readme
1 parent f1d6767 commit 016d603

11 files changed

+185
-84
lines changed

Diff for: .gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,10 @@ wheels/
88

99
# venv
1010
.venv
11+
12+
# tests
13+
tests/*
14+
output.jpg
15+
16+
# Mac
17+
.DS_Store

Diff for: README.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
This application generates jpg previews of .msg and .eml email files.
2+
13
# Setup
24
install rye (or any other python pkg manager of choice, you'll have to install the dependencies yourself)
35

4-
`rye sync`
6+
install rust (`brew install rust` or https://www.rust-lang.org/tools/install)
57

8+
`rye sync`
69

10+
# Starting the program (local)
711
`python src/main.py`
812

913

Diff for: pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies = [
1010
"extract-msg>=0.50.0",
1111
"pillow>=10.4.0",
1212
"beautifulsoup4>=4.12.3",
13+
"fast-mail-parser>=0.2.5",
1314
]
1415
readme = "README.md"
1516
requires-python = ">= 3.8"

Diff for: requirements-dev.lock

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ ebcdic==1.1.1
3131
# via extract-msg
3232
extract-msg==0.50.0
3333
# via files-preview-python-api
34+
fast-mail-parser==0.2.5
35+
# via files-preview-python-api
3436
flask==3.0.3
3537
# via files-preview-python-api
3638
itsdangerous==2.2.0

Diff for: requirements.lock

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ ebcdic==1.1.1
3131
# via extract-msg
3232
extract-msg==0.50.0
3333
# via files-preview-python-api
34+
fast-mail-parser==0.2.5
35+
# via files-preview-python-api
3436
flask==3.0.3
3537
# via files-preview-python-api
3638
itsdangerous==2.2.0

Diff for: src/image_functions.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from PIL import Image, ImageDraw, ImageFont
2+
3+
def write_text_to_image(text, output_image_path):
4+
"""Writes given text to an image."""
5+
image_width = 1920
6+
image_height = 1080
7+
8+
# We need a hardcoded font because the default font doesn't support all characters
9+
font = ImageFont.truetype("Arial.ttf", 20)
10+
11+
# Create the image
12+
image = create_image((image_width, image_height), 'white', text, font, 'black')
13+
14+
# Save the image as .jpg
15+
image.save(output_image_path, 'JPEG')
16+
17+
print(f"Image saved as: {output_image_path}")
18+
19+
def create_image(size, bgColor, text, font, fontColor):
20+
"""
21+
Creates an image with the specified size, background color, message, font, and font color.
22+
23+
Args:
24+
size (tuple): The size of the image in pixels, specified as a tuple (width, height).
25+
bgColor (str): The background color of the image in RGB format.
26+
message (str): The message to be displayed on the image.
27+
font (PIL.ImageFont): The font to be used for the message.
28+
fontColor (str): The color of the message in RGB format.
29+
30+
Returns:
31+
PIL.Image.Image: The created image.
32+
"""
33+
W, H = size
34+
image = Image.new('RGB', size, bgColor)
35+
draw = ImageDraw.Draw(image)
36+
37+
# Draw the given text anchored to the center of the image, with the given font and color.
38+
draw.text((W/2, H/2), text, anchor="mm", font=font, fill=fontColor)
39+
40+
return image

Diff for: src/main.py

+44-83
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,12 @@
11
from flask import Flask, request, send_file, jsonify
2+
from fast_mail_parser import parse_email, ParseError
23
import extract_msg
3-
from PIL import Image, ImageDraw, ImageFont
4-
from bs4 import BeautifulSoup
5-
from email import policy
6-
from email.parser import BytesParser
4+
import text_functions
5+
import image_functions
76
import os
8-
import textwrap
97

108
app = Flask(__name__)
119

12-
def convert_html_to_text(html_content):
13-
"""Convert HTML content to plain text using BeautifulSoup."""
14-
soup = BeautifulSoup(html_content, 'html.parser')
15-
return soup.get_text()
16-
17-
def extract_email_body_from_msg(msg_file_path):
18-
"""Extract body content from a .msg file."""
19-
# TODO might have to handle HTML content as well.
20-
msg = extract_msg.Message(msg_file_path)
21-
22-
return msg.body
23-
24-
def extract_email_body_from_eml(eml_file_path):
25-
"""Extract body content from a .eml file."""
26-
with open(eml_file_path, 'rb') as f:
27-
msg = BytesParser(policy=policy.default).parse(f)
28-
29-
# Check for HTML or plain text part
30-
if msg.is_multipart():
31-
for part in msg.iter_parts():
32-
if part.get_content_type() == 'text/html':
33-
return convert_html_to_text(part.get_payload(decode=True).decode())
34-
elif part.get_content_type() == 'text/plain':
35-
return part.get_payload(decode=True).decode()
36-
else:
37-
# Non-multipart email, directly return text/plain or HTML content
38-
if msg.get_content_type() == 'text/html':
39-
return convert_html_to_text(msg.get_payload(decode=True).decode())
40-
else:
41-
return msg.get_payload(decode=True).decode()
42-
43-
def convert_email_to_image(body_content, output_image_path):
44-
"""Convert email body content to an image."""
45-
# Set up image
46-
## TODO probably need to adjust these values based on the content
47-
image_width = 800
48-
image_height = 600
49-
padding = 20
50-
font = ImageFont.load_default()
51-
52-
# Text wrapping for proper formatting
53-
wrapped_text = textwrap.fill(body_content, width=100)
54-
55-
# TODO: Calculate the height of the image based on the wrapped text and center text
56-
dummy_image = Image.new('RGB', (image_width, 1), color=(255, 255, 255))
57-
draw = ImageDraw.Draw(dummy_image)
58-
# _, _, _, image_height = draw.multiline_textbbox((0, 0), text=wrapped_text, font=font)
59-
print(image_width, image_height)
60-
61-
# Create the actual image
62-
image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255))
63-
draw = ImageDraw.Draw(image)
64-
65-
# Draw the text on the image
66-
draw.text((padding, padding), wrapped_text, font=font, fill=(0, 0, 0))
67-
68-
# Save the image as .jpg
69-
image.save(output_image_path, 'JPEG')
70-
print(f"Image saved as: {output_image_path}")
71-
7210
@app.route('/converter', methods=['POST'])
7311
def convert_email():
7412
if 'file' not in request.files:
@@ -79,28 +17,51 @@ def convert_email():
7917
if file.filename == '':
8018
return jsonify({"error": "No selected file"}), 400
8119

82-
# TODO might have to check file by header instead of extension.
83-
if file and (file.filename.endswith('.msg') or file.filename.endswith('.eml')):
84-
filename = file.filename
85-
file_path = os.path.join('/tmp', filename)
86-
file.save(file_path)
20+
if file is None:
21+
return jsonify({"error": "No valid file provided"}), 400
8722

88-
# Extract body content based on file type
89-
if filename.endswith('.msg'):
90-
body_content = extract_email_body_from_msg(file_path)
91-
elif filename.endswith('.eml'):
92-
body_content = extract_email_body_from_eml(file_path)
93-
else:
94-
return jsonify({"error": "Unsupported file format"}), 400
23+
# Save the file to /tmp
24+
filename = file.filename
25+
file_path = os.path.join('/tmp', filename)
26+
file.save(file_path)
9527

96-
# Convert email body to image
97-
output_image_path = os.path.join('/tmp', 'output.jpg')
98-
convert_email_to_image(body_content, output_image_path)
28+
# Check file by content
29+
# Check if it's a .msg file.
30+
try:
31+
extract_msg.openMsg(file_path)
32+
is_msg = True
33+
except:
34+
is_msg = False
35+
pass
9936

100-
# Return the image file as a response
101-
return send_file(output_image_path, mimetype='image/jpeg')
10237

103-
return jsonify({"error": "Unsupported file format"}), 400
38+
# Check if it's a .eml file.
39+
try:
40+
with open(file_path, 'r') as f:
41+
message_payload = f.read()
42+
43+
_ = parse_email(message_payload)
44+
is_eml = True
45+
46+
# UnicodeDecodeError is raised when the file is not a text file (e.g. an .msg file)
47+
except (ParseError, UnicodeDecodeError):
48+
is_eml = False
49+
pass
50+
51+
# Extract text from the file
52+
if is_msg:
53+
text = text_functions.build_email_text_from_msg(file_path)
54+
elif is_eml:
55+
text = text_functions.build_email_text_from_eml(file_path)
56+
else:
57+
return jsonify({"error": "File has no supported file type: eml, msg"}), 400
58+
59+
# Write extracted text to image
60+
output_image_path = os.path.join('/tmp', 'output.jpg')
61+
image_functions.write_text_to_image(text, output_image_path)
62+
63+
# Return the image file as a response
64+
return send_file(output_image_path, mimetype='image/jpeg')
10465

10566
if __name__ == '__main__':
10667
app.run(host='0.0.0.0', port=8082) # TODO change port number via params

Diff for: src/text_functions.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from fast_mail_parser import parse_email
2+
from bs4 import BeautifulSoup
3+
import extract_msg
4+
import textwrap
5+
import re
6+
7+
def convert_html_to_text(html_content):
8+
"""Converts HTML content to plain text using BeautifulSoup."""
9+
soup = BeautifulSoup(html_content, 'html.parser')
10+
return soup.get_text()
11+
12+
def build_email_text_from_msg(msg_file_path):
13+
"""Extracts content from a .msg file and format the resulting text."""
14+
15+
msg = extract_msg.openMsg(msg_file_path)
16+
17+
# Wrap message body
18+
msg.body = wrap_text(msg.body, 160)
19+
20+
# Build text from the content of the email
21+
text = u'Datum: ' + msg.date.strftime("%d.%m.%Y") + '\nVon: ' + msg.sender + '\nAn: ' + msg.to + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + msg.body[:1000]
22+
23+
# Get Attachment Filenames add to email text
24+
attachmentFilenames = ''
25+
for attachment in msg.attachments:
26+
attachmentFilenames += attachment.getFilename() + '\n'
27+
if attachmentFilenames:
28+
text += '\n\nAnhänge:\n' + attachmentFilenames
29+
30+
# Remove some special characters that don't get displayed correctly
31+
text = text.replace('\r', '').replace('\t', '')
32+
33+
# Remove duplicate spaces
34+
text = re.sub(' {2,}', ' ', text)
35+
36+
return convert_html_to_text(text)
37+
38+
def build_email_text_from_eml(eml_file_path):
39+
"""Extracts content from a .eml file and format the resulting text."""
40+
41+
with open(eml_file_path, 'r') as f:
42+
message_payload = f.read()
43+
44+
msg = parse_email(message_payload)
45+
46+
# Wrap message body
47+
body = wrap_text(msg.text_plain[0], 160)
48+
49+
# Build text from the content of the email
50+
text = u'Datum: ' + msg.date + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + body[:1000]
51+
52+
# Get Attachment Filenames add to email text
53+
attachmentFilenames = ''
54+
for attachment in msg.attachments:
55+
attachmentFilenames += attachment.filename + '\n'
56+
if attachmentFilenames:
57+
text += '\n\nAnhänge:\n' + attachmentFilenames
58+
59+
# Remove some special characters that don't get displayed correctly
60+
text = text.replace('\r', '').replace('\t', '')
61+
62+
# Remove duplicate spaces
63+
text = re.sub(' {2,}', ' ', text)
64+
65+
return convert_html_to_text(text)
66+
67+
def wrap_text(text, width):
68+
"""
69+
Wraps the given text to the specified width. Preserves existing single line breaks.
70+
71+
Args:
72+
text (str): The text to be wrapped.
73+
width (int): The maximum width of each line.
74+
75+
Returns:
76+
str: The wrapped text.
77+
78+
"""
79+
80+
text = '\n'.join(['\n'.join(textwrap.wrap(line, width,
81+
break_long_words=False, replace_whitespace=False))
82+
for line in text.splitlines(keepends=True) if line.strip() != ''])
83+
84+
return text
Binary file not shown.

Diff for: tests/asd.msg

-185 KB
Binary file not shown.

0 commit comments

Comments
 (0)