Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug with eml attachments containing html content #106

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions parse_emails/handle_eml.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,13 @@ def handle_eml(file_path, b64=False, file_name=None, parse_only_headers=False, m
payload = part.get_payload()

logger.debug(f'Iterating over parts. Current part: {part.get_content_type()=}')
if (part.is_multipart() or part.get_content_type().startswith('multipart')) \
and "attachment" not in part.get("Content-Disposition", "") or \
(payload and isinstance(payload, list) and len(payload) == 1 and
payload[0].get_content_type() == 'text/html'):
parts += [part_ for part_ in part.get_payload() if isinstance(part_, email.message.Message)]

is_multipart = part.is_multipart() or part.get_content_type().startswith('multipart')
is_not_attachment = "attachment" not in part.get("Content-Disposition", "")
is_message = payload and isinstance(payload, list) and len(payload) == 1 and payload[0].get_content_type() == 'text/html'

if is_not_attachment and (is_multipart or is_message):
parts += [part_ for part_ in payload if isinstance(part_, email.message.Message)]
elif part.get_filename()\
or "attachment" in part.get("Content-Disposition", "")\
or part.get("X-Attachment-Id")\
Expand Down
23 changes: 19 additions & 4 deletions parse_emails/tests/parse_emails_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,10 +968,25 @@ def test_msg_contains_ascii_characters_with_null():
assert results['Subject'] == 'RE: Test email for readpst and msg-extractor utility'


def test_eml_contails_html_content_type():
test_path = 'parse_emails/tests/test_data/eml_contains_htm_content_type.eml'
def test_multipart_eml_with_eml_attachment_containing_html_body():
"""
Given:
- eml file with attached another eml file with text/html content.

When:
- parsing the file.

Then:
- make sure the msg was correctly parsed.
"""
test_path = 'parse_emails/tests/test_data/multipart_with_eml_attachment_containing_html.eml'

email_parser = EmailParser(file_path=test_path, max_depth=2)
results = email_parser.parse()
assert len(results) == 15
assert results['HTML'] == '<html></html>'

assert isinstance(results, list)
assert len(results) == 2
assert results[0]["HTML"] == ""
assert results[0]["Attachments"] == "original_message.eml"
assert len(results[0]["AttachmentsData"]) > 0
assert results[1]["ParentFileName"] == "multipart_with_eml_attachment_containing_html.eml"
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
To: [email protected]
From: [email protected]
Subject: Your Subject
Date: 14 Jan 2025 12:00:00 +0000
Content-Type: multipart/mixed; boundary="000000000000e915c3062bcd115c"

--000000000000e915c3062bcd115c
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: base64

Email with attached another email

--000000000000e915c3062bcd115c
Content-Type: message/rfc822; name="original_message.eml"
Content-Disposition: attachment; filename="original_message.eml"
Content-Transfer-Encoding: 8bit
X-Attachment-Id: f0af9d461a78b41c_0.1

From: [email protected]
To: [email protected]
Date: 16 Jan 2025 05:31:24 +0000
Subject: =?utf-8?B?QXR0YWNoZWQgZW1haWwgc3ViamVjdA==?=
Content-Type: text/html; charset="utf-8"
Content-Transfer-Encoding: base64

PG1ldGEgaHR0cC1lcXVpdj0iQ29udGVudC1UeXBlIiBjb250ZW50PSJ0ZXh0L2h0b
Ww7IGNoYXJzZXQ9dXRmLTgiPg0KPHA+QXR0YWNoZWQgZW1haWwgSFRNTDwvcD4=
--000000000000e915c3062bcd115c--