diff --git a/parse_emails/handle_eml.py b/parse_emails/handle_eml.py index c495e86..27d222c 100644 --- a/parse_emails/handle_eml.py +++ b/parse_emails/handle_eml.py @@ -113,12 +113,13 @@ def handle_eml(file_path, b64=False, file_name=None, parse_only_headers=False, m payload = part.get_payload() logger.debug(f'Iterating over parts. Current part: {part.get_content_type()=}') - if (part.is_multipart() or part.get_content_type().startswith('multipart')) \ - and "attachment" not in part.get("Content-Disposition", "") or \ - (payload and isinstance(payload, list) and len(payload) == 1 and - payload[0].get_content_type() == 'text/html'): - parts += [part_ for part_ in part.get_payload() if isinstance(part_, email.message.Message)] + is_multipart = part.is_multipart() or part.get_content_type().startswith('multipart') + is_not_attachment = "attachment" not in part.get("Content-Disposition", "") + is_message = payload and isinstance(payload, list) and len(payload) == 1 and payload[0].get_content_type() == 'text/html' + + if is_not_attachment and (is_multipart or is_message): + parts += [part_ for part_ in payload if isinstance(part_, email.message.Message)] elif part.get_filename()\ or "attachment" in part.get("Content-Disposition", "")\ or part.get("X-Attachment-Id")\ diff --git a/parse_emails/tests/parse_emails_test.py b/parse_emails/tests/parse_emails_test.py index 328c430..d58aae9 100644 --- a/parse_emails/tests/parse_emails_test.py +++ b/parse_emails/tests/parse_emails_test.py @@ -968,10 +968,25 @@ def test_msg_contains_ascii_characters_with_null(): assert results['Subject'] == 'RE: Test email for readpst and msg-extractor utility' -def test_eml_contails_html_content_type(): - test_path = 'parse_emails/tests/test_data/eml_contains_htm_content_type.eml' +def test_multipart_eml_with_eml_attachment_containing_html_body(): + """ + Given: + - eml file with attached another eml file with text/html content. + + When: + - parsing the file. + + Then: + - make sure the msg was correctly parsed. + """ + test_path = 'parse_emails/tests/test_data/multipart_with_eml_attachment_containing_html.eml' email_parser = EmailParser(file_path=test_path, max_depth=2) results = email_parser.parse() - assert len(results) == 15 - assert results['HTML'] == '' + + assert isinstance(results, list) + assert len(results) == 2 + assert results[0]["HTML"] == "" + assert results[0]["Attachments"] == "original_message.eml" + assert len(results[0]["AttachmentsData"]) > 0 + assert results[1]["ParentFileName"] == "multipart_with_eml_attachment_containing_html.eml" \ No newline at end of file diff --git a/parse_emails/tests/test_data/multipart_with_eml_attachment_containing_html.eml b/parse_emails/tests/test_data/multipart_with_eml_attachment_containing_html.eml new file mode 100644 index 0000000..ec0654d --- /dev/null +++ b/parse_emails/tests/test_data/multipart_with_eml_attachment_containing_html.eml @@ -0,0 +1,28 @@ +To: recipient@example.com +From: sender@example.com +Subject: Your Subject +Date: 14 Jan 2025 12:00:00 +0000 +Content-Type: multipart/mixed; boundary="000000000000e915c3062bcd115c" + +--000000000000e915c3062bcd115c +Content-Type: text/plain; charset="UTF-8" +Content-Transfer-Encoding: base64 + +Email with attached another email + +--000000000000e915c3062bcd115c +Content-Type: message/rfc822; name="original_message.eml" +Content-Disposition: attachment; filename="original_message.eml" +Content-Transfer-Encoding: 8bit +X-Attachment-Id: f0af9d461a78b41c_0.1 + +From: attached-email-from@example.com +To: attached-email-to@example.com +Date: 16 Jan 2025 05:31:24 +0000 +Subject: =?utf-8?B?QXR0YWNoZWQgZW1haWwgc3ViamVjdA==?= +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: base64 + +PG1ldGEgaHR0cC1lcXVpdj0iQ29udGVudC1UeXBlIiBjb250ZW50PSJ0ZXh0L2h0b +Ww7IGNoYXJzZXQ9dXRmLTgiPg0KPHA+QXR0YWNoZWQgZW1haWwgSFRNTDwvcD4= +--000000000000e915c3062bcd115c-- \ No newline at end of file