From bf72b4a38dddbbdd15f95edef192711249bbd85b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Will=20=E4=BF=9D=E5=93=A5?= Date: Sat, 28 Dec 2024 01:19:07 +0800 Subject: [PATCH] Fix markitdown.convert_stream handling of leading blanks Fixes #222 Address issue with `markitdown.convert_stream` crashing on input with leading blank characters or line breaks. * Modify `convert_stream` function in `src/markitdown/_markitdown.py` to strip leading blank characters or line breaks from the input stream using a new helper function `_strip_leading_blanks`. * Add a test case in `tests/test_markitdown.py` to verify that `markitdown.convert_stream` handles input with leading blank characters or line breaks correctly. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown/issues/222?shareId=XXXX-XXXX-XXXX-XXXX). --- src/markitdown/_markitdown.py | 6 +++++- tests/test_markitdown.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..b397685 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1344,7 +1344,7 @@ def convert_stream( result = None try: # Write to the temporary file - content = stream.read() + content = self._strip_leading_blanks(stream.read()) if isinstance(content, str): fh.write(content.encode("utf-8")) else: @@ -1367,6 +1367,10 @@ def convert_stream( return result + def _strip_leading_blanks(self, content: bytes) -> bytes: + """Helper function to strip leading blank characters or line breaks from content.""" + return content.lstrip() + def convert_url( self, url: str, **kwargs: Any ) -> DocumentConverterResult: # TODO: fix kwargs type diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..27a1160 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -300,6 +300,15 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +def test_markitdown_strip_leading_blanks() -> None: + markitdown = MarkItDown() + + # Test input with leading blank characters + input_data = b" \n\n\n

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "

Test

" in result.text_content + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -307,3 +316,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_strip_leading_blanks()