Skip to content

Commit

Permalink
Fix markitdown.convert_stream handling of leading blanks
Browse files Browse the repository at this point in the history
Fixes #222

Address issue with `markitdown.convert_stream` crashing on input with leading blank characters or line breaks.

* Modify `convert_stream` function in `src/markitdown/_markitdown.py` to strip leading blank characters or line breaks from the input stream using a new helper function `_strip_leading_blanks`.
* Add a test case in `tests/test_markitdown.py` to verify that `markitdown.convert_stream` handles input with leading blank characters or line breaks correctly.

---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown/issues/222?shareId=XXXX-XXXX-XXXX-XXXX).
  • Loading branch information
doggy8088 committed Dec 27, 2024
1 parent 125e206 commit bf72b4a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ def convert_stream(
result = None
try:
# Write to the temporary file
content = stream.read()
content = self._strip_leading_blanks(stream.read())
if isinstance(content, str):
fh.write(content.encode("utf-8"))
else:
Expand All @@ -1367,6 +1367,10 @@ def convert_stream(

return result

def _strip_leading_blanks(self, content: bytes) -> bytes:
"""Helper function to strip leading blank characters or line breaks from content."""
return content.lstrip()

def convert_url(
self, url: str, **kwargs: Any
) -> DocumentConverterResult: # TODO: fix kwargs type
Expand Down
10 changes: 10 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,20 @@ def test_markitdown_llm() -> None:
assert test_string in result.text_content.lower()


def test_markitdown_strip_leading_blanks() -> None:
markitdown = MarkItDown()

# Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
assert "<h1>Test</h1>" in result.text_content


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_strip_leading_blanks()

0 comments on commit bf72b4a

Please sign in to comment.