Skip to content

Commit

Permalink
Do some escaping of special Markdown characters. (#867)
Browse files Browse the repository at this point in the history
* Do some escaping of special Markdown characters.

* bugfix

* and caption too
  • Loading branch information
alexaryn authored Oct 3, 2024
1 parent 1ba50ca commit 3a554ca
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
4 changes: 2 additions & 2 deletions lib/sycamore/sycamore/tests/unit/utils/test_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_shenanigans() -> None:
TableCell(content="F", rows=[2], cols=[2], is_header=False),
TableCell(content="G", rows=[3], cols=[0], is_header=False),
TableCell(content="H", rows=[3], cols=[1], is_header=False),
TableCell(content="I", rows=[3], cols=[2], is_header=False),
TableCell(content="|", rows=[3], cols=[2], is_header=False),
]
)
te = elemFromTable(table, 1, 0.1, 0.1)
Expand All @@ -142,7 +142,7 @@ def test_shenanigans() -> None:
| A | B | C |
| D | B | E |
| D | B | F |
| G | H | I |
| G | H | \\| |
"""
assert s == answer
Expand Down
30 changes: 24 additions & 6 deletions lib/sycamore/sycamore/utils/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Utilities for converting a list of Elements into Markdown-formatted text.
TODO:
- address multi-line and span headers
- maybe insert horizontal rules at page breaks
- handle numbered lists
- render textract tables
Expand All @@ -15,6 +14,24 @@

SKIP_TYPES = {"page-header", "page-footer", "image"}

ESCAPE_CHARS = {"\\", "!", "#", ">", "[", "|"}


def escape_str(s: str) -> str:
"""
We don't expect input with meaningful backslashes.
"""
sio = StringIO()
for ch in s:
if ch < " ":
sio.write(" ")
elif ch in ESCAPE_CHARS:
sio.write("\\")
sio.write(ch)
else:
sio.write(ch)
return sio.getvalue()


def elements_to_markdown(elems: list[Element]) -> str:
"""
Expand All @@ -40,7 +57,7 @@ def elements_to_markdown(elems: list[Element]) -> str:
text = elem_text(elem).strip()
if not text:
continue
text = text.replace("\n", " ")
text = escape_str(text)
if type == "title":
sio.write(f"\n# {text}\n\n")
elif type == "section-header":
Expand Down Expand Up @@ -83,16 +100,17 @@ def render_table(elem: TableElement, sio: StringIO) -> None:
matrix = [[""] * ncol for _ in range(nrow)]
for cell in cells:
if cell.content:
content = escape_str(cell.content).strip()
for col in cell.cols:
if cell.rows[0] <= hdr_max: # ignore rowspan in headers
s = matrix[0][col]
if s:
matrix[0][col] = f"{s} {cell.content}"
matrix[0][col] = f"{s} {content}"
else:
matrix[0][col] = cell.content
matrix[0][col] = content
else:
for row in cell.rows:
matrix[row][col] = cell.content
matrix[row][col] = content
sep = "| " + " | ".join(["-----" for _ in range(ncol)]) + " |\n"
sio.write("\n")
if hdr_max < 0:
Expand All @@ -107,7 +125,7 @@ def render_table(elem: TableElement, sio: StringIO) -> None:
sio.write("\n")
caption = table.caption
if caption:
caption = caption.replace("\n", " ").strip()
caption = escape_str(caption).strip()
if caption:
sio.write(caption)
sio.write("\n")
Expand Down

0 comments on commit 3a554ca

Please sign in to comment.