Skip to content

Commit

Permalink
Fix hyphen issue
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 1, 2024
1 parent 0937baf commit f8f595c
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 4 deletions.
4 changes: 2 additions & 2 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from marker.ocr.recognition import run_ocr
from marker.pdf.extract_text import get_text_blocks
from marker.cleaners.headers import filter_header_footer, filter_common_titles
from marker.equations.equations import replace_equations
from marker.equations.texify import replace_equations
from marker.pdf.filetype import find_filetype
from marker.postprocessors.editor import edit_full_text
from marker.cleaners.code import identify_code_blocks, indent_blocks
from marker.cleaners.bullets import replace_bullets
from marker.markdown import merge_spans, merge_lines, get_full_text
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
from typing import List, Dict, Tuple, Optional
import re
from marker.settings import settings
Expand Down
File renamed without changes.
3 changes: 2 additions & 1 deletion marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
for l in block["lines"]:
spans = []
for i, s in enumerate(l["spans"]):
block_text = s["text"]
block_text = s["text"].rstrip("\n")
block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
span_obj = Span(
text=block_text.rstrip("\n"), # Remove end of line newlines, not spaces
bbox=s["bbox"],
Expand Down
2 changes: 1 addition & 1 deletion marker/markdown.py → marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def line_separator(line1, line2, block_type, is_continuation=False):
uppercase_letters = "A-ZÀ-ÖØ-ßА-ЯŞĆĂÂĐÊÔƠƯÞÐÆØÅ"
# Remove hyphen in current line if next line and current line appear to be joined
hyphen_pattern = re.compile(rf'.*[{lowercase_letters}][-]\s?$', re.DOTALL)
if line1 and hyphen_pattern.match(line1) and re.match(rf"^[{lowercase_letters}]", line2):
if line1 and hyphen_pattern.match(line1) and re.match(rf"^\s?[{lowercase_letters}]", line2):
# Split on — or - from the right
line1 = re.split(r"[-—]\s?$", line1)[0]
return line1.rstrip() + line2.lstrip()
Expand Down

0 comments on commit f8f595c

Please sign in to comment.