Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix: Work with rotated pdfs #37

Merged
merged 1 commit into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA

See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.

# Community

[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.

# Limitations

PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
Expand Down
22 changes: 21 additions & 1 deletion marker/bbox.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import fitz as pymupdf

def should_merge_blocks(box1, box2, tol=5):
# Within tol y px, and to the right within tol px
merge = [
Expand Down Expand Up @@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height):
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
]


def correct_rotation(bbox, page):
#bbox base is (x0, y0, x1, y1)
rotation = page.rotation
if rotation == 0:
return bbox

tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
if rotation == 90:
bbox = [br[0], tl[1], tl[0], br[1]]
elif rotation == 180:
bbox = [br[0], br[1], tl[0], tl[1]]
elif rotation == 270:
bbox = [tl[0], br[1], br[0], tl[1]]

return bbox
7 changes: 5 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans):
if not settings.DEBUG_DATA_FOLDER:
return

if len(images) == 0:
return

# We attempted one conversion per image
assert len(converted_spans) == len(images)

Expand All @@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
with open(debug_file, "w+") as f:
json.dump(data_lines, f, indent=4)
json.dump(data_lines, f)


def dump_bbox_debug_data(doc, blocks: List[Page]):
Expand Down Expand Up @@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f, indent=4)
json.dump(debug_data, f)



33 changes: 29 additions & 4 deletions marker/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from spellchecker import SpellChecker

from marker.bbox import correct_rotation
from marker.ocr.page import ocr_entire_page
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
from marker.settings import settings
Expand All @@ -12,8 +13,27 @@
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX


def sort_rotated_text(page_blocks, tolerance=1.25):
vertical_groups = {}
for block in page_blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_page_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_page_blocks.extend(sorted_group)

return sorted_page_blocks


def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
page = doc[pnum]
rotation = page.rotation

if ocr:
blocks = ocr_entire_page(page, tess_lang, spellchecker)
else:
Expand All @@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
bbox = s["bbox"]
span_obj = Span(
text=block_text,
bbox=bbox,
bbox=correct_rotation(bbox, page),
span_id=f"{pnum}_{span_id}",
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
color=s["color"],
Expand All @@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
span_id += 1
line_obj = Line(
spans=spans,
bbox=l["bbox"]
bbox=correct_rotation(l["bbox"], page),
)
# Only select valid lines, with positive bboxes
if line_obj.area > 0:
block_lines.append(line_obj)
block_obj = Block(
lines=block_lines,
bbox=block["bbox"],
bbox=correct_rotation(block["bbox"], page),
pnum=pnum
)
# Only select blocks with multiple lines
if len(block_lines) > 0:
page_blocks.append(block_obj)

# If the page was rotated, sort the text again
if rotation > 0:
page_blocks = sort_rotated_text(page_blocks)
return page_blocks


Expand All @@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
not disable_ocr
]
if all(conditions) or settings.OCR_ALL_PAGES:
page = doc[pnum]
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
ocr_pages = 1
if len(blocks) == 0:
ocr_failed = 1
Expand Down
4 changes: 2 additions & 2 deletions marker/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter
from typing import List, Optional
from typing import List, Optional, Tuple

from pydantic import BaseModel, field_validator
import ftfy
Expand All @@ -20,7 +20,6 @@ def find_span_type(span, page_blocks):
class BboxElement(BaseModel):
bbox: List[float]


@field_validator('bbox')
@classmethod
def check_4_elements(cls, v: List[float]) -> List[float]:
Expand Down Expand Up @@ -134,6 +133,7 @@ class Page(BboxElement):
blocks: List[Block]
pnum: int
column_count: Optional[int] = None
rotation: Optional[int] = None # Rotation degrees of the page

def get_nonblank_lines(self):
lines = self.get_all_lines()
Expand Down
18 changes: 16 additions & 2 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,22 @@ class Settings(BaseSettings):
# Nougat model
NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
"\par\par\par", "## Chapter", "Fig.", "particle", "[REPEATS]", "[TRUNCATED]", "### ", "effective field strength", "\Phi_{\rm eff}"]
NOUGAT_HALLUCINATION_WORDS: List[str] = [
"[MISSING_PAGE_POST]",
"## References\n",
"**Figure Captions**\n",
"Footnote",
"\par\par\par",
"## Chapter",
"Fig.",
"particle",
"[REPEATS]",
"[TRUNCATED]",
"### ",
"effective field strength",
"\Phi_{\rm eff}",
"\mathbf{\mathbf"
]
NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu
Expand Down