From 6198478597c4922b79b12cd494d55c0d9cd14a61 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Mon, 6 May 2024 11:09:27 -0700
Subject: [PATCH 1/4] Fix rotation issues

---
 marker/models.py           | 2 +-
 marker/ocr/heuristics.py   | 2 +-
 marker/ocr/lang.py         | 4 ++--
 marker/ocr/recognition.py  | 2 +-
 marker/pdf/extract_text.py | 8 +++++++-
 marker/settings.py         | 9 +--------
 poetry.lock                | 8 ++++----
 pyproject.toml             | 2 +-
 8 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/marker/models.py b/marker/models.py
index 72cb2b62..a77fb7e1 100644
--- a/marker/models.py
+++ b/marker/models.py
@@ -50,7 +50,7 @@ def load_all_models(langs=None):
     layout = setup_layout_model()
     order = setup_order_model()
     edit = load_editing_model()
-    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE_INTERNAL == "surya" else None
+    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
     texify = setup_texify_model()
     model_lst = [texify, layout, order, edit, detection, ocr]
     return model_lst
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
index 2fdb9d8e..d7bca5bb 100644
--- a/marker/ocr/heuristics.py
+++ b/marker/ocr/heuristics.py
@@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3)
         total_intersection = 0
         for block in page.blocks:
             for line in block.lines:
-                intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
+                intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
                 total_intersection += intersection_pct
         if total_intersection > intersect_thresh:
             found_lines += 1
diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py
index 8240b057..82d6cc0e 100644
--- a/marker/ocr/lang.py
+++ b/marker/ocr/lang.py
@@ -5,7 +5,7 @@
 
 
 def replace_langs_with_codes(langs):
-    if settings.OCR_ENGINE_INTERNAL == "surya":
+    if settings.OCR_ENGINE == "surya":
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_CODE[lang]
@@ -17,7 +17,7 @@ def replace_langs_with_codes(langs):
 
 
 def validate_langs(langs):
-    if settings.OCR_ENGINE_INTERNAL == "surya":
+    if settings.OCR_ENGINE == "surya":
         for lang in langs:
             if lang not in CODE_TO_LANGUAGE:
                 raise ValueError(f"Invalid language code {lang} for Surya OCR")
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
index 6da62d8d..24dfbbc7 100644
--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -28,7 +28,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
             ocr_idxs.append(pnum)
             ocr_pages += 1
 
-    ocr_method = settings.OCR_ENGINE_INTERNAL
+    ocr_method = settings.OCR_ENGINE
     if ocr_method == "surya":
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
     else:
diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
index bf10e906..4d5aa317 100644
--- a/marker/pdf/extract_text.py
+++ b/marker/pdf/extract_text.py
@@ -57,12 +57,18 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
     page_bbox = page["bbox"]
     page_width = abs(page_bbox[2] - page_bbox[0])
     page_height = abs(page_bbox[3] - page_bbox[1])
+    rotation = page["rotation"]
+
+    # Flip width and height if rotated
+    if rotation == 90 or rotation == 270:
+        page_width, page_height = page_height, page_width
+
     page_bbox = [0, 0, page_width, page_height]
     out_page = Page(
         blocks=page_blocks,
         pnum=page["page"],
         bbox=page_bbox,
-        rotation=page["rotation"],
+        rotation=rotation,
         char_blocks=page["blocks"]
     )
     return out_page
diff --git a/marker/settings.py b/marker/settings.py
index 9bdc1490..5566e5fb 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -44,7 +44,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
-    OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
+    OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
     OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
 
     ## Surya
@@ -56,13 +56,6 @@ def TORCH_DEVICE_MODEL(self) -> str:
     TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
     TESSDATA_PREFIX: str = ""
 
-    @computed_field
-    def OCR_ENGINE_INTERNAL(self) -> str:
-        if self.OCR_ENGINE is not None:
-            return self.OCR_ENGINE
-
-        return "surya"
-
     # Texify model
     TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
     TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify
diff --git a/poetry.lock b/poetry.lock
index 925699e8..50eb44e0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2525,13 +2525,13 @@ image = ["Pillow"]
 
 [[package]]
 name = "pdftext"
-version = "0.3.5"
+version = "0.3.6"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
-    {file = "pdftext-0.3.5-py3-none-any.whl", hash = "sha256:2a1649b1f2b8ea563fd4f2a3a7227afb0693622b5e3820bca390817d92f228c7"},
-    {file = "pdftext-0.3.5.tar.gz", hash = "sha256:bd2c4c918889894488b18fa6395eff77138dcb8762fc3c44f08a402597618d41"},
+    {file = "pdftext-0.3.6-py3-none-any.whl", hash = "sha256:82c6b0c1e3e1116446c9a5e31f1e15b078cf9195e1cff608e24f9fd5826a88df"},
+    {file = "pdftext-0.3.6.tar.gz", hash = "sha256:91be26c76c2a496054d64875edf17349dbf5c17c40bb47f844dc0d9b95d4b7e2"},
 ]
 
 [package.dependencies]
@@ -4990,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
-content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb"
+content-hash = "8759c2dc6b9d345ae966f2fe10bb8ee9a2bb93c2d6a07ec2a7d2ec4d57bd3b2c"
diff --git a/pyproject.toml b/pyproject.toml
index ff0c2ff4..219de389 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ texify = "^0.1.8"
 rapidfuzz = "^3.8.1"
 surya-ocr = "^0.4.0"
 filetype = "^1.2.0"
-pdftext = "^0.3.4"
+pdftext = "^0.3.6"
 regex = "^2024.4.28"
 
 [tool.poetry.group.dev.dependencies]

From 77a99f37b35e7124915c230c5e86f9518349f7cd Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Mon, 6 May 2024 20:33:55 -0700
Subject: [PATCH 2/4] Work on tables

---
 marker/convert.py                    |   5 +-
 marker/equations/equations.py        |  28 ++++---
 marker/equations/images.py           |  19 -----
 marker/images/extract.py             |  53 +++++++++++++
 marker/layout/order.py               |  20 +----
 marker/ocr/heuristics.py             |   2 +-
 marker/pdf/extract_text.py           |   5 +-
 marker/pdf/images.py                 |  19 ++++-
 marker/pdf/utils.py                  |  22 ++++++
 marker/schema/block.py               |  21 +++++
 marker/schema/page.py                |   3 +-
 marker/settings.py                   |   1 +
 marker/tables/cells.py               |  89 +++++++++++++++++++++
 marker/{cleaners => tables}/table.py | 114 ++++++---------------------
 marker/tables/utils.py               |  37 +++++++++
 15 files changed, 295 insertions(+), 143 deletions(-)
 delete mode 100644 marker/equations/images.py
 create mode 100644 marker/images/extract.py
 create mode 100644 marker/tables/cells.py
 rename marker/{cleaners => tables}/table.py (64%)
 create mode 100644 marker/tables/utils.py

diff --git a/marker/convert.py b/marker/convert.py
index f3965be6..f475d3d4 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -6,7 +6,7 @@
 
 import pypdfium2 as pdfium
 
-from marker.cleaners.table import arrange_table_rows
+from marker.tables.table import format_tables
 from marker.debug.data import dump_bbox_debug_data
 from marker.layout.layout import surya_layout, annotate_block_types
 from marker.layout.order import surya_order, sort_blocks_in_reading_order
@@ -25,7 +25,6 @@
 from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
 
 from typing import List, Dict, Tuple, Optional
-import re
 from marker.settings import settings
 
 
@@ -107,7 +106,7 @@ def convert_single_pdf(
     indent_blocks(pages)
 
     # Fix table blocks
-    table_count = arrange_table_rows(pages)
+    table_count = format_tables(pages)
     out_meta["block_stats"]["table"] = table_count
 
     for page in pages:
diff --git a/marker/equations/equations.py b/marker/equations/equations.py
index da23b136..32df8d4d 100644
--- a/marker/equations/equations.py
+++ b/marker/equations/equations.py
@@ -3,11 +3,11 @@
 from typing import List
 
 from marker.debug.data import dump_equation_debug_data
-from marker.equations.images import get_equation_image
 from marker.equations.inference import get_total_texify_tokens, get_latex_batched
+from marker.pdf.images import render_bbox_image
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
-from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
+from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block
 from marker.settings import settings
 
 
@@ -30,21 +30,29 @@ def find_equation_blocks(page, processor):
                     if region_idx not in insert_points:
                         insert_points[region_idx] = (block_idx, line_idx)
 
+    # Account for regions where the lines were not detected
+    for region_idx, region in enumerate(equation_regions):
+        if region_idx in insert_points:
+            continue
+
+        insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
+
     block_lines_to_remove = defaultdict(set)
     for region_idx, equation_region in enumerate(equation_regions):
         if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
-            continue
-        equation_block = equation_lines[region_idx]
-        equation_insert = insert_points[region_idx]
-        block_text = " ".join([line.prelim_text for line in equation_block])
-        equation_bbox = bbox_from_lines(equation_block)
+            block_text = ""
+            total_tokens = 0
+        else:
+            equation_block = equation_lines[region_idx]
+            block_text = " ".join([line.prelim_text for line in equation_block])
+            total_tokens = get_total_texify_tokens(block_text, processor)
 
-        total_tokens = get_total_texify_tokens(block_text, processor)
+        equation_insert = insert_points[region_idx]
         equation_insert_line_idx = equation_insert[1]
         equation_insert_line_idx -= len(
             [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
 
-        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
+        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
         if total_tokens < settings.TEXIFY_MODEL_MAX:
             # Account for the lines we're about to remove
             for item in lines_to_remove[region_idx]:
@@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
     for page_idx, page_equation_blocks in enumerate(equation_blocks):
         page_obj = doc[page_idx]
         for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
-            png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
+            png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)
 
             images.append(png_image)
             token_counts.append(token_count)
diff --git a/marker/equations/images.py b/marker/equations/images.py
deleted file mode 100644
index f6d4644f..00000000
--- a/marker/equations/images.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from pypdfium2 import PdfPage
-
-from marker.pdf.images import render_image
-from marker.schema.bbox import rescale_bbox
-from marker.schema.page import Page
-from marker.settings import settings
-
-
-def get_equation_image(page_obj: PdfPage, page: Page, bbox):
-    rescaled_bboxes = []
-    png_image = render_image(page_obj, settings.TEXIFY_DPI)
-    # Rescale original pdf bbox bounds to match png image size
-    png_bbox = [0, 0, png_image.size[0], png_image.size[1]]
-    rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox)
-
-    # Crop out only the equation image
-    png_image = png_image.crop(rescaled_merged)
-    png_image = png_image.convert("RGB")
-    return png_image
diff --git a/marker/images/extract.py b/marker/images/extract.py
new file mode 100644
index 00000000..ae7d3367
--- /dev/null
+++ b/marker/images/extract.py
@@ -0,0 +1,53 @@
+from marker.pdf.images import render_bbox_image
+from marker.schema.bbox import rescale_bbox
+from marker.schema.block import find_insert_block, Span
+
+
+def find_image_blocks(page):
+    image_blocks = []
+    image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
+    image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]
+
+    insert_points = {}
+    for region_idx, region in enumerate(image_regions):
+        for block_idx, block in enumerate(page.blocks):
+            for line_idx, line in enumerate(block.lines):
+                if line.intersection_pct(region) > .8:
+                    line.spans = [] # We will remove this line from the block
+
+                    if region_idx not in insert_points:
+                        insert_points[region_idx] = (block_idx, line_idx)
+
+    # Account for images with no detected lines
+    for region_idx, region in enumerate(image_regions):
+        if region_idx in insert_points:
+            continue
+
+        insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
+
+    for region_idx, image_region in enumerate(image_regions):
+        image_insert = insert_points[region_idx]
+        image_blocks.append([image_insert[0], image_insert[1], image_region])
+
+    return image_blocks
+
+
+def extract_images(page):
+    image_blocks = find_image_blocks(page)
+
+    for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
+        block = page.blocks[block_idx]
+        image = render_bbox_image(page.page_obj, page, bbox)
+        image_filename = f"{page.pnum}_image_{image_idx}.png"
+        image_markdown = f"![{image_filename}]({image_filename})"
+        image_span = Span(
+            bbox=bbox,
+            text=image_markdown,
+            font="Image",
+            rotation=0,
+            font_weight=0,
+            font_size=0,
+            image=True
+        )
+        block.lines[line_idx].spans.append(image_span)
+        page.images.append(image)
diff --git a/marker/layout/order.py b/marker/layout/order.py
index 9833f5de..76f9fbc0 100644
--- a/marker/layout/order.py
+++ b/marker/layout/order.py
@@ -4,6 +4,7 @@
 from surya.ordering import batch_ordering
 
 from marker.pdf.images import render_image
+from marker.pdf.utils import sort_block_group
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
 from marker.settings import settings
@@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]):
             block_group = sort_block_group(block_groups[position])
             new_blocks.extend(block_group)
 
-        page.blocks = new_blocks
-
-
-def sort_block_group(blocks, tolerance=1.25):
-    vertical_groups = {}
-    for block in blocks:
-        group_key = round(block.bbox[1] / tolerance) * tolerance
-        if group_key not in vertical_groups:
-            vertical_groups[group_key] = []
-        vertical_groups[group_key].append(block)
-
-    # Sort each group horizontally and flatten the groups into a single list
-    sorted_blocks = []
-    for _, group in sorted(vertical_groups.items()):
-        sorted_group = sorted(group, key=lambda x: x.bbox[0])
-        sorted_blocks.extend(sorted_group)
-
-    return sorted_blocks
\ No newline at end of file
+        page.blocks = new_blocks
\ No newline at end of file
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
index d7bca5bb..ffe6e422 100644
--- a/marker/ocr/heuristics.py
+++ b/marker/ocr/heuristics.py
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
 
 
-def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
+def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:
 
diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
index 4d5aa317..ea9182e1 100644
--- a/marker/pdf/extract_text.py
+++ b/marker/pdf/extract_text.py
@@ -4,7 +4,7 @@
 import pypdfium2 as pdfium
 import pypdfium2.internal as pdfium_i
 
-from marker.pdf.utils import find_filetype, font_flags_decomposer
+from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
 from marker.ocr.heuristics import detect_bad_ocr
 from marker.settings import settings
 from marker.schema.block import Span, Line, Block
@@ -63,13 +63,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
     if rotation == 90 or rotation == 270:
         page_width, page_height = page_height, page_width
 
+    char_blocks = page["blocks"]
     page_bbox = [0, 0, page_width, page_height]
     out_page = Page(
         blocks=page_blocks,
         pnum=page["page"],
         bbox=page_bbox,
         rotation=rotation,
-        char_blocks=page["blocks"]
+        char_blocks=char_blocks
     )
     return out_page
 
diff --git a/marker/pdf/images.py b/marker/pdf/images.py
index 2264c28c..1bf24b56 100644
--- a/marker/pdf/images.py
+++ b/marker/pdf/images.py
@@ -1,4 +1,9 @@
 import pypdfium2 as pdfium
+from pypdfium2 import PdfPage
+
+from marker.schema.page import Page
+from marker.schema.bbox import rescale_bbox
+from marker.settings import settings
 
 
 def render_image(page: pdfium.PdfPage, dpi):
@@ -7,4 +12,16 @@ def render_image(page: pdfium.PdfPage, dpi):
         draw_annots=False
     ).to_pil()
     image = image.convert("RGB")
-    return image
\ No newline at end of file
+    return image
+
+
+def render_bbox_image(page_obj: PdfPage, page: Page, bbox):
+    png_image = render_image(page_obj, settings.IMAGE_DPI)
+    # Rescale original pdf bbox bounds to match png image size
+    png_bbox = [0, 0, png_image.size[0], png_image.size[1]]
+    rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox)
+
+    # Crop out only the equation image
+    png_image = png_image.crop(rescaled_merged)
+    png_image = png_image.convert("RGB")
+    return png_image
\ No newline at end of file
diff --git a/marker/pdf/utils.py b/marker/pdf/utils.py
index 1512c17b..e15e9f37 100644
--- a/marker/pdf/utils.py
+++ b/marker/pdf/utils.py
@@ -52,3 +52,25 @@ def font_flags_decomposer(flags: Optional[int]) -> str:
         flag_descriptions.append("use_extern_attr")
 
     return "_".join(flag_descriptions)
+
+
+def sort_block_group(blocks, tolerance=1.25):
+    vertical_groups = {}
+    for block in blocks:
+        if hasattr(block, "bbox"):
+            bbox = block.bbox
+        else:
+            bbox = block["bbox"]
+
+        group_key = round(bbox[1] / tolerance) * tolerance
+        if group_key not in vertical_groups:
+            vertical_groups[group_key] = []
+        vertical_groups[group_key].append(block)
+
+    # Sort each group horizontally and flatten the groups into a single list
+    sorted_blocks = []
+    for _, group in sorted(vertical_groups.items()):
+        sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
+        sorted_blocks.extend(sorted_group)
+
+    return sorted_blocks
diff --git a/marker/schema/block.py b/marker/schema/block.py
index 1220b698..50ae95c6 100644
--- a/marker/schema/block.py
+++ b/marker/schema/block.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Optional
 
 from pydantic import field_validator
@@ -19,6 +20,7 @@ class Span(BboxElement):
     font_size: float
     bold: Optional[bool] = None
     italic: Optional[bool] = None
+    image: Optional[bool] = None
 
 
     @field_validator('text')
@@ -98,3 +100,22 @@ def split_block_lines(block: Block, split_line_idx: int):
         new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum))
         new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum))
     return new_blocks
+
+
+def find_insert_block(blocks: List[Block], bbox):
+    nearest_match = None
+    match_dist = None
+    for idx, block in enumerate(blocks):
+        try:
+            dist = math.sqrt((block.bbox[1] - bbox[1]) ** 2 + (block.bbox[0] - bbox[0]) ** 2)
+        except Exception as e:
+            continue
+
+        if nearest_match is None or dist < match_dist:
+            nearest_match = idx
+            match_dist = dist
+    if nearest_match is None:
+        return 0
+    return nearest_match
+
+
diff --git a/marker/schema/page.py b/marker/schema/page.py
index 407939eb..c4fca410 100644
--- a/marker/schema/page.py
+++ b/marker/schema/page.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 
 from marker.schema.bbox import BboxElement
 from marker.schema.block import Block, Span
@@ -15,6 +15,7 @@ class Page(BboxElement):
     order: Optional[OrderResult] = None
     ocr_method: Optional[str] = None # One of "surya" or "tesseract"
     char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext
+    images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error
 
     def get_nonblank_lines(self):
         lines = self.get_all_lines()
diff --git a/marker/settings.py b/marker/settings.py
index 5566e5fb..5eb85bed 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -10,6 +10,7 @@
 class Settings(BaseSettings):
     # General
     TORCH_DEVICE: Optional[str] = None
+    IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
 
     @computed_field
     @property
diff --git a/marker/tables/cells.py b/marker/tables/cells.py
new file mode 100644
index 00000000..d4524314
--- /dev/null
+++ b/marker/tables/cells.py
@@ -0,0 +1,89 @@
+from marker.schema.bbox import rescale_bbox, box_intersection_pct
+from marker.schema.page import Page
+
+
+def find_row_separators(page: Page, table_box, round_factor=4):
+    top_edges = []
+    bottom_edges = []
+
+    line_boxes = [p.bbox for p in page.text_lines.bboxes]
+    line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
+    line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8]
+
+    min_count = len(line_boxes) / 3
+
+    for cell in line_boxes:
+        top_edges.append(cell[1] / round_factor * round_factor)
+        bottom_edges.append(cell[3] / round_factor * round_factor)
+
+    top_edges = [t for t in top_edges if top_edges.count(t) > min_count]
+    bottom_edges = [b for b in bottom_edges if bottom_edges.count(b) > min_count]
+
+    unique_top = sorted(list(set(top_edges)))
+    unique_bottom = sorted(list(set(bottom_edges)))
+
+    separators = min([unique_top, unique_bottom], key=len)
+
+    # Add the top and bottom of the page as separators, to grab all possible cells
+    separators.append(page.bbox[3])
+    separators.insert(0, page.bbox[1])
+    return separators
+
+
+def find_column_separators(page: Page, table_box, round_factor=4):
+    left_edges = []
+    right_edges = []
+    centers = []
+
+    line_boxes = [p.bbox for p in page.text_lines.bboxes]
+    line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
+    line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8]
+
+    min_count = len(line_boxes) / 3
+    for cell in line_boxes:
+        left_edges.append(cell[0] / round_factor * round_factor)
+        right_edges.append(cell[2] / round_factor * round_factor)
+        centers.append((cell[0] + cell[2]) / 2 * round_factor / round_factor)
+
+    left_edges = [l for l in left_edges if left_edges.count(l) > min_count]
+    right_edges = [r for r in right_edges if right_edges.count(r) > min_count]
+    centers = [c for c in centers if centers.count(c) > min_count]
+
+    unique_left = sorted(list(set(left_edges)))
+    unique_right = sorted(list(set(right_edges)))
+    unique_center = sorted(list(set(centers)))
+
+    # Find list with minimum length
+    separators = min([unique_left, unique_right, unique_center], key=len)
+    separators.append(page.bbox[2])
+    separators.insert(0, page.bbox[0])
+    return separators
+
+
+def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4):
+    separators = find_column_separators(page, table_box, round_factor=round_factor)
+    new_rows = []
+    additional_column_index = 0
+    for row in rows:
+        new_row = {}
+        last_col_index = -1
+        for cell in row:
+            left_edge = cell[0][0]
+            column_index = -1
+            for i, separator in enumerate(separators):
+                if left_edge - tolerance < separator and last_col_index < i:
+                    column_index = i
+                    break
+            if column_index == -1:
+                column_index = len(separators) + additional_column_index
+                additional_column_index += 1
+            new_row[column_index] = cell[1]
+            last_col_index = column_index
+        additional_column_index = 0
+
+        flat_row = []
+        for cell_idx, cell in enumerate(sorted(new_row.items())):
+            flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]])
+        new_rows.append(flat_row)
+
+    return new_rows
diff --git a/marker/cleaners/table.py b/marker/tables/table.py
similarity index 64%
rename from marker/cleaners/table.py
rename to marker/tables/table.py
index fe33e0e0..d99b758e 100644
--- a/marker/cleaners/table.py
+++ b/marker/tables/table.py
@@ -1,45 +1,13 @@
+from collections import defaultdict
+
 from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
 from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page
 from tabulate import tabulate
-from typing import List, Dict
-import re
-
-
-def sort_table_blocks(blocks, tolerance=5):
-    vertical_groups = {}
-    for block in blocks:
-        if hasattr(block, "bbox"):
-            bbox = block.bbox
-        else:
-            bbox = block["bbox"]
-        group_key = round(bbox[1] / tolerance) * tolerance
-        if group_key not in vertical_groups:
-            vertical_groups[group_key] = []
-        vertical_groups[group_key].append(block)
-
-    # Sort each group horizontally and flatten the groups into a single list
-    sorted_blocks = []
-    for _, group in sorted(vertical_groups.items()):
-        sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
-        sorted_blocks.extend(sorted_group)
-
-    return sorted_blocks
+from typing import List
 
-
-def replace_dots(text):
-    dot_pattern = re.compile(r'(\s*\.\s*){4,}')
-    dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
-
-    if dot_multiline_pattern.match(text):
-        text = dot_pattern.sub(' ', text)
-    return text
-
-
-def replace_newlines(text):
-    # Replace all newlines
-    newline_pattern = re.compile(r'[\r\n]+')
-    return newline_pattern.sub(' ', text.strip())
+from marker.tables.cells import assign_cells_to_columns, find_row_separators, find_column_separators
+from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines
 
 
 def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
@@ -73,77 +41,45 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
     return table_rows
 
 
-def assign_cells_to_columns(rows, round_factor=4, tolerance=4):
-    left_edges = []
-    right_edges = []
-    centers = []
-
-    for row in rows:
-        for cell in row:
-            left_edges.append(cell[0][0] / round_factor * round_factor)
-            right_edges.append(cell[0][2] / round_factor * round_factor)
-            centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor)
-
-    unique_left = sorted(list(set(left_edges)))
-    unique_right = sorted(list(set(right_edges)))
-    unique_center = sorted(list(set(centers)))
-
-    # Find list with minimum length
-    separators = min([unique_left, unique_right, unique_center], key=len)
-
-    new_rows = []
-    for row in rows:
-        new_row = {}
-        last_col_index = -1
-        for cell in row:
-            left_edge = cell[0][0]
-            column_index = -1
-            for i, separator in enumerate(separators):
-                if left_edge - tolerance < separator and last_col_index < i:
-                    column_index = i
-                    break
-            if column_index == -1:
-                column_index = cell[0][0] # Assign a new column
-            new_row[column_index] = cell[1]
-            last_col_index = column_index
-
-        flat_row = [cell[1] for cell in sorted(new_row.items())]
-        min_column_index = min(new_row.keys())
-        flat_row = [""] * min_column_index + flat_row
-        new_rows.append(flat_row)
-
-    return new_rows
-
-
 def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     page_width = page.width
     table_rows = []
     table_cell = ""
     cell_bbox = None
-    prev_end = None
+    prev_char = False
     table_row = []
     sorted_char_blocks = sort_table_blocks(page.char_blocks)
+
     for block_idx, block in enumerate(sorted_char_blocks):
-        sorted_block_lines = sort_table_blocks(block["lines"])
-        for line_idx, line in enumerate(sorted_block_lines):
+        sorted_lines = sort_table_blocks(block["lines"])
+        for line_idx, line in enumerate(sorted_lines):
             line_bbox = line["bbox"]
             intersect_pct = box_intersection_pct(line_bbox, table_box)
-            if intersect_pct < .5:
+            if intersect_pct < .7:
                 continue
             for span in line["spans"]:
                 for char in span["chars"]:
                     x_start, y_start, x_end, y_end = char["bbox"]
+
                     if cell_bbox is None:
                         cell_bbox = char["bbox"]
                     else:
+                        # Find boundaries of cell bbox before merging
+                        cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox
+                        cell_x_start /= page_width
+                        cell_x_end /= page_width
+
                         cell_bbox = merge_boxes(cell_bbox, char["bbox"])
 
                     x_start /= page_width
                     x_end /= page_width
+
                     cell_content = replace_dots(replace_newlines(table_cell))
-                    if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell
+                    if not prev_char: # First char
+                        table_cell += char["char"]
+                    elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell
                         table_cell += char["char"]
-                    elif x_start > prev_end - space_tol: # Check if we are on the same line
+                    elif x_start > cell_x_end - space_tol: # Same line, new cell, check against cell bbox
                         if len(table_cell) > 0:
                             table_row.append((cell_bbox, cell_content))
                         table_cell = char["char"]
@@ -156,16 +92,18 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
                         if len(table_row) > 0:
                             table_rows.append(table_row)
                         table_row = []
-                    prev_end = x_end
+                    prev_char = True
+
     if len(table_cell) > 0:
         table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
     if len(table_row) > 0:
         table_rows.append(table_row)
-    table_rows = assign_cells_to_columns(table_rows)
+
+    table_rows = assign_cells_to_columns(page, table_box, table_rows)
     return table_rows
 
 
-def arrange_table_rows(pages: List[Page]):
+def format_tables(pages: List[Page]):
     # Formats tables nicely into github flavored markdown
     table_count = 0
     for page in pages:
diff --git a/marker/tables/utils.py b/marker/tables/utils.py
new file mode 100644
index 00000000..b7efdabb
--- /dev/null
+++ b/marker/tables/utils.py
@@ -0,0 +1,37 @@
+import re
+
+
+def sort_table_blocks(blocks, tolerance=5):
+    vertical_groups = {}
+    for block in blocks:
+        if hasattr(block, "bbox"):
+            bbox = block.bbox
+        else:
+            bbox = block["bbox"]
+        group_key = round(bbox[1] / tolerance) * tolerance
+        if group_key not in vertical_groups:
+            vertical_groups[group_key] = []
+        vertical_groups[group_key].append(block)
+
+    # Sort each group horizontally and flatten the groups into a single list
+    sorted_blocks = []
+    for _, group in sorted(vertical_groups.items()):
+        sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
+        sorted_blocks.extend(sorted_group)
+
+    return sorted_blocks
+
+
+def replace_dots(text):
+    dot_pattern = re.compile(r'(\s*\.\s*){4,}')
+    dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
+
+    if dot_multiline_pattern.match(text):
+        text = dot_pattern.sub(' ', text)
+    return text
+
+
+def replace_newlines(text):
+    # Replace all newlines
+    newline_pattern = re.compile(r'[\r\n]+')
+    return newline_pattern.sub(' ', text.strip())

From f7444f3a814ad6e38a16442592bd3bc9dc5463cc Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Mon, 6 May 2024 22:41:17 -0700
Subject: [PATCH 3/4] Improve sorting

---
 marker/ocr/heuristics.py |  2 +-
 marker/tables/cells.py   | 18 +++++++++++++++++-
 marker/tables/table.py   | 11 ++++-------
 marker/tables/utils.py   |  2 +-
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
index ffe6e422..278d8295 100644
--- a/marker/ocr/heuristics.py
+++ b/marker/ocr/heuristics.py
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
 
 
-def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
+def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:
 
diff --git a/marker/tables/cells.py b/marker/tables/cells.py
index d4524314..1981bcd3 100644
--- a/marker/tables/cells.py
+++ b/marker/tables/cells.py
@@ -86,4 +86,20 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4):
             flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]])
         new_rows.append(flat_row)
 
-    return new_rows
+    # Pad rows to have the same length
+    max_row_len = max([len(r) for r in new_rows])
+    for row in new_rows:
+        while len(row) < max_row_len:
+            row.append("")
+
+    cols_to_remove = set()
+    for idx, col in enumerate(zip(*new_rows)):
+        col_total = sum([len(cell.strip()) > 0 for cell in col])
+        if col_total == 0:
+            cols_to_remove.add(idx)
+
+    rows = []
+    for row in new_rows:
+        rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove])
+
+    return rows
diff --git a/marker/tables/table.py b/marker/tables/table.py
index d99b758e..ef652634 100644
--- a/marker/tables/table.py
+++ b/marker/tables/table.py
@@ -37,11 +37,11 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
             x_position = normed_x_end
     if len(table_row) > 0:
         table_rows.append(table_row)
-    table_rows = assign_cells_to_columns(table_rows)
+    table_rows = assign_cells_to_columns(page, table_box, table_rows)
     return table_rows
 
 
-def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
+def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> List[List[str]]:
     page_width = page.width
     table_rows = []
     table_cell = ""
@@ -90,6 +90,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
                         table_cell = char["char"]
                         cell_bbox = char["bbox"]
                         if len(table_row) > 0:
+                            table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
                             table_rows.append(table_row)
                         table_row = []
                     prev_char = True
@@ -97,6 +98,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     if len(table_cell) > 0:
         table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
     if len(table_row) > 0:
+        table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
         table_rows.append(table_row)
 
     table_rows = assign_cells_to_columns(page, table_box, table_rows)
@@ -139,11 +141,6 @@ def format_tables(pages: List[Page]):
             if len(table_rows) == 0:
                 continue
 
-            max_row_len = max([len(r) for r in table_rows])
-            for row in table_rows:
-                while len(row) < max_row_len:
-                    row.append("")
-
             table_text = tabulate(table_rows, headers="firstrow", tablefmt="github")
             table_block = Block(
                 bbox=table_box,
diff --git a/marker/tables/utils.py b/marker/tables/utils.py
index b7efdabb..61b03403 100644
--- a/marker/tables/utils.py
+++ b/marker/tables/utils.py
@@ -8,7 +8,7 @@ def sort_table_blocks(blocks, tolerance=5):
             bbox = block.bbox
         else:
             bbox = block["bbox"]
-        group_key = round(bbox[1] / tolerance) * tolerance
+        group_key = round(bbox[1] / tolerance)
         if group_key not in vertical_groups:
             vertical_groups[group_key] = []
         vertical_groups[group_key].append(block)

From c8c1f06dbaaacc0bf390bb688e78b2e95b01021d Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 7 May 2024 10:53:07 -0700
Subject: [PATCH 4/4] Enable extracting and saving images

---
 README.md                       | 31 ++++++++++++---------
 benchmark.py                    |  2 +-
 convert.py                      | 13 +++------
 convert_single.py               | 15 +++++------
 marker/convert.py               | 16 +++++++----
 marker/images/extract.py        | 19 +++++++++----
 marker/images/save.py           | 18 +++++++++++++
 marker/output.py                | 36 +++++++++++++++++++++++++
 marker/postprocessors/images.py |  0
 marker/settings.py              |  1 +
 marker/tables/cells.py          | 48 +++++++++++++--------------------
 marker/tables/table.py          | 36 ++++++++++++-------------
 12 files changed, 148 insertions(+), 87 deletions(-)
 create mode 100644 marker/images/save.py
 create mode 100644 marker/output.py
 delete mode 100644 marker/postprocessors/images.py

diff --git a/README.md b/README.md
index f31beba6..83b51f5d 100644
--- a/README.md
+++ b/README.md
@@ -69,13 +69,16 @@ First, clone the repo:
   - GPU only: run `pip install torch` to install other torch dependencies.
   - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions.
 
-- Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend
-  - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
-  - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
-  - Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
-  - Set the tesseract data folder path
-    - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
-    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+**Optional**
+
+Only needed if using `ocrmypdf` as the ocr backend.
+
+- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
+- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
+- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
+- Set the tesseract data folder path
+  - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
+  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 
 ## Mac
 
@@ -83,10 +86,14 @@ First, clone the repo:
   - `poetry install`
   - `poetry shell` to activate your poetry venv
 
-- Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR
-  - Set the tesseract data folder path
-    - Find the tesseract data folder `tessdata` with `brew list tesseract`
-    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+**Optional**
+
+Only needed if using `ocrmypdf` as the ocr backend.
+
+- Install system requirements from `scripts/install/brew-requirements.txt`
+- Set the tesseract data folder path
+  - Find the tesseract data folder `tessdata` with `brew list tesseract`
+  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 
 # Usage
 
@@ -104,7 +111,7 @@ First, some configuration.  Note that settings can be overridden with env vars,
 Run `convert_single.py`, like this:
 
 ```
-python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 --langs English
+python convert_single.py /path/to/file.pdf /path/to/output/folder --parallel_factor 2 --max_pages 10 --langs English
 ```
 
 - `--parallel_factor` is how much to increase batch size and parallel OCR workers by.  Higher numbers will take more VRAM and CPU, but process faster.  Set to 1 by default.
diff --git a/benchmark.py b/benchmark.py
index 3214e545..c2685f57 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -68,7 +68,7 @@ def main():
         for method in methods:
             start = time.time()
             if method == "marker":
-                full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
+                full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
             elif method == "nougat":
                 full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
             elif method == "naive":
diff --git a/convert.py b/convert.py
index c93e161a..9ddb6226 100755
--- a/convert.py
+++ b/convert.py
@@ -7,6 +7,7 @@
 import math
 
 from marker.convert import convert_single_pdf
+from marker.output import markdown_exists, save_markdown
 from marker.pdf.utils import find_filetype
 from marker.pdf.extract_text import get_length_of_text
 from marker.models import load_all_models
@@ -20,10 +21,7 @@
 
 @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
 def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
-    out_filename = fname.rsplit(".", 1)[0] + ".md"
-    out_filename = os.path.join(out_folder, os.path.basename(out_filename))
-    out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
-    if os.path.exists(out_filename):
+    if markdown_exists(out_folder, fname):
         return
     try:
         # Skip trying to convert files that don't have a lot of embedded text
@@ -38,12 +36,9 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
             if length < min_length:
                 return
 
-        full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
+        full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
         if len(full_text.strip()) > 0:
-            with open(out_filename, "w+", encoding='utf-8') as f:
-                f.write(full_text)
-            with open(out_meta_filename, "w+") as f:
-                f.write(json.dumps(out_metadata, indent=4))
+            save_markdown(out_folder, fname, full_text, images, out_metadata)
         else:
             print(f"Empty file: {fname}.  Could not convert.")
     except Exception as e:
diff --git a/convert_single.py b/convert_single.py
index 88990a29..b6af88db 100755
--- a/convert_single.py
+++ b/convert_single.py
@@ -1,17 +1,20 @@
 import argparse
+import os
 
 from marker.convert import convert_single_pdf
 from marker.logger import configure_logging
 from marker.models import load_all_models
 import json
 
+from marker.output import save_markdown
+
 configure_logging()
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("filename", help="PDF file to parse")
-    parser.add_argument("output", help="Output file name")
+    parser.add_argument("output", help="Output base folder path")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
     parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
@@ -21,14 +24,10 @@ def main():
 
     fname = args.filename
     model_lst = load_all_models()
-    full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)
-
-    with open(args.output, "w+", encoding='utf-8') as f:
-        f.write(full_text)
+    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)
 
-    out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
-    with open(out_meta_filename, "w+") as f:
-        f.write(json.dumps(out_meta, indent=4))
+    fname = os.path.basename(fname)
+    save_markdown(args.output, fname, full_text, images, out_meta)
 
 
 if __name__ == "__main__":
diff --git a/marker/convert.py b/marker/convert.py
index f475d3d4..8c29917c 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -1,10 +1,8 @@
 import warnings
-
-from marker.cleaners.text import cleanup_text
-
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
+from PIL import Image
 
 from marker.tables.table import format_tables
 from marker.debug.data import dump_bbox_debug_data
@@ -23,6 +21,9 @@
 from marker.cleaners.headings import split_heading_blocks
 from marker.cleaners.fontstyle import find_bold_italic
 from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
+from marker.cleaners.text import cleanup_text
+from marker.images.extract import extract_images
+from marker.images.save import images_to_dict
 
 from typing import List, Dict, Tuple, Optional
 from marker.settings import settings
@@ -35,7 +36,7 @@ def convert_single_pdf(
         metadata: Optional[Dict]=None,
         parallel_factor: int = 1,
         langs: Optional[List[str]] = None
-) -> Tuple[str, Dict]:
+) -> Tuple[str, Dict[str, Image.Image], Dict]:
     # Set language needed for OCR
     if langs is None:
         langs = [settings.DEFAULT_LANG]
@@ -122,6 +123,10 @@ def convert_single_pdf(
     )
     out_meta["block_stats"]["equations"] = eq_stats
 
+    # Extract images and figures
+    if settings.EXTRACT_IMAGES:
+        extract_images(doc, pages)
+
     # Split out headers
     split_heading_blocks(pages)
     find_bold_italic(pages)
@@ -145,5 +150,6 @@ def convert_single_pdf(
         batch_size=settings.EDITOR_BATCH_SIZE * parallel_factor
     )
     out_meta["postprocess_stats"] = {"edit": edit_stats}
+    doc_images = images_to_dict(pages)
 
-    return full_text, out_meta
\ No newline at end of file
+    return full_text, doc_images, out_meta
\ No newline at end of file
diff --git a/marker/images/extract.py b/marker/images/extract.py
index ae7d3367..85464f2c 100644
--- a/marker/images/extract.py
+++ b/marker/images/extract.py
@@ -1,3 +1,4 @@
+from marker.images.save import get_image_filename
 from marker.pdf.images import render_bbox_image
 from marker.schema.bbox import rescale_bbox
 from marker.schema.block import find_insert_block, Span
@@ -32,14 +33,15 @@ def find_image_blocks(page):
     return image_blocks
 
 
-def extract_images(page):
+def extract_page_images(page_obj, page):
+    page.images = []
     image_blocks = find_image_blocks(page)
 
     for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
         block = page.blocks[block_idx]
-        image = render_bbox_image(page.page_obj, page, bbox)
-        image_filename = f"{page.pnum}_image_{image_idx}.png"
-        image_markdown = f"![{image_filename}]({image_filename})"
+        image = render_bbox_image(page_obj, page, bbox)
+        image_filename = get_image_filename(page, image_idx)
+        image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n"
         image_span = Span(
             bbox=bbox,
             text=image_markdown,
@@ -47,7 +49,14 @@ def extract_images(page):
             rotation=0,
             font_weight=0,
             font_size=0,
-            image=True
+            image=True,
+            span_id=f"image_{image_idx}"
         )
         block.lines[line_idx].spans.append(image_span)
         page.images.append(image)
+
+
+def extract_images(doc, pages):
+    for page_idx, page in enumerate(pages):
+        page_obj = doc[page_idx]
+        extract_page_images(page_obj, page)
diff --git a/marker/images/save.py b/marker/images/save.py
new file mode 100644
index 00000000..8397d5cc
--- /dev/null
+++ b/marker/images/save.py
@@ -0,0 +1,18 @@
+from typing import List
+
+from marker.schema.page import Page
+
+
+def get_image_filename(page: Page, image_idx):
+    return f"{page.pnum}_image_{image_idx}.png"
+
+
+def images_to_dict(pages: List[Page]):
+    images = {}
+    for page in pages:
+        if page.images is None:
+            continue
+        for image_idx, image in enumerate(page.images):
+            image_filename = get_image_filename(page, image_idx)
+            images[image_filename] = image
+    return images
diff --git a/marker/output.py b/marker/output.py
new file mode 100644
index 00000000..aa53c2f6
--- /dev/null
+++ b/marker/output.py
@@ -0,0 +1,36 @@
+import os
+import json
+
+
+def get_subfolder_path(out_folder, fname):
+    subfolder_name = fname.split(".")[0]
+    subfolder_path = os.path.join(out_folder, subfolder_name)
+    os.makedirs(subfolder_path, exist_ok=True)
+    return subfolder_path
+
+
+def get_markdown_filepath(out_folder, fname):
+    subfolder_path = get_subfolder_path(out_folder, fname)
+    out_filename = fname.rsplit(".", 1)[0] + ".md"
+    out_filename = os.path.join(subfolder_path, out_filename)
+    return out_filename
+
+
+def markdown_exists(out_folder, fname):
+    out_filename = get_markdown_filepath(out_folder, fname)
+    return os.path.exists(out_filename)
+
+
+def save_markdown(out_folder, fname, full_text, images, out_metadata):
+    subfolder_path = get_subfolder_path(out_folder, fname)
+    markdown_filepath = get_markdown_filepath(out_folder, fname)
+    out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
+
+    with open(markdown_filepath, "w+", encoding='utf-8') as f:
+        f.write(full_text)
+    with open(out_meta_filepath, "w+") as f:
+        f.write(json.dumps(out_metadata, indent=4))
+
+    for filename, image in images.items():
+        image_filepath = os.path.join(subfolder_path, filename)
+        image.save(image_filepath, "PNG")
\ No newline at end of file
diff --git a/marker/postprocessors/images.py b/marker/postprocessors/images.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/marker/settings.py b/marker/settings.py
index 5eb85bed..35d7ab81 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -11,6 +11,7 @@ class Settings(BaseSettings):
     # General
     TORCH_DEVICE: Optional[str] = None
     IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
+    EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
 
     @computed_field
     @property
diff --git a/marker/tables/cells.py b/marker/tables/cells.py
index 1981bcd3..8bf8f9f1 100644
--- a/marker/tables/cells.py
+++ b/marker/tables/cells.py
@@ -1,36 +1,27 @@
 from marker.schema.bbox import rescale_bbox, box_intersection_pct
 from marker.schema.page import Page
+from sklearn.cluster import DBSCAN
+import numpy as np
 
 
-def find_row_separators(page: Page, table_box, round_factor=4):
-    top_edges = []
-    bottom_edges = []
+def cluster_coords(coords):
+    if len(coords) == 0:
+        return []
+    coords = np.array(sorted(set(coords))).reshape(-1, 1)
 
-    line_boxes = [p.bbox for p in page.text_lines.bboxes]
-    line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
-    line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8]
-
-    min_count = len(line_boxes) / 3
-
-    for cell in line_boxes:
-        top_edges.append(cell[1] / round_factor * round_factor)
-        bottom_edges.append(cell[3] / round_factor * round_factor)
-
-    top_edges = [t for t in top_edges if top_edges.count(t) > min_count]
-    bottom_edges = [b for b in bottom_edges if bottom_edges.count(b) > min_count]
-
-    unique_top = sorted(list(set(top_edges)))
-    unique_bottom = sorted(list(set(bottom_edges)))
+    clustering = DBSCAN(eps=5, min_samples=1).fit(coords)
+    clusters = clustering.labels_
 
-    separators = min([unique_top, unique_bottom], key=len)
+    separators = []
+    for label in set(clusters):
+        clustered_points = coords[clusters == label]
+        separators.append(np.mean(clustered_points))
 
-    # Add the top and bottom of the page as separators, to grab all possible cells
-    separators.append(page.bbox[3])
-    separators.insert(0, page.bbox[1])
+    separators = sorted(separators)
     return separators
 
 
-def find_column_separators(page: Page, table_box, round_factor=4):
+def find_column_separators(page: Page, table_box, round_factor=4, min_count=1):
     left_edges = []
     right_edges = []
     centers = []
@@ -39,7 +30,6 @@ def find_column_separators(page: Page, table_box, round_factor=4):
     line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
     line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8]
 
-    min_count = len(line_boxes) / 3
     for cell in line_boxes:
         left_edges.append(cell[0] / round_factor * round_factor)
         right_edges.append(cell[2] / round_factor * round_factor)
@@ -49,12 +39,12 @@ def find_column_separators(page: Page, table_box, round_factor=4):
     right_edges = [r for r in right_edges if right_edges.count(r) > min_count]
     centers = [c for c in centers if centers.count(c) > min_count]
 
-    unique_left = sorted(list(set(left_edges)))
-    unique_right = sorted(list(set(right_edges)))
-    unique_center = sorted(list(set(centers)))
+    sorted_left = cluster_coords(left_edges)
+    sorted_right = cluster_coords(right_edges)
+    sorted_center = cluster_coords(centers)
 
     # Find list with minimum length
-    separators = min([unique_left, unique_right, unique_center], key=len)
+    separators = max([sorted_left, sorted_right, sorted_center], key=len)
     separators.append(page.bbox[2])
     separators.insert(0, page.bbox[0])
     return separators
@@ -83,7 +73,7 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4):
 
         flat_row = []
         for cell_idx, cell in enumerate(sorted(new_row.items())):
-            flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]])
+            flat_row.append(cell[1])
         new_rows.append(flat_row)
 
     # Pad rows to have the same length
diff --git a/marker/tables/table.py b/marker/tables/table.py
index ef652634..1de64437 100644
--- a/marker/tables/table.py
+++ b/marker/tables/table.py
@@ -6,7 +6,7 @@
 from tabulate import tabulate
 from typing import List
 
-from marker.tables.cells import assign_cells_to_columns, find_row_separators, find_column_separators
+from marker.tables.cells import assign_cells_to_columns
 from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines
 
 
@@ -46,10 +46,12 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L
     table_rows = []
     table_cell = ""
     cell_bbox = None
-    prev_char = False
     table_row = []
     sorted_char_blocks = sort_table_blocks(page.char_blocks)
 
+    table_width = table_box[2] - table_box[0]
+    new_line_start_x = table_box[0] + table_width * .2
+
     for block_idx, block in enumerate(sorted_char_blocks):
         sorted_lines = sort_table_blocks(block["lines"])
         for line_idx, line in enumerate(sorted_lines):
@@ -60,31 +62,25 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L
             for span in line["spans"]:
                 for char in span["chars"]:
                     x_start, y_start, x_end, y_end = char["bbox"]
+                    x_start /= page_width
+                    x_end /= page_width
 
-                    if cell_bbox is None:
-                        cell_bbox = char["bbox"]
-                    else:
+                    if cell_bbox is not None:
                         # Find boundaries of cell bbox before merging
                         cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox
                         cell_x_start /= page_width
                         cell_x_end /= page_width
 
-                        cell_bbox = merge_boxes(cell_bbox, char["bbox"])
-
-                    x_start /= page_width
-                    x_end /= page_width
-
                     cell_content = replace_dots(replace_newlines(table_cell))
-                    if not prev_char: # First char
+                    if cell_bbox is None: # First char
                         table_cell += char["char"]
+                        cell_bbox = char["bbox"]
                     elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell
                         table_cell += char["char"]
-                    elif x_start > cell_x_end - space_tol: # Same line, new cell, check against cell bbox
-                        if len(table_cell) > 0:
-                            table_row.append((cell_bbox, cell_content))
-                        table_cell = char["char"]
-                        cell_bbox = char["bbox"]
-                    else: # New line and cell
+                        cell_bbox = merge_boxes(cell_bbox, char["bbox"])
+                    # New line and cell
+                    # Use x_start < new_line_start_x to account for out-of-order cells in the pdf
+                    elif x_start < cell_x_end - space_tol and x_start < new_line_start_x:
                         if len(table_cell) > 0:
                             table_row.append((cell_bbox, cell_content))
                         table_cell = char["char"]
@@ -93,7 +89,11 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L
                             table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
                             table_rows.append(table_row)
                         table_row = []
-                    prev_char = True
+                    else: # Same line, new cell, check against cell bbox
+                        if len(table_cell) > 0:
+                            table_row.append((cell_bbox, cell_content))
+                        table_cell = char["char"]
+                        cell_bbox = char["bbox"]
 
     if len(table_cell) > 0:
         table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))