Fix bolding, update deps

VikParuchuri · May 7, 2024 · 59b3215 · 59b3215
1 parent 01c18b8
commit 59b3215
Show file tree

Hide file tree

Showing 14 changed files with 1,037 additions and 1,644 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
 # Marker
 
-Marker converts PDF, EPUB, and MOBI to markdown.  It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
+Marker converts PDF to markdown.  It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
 
-- Support for a range of PDF documents (optimized for books and scientific papers)
+- Support for a range of documents (optimized for books and scientific papers)
 - Removes headers/footers/other artifacts
 - Converts most equations to latex
-- Formats code blocks and tables
+- Formats tables and code blocks
 - Support for all languages (although most testing is done in English).
 - Works on GPU, CPU, or MPS
 
@@ -73,9 +73,9 @@ First, clone the repo:
 
 Only needed if using `ocrmypdf` as the ocr backend.
 
-- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
+- Run `pip install ocrmypdf`
 - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
-- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
+- Install other requirements with `cat scripts/install/tess-apt-requirements.txt | xargs sudo apt-get install -y`
 - Set the tesseract data folder path
   - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
   - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -90,7 +90,8 @@ Only needed if using `ocrmypdf` as the ocr backend.
 
 Only needed if using `ocrmypdf` as the ocr backend.
 
-- Install system requirements from `scripts/install/brew-requirements.txt`
+- Run `pip install ocrmypdf`
+- Install system requirements from `scripts/install/tess-brew-requirements.txt`
 - Set the tesseract data folder path
   - Find the tesseract data folder `tessdata` with `brew list tesseract`
   - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -210,7 +211,7 @@ All models were trained from scratch, so they're okay for commercial usage.  The
 
 If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at [email protected] for dual licensing.
 
-Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions.  If you want to avoid this completely, just use `surya` as the OCR option.  (ocrmypdf is faster on CPU, but less accurate)
+Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions.  Ocrmypdf is disabled by default, and will not be installed automatically.
 
 # Thanks
 

diff --git a/convert_single.py b/convert_single.py
@@ -4,7 +4,6 @@
 from marker.convert import convert_single_pdf
 from marker.logger import configure_logging
 from marker.models import load_all_models
-import json
 
 from marker.output import save_markdown
 

diff --git a/marker/cleaners/fontstyle.py b/marker/cleaners/fontstyle.py
@@ -1,11 +1,9 @@
 from typing import List
-from statistics import mean
-import numpy as np
 
 from marker.schema.page import Page
 
 
-def find_bold_italic(pages: List[Page], bold_min_weight=550):
+def find_bold_italic(pages: List[Page], bold_min_weight=600):
     font_weights = []
     for page in pages:
         for block in page.blocks:
@@ -24,17 +22,9 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
     if len(font_weights) == 0:
         return
 
-    font_weights = np.array(font_weights)
-    bold_thresh = np.percentile(font_weights, 90)
-    bold_thresh_lower = np.percentile(font_weights, 75)
-
-    # If a lot of the text on the page is bold, don't bold it all
-    if bold_thresh == bold_thresh_lower or bold_thresh < bold_min_weight:
-        return
-
     for page in pages:
         for block in page.blocks:
             for line in block.lines:
                 for span in line.spans:
-                    if span.font_weight >= bold_thresh:
+                    if span.font_weight >= bold_min_weight:
                         span.bold = True
diff --git a/marker/cleaners/headers.py b/marker/cleaners/headers.py
@@ -1,13 +1,8 @@
 import re
-from collections import Counter, defaultdict
-from itertools import chain
+from collections import Counter
 from rapidfuzz import fuzz
 
-from sklearn.cluster import DBSCAN
-import numpy as np
-
 from marker.schema.merged import FullyMergedBlock
-from marker.schema.page import Page
 from typing import List, Tuple
 
 

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -1,7 +1,6 @@
 from itertools import repeat
 from typing import List, Optional, Dict
 
-import ocrmypdf
 import pypdfium2 as pdfium
 import io
 from concurrent.futures import ThreadPoolExecutor
@@ -113,6 +112,7 @@ def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
 
 
 def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
+    import ocrmypdf
     out_pdf = io.BytesIO()
 
     ocrmypdf.ocr(

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -4,8 +4,7 @@
 import pypdfium2 as pdfium
 import pypdfium2.internal as pdfium_i
 
-from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
-from marker.ocr.heuristics import detect_bad_ocr
+from marker.pdf.utils import font_flags_decomposer
 from marker.settings import settings
 from marker.schema.block import Span, Line, Block
 from marker.schema.page import Page

diff --git a/marker/postprocessors/editor.py b/marker/postprocessors/editor.py
@@ -1,8 +1,7 @@
-from collections import defaultdict, Counter
+from collections import defaultdict
 from itertools import chain
 from typing import Optional
 
-from transformers import AutoTokenizer
 from marker.settings import settings
 import torch
 import torch.nn.functional as F

diff --git a/marker/postprocessors/t5.py b/marker/postprocessors/t5.py
@@ -2,7 +2,7 @@
 import torch
 from torch import nn
 from copy import deepcopy
-from typing import Optional, Tuple, Union, List
+from typing import Optional, Tuple, Union
 from itertools import chain
 
 from transformers.modeling_outputs import TokenClassifierOutput

diff --git a/marker/settings.py b/marker/settings.py
@@ -1,4 +1,3 @@
-import os
 from typing import Optional, List, Dict
 
 from dotenv import find_dotenv

diff --git a/marker/tables/table.py b/marker/tables/table.py
@@ -1,5 +1,3 @@
-from collections import defaultdict
-
 from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
 from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page