Remove libmagic dependency

VikParuchuri · May 1, 2024 · d22c5a5 · d22c5a5
1 parent f8f595c
commit d22c5a5
Show file tree

Hide file tree

Showing 10 changed files with 34 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -65,13 +65,13 @@ First, clone the repo:
 
 ## Linux
 
-- Install system requirements
+- Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend
   - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
   - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
   - Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
-- Set the tesseract data folder path
-  - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
-  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+  - Set the tesseract data folder path
+    - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
+    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
@@ -81,10 +81,10 @@ First, clone the repo:
 
 ## Mac
 
-- Install system requirements from `scripts/install/brew-requirements.txt`
-- Set the tesseract data folder path
-  - Find the tesseract data folder `tessdata` with `brew list tesseract`
-  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+- Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR
+  - Set the tesseract data folder path
+    - Find the tesseract data folder `tessdata` with `brew list tesseract`
+    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
@@ -96,20 +96,21 @@ First, some configuration.  Note that settings can be overridden with env vars,
 - Your torch device will be automatically detected, but you can manually set it also.  For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
   - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
   - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
+- By default, marker will use `ocrmypdf` for OCR on CPU, and `surya` on GPU.  Surya is slower on CPU, but more accurate. `ocrmypdf` also requires external dependencies (see above). You can override the default with the `OCR_ENGINE` setting.
 - Inspect the other settings in `marker/settings.py`.  You can override any settings in the `local.env` file, or by setting environment variables.
-  - By default, the final editor model is off.  Turn it on with `ENABLE_EDITOR_MODEL=true`.
-  - By default, marker will use ocrmypdf for OCR, which is slower than base tesseract, but higher quality.  You can change this with the `OCR_ENGINE` setting.
+
 
 ## Convert a single file
 
 Run `convert_single.py`, like this:
 
 ```
-python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10
+python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 --langs English
 ```
 
 - `--parallel_factor` is how much to increase batch size and parallel OCR workers by.  Higher numbers will take more VRAM and CPU, but process faster.  Set to 1 by default.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
+- `--langs` is a comma separated list of the languages in the document, for OCR
 
 Make sure the `DEFAULT_LANG` setting is set appropriately for your document.
 
@@ -199,23 +200,18 @@ Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running
 
 # Commercial usage
 
-Due to the licensing of the underlying models like layoutlmv3 and nougat, this is only suitable for noncommercial usage.  
-
-I'm building a version that can be used commercially, by stripping out the dependencies below. If you would like to get early access, email me at [email protected].
-
-Here are the non-commercial/restrictive dependencies:
+All models were trained from scratch, so they're okay for commercial usage.  The weights for the models are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period.
 
-- LayoutLMv3: CC BY-NC-SA 4.0 .  [Source](https://huggingface.co/microsoft/layoutlmv3-base)
-- PyMuPDF - GPL . [Source](https://pymupdf.readthedocs.io/en/latest/about.html#license-and-copyright)
+If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at [email protected] for dual licensing.
 
-Other dependencies/datasets are openly licensed (doclaynet, byt5), or used in a way that is compatible with commercial usage (ghostscript).
+Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions.  If you want to avoid this completely, just use `surya` as the OCR option.  (ocrmypdf is faster on CPU, but less accurate)
 
 # Thanks
 
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):
 
 - Nougat from Meta
-- Layoutlmv3 from Microsoft
+- Pypdfium2/pdfium
 - DocLayNet from IBM
 - ByT5 from Google
 

diff --git a/convert.py b/convert.py
@@ -7,7 +7,7 @@
 import math
 
 from marker.convert import convert_single_pdf
-from marker.pdf.filetype import find_filetype
+from marker.pdf.utils import find_filetype
 from marker.pdf.extract_text import get_length_of_text
 from marker.models import load_all_models
 from marker.settings import settings

diff --git a/convert_single.py b/convert_single.py
@@ -14,11 +14,14 @@ def main():
     parser.add_argument("output", help="Output file name")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
+    parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
     args = parser.parse_args()
 
+    langs = args.langs.split(",") if args.langs else None
+
     fname = args.filename
     model_lst = load_all_models()
-    full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor)
+    full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)
 
     with open(args.output, "w+", encoding='utf-8') as f:
         f.write(full_text)

diff --git a/marker/convert.py b/marker/convert.py
@@ -13,7 +13,7 @@
 from marker.pdf.extract_text import get_text_blocks
 from marker.cleaners.headers import filter_header_footer, filter_common_titles
 from marker.equations.texify import replace_equations
-from marker.pdf.filetype import find_filetype
+from marker.pdf.utils import find_filetype
 from marker.postprocessors.editor import edit_full_text
 from marker.cleaners.code import identify_code_blocks, indent_blocks
 from marker.cleaners.bullets import replace_bullets
@@ -28,10 +28,13 @@ def convert_single_pdf(
         model_lst: List,
         max_pages=None,
         metadata: Optional[Dict]=None,
-        parallel_factor: int = 1
+        parallel_factor: int = 1,
+        langs: Optional[List[str]] = None
 ) -> Tuple[str, Dict]:
     # Set language needed for OCR
-    langs = [settings.DEFAULT_LANG]
+    if langs is None:
+        langs = [settings.DEFAULT_LANG]
+
     if metadata:
         langs = metadata.get("languages", langs)
 

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -4,7 +4,7 @@
 import pypdfium2 as pdfium
 import pypdfium2.internal as pdfium_i
 
-from marker.pdf.filetype import find_filetype
+from marker.pdf.utils import find_filetype
 from marker.ocr.utils import font_flags_decomposer
 from marker.ocr.heuristics import detect_bad_ocr
 from marker.settings import settings

diff --git a/marker/pdf/filetype.py → marker/pdf/utils.py b/marker/pdf/filetype.py → marker/pdf/utils.py
@@ -1,10 +1,12 @@
-import magic
+import filetype
 
 from marker.settings import settings
 
 
 def find_filetype(fpath):
-    mimetype = magic.from_file(fpath).lower()
+    kind = filetype.guess(fpath)
+
+    mimetype = kind.mime
 
     # Get extensions from mimetype
     # The mimetype is not always consistent, so use in to check the most common formats

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,6 @@ torch = "^2.1.2"
 ray = "^2.9.0"
 tqdm = "^4.66.1"
 tabulate = "^0.9.0"
-python-magic = "^0.4.27"
 ftfy = "^6.1.1"
 nltk = "^3.8.1"
 ocrmypdf = "^15.4.0"
@@ -42,6 +41,7 @@ texify = "^0.1.8"
 pdftext = "^0.3.1"
 rapidfuzz = "^3.8.1"
 surya-ocr = "^0.4.0"
+filetype = "^1.2.0"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/scripts/install/apt-requirements.txt b/scripts/install/apt-requirements.txt
@@ -1,6 +1,5 @@
 tesseract-ocr
 libtesseract-dev
-libmagic1
 ocrmypdf
 tesseract-ocr-eng
 tesseract-ocr-deu

diff --git a/scripts/install/brew-requirements.txt b/scripts/install/brew-requirements.txt
@@ -1,4 +1,3 @@
 ocrmypdf
-libmagic
 tesseract
 tesseract-lang