Skip to content

Commit

Permalink
Add pypi package config
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Dec 19, 2023
1 parent 5c47d39 commit c126b9c
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 14 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install
poetry remove torch
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Build package
run: |
poetry build
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
10 changes: 5 additions & 5 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,13 @@
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.ordering import load_ordering_model
from marker.segmentation import load_layout_model
from marker.cleaners.equations import load_nougat_model
from marker.benchmark.scoring import score_text
from marker.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
import fitz as pymupdf
from marker.settings import settings
from tabulate import tabulate

configure_logging()
Expand All @@ -34,7 +30,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
return data


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
Expand Down Expand Up @@ -126,3 +122,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))


if __name__ == "__main__":
main()

19 changes: 19 additions & 0 deletions chunk_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import argparse
import subprocess


def main():
parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
args = parser.parse_args()

# Construct the command
cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"

# Execute the shell script
subprocess.run(cmd, shell=True, check=True)


if __name__ == "__main__":
main()
Empty file modified chunk_convert.sh
100644 → 100755
Empty file.
8 changes: 6 additions & 2 deletions convert.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
print(traceback.format_exc())


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
Expand Down Expand Up @@ -121,4 +121,8 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
progress_bar.update(1)

# Shutdown ray to free resources
ray.shutdown()
ray.shutdown()


if __name__ == "__main__":
main()
9 changes: 6 additions & 3 deletions convert_single.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.settings import settings
import json

configure_logging()


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output file name")
Expand All @@ -26,4 +25,8 @@

out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_meta, indent=4))
f.write(json.dumps(out_meta, indent=4))


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from marker.markdown import merge_spans, merge_lines, get_full_text
from marker.schema import Page, BlockType
from typing import List, Dict, Tuple, Optional
from copy import deepcopy
import re
import magic
from marker.settings import settings
Expand Down
22 changes: 19 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
[tool.poetry]
name = "marker"
version = "0.1.0"
name = "marker-pdf"
version = "0.1.1"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
license = "GPL-3.0-or-later"
repository = "https://github.com/VikParuchuri/marker"
keywords = ["pdf", "markdown", "ocr", "nlp"]
packages = [
{include = "marker"}
]
include = [
"convert.py",
"convert_single.py",
"chunk_convert.sh",
"benchmark.py",
"chunk_convert.py",
]

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
Expand Down Expand Up @@ -37,6 +47,12 @@ grpcio = "^1.60.0"
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"

[tool.poetry.scripts]
marker = "convert:main"
marker_single = "convert_single:main"
marker_benchmark = "benchmark:main"
marker_chunk_convert = "chunk_convert:main"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"

0 comments on commit c126b9c

Please sign in to comment.