Skip to content

Commit

Permalink
Get chunk conversion working
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 9, 2024
1 parent 2d7cb00 commit 4eb9095
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 3 deletions.
5 changes: 4 additions & 1 deletion chunk_convert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import subprocess
import pkg_resources


def main():
Expand All @@ -8,8 +9,10 @@ def main():
parser.add_argument("out_folder", help="Output folder")
args = parser.parse_args()

script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')

# Construct the command
cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
cmd = f"{script_path} {args.in_folder} {args.out_folder}"

# Execute the shell script
subprocess.run(cmd, shell=True, check=True)
Expand Down
1 change: 1 addition & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Opt
fname = os.path.basename(filepath)
if markdown_exists(out_folder, fname):
return

try:
# Skip trying to convert files that don't have a lot of embedded text
# This can indicate that they were scanned, and not OCRed properly
Expand Down
3 changes: 2 additions & 1 deletion marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
def get_subfolder_path(out_folder, fname):
subfolder_name = fname.split(".")[0]
subfolder_path = os.path.join(out_folder, subfolder_name)
os.makedirs(subfolder_path, exist_ok=True)
return subfolder_path


Expand All @@ -23,6 +22,8 @@ def markdown_exists(out_folder, fname):

def save_markdown(out_folder, fname, full_text, images, out_metadata):
subfolder_path = get_subfolder_path(out_folder, fname)
os.makedirs(subfolder_path, exist_ok=True)

markdown_filepath = get_markdown_filepath(out_folder, fname)
out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"

Expand Down
3 changes: 3 additions & 0 deletions marker/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

def find_filetype(fpath):
kind = filetype.guess(fpath)
if kind is None:
print(f"Could not determine filetype for {fpath}")
return "other"

mimetype = kind.mime

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.2"
version = "0.2.3"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 4eb9095

Please sign in to comment.