Get chunk conversion working

VikParuchuri · May 9, 2024 · 4eb9095 · 4eb9095
1 parent 2d7cb00
commit 4eb9095
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 3 deletions.
diff --git a/chunk_convert.py b/chunk_convert.py
@@ -1,5 +1,6 @@
 import argparse
 import subprocess
+import pkg_resources
 
 
 def main():
@@ -8,8 +9,10 @@ def main():
     parser.add_argument("out_folder", help="Output folder")
     args = parser.parse_args()
 
+    script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
+
     # Construct the command
-    cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
+    cmd = f"{script_path} {args.in_folder} {args.out_folder}"
 
     # Execute the shell script
     subprocess.run(cmd, shell=True, check=True)

diff --git a/convert.py b/convert.py
@@ -24,6 +24,7 @@ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Opt
     fname = os.path.basename(filepath)
     if markdown_exists(out_folder, fname):
         return
+
     try:
         # Skip trying to convert files that don't have a lot of embedded text
         # This can indicate that they were scanned, and not OCRed properly

diff --git a/marker/output.py b/marker/output.py
@@ -5,7 +5,6 @@
 def get_subfolder_path(out_folder, fname):
     subfolder_name = fname.split(".")[0]
     subfolder_path = os.path.join(out_folder, subfolder_name)
-    os.makedirs(subfolder_path, exist_ok=True)
     return subfolder_path
 
 
@@ -23,6 +22,8 @@ def markdown_exists(out_folder, fname):
 
 def save_markdown(out_folder, fname, full_text, images, out_metadata):
     subfolder_path = get_subfolder_path(out_folder, fname)
+    os.makedirs(subfolder_path, exist_ok=True)
+
     markdown_filepath = get_markdown_filepath(out_folder, fname)
     out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
 

diff --git a/marker/pdf/utils.py b/marker/pdf/utils.py
@@ -7,6 +7,9 @@
 
 def find_filetype(fpath):
     kind = filetype.guess(fpath)
+    if kind is None:
+        print(f"Could not determine filetype for {fpath}")
+        return "other"
 
     mimetype = kind.mime
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.2"
+version = "0.2.3"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"