From 21ca9e5f7d12698c152e6eb8e251b85191661724 Mon Sep 17 00:00:00 2001 From: Stony Wang Date: Wed, 31 Jan 2024 23:12:41 +0800 Subject: [PATCH] Change the way to share and clean up temp directory. Add _get_temp_path to make sure to access tmp pdf file in the same way. Update _save_page parameters to meet the change. --- camelot/handlers.py | 58 ++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index 66ee1697..3a2bba1a 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,5 +1,8 @@ import os +import shutil import sys +import tempfile +import weakref from pathlib import Path from typing import Union @@ -10,7 +13,6 @@ from .core import TableList from .parsers import Lattice from .parsers import Stream -from .utils import TemporaryDirectory from .utils import download_url from .utils import get_page_layout from .utils import get_rotation @@ -36,6 +38,8 @@ class PDFHandler: """ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + self.tempdir = tempfile.mkdtemp() + self._finalizer = weakref.finalize(self, shutil.rmtree, self.tempdir) if is_url(filepath): filepath = download_url(filepath) self.filepath: Union[StrByteType, Path] = filepath @@ -56,15 +60,13 @@ def _get_pages(self, pages): Parameters ---------- - filepath : str - Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- - P : list + result : list[int] List of int page numbers. """ @@ -95,24 +97,33 @@ def _get_pages(self, pages): result.extend(range(p["start"], p["end"] + 1)) return sorted(set(result)) - def _save_page(self, filepath: Union[StrByteType, Path], page, temp): - """Saves specified page from PDF into a temporary directory. + def _get_temp_path(self, page, rotated=False): + """Generate page path with temp directory. Parameters ---------- - filepath : str - Filepath or URL of the PDF file. page : int Page number. - temp : str - Tmp directory. + rotated: bool (default: False) + Switch to generate temp file name. + """ + if rotated is False: + return os.path.join(self.tempdir, f"page-{page}.pdf") + else: + return os.path.join(self.tempdir, f"p-{page}_rotated.pdf") + + def _save_page(self, page): + """Saves specified page from PDF into a temporary directory. + Parameters + ---------- + page : int + Page number. """ - infile = PdfReader(filepath, strict=False) + infile = PdfReader(self.filepath, strict=False) if infile.is_encrypted: infile.decrypt(self.password) - fpath = os.path.join(temp, f"page-{page}.pdf") - froot, fext = os.path.splitext(fpath) + fpath = self._get_temp_path(page) p = infile.pages[page - 1] outfile = PdfWriter() outfile.add_page(p) @@ -125,7 +136,7 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp): vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + fpath_new = self._get_temp_path(page, rotated=True) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfReader(instream, strict=False) @@ -171,14 +182,13 @@ def parse( layout_kwargs = {} tables = [] - with TemporaryDirectory() as tempdir: - for p in self.pages: - self._save_page(self.filepath, p, tempdir) - pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages] - parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) - for p in pages: - t = parser.extract_tables( - p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs - ) - tables.extend(t) + for p in self.pages: + self._save_page(p) + pages = [self._get_temp_path(p) for p in self.pages] + parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + for p in pages: + t = parser.extract_tables( + p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + ) + tables.extend(t) return TableList(sorted(tables))