Skip to content

Commit

Permalink
Fix java heap (#493)
Browse files Browse the repository at this point in the history
Co-authored-by: Nasty <[email protected]>
  • Loading branch information
sunveil and NastyBoget authored Sep 12, 2024
1 parent 47486dd commit b73cf1a
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 9 deletions.
28 changes: 19 additions & 9 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,18 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
import tempfile
from dedoc.utils.parameter_utils import get_param_with_attachments
parameters = {} if parameters is None else parameters
warnings = []
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings)

with tempfile.TemporaryDirectory() as tmp_dir:
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(
path=file_path,
parameters=parameters,
warnings=warnings,
tmp_dir=tmp_dir
)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=attachments)

if get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path):
Expand All @@ -71,7 +79,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure

return self._postprocess(result)

def __extract(self, path: str, parameters: dict, warnings: list)\
def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
import math
from dedoc.utils.pdf_utils import get_pdf_page_count
Expand Down Expand Up @@ -102,7 +110,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
first_tabby_page = first_page + 1 if first_page is not None else 1
last_tabby_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
self.logger.info(f"Reading PDF pages from {first_tabby_page} to {last_tabby_page}")
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page, tmp_dir=tmp_dir)

pages = document.get("pages", [])
for page in pages:
Expand Down Expand Up @@ -283,10 +291,10 @@ def __jar_path(self) -> str:
import os
return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"])

def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes:
def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes:
import subprocess

args = ["java"] + ["-jar", self.__jar_path(), "-i", path]
args = ["java"] + ["-jar", self.__jar_path(), "-i", path, "-tmp", f"{tmp_dir}/"]
if start_page is not None and end_page is not None:
args += ["-sp", str(start_page), "-ep", str(end_page)]
try:
Expand All @@ -299,12 +307,14 @@ def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = Non
except subprocess.CalledProcessError as e:
raise TabbyPdfError(e.stderr.decode(encoding))

def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict:
def __process_pdf(self, path: str, tmp_dir: str, start_page: int = None, end_page: int = None) -> dict:
import json
import os

self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir)
with open(os.path.join(tmp_dir, "data.json"), "r") as response:
document = json.load(response)

output = self.__run(path=path, start_page=start_page, end_page=end_page)
response = output.decode("UTF-8")
document = json.loads(response) if response else {}
return document

def _process_one_page(self,
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit b73cf1a

Please sign in to comment.