diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 76c67dc0..a6a6eb31 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -1,3 +1,5 @@ +import os +from os.path import expanduser from typing import List, Optional, Tuple from dedocutils.data_structures import BBox @@ -283,12 +285,14 @@ def __jar_path(self) -> str: import os return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"]) - def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: + def __run(self, path: str = None, encoding: str = "utf-8", + start_page: int = None, end_page: int = None, tmp_dir: str = "") -> bytes: import subprocess args = ["java"] + ["-jar", self.__jar_path(), "-i", path] if start_page is not None and end_page is not None: args += ["-sp", str(start_page), "-ep", str(end_page)] + args += ["-tmp", tmp_dir] try: result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True) if result.stderr: @@ -299,12 +303,16 @@ def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = Non except subprocess.CalledProcessError as e: raise TabbyPdfError(e.stderr.decode(encoding)) - def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict: + def __process_pdf(self, path: str, start_page: int = None, end_page: int = None, tmp_dir: str = "/.cache/dedoc/tabby/") -> dict: import json - output = self.__run(path=path, start_page=start_page, end_page=end_page) - response = output.decode("UTF-8") - document = json.loads(response) if response else {} + self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir) + folder = expanduser("~") + tmp_dir + out_path = os.path.join(folder, 'data.json') + with open(out_path) as response: + document = json.load(response) if response else {} + response.close() + return document def _process_one_page(self, diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 61612f8b..f661bc5f 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar new file mode 100644 index 00000000..c13bcb91 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar new file mode 100644 index 00000000..34be9026 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar new file mode 100644 index 00000000..3750b8c1 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar new file mode 100644 index 00000000..5c50e177 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar differ