Merge pull request #117 from funnyzak/refactor/pdf

funnyzak · Jun 8, 2023 · 18ba820 · 18ba820
2 parents 4d01566 + 0557a3c
commit 18ba820
Show file tree

Hide file tree

Showing 8 changed files with 895 additions and 461 deletions.
diff --git a/.gitignore b/.gitignore
@@ -146,3 +146,5 @@ _cache/
 **/*/*test*.jpeg
 **/*/*test*.jpg
 **/*/*test*.png
+
+**/*/pdf/*.json
diff --git a/README.md b/README.md
@@ -77,6 +77,9 @@ poetry install --extras "pdf"
 poetry install --with dev,test
 # Only install required dependencies for production
 poetry install
+
+# Specify python version
+poetry env use python3.9
 ```
 
 ## Usage
@@ -93,6 +96,9 @@ poetry run merge_pdf_demo
 # Run project => pdf_parse: parse multi pdf to multi-layer pdf
 poetry run multi_layer_pdf_demo
 
+# Extract text to json from pdf
+poetry run extract_text_from_pdf
+
 # Debug "hello" project with ipdb3
 poetry run ipdb3 ./src/hello/main.py
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pyproject-starter"
-version = "0.1.6"
+version = "0.1.7"
 description = "A template for the python project. It uses poetry for dependency management and tox for testing."
 authors = ["Leon <[email protected]>"]
 license = "MIT"
@@ -27,6 +27,8 @@ packages = [
 hello = "hello.main:say_hello"
 single_pdf_parse = { callable = "pdf_parse.single_pdf_parse:main", extras = ["pdf"] }
 multi_pdf_parse = { callable = "pdf_parse.multi_pdf_parse:main", extras = ["pdf"] }
+extract_text_from_pdf = { callable = "pdf_parse.extract_text_location:test_extract_text_location", extras = ["pdf"] }
+
 
 
 add_pdf_annotation_demo = { callable = "pdf_parse.single_pdf_parse:test_add_annotation", extras = ["pdf"] }
@@ -42,10 +44,11 @@ python = "^3.8.1"
 borb = { version = "^2.1.7", optional = true }
 PyPDF2 = { version = "^2.12.1", optional = true }
 requests = {version = "^2.28.1", optional = true}
+pdfminer = {version = "^20191125", optional = true}
 
 
 [tool.poetry.extras]
-pdf = ["PyPDF2", "borb", "requests"]
+pdf = ["PyPDF2", "borb", "requests", "pdfminer"]
 
 [tool.poetry.group.dev]
 optional = true

diff --git a/src/pdf_parse/extract_text_location.py b/src/pdf_parse/extract_text_location.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+# created by: leon<silenceace at gmail dot com>
+# date: 2023-06-08
+# license: MIT
+# description: Extract text location from pdf file.
+# usage: poetry run python src/pdf_parse/extract_text_location.py -i public/attachments/pdf/whatispython.pdf
+# notes:
+
+import argparse
+import json
+import os
+import time
+
+from pdfminer.converter import PDFPageAggregator  # type: ignore
+from pdfminer.layout import LAParams  # type: ignore
+from pdfminer.layout import LTAnno
+from pdfminer.layout import LTChar
+from pdfminer.layout import LTText
+from pdfminer.pdfdocument import PDFDocument  # type: ignore
+from pdfminer.pdfinterp import PDFPageInterpreter  # type: ignore
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfpage import PDFPage  # type: ignore
+from pdfminer.pdfparser import PDFParser  # type: ignore
+
+
+class ExtractTextLocation:
+    """Extract text location from pdf file."""
+
+    def __init__(self, pdf_file, output_path=None) -> None:
+        """Pass in the path to the PDF file and output path.
+
+        :param pdf_file: pdf file path
+        :param output_path: output dir
+        """
+        self.pdf_file = pdf_file
+        self.output_path = output_path
+
+        self.check_file()
+        self.check_out_path()
+
+    def check_file(self):
+        """Check if the file exists."""
+        if not os.path.exists(self.pdf_file):
+            raise FileNotFoundError(f"file {self.pdf_file} not found.")
+        if not self.pdf_file.endswith(".pdf"):
+            raise ValueError(f"file {self.pdf_file} is not pdf file")
+
+    def check_out_path(self):
+        """Check if the output path exists."""
+        if not self.output_path:
+            self.output_path = os.path.dirname(self.pdf_file)
+        if not os.path.exists(self.output_path):
+            os.makedirs(self.output_path)
+
+    def set_output_path(self, output_path):
+        """Set output path."""
+        self.output_path = output_path
+        self.check_out_path()
+
+    def parse_char_layout(self, layout):
+        """Parsing page content, letter by letter."""
+        # bbox:
+        # x0: the distance from the left side of the page to the left edge of the box.
+        # y0: the distance from the bottom of the page to the bottom edge of the box.
+        # x1: the distance from the left side of the page to the right edge of the box
+        # y1: distance from the bottom of the page to the top edge of the box
+        words_result = []
+        for textbox in layout:
+            if isinstance(textbox, LTText):
+                for line in textbox:
+                    char_list = []
+                    for character in line:
+                        # If the char is a line-break or an empty space, the word is complete
+                        if isinstance(character, LTAnno) or character.get_text() == " ":
+                            pass
+                        elif isinstance(character, LTChar):
+                            char_list.append(
+                                {
+                                    "char": character.get_text(),
+                                    "size": round(character.size, 2),
+                                    "font": character.fontname,
+                                    "location": {
+                                        "left": round(character.bbox[0], 2),
+                                        "top": round(character.bbox[3], 2),
+                                        "width": round(character.width, 2),
+                                        "height": round(character.height, 2),
+                                    },
+                                }
+                            )
+                    line_dict = {
+                        "words": line.get_text().strip(),
+                        "location": {
+                            "left": round(line.bbox[0], 2),
+                            "top": round(line.bbox[3], 2),
+                            "width": round(line.width, 2),
+                            "height": round(line.height, 2),
+                        },
+                        "chars": char_list,
+                    }
+                    words_result.append(line_dict)
+        return words_result
+
+    def extract_text_location(self, new_pdf_file=None):
+        """Extract text location from pdf file."""
+        if new_pdf_file not in [None, ""]:
+            self.pdf_file = new_pdf_file
+            self.check_file()
+            self.check_out_path()
+
+        with open(self.pdf_file, "rb") as in_file:
+            parser = PDFParser(in_file)
+            doc = PDFDocument(parser)  # Create a PDF document
+
+            rsrcmgr = PDFResourceManager()  # Create a PDF resource manager to share resources
+            # Create a PDF device object
+            laparams = LAParams()
+            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+            # Create a PDF interpreter object
+            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            # Iterate through the list and process the content of each page
+            # doc.get_pages() retrieves the page list
+            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            # Process the content of each page in the document object
+            # doc.get_pages() retrieves the page list
+            # Iterate through the list and process the content of each page
+            # Here, layout is an LTPage object that contains various objects parsed from this page,
+            # including LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal, etc.
+            # To obtain text, access the text attribute of the object
+
+            start_time = time.time()
+            print("============== pdf file: ", self.pdf_file, " processing ==============")
+            pdf_name = os.path.basename(self.pdf_file).split(".")[0]
+            page_list = []
+            for page_index, page in enumerate(PDFPage.create_pages(doc)):
+                print("================ page: ", page_index + 1, " ==================")
+                interpreter.process_page(page)
+                layout = device.get_result()
+                # get layout width and height
+                page_data = self.parse_char_layout(layout)
+                page_list.append(
+                    {
+                        "page": page_index + 1,
+                        "width": round(layout.width, 2),
+                        "height": round(layout.height, 2),
+                        "words": page_data,
+                    }
+                )
+            pdf_json = {"name": pdf_name, "page_count": len(page_list), "pages": page_list}
+            print("pdf_json: ", pdf_json)
+            with open(os.path.join(self.output_path, pdf_name + ".json"), "w") as out_file:
+                out_file.write(json.dumps(pdf_json, indent=4, ensure_ascii=False))
+                print(
+                    "save json file success. file: ",
+                    os.path.join(self.output_path, pdf_name + ".json"),
+                )
+            end_time = time.time()
+            print(
+                "============== pdf file: ",
+                self.pdf_file,
+                " process success, cost time: ",
+                "{:.2f} S".format(end_time - start_time),
+                " ==============",
+            )
+        pass
+
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+attachment_dir = os.path.join(root_dir, "public/attachments/pdf")
+pic_pdf_path = os.path.join(attachment_dir, "whatispython.pdf")
+
+
+def test_extract_text_location():
+    """Test extract text location."""
+    extract_text_location = ExtractTextLocation(pic_pdf_path)
+    extract_text_location.extract_text_location()
+    pass
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract text location from pdf file.")
+    parser.add_argument("-i", "--input", help="input pdf file", required=True)
+
+    args = parser.parse_args()
+    pdf_file = args.input
+
+    extract_text_location = ExtractTextLocation(pdf_file)
+    extract_text_location.extract_text_location()
+    pass
diff --git a/src/pdf_parse/single_pdf_parse.py b/src/pdf_parse/single_pdf_parse.py
@@ -35,6 +35,15 @@ def export_as_images(self) -> List[str]:
         # return empty
         return []
 
+    def extract_text(self) -> str:
+        """Extract text from pdf file."""
+        reader = PdfReader(self.pdf_file)
+        # extract text from pdf file
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+
     def add_annotation(self, annotation_list: list) -> str:
         """Fill the writer with the pages you want.
 
@@ -83,6 +92,7 @@ def check_out_path(self):
 root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 attachment_dir = os.path.join(root_dir, "public/attachments/pdf")
 pic_pdf_path = os.path.join(attachment_dir, "samplepic.pdf")
+pic_pdf_path2 = os.path.join(attachment_dir, "whatispython.pdf")
 
 
 def test_export_as_images() -> None:
@@ -91,6 +101,13 @@ def test_export_as_images() -> None:
     extract_pdf_images.export_as_images()
 
 
+def test_extract_text() -> None:
+    """Test extract text from pdf."""
+    extract_pdf_text = SinglePdfParse(pic_pdf_path2)
+    text = extract_pdf_text.extract_text()
+    print(text)
+
+
 def test_add_annotation() -> None:
     """Add pdf annotiation demo."""
     # Create the annotation and add it

diff --git a/tests/test_pdf_parse/test_extract_text_location.py b/tests/test_pdf_parse/test_extract_text_location.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+from pdf_parse import extract_text_location
+
+
+def test_pdf_file_not_found():
+    with pytest.raises(FileNotFoundError) as e:
+        extract_text_location.ExtractTextLocation("test.pdf")
+    exec_msg = e.value.args[0]
+    assert exec_msg == "file test.pdf not found."
+
+
+def test_not_pdf_file():
+    with pytest.raises(ValueError) as e:
+        extract_text_location.ExtractTextLocation(__file__)
+    exec_msg = e.value.args[0]
+    assert exec_msg.index("not pdf file") != -1
+
+
+def test_extract_text_location():
+    extract_text_location.test_extract_text_location()
diff --git a/tests/test_pdf_parse/test_single_pdf_parse.py b/tests/test_pdf_parse/test_single_pdf_parse.py
@@ -31,6 +31,10 @@ def test_export_as_images_demo():
     single_pdf_parse.test_export_as_images()
 
 
+def test_export_as_text():
+    single_pdf_parse.test_extract_text()
+
+
 # test add annotation
 def test_add_annotation():
     output_pdf = os.path.join(tpp.test_dist_path, f"test_add_annotation_{str(int(datetime.now().timestamp() * 1000 ))}")