Skip to content

Commit

Permalink
Merge pull request #117 from funnyzak/refactor/pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
funnyzak authored Jun 8, 2023
2 parents 4d01566 + 0557a3c commit 18ba820
Show file tree
Hide file tree
Showing 8 changed files with 895 additions and 461 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,5 @@ _cache/
**/*/*test*.jpeg
**/*/*test*.jpg
**/*/*test*.png

**/*/pdf/*.json
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ poetry install --extras "pdf"
poetry install --with dev,test
# Only install required dependencies for production
poetry install
# Specify python version
poetry env use python3.9
```

## Usage
Expand All @@ -93,6 +96,9 @@ poetry run merge_pdf_demo
# Run project => pdf_parse: parse multi pdf to multi-layer pdf
poetry run multi_layer_pdf_demo
# Extract text to json from pdf
poetry run extract_text_from_pdf
# Debug "hello" project with ipdb3
poetry run ipdb3 ./src/hello/main.py
Expand Down
1,110 changes: 651 additions & 459 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pyproject-starter"
version = "0.1.6"
version = "0.1.7"
description = "A template for the python project. It uses poetry for dependency management and tox for testing."
authors = ["Leon <[email protected]>"]
license = "MIT"
Expand All @@ -27,6 +27,8 @@ packages = [
hello = "hello.main:say_hello"
single_pdf_parse = { callable = "pdf_parse.single_pdf_parse:main", extras = ["pdf"] }
multi_pdf_parse = { callable = "pdf_parse.multi_pdf_parse:main", extras = ["pdf"] }
extract_text_from_pdf = { callable = "pdf_parse.extract_text_location:test_extract_text_location", extras = ["pdf"] }



add_pdf_annotation_demo = { callable = "pdf_parse.single_pdf_parse:test_add_annotation", extras = ["pdf"] }
Expand All @@ -42,10 +44,11 @@ python = "^3.8.1"
borb = { version = "^2.1.7", optional = true }
PyPDF2 = { version = "^2.12.1", optional = true }
requests = {version = "^2.28.1", optional = true}
pdfminer = {version = "^20191125", optional = true}


[tool.poetry.extras]
pdf = ["PyPDF2", "borb", "requests"]
pdf = ["PyPDF2", "borb", "requests", "pdfminer"]

[tool.poetry.group.dev]
optional = true
Expand Down
188 changes: 188 additions & 0 deletions src/pdf_parse/extract_text_location.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# -*- coding: utf-8 -*-
# created by: leon<silenceace at gmail dot com>
# date: 2023-06-08
# license: MIT
# description: Extract text location from pdf file.
# usage: poetry run python src/pdf_parse/extract_text_location.py -i public/attachments/pdf/whatispython.pdf
# notes:

import argparse
import json
import os
import time

from pdfminer.converter import PDFPageAggregator # type: ignore
from pdfminer.layout import LAParams # type: ignore
from pdfminer.layout import LTAnno
from pdfminer.layout import LTChar
from pdfminer.layout import LTText
from pdfminer.pdfdocument import PDFDocument # type: ignore
from pdfminer.pdfinterp import PDFPageInterpreter # type: ignore
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage # type: ignore
from pdfminer.pdfparser import PDFParser # type: ignore


class ExtractTextLocation:
"""Extract text location from pdf file."""

def __init__(self, pdf_file, output_path=None) -> None:
"""Pass in the path to the PDF file and output path.
:param pdf_file: pdf file path
:param output_path: output dir
"""
self.pdf_file = pdf_file
self.output_path = output_path

self.check_file()
self.check_out_path()

def check_file(self):
"""Check if the file exists."""
if not os.path.exists(self.pdf_file):
raise FileNotFoundError(f"file {self.pdf_file} not found.")
if not self.pdf_file.endswith(".pdf"):
raise ValueError(f"file {self.pdf_file} is not pdf file")

def check_out_path(self):
"""Check if the output path exists."""
if not self.output_path:
self.output_path = os.path.dirname(self.pdf_file)
if not os.path.exists(self.output_path):
os.makedirs(self.output_path)

def set_output_path(self, output_path):
"""Set output path."""
self.output_path = output_path
self.check_out_path()

def parse_char_layout(self, layout):
"""Parsing page content, letter by letter."""
# bbox:
# x0: the distance from the left side of the page to the left edge of the box.
# y0: the distance from the bottom of the page to the bottom edge of the box.
# x1: the distance from the left side of the page to the right edge of the box
# y1: distance from the bottom of the page to the top edge of the box
words_result = []
for textbox in layout:
if isinstance(textbox, LTText):
for line in textbox:
char_list = []
for character in line:
# If the char is a line-break or an empty space, the word is complete
if isinstance(character, LTAnno) or character.get_text() == " ":
pass
elif isinstance(character, LTChar):
char_list.append(
{
"char": character.get_text(),
"size": round(character.size, 2),
"font": character.fontname,
"location": {
"left": round(character.bbox[0], 2),
"top": round(character.bbox[3], 2),
"width": round(character.width, 2),
"height": round(character.height, 2),
},
}
)
line_dict = {
"words": line.get_text().strip(),
"location": {
"left": round(line.bbox[0], 2),
"top": round(line.bbox[3], 2),
"width": round(line.width, 2),
"height": round(line.height, 2),
},
"chars": char_list,
}
words_result.append(line_dict)
return words_result

def extract_text_location(self, new_pdf_file=None):
"""Extract text location from pdf file."""
if new_pdf_file not in [None, ""]:
self.pdf_file = new_pdf_file
self.check_file()
self.check_out_path()

with open(self.pdf_file, "rb") as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser) # Create a PDF document

rsrcmgr = PDFResourceManager() # Create a PDF resource manager to share resources
# Create a PDF device object
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Iterate through the list and process the content of each page
# doc.get_pages() retrieves the page list
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process the content of each page in the document object
# doc.get_pages() retrieves the page list
# Iterate through the list and process the content of each page
# Here, layout is an LTPage object that contains various objects parsed from this page,
# including LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal, etc.
# To obtain text, access the text attribute of the object

start_time = time.time()
print("============== pdf file: ", self.pdf_file, " processing ==============")
pdf_name = os.path.basename(self.pdf_file).split(".")[0]
page_list = []
for page_index, page in enumerate(PDFPage.create_pages(doc)):
print("================ page: ", page_index + 1, " ==================")
interpreter.process_page(page)
layout = device.get_result()
# get layout width and height
page_data = self.parse_char_layout(layout)
page_list.append(
{
"page": page_index + 1,
"width": round(layout.width, 2),
"height": round(layout.height, 2),
"words": page_data,
}
)
pdf_json = {"name": pdf_name, "page_count": len(page_list), "pages": page_list}
print("pdf_json: ", pdf_json)
with open(os.path.join(self.output_path, pdf_name + ".json"), "w") as out_file:
out_file.write(json.dumps(pdf_json, indent=4, ensure_ascii=False))
print(
"save json file success. file: ",
os.path.join(self.output_path, pdf_name + ".json"),
)
end_time = time.time()
print(
"============== pdf file: ",
self.pdf_file,
" process success, cost time: ",
"{:.2f} S".format(end_time - start_time),
" ==============",
)
pass


root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
attachment_dir = os.path.join(root_dir, "public/attachments/pdf")
pic_pdf_path = os.path.join(attachment_dir, "whatispython.pdf")


def test_extract_text_location():
"""Test extract text location."""
extract_text_location = ExtractTextLocation(pic_pdf_path)
extract_text_location.extract_text_location()
pass


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract text location from pdf file.")
parser.add_argument("-i", "--input", help="input pdf file", required=True)

args = parser.parse_args()
pdf_file = args.input

extract_text_location = ExtractTextLocation(pdf_file)
extract_text_location.extract_text_location()
pass
17 changes: 17 additions & 0 deletions src/pdf_parse/single_pdf_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ def export_as_images(self) -> List[str]:
# return empty
return []

def extract_text(self) -> str:
"""Extract text from pdf file."""
reader = PdfReader(self.pdf_file)
# extract text from pdf file
text = ""
for page in reader.pages:
text += page.extract_text()
return text

def add_annotation(self, annotation_list: list) -> str:
"""Fill the writer with the pages you want.
Expand Down Expand Up @@ -83,6 +92,7 @@ def check_out_path(self):
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
attachment_dir = os.path.join(root_dir, "public/attachments/pdf")
pic_pdf_path = os.path.join(attachment_dir, "samplepic.pdf")
pic_pdf_path2 = os.path.join(attachment_dir, "whatispython.pdf")


def test_export_as_images() -> None:
Expand All @@ -91,6 +101,13 @@ def test_export_as_images() -> None:
extract_pdf_images.export_as_images()


def test_extract_text() -> None:
"""Test extract text from pdf."""
extract_pdf_text = SinglePdfParse(pic_pdf_path2)
text = extract_pdf_text.extract_text()
print(text)


def test_add_annotation() -> None:
"""Add pdf annotiation demo."""
# Create the annotation and add it
Expand Down
22 changes: 22 additions & 0 deletions tests/test_pdf_parse/test_extract_text_location.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
import pytest

from pdf_parse import extract_text_location


def test_pdf_file_not_found():
with pytest.raises(FileNotFoundError) as e:
extract_text_location.ExtractTextLocation("test.pdf")
exec_msg = e.value.args[0]
assert exec_msg == "file test.pdf not found."


def test_not_pdf_file():
with pytest.raises(ValueError) as e:
extract_text_location.ExtractTextLocation(__file__)
exec_msg = e.value.args[0]
assert exec_msg.index("not pdf file") != -1


def test_extract_text_location():
extract_text_location.test_extract_text_location()
4 changes: 4 additions & 0 deletions tests/test_pdf_parse/test_single_pdf_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def test_export_as_images_demo():
single_pdf_parse.test_export_as_images()


def test_export_as_text():
single_pdf_parse.test_extract_text()


# test add annotation
def test_add_annotation():
output_pdf = os.path.join(tpp.test_dist_path, f"test_add_annotation_{str(int(datetime.now().timestamp() * 1000 ))}")
Expand Down

0 comments on commit 18ba820

Please sign in to comment.