Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pdfpumbler added #39

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
from .document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
from .models import classification, classification_response
from .process import Process, ClassificationStrategy
from .splitter import Splitter
Expand All @@ -23,6 +24,7 @@
'DocumentLoaderSpreadSheet',
'DocumentLoaderAzureForm',
'DocumentLoaderPyPdf',
'DocumentLoaderPdfPlumber',
'classification',
'classification_response',
'Process',
Expand Down
55 changes: 55 additions & 0 deletions extract_thinker/document_loader/document_loader_pdfplumber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import io
from typing import Any, Dict, List, Union

import pdfplumber

from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import get_file_extension

SUPPORTED_FORMATS = ['pdf']

class DocumentLoaderPdfPlumber(CachedDocumentLoader):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)

def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]:
try:
if get_file_extension(file_path).lower() not in SUPPORTED_FORMATS:
raise Exception(f"Unsupported file type: {file_path}")

with pdfplumber.open(file_path) as pdf:
return self.extract_data_from_pdf(pdf)
except Exception as e:
raise Exception(f"Error processing file: {e}") from e

def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]:
try:
with pdfplumber.open(stream) as pdf:
return self.extract_data_from_pdf(pdf)
except Exception as e:
raise Exception(f"Error processing stream: {e}") from e

def extract_data_from_pdf(self, pdf: pdfplumber.PDF) -> Dict[str, Any]:
document_data = {
"text": [],
"tables": []
}

for page in pdf.pages:
# Extract text
page_text = page.extract_text()
if page_text:
document_data["text"].extend(page_text.split('\n'))

# Extract tables
tables = page.extract_tables()
for table in tables:
document_data["tables"].append(table)

return document_data

def load_content_from_file_list(self, file_paths: List[str]) -> List[Dict[str, Any]]:
return [self.load_content_from_file(file_path) for file_path in file_paths]

def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Dict[str, Any]]:
return [self.load_content_from_stream(stream) for stream in streams]
Loading
Loading