Skip to content

Commit

Permalink
Tests complete with extraction and filtering of repeated data
Browse files Browse the repository at this point in the history
  • Loading branch information
enoch3712 committed Jun 5, 2024
1 parent 555d4f1 commit d2e3a72
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 66 deletions.
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .document_loader.cached_document_loader import CachedDocumentLoader
from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
from .document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_text import DocumentLoaderText
from .models import classification, classification_response
Expand All @@ -18,6 +19,7 @@
'DocumentLoader',
'CachedDocumentLoader',
'DocumentLoaderTesseract',
'DocumentLoaderAzureForm',
'DocumentLoaderPyPdf',
'DocumentLoaderText',
'classification',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class DocumentLoaderAzureForm(CachedDocumentLoader):
def __init__(self, subscription_key: str, endpoint: str, is_container: bool = False, content: Any = None, cache_ttl: int = 300):
def __init__(self, subscription_key: str, endpoint: str, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)
self.subscription_key = subscription_key
self.endpoint = endpoint
Expand Down Expand Up @@ -42,33 +42,50 @@ def process_result(self, result: AnalyzeResult) -> List[dict]:
for page in result.pages:
paragraphs = [p.content for p in page.lines]
tables = self.build_tables(result.tables)
words_with_locations = self.process_words(page)
# words_with_locations = self.process_words(page)
# Remove lines that are present in tables
paragraphs = self.remove_lines_present_in_tables(paragraphs, tables)
output = {
"type": "pdf",
"content": result.content,
#"content": result.content,
"paragraphs": paragraphs,
"words": words_with_locations,
"tables": tables
#"words": words_with_locations,
"tables": tables.get(page.page_number, [])
}
extract_results.append(output)
return extract_results
return {"pages": extract_results}

def remove_lines_present_in_tables(self, paragraphs: List[str], tables: dict[int, List[List[str]]]) -> List[str]:
for table in tables.values():
for row in table:
for cell in row:
if cell in paragraphs:
paragraphs.remove(cell)
return paragraphs

def page_to_string(self, page: DocumentPage) -> str:
page_string = ""
for word in page.words:
for point in word.polygon:
page_string += f"({point.x}, {point.y}): {word.content}\n"
return page_string

def process_words(self, page: DocumentPage) -> List[dict]:
words_with_locations = []
for line in page.lines:
for word in line.words:
word_info = {
"content": word.content,
"bounding_box": {
"points": self.build_points(word.bounding_box)
},
"page_number": page.page_number
}
words_with_locations.append(word_info)

for word in page.words:
word_info = {
"content": word.content,
"bounding_box": {
"points": word.polygon
},
"page_number": page.page_number
}
words_with_locations.append(word_info)

return words_with_locations

def build_tables(self, tables: List[DocumentTable]) -> List[List[str]]:
table_data = []
def build_tables(self, tables: List[DocumentTable]) -> dict[int, List[List[str]]]:
table_data = {}
for table in tables:
rows = []
for row_idx in range(table.row_count):
Expand All @@ -77,7 +94,8 @@ def build_tables(self, tables: List[DocumentTable]) -> List[List[str]]:
if cell.row_index == row_idx:
row.append(cell.content)
rows.append(row)
table_data.append(rows)
# Use the page number as the key for the dictionary
table_data[table.bounding_regions[0].page_number] = rows
return table_data

def build_points(self, bounding_box: List[Point]) -> List[dict]:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "extract_thinker"
version = "0.0.3"
version = "0.0.4"
description = "Library to extract data from files and documents agnositicaly using LLMs"
authors = ["Júlio Almeida <[email protected]>"]
readme = "README.md"
Expand Down
51 changes: 6 additions & 45 deletions tests/document_loader_azure_document_intelligence.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
import os

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from io import BytesIO
from dotenv import load_dotenv
import pytest
from azure.core.exceptions import AzureError

from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm

Expand All @@ -17,48 +10,16 @@
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")
loader = DocumentLoaderAzureForm(subscription_key, endpoint)
test_file_path = os.path.join(cwd, "tests", "test_documents", "invoice.pdf")
test_file_path = os.path.join(cwd, "test_images", "invoice.png")


def test_load_content_from_file():
# Act
try:
content = loader.load_content_from_file("C:\\Users\\Lopez\\Downloads\\LNKD_INVOICE_7894414780.pdf")
except AzureError as e:
pytest.fail(f"AzureError occurred: {e}")

# Assert
assert content is not None
assert isinstance(content, list)
assert len(content) > 0


def test_load_content_from_stream():
with open(test_file_path, 'rb') as f:
test_document_stream = BytesIO(f.read())

# Act
try:
content = loader.load_content_from_stream(test_document_stream)
except AzureError as e:
pytest.fail(f"AzureError occurred: {e}")

# Assert
assert content is not None
assert isinstance(content, list)
assert len(content) > 0
content = loader.load_content_from_file(test_file_path)


def test_cache_for_file():
# Act
try:
content1 = loader.load_content_from_file(test_file_path)
content2 = loader.load_content_from_file(test_file_path)
except AzureError as e:
pytest.fail(f"AzureError occurred: {e}")
firstPage = content["pages"][0]

# Assert
assert content1 is content2


test_load_content_from_file()
assert firstPage is not None
assert firstPage["paragraphs"][0] == "Invoice 0000001"
assert len(firstPage["tables"][0]) == 4
22 changes: 22 additions & 0 deletions tests/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from extract_thinker.extractor import Extractor
from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract
from tests.models.invoice import InvoiceContract
from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm

load_dotenv()
cwd = os.getcwd()
Expand All @@ -28,3 +29,24 @@ def test_extract_with_tessaract_and_claude():
assert result is not None
assert result.invoice_number == "0000001"
assert result.invoice_date == "2014-05-07"


def test_extract_with_azure_di_and_claude():
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")
test_file_path = os.path.join(cwd, "test_images", "invoice.png")

extractor = Extractor()
extractor.load_document_loader(
DocumentLoaderAzureForm(subscription_key, endpoint)
)
extractor.load_llm("claude-3-haiku-20240307")
# Act
result = extractor.extract(test_file_path, InvoiceContract)

# Assert
assert result is not None
assert result.lines[0].description == "Website Redesign"
assert result.lines[0].quantity == 1
assert result.lines[0].unit_price == 2500
assert result.lines[0].amount == 2500
10 changes: 10 additions & 0 deletions tests/models/invoice.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
from typing import List
from extract_thinker.models.contract import Contract


class LinesContract(Contract):
description: str
quantity: int
unit_price: int
amount: int


class InvoiceContract(Contract):
invoice_number: str
invoice_date: str
lines: List[LinesContract]
total_amount: int

0 comments on commit d2e3a72

Please sign in to comment.