forked from Filimoa/open-parse
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- 0.7.4-file_URL-3
- 0.7.4-file_URL-2
- 0.7.4-file_URL
- 0.7.3-markitdown_10
- 0.7.3-markitdown_9
- 0.7.3-markitdown_8
- 0.7.3-markitdown_7
- 0.7.3-markitdown_6
- 0.7.3-markitdown_5
- 0.7.3-markitdown_4
- 0.7.3-markitdown_3
- 0.7.3-markitdown_2
- 0.7.3-markitdown
- 0.7.3-markitdown_zip
- 0.7.2-cloudflare_12
- 0.7.2-cloudflare_11
- 0.7.2-cloudflare_10
- 0.7.2-cloudflare_9
- 0.7.2-cloudflare_8
- 0.7.1-text-only_3
- 0.7.1-ollama_21
Showing
19 changed files
with
1,010 additions
and
178 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from typing import List, TypedDict, Union | ||
from dataclasses import dataclass | ||
|
||
from src.schemas import Node | ||
from src.postprocessing.steps import PostProcessingStep, default_pipeline | ||
|
||
|
||
class ProcessingArgs(TypedDict, total=False): | ||
min_tokens: int | ||
max_tokens: int | ||
processing_pipeline: List[PostProcessingStep] | ||
|
||
|
||
@dataclass | ||
class ParsedProcessingArgs: | ||
min_tokens: float = 128 | ||
max_tokens: float = 1024 | ||
processing_pipeline: List[PostProcessingStep] = default_pipeline | ||
|
||
|
||
def merge_with_defaults(user_args: Union[ProcessingArgs, None]) -> ParsedProcessingArgs: | ||
args = ParsedProcessingArgs() | ||
|
||
if user_args: | ||
for key, value in user_args.items(): | ||
if hasattr(args, key): | ||
setattr(args, key, value) | ||
|
||
return args | ||
|
||
|
||
def run_pipeline(nodes: List[Node], args: Union[ProcessingArgs, None]) -> List[Node]: | ||
parsed_args = merge_with_defaults(args) | ||
for transform in parsed_args.processing_pipeline: | ||
nodes = transform.process(nodes) | ||
return nodes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from typing import Optional, List, Sequence, Literal, TypedDict, Union | ||
from abc import ABC, abstractmethod | ||
from collections import defaultdict | ||
from dataclasses import dataclass | ||
|
||
from src.schemas import Node | ||
|
||
|
||
class ProcessingStep(ABC): | ||
@abstractmethod | ||
def process(self, nodes: List[Node]) -> List[Node]: | ||
""" | ||
Process a list of Nodes and return a modified list of Nodes. | ||
""" | ||
raise NotImplementedError("Subclasses must implement this method.") | ||
|
||
|
||
class CombineNodesSplitAcrossPages(ProcessingStep): | ||
def __init__(self, openai_client: Optional[] = None): | ||
self.max_area_pct = max_area_pct | ||
|
||
def process(self, nodes: List[Node]) -> List[Node]: | ||
res = [] | ||
for node in nodes: | ||
node_bbox = node.bbox[0] | ||
page_area = node_bbox.page_width * node_bbox.page_height | ||
|
||
if node.num_pages > 1: | ||
res.append(node) | ||
continue | ||
elif node_bbox.area / page_area < self.max_area_pct: | ||
res.append(node) | ||
continue | ||
elif not node.is_stub: | ||
res.append(node) | ||
continue | ||
return res | ||
|
||
|
||
default_pipeline = [ | ||
CombineNodesSplitAcrossPages(), | ||
# CombineBullets(), | ||
# CombineHeadingsWithClosestText(), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from ..parse import ingest, TableTransformersArgs |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import Union, List, Optional, Literal, Tuple, Any, Sequence | ||
|
||
from pydantic import BaseModel, model_validator | ||
|
||
|
||
############### | ||
### SCHEMAS ### | ||
############### | ||
|
||
Size = Tuple[int, int] | ||
BBox = Tuple[float, float, float, float] | ||
|
||
|
||
class _TableCellModelOutput(BaseModel): | ||
label: Literal[ | ||
"table spanning cell", | ||
"table row", | ||
"table column", | ||
"table", | ||
"table column header", | ||
"table projected row header", # WHAT IS THIS | ||
] | ||
confidence: float | ||
bbox: BBox # note: image coordinates | ||
|
||
@property | ||
def is_header(self) -> bool: | ||
return self.label in ["table column header", "table projected row header"] | ||
|
||
@property | ||
def is_row(self) -> bool: | ||
return self.label in ["table row", "table spanning cell"] | ||
|
||
@property | ||
def is_column(self) -> bool: | ||
return self.label in ["table column"] | ||
|
||
|
||
class _TableModelOutput(BaseModel): | ||
label: Literal["table", "table rotated"] | ||
confidence: float | ||
bbox: BBox # note: image coordinates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,4 +62,3 @@ def ingest( | |
) | ||
) | ||
return elements | ||
# return [Node(elements=[e]) for e in elements] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
from typing import List | ||
|
||
from src.schemas import TextElement, LineElement, Bbox, TextSpan | ||
|
||
from typing import List, Any, Iterable | ||
from pdfminer.high_level import extract_pages | ||
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine | ||
from pydantic import BaseModel, model_validator | ||
|
||
|
||
class CharElement(BaseModel): | ||
text: str | ||
fontname: str | ||
size: float | ||
|
||
@property | ||
def is_bold(self) -> bool: | ||
return "Bold" in self.fontname or "bold" in self.fontname | ||
|
||
@property | ||
def is_italic(self) -> bool: | ||
return "Italic" in self.fontname or "italic" in self.fontname | ||
|
||
@model_validator(mode="before") | ||
@classmethod | ||
def round_size(cls, data: Any) -> Any: | ||
data["size"] = round(data["size"], 2) | ||
return data | ||
|
||
|
||
def extract_chars(text_line: LTTextLine) -> List[CharElement]: | ||
return [ | ||
CharElement(text=char.get_text(), fontname=char.fontname, size=char.size) | ||
for char in text_line | ||
if isinstance(char, LTChar) | ||
] | ||
|
||
|
||
def group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]: | ||
spans = [] | ||
current_text = "" | ||
current_style = (False, False, 0.0) | ||
|
||
for char in chars: | ||
char_style = (char.is_bold, char.is_italic, char.size) | ||
# If the current character is a space, compress multiple spaces and continue loop. | ||
if char.text.isspace(): | ||
if not current_text.endswith(" "): | ||
current_text += " " | ||
continue | ||
|
||
# If style changes and there's accumulated text, add it to spans. | ||
if char_style != current_style and current_text: | ||
# Ensure there is at most one space at the end of the text. | ||
spans.append( | ||
TextSpan( | ||
text=current_text.rstrip() | ||
+ (" " if current_text.endswith(" ") else ""), | ||
is_bold=current_style[0], | ||
is_italic=current_style[1], | ||
size=current_style[2], | ||
) | ||
) | ||
current_text = char.text | ||
else: | ||
current_text += char.text | ||
current_style = char_style | ||
|
||
# After the loop, add any remaining text as a new span. | ||
if current_text: | ||
spans.append( | ||
TextSpan( | ||
text=current_text.rstrip() | ||
+ (" " if current_text.endswith(" ") else ""), | ||
is_bold=current_style[0], | ||
is_italic=current_style[1], | ||
size=current_style[2], | ||
) | ||
) | ||
return spans | ||
|
||
|
||
def create_line_element(text_line: LTTextLine) -> LineElement: | ||
"""Create a LineElement from a text line.""" | ||
chars = extract_chars(text_line) | ||
spans = group_chars_into_spans(chars) | ||
bbox = (text_line.x0, text_line.y0, text_line.x1, text_line.y1) | ||
return LineElement(bbox=bbox, spans=spans) | ||
|
||
|
||
def get_bbox(lines: List[LineElement]) -> tuple[float, float, float, float]: | ||
"""Get the bounding box of a list of LineElements.""" | ||
x0 = min(line.bbox[0] for line in lines) | ||
y0 = min(line.bbox[1] for line in lines) | ||
x1 = max(line.bbox[2] for line in lines) | ||
y1 = max(line.bbox[3] for line in lines) | ||
return x0, y0, x1, y1 | ||
|
||
|
||
def ingest(file_path: str) -> List[TextElement]: | ||
"""Parse PDF and return a list of LineElement objects.""" | ||
elements = [] | ||
for page_num, page_layout in enumerate(extract_pages(file_path)): | ||
page_width = page_layout.width | ||
page_height = page_layout.height | ||
for element in page_layout: | ||
if isinstance(element, LTTextContainer): | ||
lines = [] | ||
for text_line in element: | ||
if isinstance(text_line, LTTextLine): | ||
lines.append(create_line_element(text_line)) | ||
|
||
bbox = get_bbox(lines) | ||
elements.append( | ||
TextElement( | ||
bbox=Bbox( | ||
x0=bbox[0], | ||
y0=bbox[1], | ||
x1=bbox[2], | ||
y1=bbox[3], | ||
page=page_num, | ||
page_width=page_width, | ||
page_height=page_height, | ||
), | ||
text="\n".join(line.text for line in lines), | ||
lines=lines, | ||
) | ||
) | ||
|
||
return elements |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
from typing import List | ||
import fitz | ||
|
||
from src.schemas import TextElement, LineElement, Bbox, TextSpan | ||
|
||
|
||
def flags_decomposer(flags: int) -> str: | ||
"""Make font flags human readable.""" | ||
l = [] | ||
if flags & 2**0: | ||
l.append("superscript") | ||
if flags & 2**1: | ||
l.append("italic") | ||
if flags & 2**2: | ||
l.append("serifed") | ||
else: | ||
l.append("sans") | ||
if flags & 2**3: | ||
l.append("monospaced") | ||
else: | ||
l.append("proportional") | ||
if flags & 2**4: | ||
l.append("bold") | ||
return ", ".join(l) | ||
|
||
|
||
def is_bold(flags) -> bool: | ||
return bool(flags & 2**4) | ||
|
||
|
||
def is_italic(flags) -> bool: | ||
return bool(flags & 2**1) | ||
|
||
|
||
def _lines_from_ocr_output(lines: dict, error_margin: float = 0) -> list[LineElement]: | ||
""" | ||
Creates LineElement objects from given lines, combining overlapping ones. | ||
""" | ||
combined: list[LineElement] = [] | ||
|
||
for line in lines: | ||
bbox = line["bbox"] | ||
spans = [ | ||
TextSpan( | ||
text=span["text"], | ||
is_bold=is_bold(span["flags"]), | ||
is_italic=is_italic(span["flags"]), | ||
size=span["size"], | ||
) | ||
for span in line["spans"] | ||
] | ||
|
||
line_element = LineElement(bbox=bbox, spans=spans) | ||
for i, other in enumerate(combined): | ||
overlaps = line_element.overlaps(other, error_margin=error_margin) | ||
similar_height = line_element.is_at_similar_height( | ||
other, error_margin=error_margin | ||
) | ||
|
||
if overlaps and similar_height: | ||
combined[i] = line_element.combine(other) | ||
break | ||
else: | ||
combined.append(line_element) | ||
|
||
return combined | ||
|
||
|
||
def ingest( | ||
doc: fitz.Document, | ||
) -> List[TextElement]: | ||
"""Parses text elements from a given pdf document.""" | ||
elements = [] | ||
for page_num, page in enumerate(doc): | ||
page_ocr = page.get_textpage_ocr(flags=0, full=False) | ||
for node in page.get_text("dict", textpage=page_ocr, sort=True)["blocks"]: | ||
if node["type"] != 0: | ||
continue | ||
|
||
lines = _lines_from_ocr_output(node["lines"]) | ||
|
||
elements.append( | ||
TextElement( | ||
bbox=Bbox( | ||
x0=node["bbox"][0], | ||
y0=node["bbox"][1], | ||
x1=node["bbox"][2], | ||
y1=node["bbox"][3], | ||
page=page_num, | ||
page_width=page.rect.width, | ||
page_height=page.rect.height, | ||
), | ||
text="\n".join(line.text for line in lines), | ||
lines=lines, | ||
) | ||
) | ||
return elements |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,396 @@ | ||
from src.tables.table_transformers.schemas import _TableCellModelOutput | ||
from src.tables.table_transformers import ml | ||
|
||
# from src.tables.schemas import _TableCellModelOutput | ||
|
||
|
||
# evals/data/tables/naic-numerical-list-of-companies-page-94.pdf | ||
sample_get_table_content_output = [ | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9939164519309998, | ||
bbox=( | ||
35.288272164084674, | ||
408.22346635298294, | ||
690.0443794944069, | ||
424.6131758256392, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9996691942214966, | ||
bbox=( | ||
35.15542533180928, | ||
490.4657454057173, | ||
690.3566963889382, | ||
506.47007890181106, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.99506676197052, | ||
bbox=( | ||
35.129248879172565, | ||
160.296967939897, | ||
690.2205879905007, | ||
176.68047471479935, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9961244463920593, | ||
bbox=( | ||
35.17559745094991, | ||
375.1639875932173, | ||
690.0448677756569, | ||
391.70527787642044, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9988767504692078, | ||
bbox=( | ||
452.4295723655007, | ||
105.89149613813919, | ||
497.28964926979756, | ||
519.6490339799361, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9969657063484192, | ||
bbox=( | ||
360.2500069358132, | ||
105.74062486128372, | ||
410.5702278830788, | ||
519.9760298295455, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9950108528137207, | ||
bbox=( | ||
35.18905188820577, | ||
391.7489180131392, | ||
690.0088570334694, | ||
408.2126936479048, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9994648098945618, | ||
bbox=( | ||
411.57538535378194, | ||
105.81761880354446, | ||
452.10687949440694, | ||
519.731522993608, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9994288086891174, | ||
bbox=( | ||
35.216128609397174, | ||
325.67996354536575, | ||
690.3316109397194, | ||
342.2242598100142, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9953063130378723, | ||
bbox=( | ||
35.20100333473897, | ||
143.6793379350142, | ||
690.1937935569069, | ||
160.27500291304153, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9981030225753784, | ||
bbox=( | ||
35.22977759621358, | ||
123.52590699629349, | ||
690.1675484397194, | ||
143.69812150435013, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9999344348907471, | ||
bbox=( | ||
35.117327950217486, | ||
105.96672578291458, | ||
206.06037070534444, | ||
519.4698347611861, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9996514320373535, | ||
bbox=( | ||
34.91754843971944, | ||
209.7849287553267, | ||
690.2598946311257, | ||
226.38141007856888, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9987955093383789, | ||
bbox=( | ||
35.13444068215108, | ||
276.3870100541548, | ||
690.0095894553444, | ||
292.8090986772017, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9938255548477173, | ||
bbox=( | ||
35.265181801535846, | ||
424.74266190962356, | ||
690.0817940451882, | ||
441.19603105024856, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9993770718574524, | ||
bbox=( | ||
35.005008003928424, | ||
226.48010392622513, | ||
690.1912300803444, | ||
242.9812483354048, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9996106028556824, | ||
bbox=( | ||
34.99921347878194, | ||
176.78753800825638, | ||
690.1875069358132, | ||
193.39771409468216, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.999058187007904, | ||
bbox=( | ||
35.12911917946553, | ||
474.03270097212356, | ||
690.1881783225319, | ||
490.56614823774856, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.999068558216095, | ||
bbox=( | ||
35.018870613791705, | ||
243.06179948286575, | ||
690.3463204123757, | ||
259.6156630082564, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9995618462562561, | ||
bbox=( | ||
35.03798987648702, | ||
193.34961076216263, | ||
690.2454903342507, | ||
209.94493241743606, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9995167255401611, | ||
bbox=( | ||
35.13354041359639, | ||
292.3361525102095, | ||
690.2454292990944, | ||
308.8550428910689, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9995043277740479, | ||
bbox=( | ||
35.23306205055928, | ||
105.57804055647415, | ||
690.1783516623757, | ||
123.37710710005325, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9793805480003357, | ||
bbox=( | ||
496.0791084983132, | ||
105.98925919966263, | ||
521.5678169944069, | ||
519.3316511674361, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9991280436515808, | ||
bbox=( | ||
35.10216834328389, | ||
341.9986738725142, | ||
690.5628731467507, | ||
358.4926008744673, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9999399185180664, | ||
bbox=( | ||
521.5474922873757, | ||
105.70714898542923, | ||
690.0865547873757, | ||
519.495927290483, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9976800084114075, | ||
bbox=( | ||
35.068866036154986, | ||
506.24861283735794, | ||
690.4476998069069, | ||
519.2454695268111, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column header", | ||
confidence=0.9989497065544128, | ||
bbox=( | ||
35.182784340598346, | ||
105.63359590010208, | ||
690.2292549826882, | ||
123.22541184858841, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9968582391738892, | ||
bbox=( | ||
35.039038918235065, | ||
358.5333418412642, | ||
690.1406319358132, | ||
375.10054154829544, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table column", | ||
confidence=0.9961622953414917, | ||
bbox=( | ||
206.15311362526631, | ||
105.64504189924759, | ||
361.92600180886006, | ||
519.362382368608, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9984205961227417, | ||
bbox=( | ||
35.26417472145772, | ||
457.65477128462356, | ||
690.0900337912819, | ||
474.10359330610794, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9983760118484497, | ||
bbox=( | ||
34.888621590354205, | ||
259.36580033735794, | ||
690.3048165061257, | ||
275.8905958695845, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.9978849291801453, | ||
bbox=( | ||
35.130687020041705, | ||
441.1501936479048, | ||
690.1639473655007, | ||
457.64375443892044, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table spanning cell", | ||
confidence=0.9597509503364563, | ||
bbox=( | ||
32.47110869667745, | ||
130.52832932905716, | ||
209.55084922096944, | ||
516.4211592240767, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table row", | ||
confidence=0.99912029504776, | ||
bbox=( | ||
35.06736304543233, | ||
308.5428633256392, | ||
690.4191963889382, | ||
325.1110243363814, | ||
), | ||
), | ||
_TableCellModelOutput( | ||
label="table", | ||
confidence=0.9999959468841553, | ||
bbox=( | ||
35.22020652077413, | ||
105.64571900801224, | ||
690.1153023459694, | ||
519.1645063920455, | ||
), | ||
), | ||
] | ||
|
||
|
||
# def test_table_from_model_outputs(): | ||
# image_size = (792, 612) | ||
# page_size = (792.0, 612.0) | ||
# table_bbox = ( | ||
# 56.02, | ||
# 180.17, | ||
# 702.35, | ||
# 460.68, | ||
# ) | ||
# table_cells = sample_get_table_content_output | ||
|
||
# headers = [ | ||
# cell | ||
# for cell in table_cells | ||
# if cell.is_header and cell.confidence > ml.MIN_CELL_CONFIDENCE | ||
# ] | ||
# rows = [ | ||
# cell | ||
# for cell in table_cells | ||
# if cell.is_row and cell.confidence > ml.MIN_CELL_CONFIDENCE | ||
# ] | ||
# cols = [ | ||
# cell | ||
# for cell in table_cells | ||
# if cell.is_column and cell.confidence > ml.MIN_CELL_CONFIDENCE | ||
# ] | ||
|
||
# assert len(headers) == 1 | ||
# header_objs = ml._preprocess_header_cells(headers, cols, image_size, page_size) | ||
# assert len(header_objs) == 1 | ||
|
||
# assert len(rows) == 26 # 24 rows + 1 spanning cell | ||
# row_objs = ml._process_row_cells(rows, cols, header_objs, image_size, page_size) | ||
# # row_objs = ml._drop_duplicates(row_objs, threshold=0.3) | ||
# assert len(row_objs) == 25 # 24 rows + 1 spanning cell |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from src.text.pdfminer.core import CharElement, extract_chars, group_chars_into_spans | ||
from src.schemas import TextSpan | ||
|
||
|
||
raw_chars = [ | ||
CharElement(text="1", fontname="bold", size=9.0), | ||
CharElement(text=".", fontname="bold", size=9.0), | ||
CharElement(text=" ", fontname="bold", size=9.0), | ||
CharElement(text="P", fontname="bold", size=9.0), | ||
CharElement(text="A", fontname="bold", size=9.0), | ||
CharElement(text="R", fontname="bold", size=9.0), | ||
CharElement(text="T", fontname="bold", size=9.0), | ||
CharElement(text="I", fontname="bold", size=9.0), | ||
CharElement(text="E", fontname="bold", size=9.0), | ||
CharElement(text="S", fontname="bold", size=9.0), | ||
CharElement(text=":", fontname="bold", size=9.0), | ||
CharElement(text=" ", fontname="", size=9.0), | ||
CharElement(text=" ", fontname="", size=9.0), | ||
] | ||
spans = [ | ||
TextSpan(text="1. PARTIES: ", is_bold=True, is_italic=False, size=9.0), | ||
] | ||
|
||
|
||
def test_group_chars_into_spans(): | ||
# Test the basic functionality with the given raw_chars and spans | ||
result = group_chars_into_spans(raw_chars) | ||
assert len(result) == len( | ||
spans | ||
), "The number of spans should match the expected count." | ||
|
||
for result_span, expected_span in zip(result, spans): | ||
assert ( | ||
result_span.text == expected_span.text | ||
), f"Expected text '{expected_span.text}', got '{result_span.text}'" | ||
assert ( | ||
result_span.is_bold == expected_span.is_bold | ||
), f"Expected is_bold {expected_span.is_bold}, got {result_span.is_bold}" | ||
assert ( | ||
result_span.is_italic == expected_span.is_italic | ||
), f"Expected is_italic {expected_span.is_italic}, got {result_span.is_italic}" | ||
assert ( | ||
result_span.size == expected_span.size | ||
), f"Expected size {expected_span.size}, got {result_span.size}" | ||
|
||
# Test with mixed styles to ensure correct grouping | ||
mixed_chars = [ | ||
CharElement(text="H", fontname="bold", size=9.0), | ||
CharElement(text="e", fontname="italic", size=9.0), | ||
CharElement(text="l", fontname="bold", size=9.0), | ||
CharElement(text="l", fontname="bold", size=9.0), | ||
CharElement(text="o", fontname="", size=9.0), | ||
CharElement(text=" ", fontname="", size=9.0), | ||
CharElement(text="W", fontname="boldItalic", size=9.0), | ||
CharElement(text="o", fontname="boldItalic", size=9.0), | ||
CharElement(text="r", fontname="boldItalic", size=9.0), | ||
CharElement(text="l", fontname="boldItalic", size=9.0), | ||
CharElement(text="d", fontname="boldItalic", size=9.0), | ||
] | ||
mixed_spans = [ | ||
TextSpan(text="H", is_bold=True, is_italic=False, size=9.0), | ||
TextSpan(text="e", is_bold=False, is_italic=True, size=9.0), | ||
TextSpan(text="ll", is_bold=True, is_italic=False, size=9.0), | ||
TextSpan(text="o ", is_bold=False, is_italic=False, size=9.0), | ||
TextSpan(text="World", is_bold=True, is_italic=True, size=9.0), | ||
] | ||
mixed_result = group_chars_into_spans(mixed_chars) | ||
assert len(mixed_result) == len( | ||
mixed_spans | ||
), "The number of spans in mixed styles should match the expected count." | ||
|
||
for result_span, expected_span in zip(mixed_result, mixed_spans): | ||
assert ( | ||
result_span.text == expected_span.text | ||
), f"Expected text '{expected_span.text}', got '{result_span.text}' in mixed styles" | ||
assert ( | ||
result_span.is_bold == expected_span.is_bold | ||
), f"Expected is_bold {expected_span.is_bold}, got {result_span.is_bold} in mixed styles" | ||
assert ( | ||
result_span.is_italic == expected_span.is_italic | ||
), f"Expected is_italic {expected_span.is_italic}, got {result_span.is_italic} in mixed styles" | ||
assert ( | ||
result_span.size == expected_span.size | ||
), f"Expected size {expected_span.size}, got {result_span.size} in mixed styles" | ||
|
||
# Add more tests here for additional scenarios like empty inputs, inputs with only spaces, etc. |