Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new version 2.2.7 #486

Merged
merged 6 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test_labeling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
python-version: '3.9'
- name: Run tests for labeling
run: |
test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test
test="true" docker compose -f labeling/docker-compose.yml up --build --exit-code-from test
2 changes: 1 addition & 1 deletion .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ jobs:
flake8 .
- name: Run tests
run: |
test="true" docker-compose up --build --exit-code-from test
test="true" docker compose up --build --exit-code-from test
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ It extracts a document’s logical structure and content: tables, text formattin
The document’s content is represented as a tree storing headings and lists of any level.
Dedoc can be integrated in a document contents and structure analysis system as a separate module.

## Workflow
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=ispras/dedoc&type=Date)](https://tar-history.com/#ispras/dedoc&Date)

## Workflow
![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)

Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)
Expand Down Expand Up @@ -136,12 +138,12 @@ cd dedoc

### 3. Build the image and run the application
```shell
docker-compose up --build
docker compose up --build
```

### 4. Run container with tests
```shell
test="true" docker-compose up --build
test="true" docker compose up --build
```

If you need to change some application settings, you may update `config.py` according to your needs and re-build the image.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.6
2.2.7
2 changes: 2 additions & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
Expand Down
4 changes: 4 additions & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ <h4>PDF handling</h4>
</label>
</p>

<p>
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
</p>

<p>
<label> language
<input name="language" list="language" size="8" placeholder="rus+eng">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ class TableAnnotation(Annotation):
"""
name = "table"

def __init__(self, name: str, start: int, end: int) -> None:
def __init__(self, value: str, start: int, end: int) -> None:
"""
:param name: unique identifier of the table which is referenced inside this annotation
:param value: unique identifier of the table which is referenced inside this annotation
:param start: start of the annotated text (usually zero)
:param end: end of the annotated text (usually end of the line)
"""
super().__init__(start=start, end=end, name=TableAnnotation.name, value=name, is_mergeable=False)
super().__init__(start=start, end=end, name=TableAnnotation.name, value=value, is_mergeable=False)
3 changes: 3 additions & 0 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ def uid(self) -> str:
def set_line(self, line: str) -> None:
self._line = line

def set_metadata(self, metadata: LineMetadata) -> None:
self._metadata = metadata

def __repr__(self) -> str:
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
Expand Down
6 changes: 3 additions & 3 deletions dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
"""
model_hash_dict = dict(
txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b",
line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683",
paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
)

Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/article_reader/article_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict,
if subpart.get("type") == "bibr" and target in bib2uid:
annotations.append(ReferenceAnnotation(value=bib2uid[target], start=start, end=start + len(sub_text)))
if subpart.get("type") == "table" and target in table2uid:
annotations.append(TableAnnotation(name=table2uid[target], start=start, end=start + len(sub_text)))
annotations.append(TableAnnotation(value=table2uid[target], start=start, end=start + len(sub_text)))
if subpart.get("type") == "figure" and target in attachment2uid:
annotations.append(AttachAnnotation(attach_uid=attachment2uid[target], start=start, end=start + len(sub_text)))
else:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d

if i in table_refs:
for table_uid in table_refs[i]:
annotation = TableAnnotation(name=table_uid, start=0, end=len(line))
annotation = TableAnnotation(value=table_uid, start=0, end=len(line))
line.annotations.append(annotation)

paragraph_id += 1
Expand Down
13 changes: 8 additions & 5 deletions dedoc/readers/docx_reader/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,14 @@ def to_table(self) -> Table:
if cell.vMerge:
value = cell.vMerge.get("w:val", "continue")
if value == "continue":
cell_lines = cell_list[-1][cell_ind].lines
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=1, rowspan=1, invisible=True))
last_cell_rowspan = cell_list[rowspan_start_info[cell_ind]][cell_ind]
last_cell_rowspan.rowspan += 1
cell_list[rowspan_start_info[cell_ind]][cell_ind] = last_cell_rowspan
if cell_ind in rowspan_start_info:
cell_lines = cell_list[-1][cell_ind].lines
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=1, rowspan=1, invisible=True))
last_cell_rowspan = cell_list[rowspan_start_info[cell_ind]][cell_ind]
last_cell_rowspan.rowspan += 1
cell_list[rowspan_start_info[cell_ind]][cell_ind] = last_cell_rowspan
else:
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=grid_span, rowspan=1, invisible=False))
elif value == "restart":
rowspan_start_info[cell_ind] = row_index
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=grid_span, rowspan=1, invisible=False))
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/html2pdf_reader/html2pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table])
line_id += 1
lines.append(line)
elif previous_line is not None:
table_annotation = TableAnnotation(name=table_uid, start=0, end=len(line.line))
table_annotation = TableAnnotation(value=table_uid, start=0, end=len(line.line))
previous_line.annotations.append(table_annotation)
tables_result.append(tables[table_uid])
return UnstructuredDocument(lines=lines, tables=tables_result, attachments=document.attachments)
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class PageWithBBox:

def __init__(self, image: ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
def __init__(self, image: Optional[ndarray], bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None:
self.image = image
self.bboxes = bboxes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
parameters = {} if parameters is None else parameters
warnings = []
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)

Expand Down
9 changes: 7 additions & 2 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,13 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
try:
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
is_correct = any(line.line.strip() for line in lines)
first_page_lines = [line for line in lines if line.metadata.page_id == 0]
first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines)
else:
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
return PdfTxtlayerParameters(is_correct_text_layer=is_correct, is_first_page_correct=first_page_correct)

except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,19 @@

class TxtlayerFeatureExtractor:

def __init__(self) -> None:
self.eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
self.rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])
self.lower_letters = self.eng + self.rus
self.upper_letters = self.lower_letters.upper()
self.letters = self.upper_letters + self.lower_letters
self.digits = "".join([str(i) for i in range(10)])
self.special_symbols = "<>~!@#$%^&*_+-/\"|?.,:;'`= "
self.brackets = "{}[]()"
self.symbols = self.letters + self.digits + self.brackets + self.special_symbols

self.prohibited_symbols = {s: i for i, s in enumerate("[]<")}

def transform(self, texts: List[str]) -> pd.DataFrame:
from dedoc.structure_extractors.feature_extractors.char_features import letters, digits, special_symbols, brackets, rus, eng, prohibited_symbols, \
lower_letters, upper_letters, symbols, count_symbols

features = defaultdict(list)

for text in texts:
num_letters = self.__count_symbols(text, self.letters)
num_digits = self.__count_symbols(text, self.digits)
num_special_symbols = self.__count_symbols(text, self.special_symbols)
num_brackets = self.__count_symbols(text, self.brackets)
num_rus = self.__count_symbols(text, self.rus + self.rus.upper())
num_eng = self.__count_symbols(text, self.eng + self.eng.upper())
num_letters = count_symbols(text, letters)
num_digits = count_symbols(text, digits)
num_special_symbols = count_symbols(text, special_symbols)
num_brackets = count_symbols(text, brackets)
num_rus = count_symbols(text, rus + rus.upper())
num_eng = count_symbols(text, eng + eng.upper())

features["letters_proportion"].append(num_letters / len(text))
features["digits_proportion"].append(num_digits / len(text))
Expand All @@ -38,24 +28,24 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
features["rus_proportion"].append(num_rus / len(text))
features["eng_proportion"].append(num_eng / len(text))

for symbol in self.letters + self.digits:
for symbol in letters + digits:
n = num_letters + num_digits
# proportion of occurring english and russian letters
features[f"{symbol}_proportion"].append(text.count(symbol) / n if n != 0 else 0.0)

for symbol in self.special_symbols + self.brackets:
for symbol in special_symbols + brackets:
# number of symbols
symbol_name = symbol if symbol not in self.prohibited_symbols else f"symbol{self.prohibited_symbols[symbol]}"
symbol_name = symbol if symbol not in prohibited_symbols else f"symbol{prohibited_symbols[symbol]}"
features[f"{symbol_name}_number"].append(text.count(symbol))

# proportion of letters with symbols
features["all_proportion"].append((num_letters + num_digits + num_brackets + num_special_symbols) / len(text) if len(text) != 0 else 0)

case_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.lower_letters) and (s2 in self.upper_letters))
case_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in lower_letters) and (s2 in upper_letters))
features["case_changes"].append(case_changes / len(text))
symbol_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.symbols) != (s2 in self.symbols))
symbol_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in symbols) != (s2 in symbols))
features["symbol_changes"].append(symbol_changes / len(text))
letter_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.letters) and (s2 not in self.symbols))
letter_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in letters) and (s2 not in symbols))
features["letter_changes"].append(letter_changes / len(text))

features["mean_word_length"].append(np.mean([len(word) for word in text.split()]))
Expand All @@ -70,6 +60,3 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
features["median_char_ord"].append(np.median(all_characters_ord))
features = pd.DataFrame(features)
return features[sorted(features.columns)].astype(float)

def __count_symbols(self, text: str, symbol_list: str) -> int:
return sum(1 for symbol in text if symbol in symbol_list)
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
import warnings
from os import path
from typing import Optional, Tuple
Expand Down Expand Up @@ -30,11 +31,9 @@ def __init__(self, on_gpu: bool, checkpoint_path: Optional[str], *, config: dict
@property
def net(self) -> ClassificationModelTorch:
if self._net is None:
net = ClassificationModelTorch(self.checkpoint_path)
if self.checkpoint_path is not None:
net = ClassificationModelTorch(path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth"))
self._load_weights(net)
else:
net = ClassificationModelTorch(None)
self._net = net
self._net.to(self.device)
return self._net
Expand All @@ -61,17 +60,18 @@ def _set_device(self, on_gpu: bool) -> None:
self.logger.warning(f"Classifier is set to device {self.device}")

def _load_weights(self, net: ClassificationModelTorch) -> None:
path_checkpoint = path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth")
if not path.isfile(path_checkpoint):
download_from_hub(out_dir=self.checkpoint_path,
if not path.isfile(self.checkpoint_path):
from dedoc.config import get_config
self.checkpoint_path = os.path.join(get_config()["resources_path"], "scan_orientation_efficient_net_b0.pth")
download_from_hub(out_dir=os.path.dirname(os.path.abspath(self.checkpoint_path)),
out_name="scan_orientation_efficient_net_b0.pth",
repo_name="scan_orientation_efficient_net_b0",
hub_name="model.pth")

with warnings.catch_warnings():
warnings.simplefilter("ignore")
net.load_state_dict(torch.load(path_checkpoint, map_location=self.location))
self.logger.info(f"Weights were loaded from {path_checkpoint}")
net.load_state_dict(torch.load(self.checkpoint_path, map_location=self.location))
self.logger.info(f"Weights were loaded from {self.checkpoint_path}")

def save_weights(self, path_checkpoint: str) -> None:
torch.save(self.net.state_dict(), path_checkpoint)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __get_f1_homogeneous(self, x: np.ndarray, x_clusters: np.ndarray) -> float:

w1 = np.std(x) * len(x)
w2 = np.std(x_clust0) * len(x_clust0) + np.std(x_clust1) * len(x_clust1)
f1 = w2 / w1
f1 = w2 / w1 if w1 != 0. else 0.
return f1

def __get_f_criterion_homogeneous(self, n: int, p: int = 2) -> float:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ def __init__(self, n: int = 5) -> None:
self.n = n

def binarize(self, image: np.ndarray) -> np.ndarray:
gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
threshold = self.__get_threshold(gray_img)
if image.shape[-1] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
threshold = self.__get_threshold(image)

gray_img[gray_img <= threshold] = 0
gray_img[gray_img > threshold] = 1
return gray_img
image[image <= threshold] = 0
image[image > threshold] = 1
return image

def __get_threshold(self, gray_img: np.ndarray) -> int:
c, x = np.histogram(gray_img, bins=255)
Expand All @@ -33,8 +34,8 @@ def __get_threshold(self, gray_img: np.ndarray) -> int:
omega_1 = omega_1 + c[t] / total
omega_2 = 1 - omega_1
mu_k = mu_k + t * (c[t] / total)
mu_1 = mu_k / omega_1
mu_2 = (sum_val - mu_k) / omega_2
mu_1 = mu_k / omega_1 if omega_1 != 0. else 0.
mu_2 = (sum_val - mu_k) / omega_2 if omega_2 != 0. else 0.
sum_of_neighbors = np.sum(c[max(1, t - self.n):min(255, t + self.n)])
denom = total
current_var = (1 - sum_of_neighbors / denom) * (omega_1 * mu_1 ** 2 + omega_2 * mu_2 ** 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def extract_metadata_and_set_annotations(self, page_with_lines: PageWithBBox, ca
lines = []
for bbox in page_with_lines.bboxes:
lines.append(LineMetadataExtractor.get_line_with_meta(bbox=bbox))
if page_with_lines.image.ndim == 3 and page_with_lines.image.shape[2] == 3:
if page_with_lines.image is not None and page_with_lines.image.ndim == 3 and page_with_lines.image.shape[2] == 3:
color_annotation = self.__get_color_annotation(bbox, page_with_lines.image)
bbox.annotations.append(color_annotation)
self.__add_spacing_annotations(lines)
Expand Down
Loading
Loading