Skip to content

Commit

Permalink
TLDR-465 pdf miner new params (#356)
Browse files Browse the repository at this point in the history
* set char_margin to 3

* add pdf miner test script

* fix test_pdf_miner script

* fix TestApiPdfWithText

* add chaching

* rename test to benchmark

* add benchmark script again

* change name

* change name

* Try to fix documentation pipeline

* fix benchmark

---------

Co-authored-by: Nikita Shevtsov <[email protected]>
Co-authored-by: Nasty <[email protected]>
  • Loading branch information
3 people authored Oct 19, 2023
1 parent e7c1067 commit 62445da
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 3 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:

- name: Install dependencies
run: |
sudo apt update
sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
python -m pip install --upgrade --no-cache-dir pip setuptools
python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def __get_image(path: str, page_num: int) -> np.ndarray:

def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]:
rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) # TODO find the best parameters
laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, char_margin=3, detect_vertical=False)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return device, interpreter
Expand Down
72 changes: 72 additions & 0 deletions dedoc/scripts/benchmark_pdf_miner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import json
import os
import re
import zipfile
from pathlib import Path
from tempfile import TemporaryDirectory

import wget

from dedoc.api.api_utils import json2txt
from dedoc.config import get_config
from dedoc.dedoc_manager import DedocManager


URL = "https://at.ispras.ru/owncloud/index.php/s/uImxYhliBHU8ei7/download"
URL_GT = "https://at.ispras.ru/owncloud/index.php/s/SXsOTqxGaGO9wL9/download"

if __name__ == "__main__":
data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_pdfminer_data"

if not os.path.isdir(data_dir):
data_dir.mkdir(parents=True)
pdfs_zip_path = str(data_dir / "pdfs.zip")
pdfs_zip_gt_path = str(data_dir / "pdfs_gt.zip")
wget.download(URL, pdfs_zip_path)
wget.download(URL_GT, pdfs_zip_gt_path)

with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_path)
with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_gt_path)

print(f"Benchmark data downloaded to {data_dir}")
else:
print(f"Use cached benchmark data from {data_dir}")

pdfs_path = data_dir / "PdfMiner Params"
pdfs_gt_path = data_dir / "PdfMiner Params GT"

info = dict()
with TemporaryDirectory() as tmpdir:
manager = DedocManager()
for file in os.listdir(pdfs_path):
result = manager.parse(file_path=str(pdfs_path / file), parameters={"pdf_with_text_layer": "true"})
txt_content = json2txt(paragraph=result.content.structure)
with (Path(tmpdir) / "ocr.txt").open("w") as f:
f.write(txt_content)

accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy"))
gt_path = pdfs_gt_path / (file[:-3] + "txt")
tmp_ocr_path = Path(tmpdir) / "ocr.txt"
accuracy_path = Path(tmpdir) / "accuracy.txt"
if accuracy_path.exists():
accuracy_path.unlink()
command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
os.system(command)

with open(accuracy_path, "r") as f:
lines = f.readlines()
matched = [line for line in lines if "Accuracy After Correction" in line]
if not matched:
matched = [line for line in lines if "Accuracy\n" in line]
acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1]
info[str(file)] = acc_percent

output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f:
json.dump(info, f, ensure_ascii=False, indent=2)

print(f"save result in {output_dir}")
13 changes: 13 additions & 0 deletions resources/benchmarks/benchmark_pdf_miner.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"Образец примерного заполнения уведомления об отсутствии цифровых финансовых активов.pdf": "100.0",
"2023 Гоночная Инструкция CR 2023.pdf": "100.0",
"support_182_poisk-dokumentov.pdf": "100.0",
"6.1 Описание проекта Thalamus.pdf": "100.0",
"ECPPM2020_Instructions.pdf": "100.0",
"NOR CHR 2023.pdf": "100.0",
"2-column-state.pdf": "100.0",
"ba-2017.pdf": "100.0",
"Международное и национальное спортивное право портфолио_рус.pdf": "100.0",
"Uvedoml_ESN.pdf": "100.0",
"instruction_gibdd.pdf": "100.0"
}
4 changes: 2 additions & 2 deletions tests/api_tests/test_api_format_pdf_with_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,12 @@ def test_pdf_with_2_columns_text(self) -> None:
self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"])
self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"])

self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0.0")["text"])
self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0")["text"])
self.assertIn("The Tor network was designed to provide freedom\n"
"of speech by guaranteeing anonymous communications.\n"
"Whereas the cryptographic foundations of Tor, based on\n"
"onion-routing [3, 9, 22, 24], are known to be robust, identity",
self._get_by_tree_path(tree, "0.5.0.1")["text"])
self._get_by_tree_path(tree, "0.5.0.0")["text"])

def test_pdf_with_2_columns_text_2(self) -> None:
file_name = "liters_state.pdf"
Expand Down

0 comments on commit 62445da

Please sign in to comment.