diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index f4289378..a628b0b1 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -19,6 +19,7 @@ jobs: - name: Install dependencies run: | + sudo apt update sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng python -m pip install --upgrade --no-cache-dir pip setuptools python -m pip install --exists-action=w --no-cache-dir -r requirements.txt diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 3d49949d..1d17f85f 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -142,7 +142,7 @@ def __get_image(path: str, page_num: int) -> np.ndarray: def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]: rsrcmgr = PDFResourceManager() - laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) # TODO find the best parameters + laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, char_margin=3, detect_vertical=False) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter diff --git a/dedoc/scripts/benchmark_pdf_miner.py b/dedoc/scripts/benchmark_pdf_miner.py new file mode 100644 index 00000000..3f77e887 --- /dev/null +++ b/dedoc/scripts/benchmark_pdf_miner.py @@ -0,0 +1,72 @@ +import json +import os +import re +import zipfile +from pathlib import Path +from tempfile import TemporaryDirectory + +import wget + +from dedoc.api.api_utils import json2txt +from dedoc.config import get_config +from dedoc.dedoc_manager import DedocManager + + +URL = "https://at.ispras.ru/owncloud/index.php/s/uImxYhliBHU8ei7/download" +URL_GT = "https://at.ispras.ru/owncloud/index.php/s/SXsOTqxGaGO9wL9/download" + +if __name__ == "__main__": + data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_pdfminer_data" + + if not os.path.isdir(data_dir): + data_dir.mkdir(parents=True) + pdfs_zip_path = str(data_dir / "pdfs.zip") + pdfs_zip_gt_path = str(data_dir / "pdfs_gt.zip") + wget.download(URL, pdfs_zip_path) + wget.download(URL_GT, pdfs_zip_gt_path) + + with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + zip_ref.extractall(data_dir) + os.remove(pdfs_zip_path) + with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref: + zip_ref.extractall(data_dir) + os.remove(pdfs_zip_gt_path) + + print(f"Benchmark data downloaded to {data_dir}") + else: + print(f"Use cached benchmark data from {data_dir}") + + pdfs_path = data_dir / "PdfMiner Params" + pdfs_gt_path = data_dir / "PdfMiner Params GT" + + info = dict() + with TemporaryDirectory() as tmpdir: + manager = DedocManager() + for file in os.listdir(pdfs_path): + result = manager.parse(file_path=str(pdfs_path / file), parameters={"pdf_with_text_layer": "true"}) + txt_content = json2txt(paragraph=result.content.structure) + with (Path(tmpdir) / "ocr.txt").open("w") as f: + f.write(txt_content) + + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy")) + gt_path = pdfs_gt_path / (file[:-3] + "txt") + tmp_ocr_path = Path(tmpdir) / "ocr.txt" + accuracy_path = Path(tmpdir) / "accuracy.txt" + if accuracy_path.exists(): + accuracy_path.unlink() + command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}" + os.system(command) + + with open(accuracy_path, "r") as f: + lines = f.readlines() + matched = [line for line in lines if "Accuracy After Correction" in line] + if not matched: + matched = [line for line in lines if "Accuracy\n" in line] + acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1] + info[str(file)] = acc_percent + + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f: + json.dump(info, f, ensure_ascii=False, indent=2) + + print(f"save result in {output_dir}") diff --git a/resources/benchmarks/benchmark_pdf_miner.json b/resources/benchmarks/benchmark_pdf_miner.json new file mode 100644 index 00000000..d244a8c2 --- /dev/null +++ b/resources/benchmarks/benchmark_pdf_miner.json @@ -0,0 +1,13 @@ +{ + "Образец примерного заполнения уведомления об отсутствии цифровых финансовых активов.pdf": "100.0", + "2023 Гоночная Инструкция CR 2023.pdf": "100.0", + "support_182_poisk-dokumentov.pdf": "100.0", + "6.1 Описание проекта Thalamus.pdf": "100.0", + "ECPPM2020_Instructions.pdf": "100.0", + "NOR CHR 2023.pdf": "100.0", + "2-column-state.pdf": "100.0", + "ba-2017.pdf": "100.0", + "Международное и национальное спортивное право портфолио_рус.pdf": "100.0", + "Uvedoml_ESN.pdf": "100.0", + "instruction_gibdd.pdf": "100.0" +} \ No newline at end of file diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index fa9ef429..b133a6f9 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -115,12 +115,12 @@ def test_pdf_with_2_columns_text(self) -> None: self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"]) self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"]) - self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0.0")["text"]) + self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0")["text"]) self.assertIn("The Tor network was designed to provide freedom\n" "of speech by guaranteeing anonymous communications.\n" "Whereas the cryptographic foundations of Tor, based on\n" "onion-routing [3, 9, 22, 24], are known to be robust, identity", - self._get_by_tree_path(tree, "0.5.0.1")["text"]) + self._get_by_tree_path(tree, "0.5.0.0")["text"]) def test_pdf_with_2_columns_text_2(self) -> None: file_name = "liters_state.pdf"