From 35c1a52d4373c49778425ce2576b6a36d2cbced8 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Mon, 22 Jan 2024 14:26:52 +0300 Subject: [PATCH 1/5] TLDR-585 added TEDS table benchmark --- dedoc/api/api_utils.py | 4 +- dedoc/readers/pdf_reader/pdf_base_reader.py | 13 +- resources/benchmarks/table_benchmark.json | 15 ++ scripts/benchmark_table/benchmark_table.py | 114 ++++++++++++++ scripts/benchmark_table/metric.py | 158 ++++++++++++++++++++ scripts/benchmark_table/requirements.txt | 3 + 6 files changed, 299 insertions(+), 8 deletions(-) create mode 100644 resources/benchmarks/table_benchmark.json create mode 100644 scripts/benchmark_table/benchmark_table.py create mode 100644 scripts/benchmark_table/metric.py create mode 100644 scripts/benchmark_table/requirements.txt diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index df8a1286..1287912d 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -133,7 +133,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab if tables is not None and len(tables) > 0: text += "

Tables:

" for table in tables: - text += __table2html(table, table2id) + text += table2html(table, table2id) text += "

 

" return text @@ -201,7 +201,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: return text.replace("\n", "
") -def __table2html(table: Table, table2id: Dict[str, int]) -> str: +def table2html(table: Table, table2id: Dict[str, int]) -> str: uid = table.metadata.uid text = f"

table {table2id[uid]}:

" text += f'\n\n' diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index d52e0d3c..ea869675 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -92,12 +92,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) - tables = [] - for scan_table in scan_tables: - metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle) - cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells] - table = Table(metadata=metadata, cells=cells_with_meta) - tables.append(table) + tables = [self.scantable2table(scan_table) for scan_table in scan_tables] if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) @@ -105,6 +100,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields) return self._postprocess(result) + @staticmethod + def scantable2table(table: ScanTable) -> Table: + metadata = TableMetadata(page_id=table.page_number, uid=table.name, rotated_angle=table.location.rotated_angle) + cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in table.matrix_cells] + return Table(metadata=metadata, cells=cells_with_meta) + def _can_contain_attachements(self, path: str) -> bool: can_contain_attachments = False mime = get_file_mime_type(path) diff --git a/resources/benchmarks/table_benchmark.json b/resources/benchmarks/table_benchmark.json new file mode 100644 index 00000000..1ed4fef7 --- /dev/null +++ b/resources/benchmarks/table_benchmark.json @@ -0,0 +1,15 @@ +{ + "mean": 0.9824606866114314, + "images": { + "example_with_table0_0.png": 0.9873417721518988, + "example_with_table0_1.png": 1.0, + "example_with_table6.png": 1.0, + "example_with_table4.jpg": 1.0, + "example_with_table17.jpg": 0.8536585365853658, + "example_with_table_hor_vert_union.png": 1.0, + "example_with_table1.png": 1.0, + "example_with_table_horizontal_union.jpg": 1.0, + "example_with_table3.png": 1.0, + "example_with_table5.png": 0.9836065573770492 + } +} \ No newline at end of file diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py new file mode 100644 index 00000000..c3292e87 --- /dev/null +++ b/scripts/benchmark_table/benchmark_table.py @@ -0,0 +1,114 @@ +import os +import zipfile +from pathlib import Path +import json +import pprint + +import numpy as np +import wget + +from dedoc.api.api_utils import table2html +from dedoc.config import get_config +from dedoc.readers import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer +from scripts.benchmark_table.metric import TEDS +from tests.test_utils import get_test_config + +path_result = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks") +path_result = os.path.abspath(path_result) +os.makedirs(path_result, exist_ok=True) +path_result = os.path.join(path_result, "table_benchmark.json") + +URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download" + +table_recognizer = TableRecognizer(config=get_test_config()) +image_reader = PdfImageReader(config=get_test_config()) +teds = TEDS() + + +def call_metric(pred_json: dict, true_json: dict) -> dict: + scores = teds.batch_evaluate(pred_json, true_json) + pp = pprint.PrettyPrinter() + pp.pprint(scores) + + return scores + + +def get_tables(image_path: str) -> str: + document = image_reader.read(image_path) + + for table in document.tables: + table.metadata.uid = "test_id" + table2id = {"test_id": 0} + html_tables = [table2html(table, table2id) for table in document.tables] + + # TODO: while works with one table in an image + return html_tables[0] + + +def make_predict_json(data_path: Path) -> dict: + predict_json = {} + for filename in os.listdir(data_path): + print(filename) + file_path = str(data_path / filename) + + predict_json[filename] = {"html": "" + get_tables(file_path) + ""} + + return predict_json + + +def download_dataset(data_dir: Path) -> None: + + if not os.path.isdir(data_dir): + data_dir.mkdir(parents=True) + pdfs_zip_path = str(data_dir / "benchmark_table_data.zip") + wget.download(URL, pdfs_zip_path) + + with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + zip_ref.extractall(data_dir) + os.remove(pdfs_zip_path) + + print(f"Benchmark data downloaded to {data_dir}") + else: + print(f"Use cached benchmark data from {data_dir}") + + +def prediction(path_pred: Path, path_images: Path) -> dict: + pred_json = make_predict_json(path_images) + with open(path_pred, "w") as fd: + json.dump(pred_json, fd, indent=2, ensure_ascii=False) + + return pred_json + + +if __name__ == "__main__": + data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data" + path_images = data_dir / "images" + path_gt = data_dir / "gt.json" + path_pred = data_dir / "pred.json" + download_dataset(data_dir) + + with open(path_gt, "r") as fp: + gt_json = json.load(fp) + ''' + Creating base html (based on method predictions for future labeling) + path_images = data_dir / "images_tmp" + pred_json = prediction("gt_tmp.json", path_images) + ''' + pred_json = prediction(path_pred, path_images) + scores = call_metric(pred_json=pred_json, true_json=gt_json) + + result = dict() + result["mean"] = np.mean([score for score in scores.values()]) + result["images"] = scores + + # save benchmarks + with open(path_result, "w") as fd: + json.dump(result, fd, indent=2, ensure_ascii=False) + + + + + + + diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py new file mode 100644 index 00000000..bf3f6bc5 --- /dev/null +++ b/scripts/benchmark_table/metric.py @@ -0,0 +1,158 @@ +# Copyright 2020 IBM +# Author: peter.zhong@au1.ibm.com +# +# This is free software; you can redistribute it and/or modify +# it under the terms of the Apache 2.0 License. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Apache 2.0 License for more details. + +# Source: https://github.com/ibm-aur-nlp/PubTabNet + +import distance +from apted import APTED, Config +from apted.helpers import Tree +from lxml import etree, html +from collections import deque + +from tqdm import tqdm + + +class TableTree(Tree): + def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.visible = visible + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation""" + if self.tag == 'td': + result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ + (self.tag, self.colspan, self.rowspan, self.content) + else: + result = '"tag": %s' % self.tag + for child in self.children: + result += child.bracket() + return "{{{}}}".format(result) + + +class CustomConfig(Config): + @staticmethod + def maximum(*sequences): + """Get maximum possible value + """ + return max(map(len, sequences)) + + def normalized_distance(self, *sequences) -> float: + """Get distance from 0 to 1 + """ + return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) + + def rename(self, node1: TableTree, node2: TableTree) -> float: + """Compares attributes of trees""" + if not node1.visible or node2.visible: + return 0. + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == 'td': + if node1.content or node2.content: + return self.normalized_distance(node1.content, node2.content) + return 0. + + +class TEDS(object): + ''' Tree Edit Distance basead Similarity + ''' + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1' + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + ''' Tokenizes table cells + ''' + self.__tokens__.append('<%s>' % node.tag) + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != 'unk': + self.__tokens__.append('' % node.tag) + if node.tag != 'td' and node.tail is not None: + self.__tokens__ += list(node.tail) + + def load_html_tree(self, node, parent=None): + ''' Converts HTML tree to the format required by apted + ''' + global __tokens__ + if node.tag == 'td': + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + + try: + new_node = TableTree(tag=node.tag, + colspan=int(node.attrib.get('colspan', '1')), + rowspan=int(node.attrib.get('rowspan', '1')), + content=cell, + visible=False if node.attrib.get('style') == "display: none" else True, *deque()) + except Exception as ex: + print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") + raise ex + else: + new_node = TableTree(node.tag, None, None, None, True, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != 'td': + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred: str, true: str) -> float: + ''' Computes TEDS score between the prediction and the ground truth of a + given sample + ''' + if (not pred) or (not true): + return 0.0 + parser = html.HTMLParser(remove_comments=True, encoding='utf-8') + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + if pred.xpath('body/table') and true.xpath('body/table'): + pred = pred.xpath('body/table')[0] + true = true.xpath('body/table')[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + + distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + def batch_evaluate(self, pred_json, true_json): + ''' Computes TEDS score between the prediction and the ground truth of + a batch of samples + @params pred_json: {'FILENAME': 'HTML CODE', ...} + @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + @output: {'FILENAME': 'TEDS SCORE', ...} + ''' + samples = true_json.keys() + scores = [self.evaluate(pred_json.get(filename, '')['html'], true_json[filename]['html']) for filename in tqdm(samples)] + scores = dict(zip(samples, scores)) + return scores diff --git a/scripts/benchmark_table/requirements.txt b/scripts/benchmark_table/requirements.txt new file mode 100644 index 00000000..99314805 --- /dev/null +++ b/scripts/benchmark_table/requirements.txt @@ -0,0 +1,3 @@ +# for metric TEDS: +apted==1.0.3 +distance==0.1.3 \ No newline at end of file From 00cebfd21d17d68473629fb20a54bfd1f7708300 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Tue, 23 Jan 2024 18:19:02 +0300 Subject: [PATCH 2/5] TLDR-585 fixed after review --- .../data_classes/tables/scantable.py | 6 ++ dedoc/readers/pdf_reader/pdf_base_reader.py | 11 +--- scripts/benchmark_table/benchmark_table.py | 61 ++++++++----------- scripts/benchmark_table/metric.py | 5 +- 4 files changed, 34 insertions(+), 49 deletions(-) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index c7c47fe1..4bc057df 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -5,6 +5,7 @@ import numpy as np from dedocutils.data_structures import BBox +from dedoc.data_structures import CellWithMeta, Table, TableMetadata from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -27,6 +28,11 @@ def extended(self, table: "ScanTable") -> None: # extend order self.order = max(self.order, table.order) + def to_table(self) -> Table: + metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle) + cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells] + return Table(metadata=metadata, cells=cells_with_meta) + @staticmethod def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]: attrs = [] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index ea869675..fd6ed93b 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -13,10 +13,7 @@ import dedoc.utils.parameter_utils as param_utils from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader @@ -92,7 +89,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) - tables = [self.scantable2table(scan_table) for scan_table in scan_tables] + tables = [scan_table.to_table() for scan_table in scan_tables] if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) @@ -100,12 +97,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields) return self._postprocess(result) - @staticmethod - def scantable2table(table: ScanTable) -> Table: - metadata = TableMetadata(page_id=table.page_number, uid=table.name, rotated_angle=table.location.rotated_angle) - cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in table.matrix_cells] - return Table(metadata=metadata, cells=cells_with_meta) - def _can_contain_attachements(self, path: str) -> bool: can_contain_attachments = False mime = get_file_mime_type(path) diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index c3292e87..a1066d4b 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -1,9 +1,7 @@ -import os import zipfile from pathlib import Path import json import pprint - import numpy as np import wget @@ -12,17 +10,14 @@ from dedoc.readers import PdfImageReader from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer from scripts.benchmark_table.metric import TEDS -from tests.test_utils import get_test_config -path_result = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks") -path_result = os.path.abspath(path_result) -os.makedirs(path_result, exist_ok=True) -path_result = os.path.join(path_result, "table_benchmark.json") +path_result = Path(__file__).parent / ".." / "resources" / "benchmarks" +path_result.absolute().mkdir(parents=True, exist_ok=True) URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download" -table_recognizer = TableRecognizer(config=get_test_config()) -image_reader = PdfImageReader(config=get_test_config()) +table_recognizer = TableRecognizer(config=get_config()) +image_reader = PdfImageReader(config=get_config()) teds = TEDS() @@ -34,8 +29,8 @@ def call_metric(pred_json: dict, true_json: dict) -> dict: return scores -def get_tables(image_path: str) -> str: - document = image_reader.read(image_path) +def get_tables(image_path: Path) -> str: + document = image_reader.read(str(image_path)) for table in document.tables: table.metadata.uid = "test_id" @@ -48,35 +43,35 @@ def get_tables(image_path: str) -> str: def make_predict_json(data_path: Path) -> dict: predict_json = {} - for filename in os.listdir(data_path): - print(filename) - file_path = str(data_path / filename) + for pathname in Path.iterdir(data_path): + print(pathname) - predict_json[filename] = {"html": "" + get_tables(file_path) + ""} + predict_json[pathname.name] = {"html": "" + get_tables(pathname) + ""} return predict_json def download_dataset(data_dir: Path) -> None: - if not os.path.isdir(data_dir): - data_dir.mkdir(parents=True) - pdfs_zip_path = str(data_dir / "benchmark_table_data.zip") - wget.download(URL, pdfs_zip_path) + if Path.exists(data_dir): + print(f"Use cached benchmark data from {data_dir}") + return - with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: - zip_ref.extractall(data_dir) - os.remove(pdfs_zip_path) + data_dir.mkdir(parents=True, exist_ok=True) + pdfs_zip_path = data_dir / "benchmark_table_data.zip" + wget.download(URL, str(data_dir)) - print(f"Benchmark data downloaded to {data_dir}") - else: - print(f"Use cached benchmark data from {data_dir}") + with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + zip_ref.extractall(data_dir) + pdfs_zip_path.unlink() + + print(f"Benchmark data downloaded to {data_dir}") def prediction(path_pred: Path, path_images: Path) -> dict: pred_json = make_predict_json(path_images) - with open(path_pred, "w") as fd: - json.dump(pred_json, fd, indent=2, ensure_ascii=False) + with path_pred.open("w") as fd: + json.dump(str(pred_json), fd, indent=2, ensure_ascii=False) return pred_json @@ -103,12 +98,6 @@ def prediction(path_pred: Path, path_images: Path) -> dict: result["images"] = scores # save benchmarks - with open(path_result, "w") as fd: - json.dump(result, fd, indent=2, ensure_ascii=False) - - - - - - - + file_result = path_result / "table_benchmark.json" + with file_result.open("w") as fd: + json.dump(str(file_result), fd, indent=2, ensure_ascii=False) diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index bf3f6bc5..ebf978cc 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -32,10 +32,9 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, def bracket(self): """Show tree using brackets notation""" if self.tag == 'td': - result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ - (self.tag, self.colspan, self.rowspan, self.content) + result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}' else: - result = '"tag": %s' % self.tag + result = f'"tag": {self.tag}' for child in self.children: result += child.bracket() return "{{{}}}".format(result) From 8e7f86bcc3a0f5631928b7db200e167162dd2960 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Wed, 24 Jan 2024 13:09:36 +0300 Subject: [PATCH 3/5] TLDR-585 fixed bug, include cells's content in metric --- resources/benchmarks/table_benchmark.json | 23 ++++---- scripts/benchmark_table/benchmark_table.py | 15 +++-- scripts/benchmark_table/metric.py | 66 +++++++++++----------- 3 files changed, 54 insertions(+), 50 deletions(-) diff --git a/resources/benchmarks/table_benchmark.json b/resources/benchmarks/table_benchmark.json index 1ed4fef7..d7a9d7c6 100644 --- a/resources/benchmarks/table_benchmark.json +++ b/resources/benchmarks/table_benchmark.json @@ -1,15 +1,16 @@ { - "mean": 0.9824606866114314, + "mode_metric_structure_only": false, + "mean": 0.9468374367023571, "images": { - "example_with_table0_0.png": 0.9873417721518988, - "example_with_table0_1.png": 1.0, - "example_with_table6.png": 1.0, - "example_with_table4.jpg": 1.0, - "example_with_table17.jpg": 0.8536585365853658, - "example_with_table_hor_vert_union.png": 1.0, - "example_with_table1.png": 1.0, - "example_with_table_horizontal_union.jpg": 1.0, - "example_with_table3.png": 1.0, - "example_with_table5.png": 0.9836065573770492 + "example_with_table0_0.png": 0.9525583036909738, + "example_with_table0_1.png": 0.9264351862896008, + "example_with_table6.png": 0.989010989010989, + "example_with_table4.jpg": 0.908436211832951, + "example_with_table17.jpg": 0.8078952936402488, + "example_with_table_hor_vert_union.png": 0.9896091617933723, + "example_with_table1.png": 0.9781560283687943, + "example_with_table_horizontal_union.jpg": 0.9925757575757576, + "example_with_table3.png": 0.9778008866078716, + "example_with_table5.png": 0.9458965482130129 } } \ No newline at end of file diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index a1066d4b..9a50b4fa 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -11,17 +11,17 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer from scripts.benchmark_table.metric import TEDS -path_result = Path(__file__).parent / ".." / "resources" / "benchmarks" +path_result = Path(__file__).parent / ".." / ".." / "resources" / "benchmarks" path_result.absolute().mkdir(parents=True, exist_ok=True) URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download" table_recognizer = TableRecognizer(config=get_config()) image_reader = PdfImageReader(config=get_config()) -teds = TEDS() -def call_metric(pred_json: dict, true_json: dict) -> dict: +def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False) -> dict: + teds = TEDS(structure_only=structure_only) scores = teds.batch_evaluate(pred_json, true_json) pp = pprint.PrettyPrinter() pp.pprint(scores) @@ -83,6 +83,8 @@ def prediction(path_pred: Path, path_images: Path) -> dict: path_pred = data_dir / "pred.json" download_dataset(data_dir) + mode_metric_structure_only = False + with open(path_gt, "r") as fp: gt_json = json.load(fp) ''' @@ -90,14 +92,15 @@ def prediction(path_pred: Path, path_images: Path) -> dict: path_images = data_dir / "images_tmp" pred_json = prediction("gt_tmp.json", path_images) ''' - pred_json = prediction(path_pred, path_images) - scores = call_metric(pred_json=pred_json, true_json=gt_json) + pred_json = prediction(path_pred, path_images) + scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only) result = dict() + result["mode_metric_structure_only"] = mode_metric_structure_only result["mean"] = np.mean([score for score in scores.values()]) result["images"] = scores # save benchmarks file_result = path_result / "table_benchmark.json" with file_result.open("w") as fd: - json.dump(str(file_result), fd, indent=2, ensure_ascii=False) + json.dump(result, fd, indent=2, ensure_ascii=False) diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index ebf978cc..1872b414 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -30,14 +30,15 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, self.children = list(children) def bracket(self): - """Show tree using brackets notation""" - if self.tag == 'td': + """Show tree using brackets notation + """ + if self.tag == "td": result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}' else: result = f'"tag": {self.tag}' for child in self.children: result += child.bracket() - return "{{{}}}".format(result) + return "{{" + result + "}}" class CustomConfig(Config): @@ -54,44 +55,44 @@ def normalized_distance(self, *sequences) -> float: def rename(self, node1: TableTree, node2: TableTree) -> float: """Compares attributes of trees""" - if not node1.visible or node2.visible: - return 0. if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): return 1. - if node1.tag == 'td': + if node1.tag == "td": + if not node1.visible or not node2.visible: + return 0. if node1.content or node2.content: return self.normalized_distance(node1.content, node2.content) return 0. class TEDS(object): - ''' Tree Edit Distance basead Similarity - ''' + """ Tree Edit Distance based Similarity + """ + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): - assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1' + assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1" self.structure_only = structure_only self.n_jobs = n_jobs self.ignore_nodes = ignore_nodes self.__tokens__ = [] def tokenize(self, node): - ''' Tokenizes table cells - ''' - self.__tokens__.append('<%s>' % node.tag) + """ Tokenizes table cells + """ + self.__tokens__.append(f"<{node.tag}>") if node.text is not None: self.__tokens__ += list(node.text) for n in node.getchildren(): self.tokenize(n) - if node.tag != 'unk': - self.__tokens__.append('' % node.tag) - if node.tag != 'td' and node.tail is not None: + if node.tag != "unk": + self.__tokens__.append(f"") + if node.tag != "td" and node.tail is not None: self.__tokens__ += list(node.tail) def load_html_tree(self, node, parent=None): - ''' Converts HTML tree to the format required by apted - ''' - global __tokens__ - if node.tag == 'td': + """ Converts HTML tree to the format required by apted + """ + if node.tag == "td": if self.structure_only: cell = [] else: @@ -101,10 +102,10 @@ def load_html_tree(self, node, parent=None): try: new_node = TableTree(tag=node.tag, - colspan=int(node.attrib.get('colspan', '1')), - rowspan=int(node.attrib.get('rowspan', '1')), + colspan=int(node.attrib.get("colspan", "1")), + rowspan=int(node.attrib.get("rowspan", "1")), content=cell, - visible=False if node.attrib.get('style') == "display: none" else True, *deque()) + visible=False if node.attrib.get("style") == "display: none" else True, *deque()) except Exception as ex: print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") raise ex @@ -112,24 +113,23 @@ def load_html_tree(self, node, parent=None): new_node = TableTree(node.tag, None, None, None, True, *deque()) if parent is not None: parent.children.append(new_node) - if node.tag != 'td': + if node.tag != "td": for n in node.getchildren(): self.load_html_tree(n, new_node) if parent is None: return new_node def evaluate(self, pred: str, true: str) -> float: - ''' Computes TEDS score between the prediction and the ground truth of a - given sample - ''' + """ Computes TEDS score between the prediction and the ground truth of a given sample + """ if (not pred) or (not true): return 0.0 - parser = html.HTMLParser(remove_comments=True, encoding='utf-8') + parser = html.HTMLParser(remove_comments=True, encoding="utf-8") pred = html.fromstring(pred, parser=parser) true = html.fromstring(true, parser=parser) - if pred.xpath('body/table') and true.xpath('body/table'): - pred = pred.xpath('body/table')[0] - true = true.xpath('body/table')[0] + if pred.xpath("body/table") and true.xpath("body/table"): + pred = pred.xpath("body/table")[0] + true = true.xpath("body/table")[0] if self.ignore_nodes: etree.strip_tags(pred, *self.ignore_nodes) etree.strip_tags(true, *self.ignore_nodes) @@ -145,13 +145,13 @@ def evaluate(self, pred: str, true: str) -> float: return 0.0 def batch_evaluate(self, pred_json, true_json): - ''' Computes TEDS score between the prediction and the ground truth of + """ Computes TEDS score between the prediction and the ground truth of a batch of samples @params pred_json: {'FILENAME': 'HTML CODE', ...} @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} @output: {'FILENAME': 'TEDS SCORE', ...} - ''' + """ samples = true_json.keys() - scores = [self.evaluate(pred_json.get(filename, '')['html'], true_json[filename]['html']) for filename in tqdm(samples)] + scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)] scores = dict(zip(samples, scores)) return scores From 764235505e328bf1bfdeefc66d6cfb2c42af3442 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Fri, 26 Jan 2024 13:13:49 +0300 Subject: [PATCH 4/5] TLDR-591 added table generation benchmark --- .../table_benchmark_on_generated_data.json | 506 ++++++++++++++++++ scripts/benchmark_table/benchmark_table.py | 82 ++- scripts/benchmark_table/metric.py | 10 +- 3 files changed, 584 insertions(+), 14 deletions(-) create mode 100644 resources/benchmarks/table_benchmark_on_generated_data.json diff --git a/resources/benchmarks/table_benchmark_on_generated_data.json b/resources/benchmarks/table_benchmark_on_generated_data.json new file mode 100644 index 00000000..130bcd28 --- /dev/null +++ b/resources/benchmarks/table_benchmark_on_generated_data.json @@ -0,0 +1,506 @@ +{ + "mode_metric_structure_only": true, + "mean": 0.9467889492889642, + "images": { + "0OEG7D5CXUSXDNEXAZ8A3.png": 0.993103448275862, + "0IS8OPRTM71QYN821WA5S.png": 0.9878048780487805, + "0KX1D4AGMTM3EWR0EF0A5.png": 0.989010989010989, + "0QBK1U71YOHBG5Z23MT7E.png": 0.9916666666666667, + "0DC57AS1OYZ1BRHZHPIO2.png": 0.96, + "0GJE73OG32H2P2SL2AI2J.png": 0.9905660377358491, + "0GYAQKWTI3LN6DNZFM2TZ.png": 0.9904761904761905, + "0GHKLO6LOH5LBTYEUND3S.png": 0.9917355371900827, + "0F831FOUA10K3594FG4IM.png": 0.9896907216494846, + "0XG0I2F0MMZ3QMXWLWFMX.png": 0.9935064935064936, + "0WMTO9U10ILEB9HCX4C0B.png": 0.9863013698630136, + "0C1ZYGFL2YNFM2W3P2KN1.png": 0.9795918367346939, + "0A4G5JAZSJS4BT5LBZ2Q3.png": 0.9850746268656716, + "0I75SMSDR5JSJXF07PN6J.png": 0.9915966386554622, + "0K9EAAIYXSUT80SYF3ML4.png": 0.9836065573770492, + "0PH78O2B9CJAM6MMINZXT.png": 0.9876543209876543, + "1BRZ4ALOZMMEXGR4AVJWG.png": 0.9876543209876543, + "0AFVW6AL3EH9H76ONNDYF.png": 0.9848484848484849, + "0EVCQHN9C65AUYG1UAN3C.png": 0.9696969696969697, + "0DD9D0ILAPJIH77GEVRGP.png": 0.4098360655737705, + "1BHU2JO8ODKS3OL4RIU6A.png": 0.9905660377358491, + "0OG2AZLHJPMBX43O2O9LR.png": 0.989010989010989, + "0M64SMZT9HTN6LXQ4M24T.png": 0.9846153846153847, + "0TY12X0C3U2BPZC81PW66.png": 0.9836065573770492, + "0W109P7LI6B5HIYM3SJ5A.png": 0.9873417721518988, + "0H6272E6S2YUDJWBSWKQN.png": 0.9873417721518988, + "0ZRX97WSSVCVQ3NJ5959P.png": 0.9873417721518988, + "0ZFX4HDI3O7YQFDYRRYKI.png": 0.9722222222222222, + "1A4SDGAXB66WDBW7OUH58.png": 0.9876543209876543, + "0HQVUJMOQRQQ5FIP4PMZF.png": 0.9859154929577465, + "0Q335MQBC8UJJMASJUNWZ.png": 0.9938271604938271, + "0AF02R419WL1YN97ZV144.png": 0.9868421052631579, + "0JWOGY4C0KQ14J958GLYD.png": 0.9922480620155039, + "0JCVUE03Y5YD8A45IOIA5.png": 0.972972972972973, + "0R10PSLELMJ0SPFCXX92A.png": 0.9814814814814815, + "0BF411IVR1HLU1Q44I3K7.png": 0.990990990990991, + "0K9C1HJZ8K3L6CRAQ6VCW.png": 0.9850746268656716, + "0Q2MRICBMAFRV1GRRR5TA.png": 0.5189873417721519, + "0ZK44UG99IWIPKRSCOSJV.png": 0.9795918367346939, + "0S09D3ZPVQ8YOT55XIOE1.png": 0.9887640449438202, + "0KRLZUD3DQAU1DYDU99ZH.png": 0.9882352941176471, + "1ATIOLLN3DOAHKX75560Q.png": 0.5460992907801419, + "0KSBBUINDNN16F2ZLQHV4.png": 0.9915254237288136, + "0P5IE8XH9BN2EGC0DX27Z.png": 0.989010989010989, + "0GJ88Q9SMUOWF3WILKG14.png": 0.98, + "0QYA242XOQ0Y9078UC7NI.png": 0.984375, + "0F4Z8B4S5RV008LHJBW8S.png": 0.9896907216494846, + "0GWJH40B21AJBR1F73FXI.png": 0.9722222222222222, + "0ZO44O69QHTV62QJ3X9KH.png": 0.9883720930232558, + "0VK3KLUJVLAB9SRQDN6EJ.png": 0.45622119815668205, + "0HQHS3BO0IIOJ5L2EP2H4.png": 0.9882352941176471, + "1DBQ2M6XQ66Y2895PYNOM.png": 0.9824561403508771, + "0A4OYW3ZL5QP76IGF0DK0.png": 0.9836065573770492, + "1AFVW6AL3EH9H76ONNDYF.png": 0.9863013698630136, + "0J0JQM9WD7B0RCNKWBC5S.png": 0.972972972972973, + "0OVSLM3WAA36TZQCOL1WS.png": 0.9821428571428571, + "0GMEN2MGE7HN3ROOZQ5YD.png": 0.9879518072289156, + "0AJLKOKRHEVOTGE90GEH6.png": 0.9922480620155039, + "0DUX4YKT5JYJO3Z573OG8.png": 0.967741935483871, + "0A4SDGAXB66WDBW7OUH58.png": 0.9871794871794872, + "0G5S5CXGRLABEYII4QG2Z.png": 0.9887640449438202, + "0YM92E2EEDDGHAUW2YZ8Q.png": 0.9863013698630136, + "0F0E32N4VR4Q9960I0DB8.png": 0.975609756097561, + "0JNQPLSGKLPQ0UAAFYL5T.png": 0.967741935483871, + "0YL4VFF3LUUQITLVU3U9V.png": 0.9902912621359223, + "0ZO2Z3XCHZLB43ARH68WS.png": 0.9803921568627451, + "0SHL8BKLII1AGBZ1SEB4U.png": 0.9876543209876543, + "0KBLEG9N1SBX956ZCIP5I.png": 0.9795918367346939, + "1B1QX4K8U8P9QA3HVLRPN.png": 0.9896907216494846, + "0D0ZG3O9YHMQAPHCD0890.png": 0.9924242424242424, + "0PTMPFGYNVJWO6FCX1QRZ.png": 0.9933333333333333, + "0A7ZA5BA5TPHBN2WP6TT9.png": 0.9904761904761905, + "0DE9UIIVMYH3UK0SYFVUG.png": 0.9767441860465116, + "0U0LFAJATVD9YEC1Z3497.png": 0.9767441860465116, + "1A3YX0911ULBZSCUBNDZS.png": 0.9929577464788732, + "0WRQRWHH2CMV2L4CE3SN9.png": 0.9878048780487805, + "0C1X00FENSOUN2Y08Y3JT.png": 0.9824561403508771, + "0ONRU7A4SU4WAUWF25FRP.png": 0.9878048780487805, + "1DD9D0ILAPJIH77GEVRGP.png": 0.9824561403508771, + "0CBFM7HG55Z7O8F4Y0O0L.png": 0.9905660377358491, + "0C9EM94JJTICVGS6U2T2U.png": 0.9917355371900827, + "0KK57808VO3HNS1AW4CJO.png": 0.9767441860465116, + "0E7XFLPH56MT23HNK3MZ6.png": 0.9886363636363636, + "0J818KH6HIIA83D74FXS3.png": 0.9887640449438202, + "0TLMD42BW0F4NSD9PG19X.png": 0.984375, + "0PSN5QFZWTPA9U05O7MZ3.png": 0.99, + "0GI0JNFJAOXK5OJKRXCND.png": 0.9891304347826086, + "0Z4X1LVZ1K4NE2RR8P7EA.png": 0.9859154929577465, + "0S709DW5AZF9VPCPMVHXB.png": 0.9891304347826086, + "1B87OEX5XX0BHUOQAS50A.png": 0.9767441860465116, + "1AAERNSDA06GDA7OFZVCA.png": 0.9850746268656716, + "1CORAY089OILX2OWIKU1E.png": 0.98, + "0WO86MK2DC2EZUZLSMFA1.png": 0.9911504424778761, + "0KNUPYHEXZYSW1TNZ6I7L.png": 0.9846153846153847, + "0ONF59OAQYX89LAM941E6.png": 0.9859154929577465, + "0E1IVAEMQXKVCH3Q0JCVX.png": 0.9782608695652174, + "0K8YJZK75V8SXL0GIM4SU.png": 0.9896907216494846, + "0LUL2CVQ1HLC1KL6D2VMP.png": 0.4065934065934066, + "0WIEGQEF4G9LN2UM49Y12.png": 0.9876543209876543, + "0AQ9EL10BYBSGJO2RLC6Q.png": 0.9888888888888889, + "0M0WYXRJONRUQ3ZG24MJJ.png": 0.989010989010989, + "0E3XQJO1C4CKR9TNFB4IC.png": 0.9871794871794872, + "0H17CYXGJTHXPQUP51TBI.png": 0.9911504424778761, + "0NK736IIIHGBF52E1UKQ4.png": 0.9859154929577465, + "0DYIHMLOKOR6HNF2XAI8F.png": 0.9836065573770492, + "0BZ5GZPTUSCNBNGBNQZEG.png": 0.9859154929577465, + "0M47PMX0DRIVKCJBYKHPJ.png": 0.9767441860465116, + "00MK8C41M7MW013CJ9SPU.png": 0.9922480620155039, + "0DMXCT01TPF8O33UMENE4.png": 0.9917355371900827, + "0WS9VI6T1X0M5H6D8O67Z.png": 0.9859154929577465, + "0XQ9XQOL15RDKQT4YZUQC.png": 0.5739130434782609, + "1DD8FWYLADAY5EJ3UZUD9.png": 0.9876543209876543, + "0MXPSYD5A5U86BSSZQMJN.png": 0.975609756097561, + "0QAOLXSIIRIRQ3W1OP7Y8.png": 0.9921259842519685, + "1BZ5GZPTUSCNBNGBNQZEG.png": 0.9767441860465116, + "0Y5AIJNHB8DTPQOC92X6P.png": 0.9882352941176471, + "0IL3BP1QRAZ54V54IBK9A.png": 0.9876543209876543, + "0MDCUYD9ASW4AGWD3ZYK5.png": 0.9891304347826086, + "0MON88TOR16AGTBLDTGJC.png": 0.9904761904761905, + "0QVCHWR0EZCMQ5J5P0Z1J.png": 0.9767441860465116, + "0IPJ09DW34Q275Z5CMS1X.png": 0.99, + "1A7ZU26KX6C0LG0D3T3ZS.png": 0.9863013698630136, + "0M49YEV7H4P48EONCBFPS.png": 0.9863013698630136, + "1D34PI1NNCV0AB4WCQMB3.png": 0.9863013698630136, + "0AGYYXV88WJW2FC6FVV3Q.png": 0.9863013698630136, + "0F9W69ODT3GQCQ6F11L2E.png": 0.9767441860465116, + "0Q3RJT1DJMPO9D9BE6JNO.png": 0.9868421052631579, + "0ETQJY2HRGYIBO46BSD3P.png": 0.4503105590062112, + "0BRZ4ALOZMMEXGR4AVJWG.png": 0.43983402489626555, + "0WLG2ZXPFXZGF9RM2Z6N6.png": 0.9871794871794872, + "1BP5KU2XHXZ0C431B4OL9.png": 0.43450479233226835, + "00ZG4J0UMAHQMR57DQ5T7.png": 0.9818181818181818, + "0S5HD36LFVDWLLH6UFK9I.png": 0.9939759036144579, + "0EW4PZW85MH9BS8VI83KZ.png": 0.9848484848484849, + "0EMFKQLMGGAFPLQGUEZSJ.png": 0.96, + "0H4TWDI39J0HRG239GQ10.png": 0.9938650306748467, + "0BSXNNN0LA94101P5D38I.png": 0.9882352941176471, + "0SLVZSD9X7VZPGQU0Q2QN.png": 0.9850746268656716, + "0K6WPSDJC0ICOWFEASYB4.png": 0.9911504424778761, + "0TY3MTJ6YZDE6QI73SH5A.png": 0.9859154929577465, + "0B87OEX5XX0BHUOQAS50A.png": 0.9896907216494846, + "1C1X00FENSOUN2Y08Y3JT.png": 0.967741935483871, + "0KEM29NIZZ7UI3CTN6NEA.png": 0.9896907216494846, + "0JZQMX95783K8QW3ERXSM.png": 0.8827586206896552, + "0R47TY8TMFAL346RUY0LW.png": 0.9696969696969697, + "0EG83QLMPW7MGGMGBYGPD.png": 0.9882352941176471, + "1AGYYXV88WJW2FC6FVV3Q.png": 0.9927536231884058, + "0OF74SYX6Q102JCQ5KELF.png": 0.9896907216494846, + "1BX1I2HS6BLV92NZHV6J1.png": 0.9940828402366864, + "1DDEMI2034QD7F4QRH1IV.png": 0.972972972972973, + "0Z8LGXZ1SMLBHV5T6Y4O9.png": 0.9859154929577465, + "0NGE5XRBD2YHBZFMDL7VD.png": 0.9795918367346939, + "0SX4TWDHV25DCZV3HQEHH.png": 0.9777777777777777, + "00Q04QLVCESVWCSMDAURN.png": 0.9855072463768116, + "0SK696SAQW3MZNDMD4W85.png": 0.984375, + "0F4WBFLG32FAT22W0NGEY.png": 0.45871559633027525, + "0TNFF3RUQ2UL3PRNYF45M.png": 0.9868421052631579, + "1A4OYW3ZL5QP76IGF0DK0.png": 0.9904761904761905, + "0IUNRRJ3JHMEAORR2EXRS.png": 0.9908256880733946, + "0L764EQB3ZGC3FYQ20PR9.png": 0.9863013698630136, + "0XZJ4SZWY0ZOD9QBZP96A.png": 0.9922480620155039, + "0PU3J7NYVCB6XLSJJOEZ9.png": 0.9911504424778761, + "0DDKIN1PFJQTFW1JADVHT.png": 0.9863013698630136, + "1A4G5JAZSJS4BT5LBZ2Q3.png": 0.9836065573770492, + "0DPMX3BRIG9CWZPYKXFWS.png": 0.9921259842519685, + "0N7P792721CFI8EDOCB0N.png": 0.9908256880733946, + "1BIC4PMO7M3ZB8WUC3STJ.png": 0.9933333333333333, + "0XNBY82W4NFSD9GV6ONKU.png": 0.9911504424778761, + "0F3P8XGEMBYESYCYAOQPN.png": 0.9923076923076923, + "00D983SP0WHF6YGMKSHCR.png": 0.9803921568627451, + "0N91H0ZWMHBPPPPON4HUW.png": 0.993006993006993, + "0AWZPWR198XN7U8HY1E32.png": 0.9836065573770492, + "0S0Z9J05KZWNPKUFRD78Z.png": 0.9927007299270073, + "0FCDXM7JS1QEBBY3DCGBM.png": 0.9795918367346939, + "0OEVVJNLZKKW7GOPM188W.png": 0.9615384615384616, + "0Z7FUMCO707ZDI55EG306.png": 0.9878048780487805, + "0DE4P4M2855D754NA8993.png": 0.9722222222222222, + "0UZ81HSUQSHVVGU56NIOG.png": 0.9902912621359223, + "0AAERNSDA06GDA7OFZVCA.png": 0.992, + "0WZXI1YECN77S9GD6GQ4M.png": 0.98989898989899, + "0S8HOU13AW544ALTKAB73.png": 0.989010989010989, + "0AAPDAAK73MRINE7PM0ZJ.png": 0.41628959276018096, + "0UJ2AFVE6RWGTYSB6DKLJ.png": 0.45871559633027525, + "0ISYQEE43TA3O41XMA47A.png": 0.993103448275862, + "0L2E8S3ICCMGPE9PS3RLV.png": 0.9908256880733946, + "0BIC4PMO7M3ZB8WUC3STJ.png": 0.984375, + "0CTFYQFHQ1S1FLIEAPZTB.png": 0.9767441860465116, + "0A9RJA2I3YJT58JR2MEOT.png": 0.9818181818181818, + "0UGHOJ96BTPB57BR0DJS7.png": 0.4505494505494505, + "0TW35WW1PRLL2YKVYWYRM.png": 0.9818181818181818, + "0HTO45RT9NH5KQUCLOV2H.png": 0.9722222222222222, + "0F0TA5W8GO31TXUFMHHTO.png": 0.98989898989899, + "0HWMSCT6L3MCGFJV4OXF8.png": 0.975609756097561, + "0KCIUQNXNE3ZMX5ECY7V3.png": 0.9925925925925926, + "0ET4I24PZATQRKGMGG5KC.png": 0.975609756097561, + "0I6WVEL7V26O3KJJ1GGYF.png": 0.9896907216494846, + "1A2AT7TW5KOMUUAK7TQXT.png": 0.9767441860465116, + "0W8NNJL30MNEY6RTPD6DA.png": 0.9767441860465116, + "0XC0XOHP855H9DFG41W9T.png": 0.9803921568627451, + "1DC57AS1OYZ1BRHZHPIO2.png": 0.9903846153846154, + "00WVVGSQ00B0IZU4OKPHQ.png": 0.9916666666666667, + "0RTI5C20W407SL59RANEM.png": 0.991304347826087, + "0A0DA327P9Y532UTLHE2N.png": 0.9722222222222222, + "0AQZMEU4Q38NKK4USHAC5.png": 0.9896907216494846, + "0U7602J86XPC7AVTSPMWL.png": 0.9878048780487805, + "0DSGAEKSK52RUNGEOGEXP.png": 0.9921259842519685, + "0JD5R5NDJKRRHT1UI6GFW.png": 0.7058823529411764, + "0PQ9OK98A29AC6GEI3DKQ.png": 0.4882352941176471, + "0TA7SVAQC7PKDE8BUP3NF.png": 0.9887640449438202, + "0TQM47CA0F30LG2C0S2KN.png": 0.9887640449438202, + "0HA1FE8828DJ86ZIJUIX4.png": 0.967741935483871, + "0P1Y0C88Y17DSXE616MQN.png": 0.98989898989899, + "0I6GDDWCTMF9V4YLGLBIM.png": 0.4036697247706422, + "0SI6DA6CAXUMFYSXBXIF6.png": 0.9906542056074766, + "0NM9CUQJV6W2N9434O81D.png": 0.9859154929577465, + "0WU8XJP1VJSLZXQ7S43HM.png": 0.9767441860465116, + "0P0WR7JJ9JBXO0HVMDETS.png": 0.975609756097561, + "0ZNTZMWW1X0QZV4AGDHYL.png": 0.9926470588235294, + "0C98HOE9TQ4HZK6DKGF5I.png": 0.989010989010989, + "0JCDZWWAMUR9FRGHL9IVN.png": 0.9911504424778761, + "0PVN50SJP1LUTHE2TID60.png": 0.9926470588235294, + "0D7CMRTBBENLYDO7EWWVZ.png": 0.98, + "0JOTZX26K6UJB6LNVK9RH.png": 0.975609756097561, + "0ZFOZ6UKG7DCCD5HSUIIX.png": 0.9876543209876543, + "0L7V0ZXS2M9JMSBD05I25.png": 0.9873417721518988, + "0G1E97R3QFH7FG9AUAIFB.png": 0.9863013698630136, + "0CORAY089OILX2OWIKU1E.png": 0.984375, + "0EH9JARAL7RYD3CVMM8AZ.png": 0.5185185185185186, + "00KDBG5H22KPNCPCK7L2P.png": 0.9848484848484849, + "00XJ5C1RWIRVID9IPUX8G.png": 0.8, + "0FFJM5ABUDDCT2DOCW2T4.png": 0.9916666666666667, + "0D34PI1NNCV0AB4WCQMB3.png": 0.9896907216494846, + "0X9D7AJTD7S91BNHMQ4L0.png": 0.9876543209876543, + "0W9SN5GJDEWTG3WAPGPDZ.png": 0.9887640449438202, + "0ATIOLLN3DOAHKX75560Q.png": 0.9882352941176471, + "1C9EM94JJTICVGS6U2T2U.png": 0.9883720930232558, + "0TG6BRHGF3C865C2OL6DE.png": 0.9882352941176471, + "1BUP8L4PGVBNQE1GSCGJZ.png": 0.9863013698630136, + "1AJLKOKRHEVOTGE90GEH6.png": 0.989010989010989, + "1C98HOE9TQ4HZK6DKGF5I.png": 0.9859154929577465, + "0IH65GI6IN6RQWJE04YPG.png": 0.9859154929577465, + "0DNHG32KRYJ9PQ7UU1YL5.png": 0.9863013698630136, + "0EV54WP1Y9JDCWMDIT0OM.png": 0.975609756097561, + "0BE3I0HX6XWZQA4EFY99C.png": 0.984375, + "0O7G4HGEK48J2NUB5RCES.png": 0.9882352941176471, + "1BXWVCNXW1Z4N1XG8QOG4.png": 0.9905660377358491, + "0M2V36SUMHY2U8FRS9NYZ.png": 0.4424778761061947, + "0STJA7OMA59TOQ8XQ54G5.png": 0.98, + "0VB0OIQZQXKY5PA111Q8B.png": 0.984375, + "0RBPX6DU1W6LIYA2VRAA4.png": 0.972972972972973, + "0SP3KJJ2HMQZF088NH2DR.png": 0.9904761904761905, + "1D0ZG3O9YHMQAPHCD0890.png": 0.9655172413793104, + "0XZ590ZLZXRB09XIADL9V.png": 0.9934640522875817, + "0QU6QW0KAWVXZ6TL7FVJE.png": 0.9933774834437086, + "0PKH21420YW57OPRJR21R.png": 0.9922480620155039, + "0TX7Y5KWQ2MVU3579QIYH.png": 0.9777777777777777, + "0Y6OW4PMMWG05F4ZFYQ40.png": 0.9767441860465116, + "0EK5DRITVR9G3KDVF1CTJ.png": 0.9876543209876543, + "0DDEMI2034QD7F4QRH1IV.png": 0.9933774834437086, + "0HJXUBEZQCR1DEUQ8V30I.png": 0.9932885906040269, + "0BG5K95UCWQ3JXWC501XA.png": 0.9886363636363636, + "00TNQG8N9T3KUVMZ7AWTB.png": 0.967741935483871, + "0TJSB9YOUAG7C9OZW3U80.png": 0.9848484848484849, + "0SYEGYPSNLKCALCQBPGK2.png": 0.9929577464788732, + "0IP23CAYMTIVE93KLVMRA.png": 0.9824561403508771, + "0KFRN6DX1A6MMGS24B39T.png": 0.9850746268656716, + "1CTFYQFHQ1S1FLIEAPZTB.png": 0.9803921568627451, + "0U9U2Q7VBD1V6HBT7FQKM.png": 0.9923076923076923, + "0S7MUFP120D8OP4ZCCCUV.png": 0.4873417721518988, + "0BXWVCNXW1Z4N1XG8QOG4.png": 0.9873417721518988, + "1A7ZA5BA5TPHBN2WP6TT9.png": 0.9824561403508771, + "1ACY14LU0VWSKDOHEAVZM.png": 0.9924812030075187, + "0MPO1XXHHM8I5BOIT3DB9.png": 0.9876543209876543, + "0RSQ19UNM98CNWII5Q25F.png": 0.975609756097561, + "0EAA9XEBN9W7XDBPK31UZ.png": 0.9803921568627451, + "0U0BR4A64P7CE7YZ57HQ1.png": 0.9911504424778761, + "0XFNT3NMKFW1DB0F2LVY3.png": 0.9916666666666667, + "1AQZMEU4Q38NKK4USHAC5.png": 0.9904761904761905, + "0VGZMTO2VCZVZKGAOHZEU.png": 0.9910714285714286, + "0DBQ2M6XQ66Y2895PYNOM.png": 0.984375, + "0BP5KU2XHXZ0C431B4OL9.png": 0.9811320754716981, + "0PYCGJHF1705P4NTCM8AS.png": 0.9824561403508771, + "0RAGYZ9465I7GLXZXCLCQ.png": 0.9924812030075187, + "1A9560NY0NQ5OVZQQBJRQ.png": 0.4636363636363636, + "0KXDSHWWWYQJBXT2Y6U8S.png": 0.9803921568627451, + "1BF411IVR1HLU1Q44I3K7.png": 0.984375, + "0T1ZL9NSVN3385DR7B86C.png": 0.9824561403508771, + "0SYKTWM1EF4KS646AWQEL.png": 0.9803921568627451, + "0S104IFNSN5EJ31212IOP.png": 0.989010989010989, + "0H2RZUXKBQEVFJ2JT29R4.png": 0.9818181818181818, + "0SVC8WRHPF38HHKBN65YD.png": 0.9926470588235294, + "0HVIW7DPWCJSWJ5PCJDM2.png": 0.9855072463768116, + "0PRIZA7CG2JAL9GTN265B.png": 0.9929577464788732, + "0FXLG8PO267BZPBBXIX4E.png": 0.9922480620155039, + "1B0LNAITDDPPCJ4I6XIWK.png": 0.9868421052631579, + "0YNQ2KZ01B1TWP9FR5DE7.png": 0.45360824742268047, + "0A8AVSZNK6GTNOCBEVFOY.png": 0.9722222222222222, + "0XM8RQF6JQDOTJ5WQVHFE.png": 0.9873417721518988, + "0JBU3LJRDTMJI2XGB6NUE.png": 0.9868421052631579, + "0FKIASN9E4KCZ0JRCAJLQ.png": 0.9917355371900827, + "0A2AT7TW5KOMUUAK7TQXT.png": 0.9882352941176471, + "0QISJETVE3HGF1PMBD1BM.png": 0.9848484848484849, + "0KBOWWQLYSIZ0P4SIZMHJ.png": 0.993421052631579, + "0OMZO818L9AC4U3JJTKGD.png": 0.9863013698630136, + "0IVOAVCWOJ4CA92H7CM1Q.png": 0.9917355371900827, + "0SH9F7EHAT35OVT003OC5.png": 0.3728813559322034, + "0F7BJ4Z9F1R95HUG4RRZD.png": 0.9767441860465116, + "00RJGV4A4UTMTLDEIR1IG.png": 0.975609756097561, + "0BUP8L4PGVBNQE1GSCGJZ.png": 0.967741935483871, + "0B1QX4K8U8P9QA3HVLRPN.png": 0.9923664122137404, + "0IZ8M2UHYSA9H6K8XIOKS.png": 0.9855072463768116, + "0KLEV2650Z6X2DAUO94QK.png": 0.9876543209876543, + "0MRYJGMAVHEDMZ3XSX9XI.png": 0.9871794871794872, + "0I6PWVE3HEK6ZZ5K53UY4.png": 0.9818181818181818, + "1BE3I0HX6XWZQA4EFY99C.png": 0.984375, + "0M7CJCA8K3PX504PNHJRT.png": 0.9883720930232558, + "0ESACK4QILSDBXRS54UK0.png": 0.9795918367346939, + "0KLU5K631Q9RHQOY6771B.png": 0.4444444444444444, + "0RAZV12CY84ZGA4BRZQUC.png": 0.9871794871794872, + "0HZ4TDEJG6BY7B2RTALZK.png": 0.9868421052631579, + "0ROPMUV96VG8PTONLNGV9.png": 0.9887640449438202, + "0L194VI2NIOAX4AUCU2WG.png": 0.9767441860465116, + "0PG6K8IFJM2PHHLA1S4Y6.png": 0.9905660377358491, + "0H5AHQVKHAKQ1W636PLCS.png": 0.9878048780487805, + "0ZAHJJUMYDOQIMIUUFAUD.png": 0.9863013698630136, + "0MO39PWU9N82Y88WNANVM.png": 0.984375, + "0ZSUP0IMF3PK86DIVWQ8V.png": 0.967741935483871, + "0M1B6J5CTPBITI79C68MO.png": 0.9824561403508771, + "0BWJOYJSDHL1XJH6UG2RM.png": 0.9882352941176471, + "0SPYHIS3OEEZ082CFJEGF.png": 0.9871794871794872, + "0A3YX0911ULBZSCUBNDZS.png": 0.9896907216494846, + "0FK1CU21TAIHIR7YWZ2W7.png": 0.9818181818181818, + "0WP1ZBKQCK8W2W0ZXI2Z4.png": 0.7916666666666666, + "1AF02R419WL1YN97ZV144.png": 0.9767441860465116, + "0BKBFKJTQPLQBNIBZSM7E.png": 0.9916666666666667, + "1C1ZYGFL2YNFM2W3P2KN1.png": 0.9871794871794872, + "0IHOYC7KXLECI1F3G1WAF.png": 0.9848484848484849, + "1A0DA327P9Y532UTLHE2N.png": 0.9868421052631579, + "0SK9B35AHQ2OQA1RDKHHP.png": 0.9917355371900827, + "0EECJZYQ42MZLSWPOK9ZH.png": 0.9887640449438202, + "0UFBWJZOD5PBKMVX7G231.png": 0.9824561403508771, + "0OZ6DU5POAFSM589UXX4S.png": 0.9876543209876543, + "0OUIP8MTUSWLFQ6J13VXT.png": 0.967741935483871, + "0NFAI2Z8TAUKU6S7892KH.png": 0.975609756097561, + "0F3VUGWY35HLOJYHPT78G.png": 0.9883720930232558, + "0AYZOGNX998RYQVPWP1OA.png": 0.9846153846153847, + "0UC2QTKS4ITXYK4E6HU9T.png": 0.9939759036144579, + "0KK6YAU45B9B34SSZTAS7.png": 0.9836065573770492, + "0WV2Q54214D8ARYKCMBE0.png": 0.547945205479452, + "0TUDLFORB7K1BVA4U0ULU.png": 0.9917355371900827, + "0XZRML313QJ6X82YZJLYT.png": 0.9848484848484849, + "0ACY14LU0VWSKDOHEAVZM.png": 0.9873417721518988, + "0HH9NAZ1I95NJINORKJIM.png": 0.9795918367346939, + "1AWZPWR198XN7U8HY1E32.png": 0.9795918367346939, + "0TLG8NFY9BXHB15A47OGW.png": 0.9926470588235294, + "1CBFM7HG55Z7O8F4Y0O0L.png": 0.9848484848484849, + "0EV3WT6VJG3QH2HFJEIBA.png": 0.975609756097561, + "0OBPU21JDPO0KPYEQGLFO.png": 0.9722222222222222, + "0MJ27YD7XBYLQKM87RM3Y.png": 0.9887640449438202, + "0BHU2JO8ODKS3OL4RIU6A.png": 0.99, + "0WVB351NNWY8OOQQRRW6F.png": 0.476878612716763, + "1BCT1VG1R4HUK3Q6NMZGU.png": 0.9916666666666667, + "0YJ043WAWUTW4AEMDTD4R.png": 0.9782608695652174, + "0YS08VVMS1YPOHVJOFXXA.png": 0.98989898989899, + "0EWWFSOUCGGD5BK6RKMKO.png": 0.522875816993464, + "0VCTD6BP09MBAXOOM5Y5E.png": 0.975609756097561, + "0S7ZGBZ7OBI15CZS5V95A.png": 0.984375, + "0JJ9O2OQ6O13OAOFM7643.png": 0.99, + "00TXY79AHYWJ7WLXB3VLV.png": 0.9846153846153847, + "0J2UQ7WIZXFK4I5TV9UHW.png": 0.9935064935064936, + "0TYF1PBQCH64LANCKYWY7.png": 0.9859154929577465, + "0SWG2OW7F5RLADFAHJ9A4.png": 0.9882352941176471, + "0RV3TKC89HQD4FRFCTNSK.png": 0.9767441860465116, + "1BQBJ8UFLH7H3JQ965JF6.png": 0.9863013698630136, + "0C70JEJWPOAT1S8RUWCVB.png": 0.972972972972973, + "0RCE6GI0QYPCA15RH6HM7.png": 0.49382716049382713, + "0SB1QV5XRJM6W0HRU4AH7.png": 0.9891304347826086, + "0I1HQDO584A6ODC54PLNA.png": 0.9891304347826086, + "1AWHACFMS9KSHM18INN41.png": 0.9836065573770492, + "1BKBFKJTQPLQBNIBZSM7E.png": 0.9863013698630136, + "0T0Q44ALMC9WURWEESEMP.png": 0.9875, + "1A9RJA2I3YJT58JR2MEOT.png": 0.9615384615384616, + "0DD8FWYLADAY5EJ3UZUD9.png": 0.9868421052631579, + "0F078JDZMTC8C8H2P8IVA.png": 0.9921875, + "0L5KEP1L6K1ALH88LLMEY.png": 0.9795918367346939, + "0U2FXJ2H3K5SQTZNJ1WV1.png": 0.98, + "0U49K9QPO02GF77TU5JB8.png": 0.9863013698630136, + "0A9560NY0NQ5OVZQQBJRQ.png": 0.9836065573770492, + "0MRQ2DF27RW94C36QLLTZ.png": 0.9863013698630136, + "0BCT1VG1R4HUK3Q6NMZGU.png": 0.9795918367346939, + "0GQC64N9E830BWDTF8L0Q.png": 0.9910714285714286, + "0HIESCSLITYADXZHOO7IA.png": 0.989010989010989, + "0FZFGRN9B0WT3XCQMOVPJ.png": 0.9767441860465116, + "00LQMDL10JL253UW69YUO.png": 0.9818181818181818, + "0U79XK18POJ6HCLLOXS4Z.png": 0.9905660377358491, + "0I3RG6GXJ2VILV3BPFIY4.png": 0.9767441860465116, + "0X8PV0Z6SNEKPIPOCP5HR.png": 0.3931034482758621, + "0UFQOEKLIWTX65AY778BD.png": 0.5275590551181102, + "0HZUERFF8VNKXAZLV8RO5.png": 0.9850746268656716, + "0FWXHCMHZ7KG6WYRNWD6Q.png": 0.9922480620155039, + "0A9B6NHM7J57SCT1Z8TAS.png": 0.9861111111111112, + "0WOTQFWQFAEPN0HZ6MYIL.png": 0.9929577464788732, + "0IUNSDMCG8WWVJJ758NN9.png": 0.9887640449438202, + "0XLK4S5OWK77LRNU2JAG9.png": 0.46543778801843316, + "00FMSMFBJU5732FGUTLIF.png": 0.9821428571428571, + "0YOETJE558OS77GHG5L5U.png": 0.9876543209876543, + "0BKXE7HQJOJV0I1LL8YOF.png": 0.9821428571428571, + "0AWHACFMS9KSHM18INN41.png": 0.5416666666666667, + "1BL58Q9DLPBQF73ROGFDX.png": 0.9921875, + "0Q7EACO6OF8WQFZXI1MRQ.png": 0.9896907216494846, + "0R1IOV08YNRVC0KQS84EF.png": 0.9818181818181818, + "0SEF4O8YR8ULW23U32SE6.png": 0.9836065573770492, + "0IQGTS9QZK0ZYRL80GOSD.png": 0.9767441860465116, + "0E00IBZTY74DGR1SSX77L.png": 0.975609756097561, + "0BR0V61AWXYXVQSK6RMY7.png": 0.9911504424778761, + "0MESCFGQYOQNMVWD6B1VU.png": 0.9885057471264368, + "0F3GIMIL9E4UNWEFYLKGV.png": 0.9824561403508771, + "1A8AVSZNK6GTNOCBEVFOY.png": 0.9910714285714286, + "0E7WX1NX5ZKR24SEIUKRN.png": 0.9811320754716981, + "0QZOZCFYQ2TK5C0Q3KN5C.png": 0.5106382978723405, + "0SDC2B1I853GR50G545IX.png": 0.9891304347826086, + "0FEKB24PHTZNT3KIZZVIS.png": 0.9876543209876543, + "0SIW9Q9NWY3TWRC712D4J.png": 0.9876543209876543, + "0JFFFUOFXDOLV2ZGQJAPB.png": 0.9887640449438202, + "0O976W9Y9NDSJ24YV7HU9.png": 0.975609756097561, + "0B0LNAITDDPPCJ4I6XIWK.png": 0.9811320754716981, + "1AAPDAAK73MRINE7PM0ZJ.png": 0.9852941176470589, + "0HC8F1RENJE297WV8RW0N.png": 0.45517241379310347, + "0OXJ4SWAYILOZVQCGO1OB.png": 0.9937106918238994, + "0I3S2Z8YWZ0JOIMKGU51B.png": 0.972972972972973, + "0Y0LZ2LRH7BR5ZDYBTH7U.png": 0.9824561403508771, + "0T0LAS5REAE827IQO0Q9U.png": 0.98989898989899, + "1AQ9EL10BYBSGJO2RLC6Q.png": 0.9868421052631579, + "0L1YL688ZRRPYAJ07UOFQ.png": 0.9911504424778761, + "1BWJOYJSDHL1XJH6UG2RM.png": 0.9922480620155039, + "0TELO9B7QI0QQVFMJXAQ1.png": 0.9896907216494846, + "0XDX2OT3OG575I0U99YAQ.png": 0.54, + "0X49B57NNHU6FEB4J21VY.png": 0.993006993006993, + "0DHJ8WY2XLWKG7K345LAK.png": 0.975609756097561, + "0BQBJ8UFLH7H3JQ965JF6.png": 0.989010989010989, + "00CBN2MRTC48ZY50RUSBW.png": 0.9767441860465116, + "1D7CMRTBBENLYDO7EWWVZ.png": 0.9863013698630136, + "0BX1I2HS6BLV92NZHV6J1.png": 0.9722222222222222, + "0XQE375V4J34MLJYN711T.png": 0.9722222222222222, + "1BKXE7HQJOJV0I1LL8YOF.png": 0.9795918367346939, + "0E3OA2PY1K3B44GN9AS0Y.png": 0.9863013698630136, + "0SCRALC3GPIO2ZD918U8L.png": 0.478021978021978, + "0ITKDLWB7SDGMM8980ZSS.png": 0.9911504424778761, + "1BYRMKANKN4PL6JFPG8AR.png": 0.989010989010989, + "0BYRMKANKN4PL6JFPG8AR.png": 0.9803921568627451, + "0R8W6O2N25AVQI9FQ5IL7.png": 0.972972972972973, + "0J9TV59N7U65CB7YCHD38.png": 0.9922480620155039, + "0VX41MM59ET2MK09202C3.png": 0.9896907216494846, + "1CGP5R7FMVCKR47XK6IVA.png": 0.9896907216494846, + "1BSXNNN0LA94101P5D38I.png": 0.9926470588235294, + "0UMVEM9RUVZDRJRFA1W2V.png": 0.9722222222222222, + "0KPHJHUXB0MS3B9RHL57O.png": 0.9868421052631579, + "0TYH6IN161KXZT369VVWQ.png": 0.9795918367346939, + "0AUTW1OL7IAPO1JH1TQUR.png": 0.984375, + "0GNCKEB99NZ0J9GCAI0TH.png": 0.9850746268656716, + "0CGP5R7FMVCKR47XK6IVA.png": 0.9915966386554622, + "0NNLAUZDCGVKZP852ZJ7X.png": 0.9836065573770492, + "0EJW9DEXTHUR17CZCUPB1.png": 0.9850746268656716, + "0JZRIWIFSATGGFL8P0NZF.png": 0.9908256880733946, + "0VNHMSVYYS2Q0H0VJDNAK.png": 0.9782608695652174, + "0Z2ZZWW84O21E70F5RGIA.png": 0.993103448275862, + "0UODYVKUWDGD6S5D7LNAW.png": 0.9930555555555556, + "0NGNPB7KAJSSKSHQV1KZS.png": 0.9767441860465116, + "0Q3C8N8G8GXV2EP88XEXI.png": 0.9795918367346939, + "0IU89E255WY0KPUD6L7Y9.png": 0.9902912621359223, + "0F22CQYG638LSZROETJ9V.png": 0.9904761904761905, + "0HBX9X0EJVVL4TA9CJ25G.png": 0.9873417721518988, + "0IRDSID7UDBLOIRB9JQ9S.png": 0.9883720930232558, + "0IHCMVD5NO41KSAB3ODC0.png": 0.5213270142180095, + "1AYZOGNX998RYQVPWP1OA.png": 0.9931506849315068, + "0WM2Y66O2ZJA831TN2E7Z.png": 0.9615384615384616, + "0CPW27F5C8I03UQBVBL2Y.png": 0.42500000000000004, + "1BR0V61AWXYXVQSK6RMY7.png": 0.989010989010989, + "0JJPRMSYFQLJKD3JYA1JP.png": 0.9850746268656716, + "0RT937QPOOWU9LKZVU0G3.png": 0.9922480620155039, + "0EFBK546D496KI033ACDF.png": 0.972972972972973, + "0EP1D1EXZC4VOMGZJGQQT.png": 0.9891304347826086, + "0DSQ4IAVY32EHCJ0AJM1Y.png": 0.9824561403508771, + "0F4HFOUP4374O8RL4E914.png": 0.9824561403508771, + "0IMS5FXCTVU6GSCR5CHTK.png": 0.984375, + "0P82SO3E98ECMRNRS62D4.png": 0.9868421052631579, + "1AUTW1OL7IAPO1JH1TQUR.png": 0.9911504424778761, + "0WFIWI83FBAOLU16M27NL.png": 0.9939024390243902, + "1A9B6NHM7J57SCT1Z8TAS.png": 0.9859154929577465, + "0UU3AG1PSZ1H78B6J17PA.png": 0.9882352941176471, + "0UVW81GETVKT5GPM6ZX0S.png": 0.9803921568627451, + "0IKFXKSQ9OA3OCRGQBZFI.png": 0.9795918367346939, + "1C70JEJWPOAT1S8RUWCVB.png": 0.9803921568627451, + "0A7ZU26KX6C0LG0D3T3ZS.png": 0.9818181818181818, + "0BL58Q9DLPBQF73ROGFDX.png": 0.9937106918238994, + "0UQWQMAYVXUFY65GH4ION.png": 0.9836065573770492, + "0R77TU5P7A0F1YTLIGSOA.png": 0.9863013698630136, + "0Q740R8QE6ZAF034ZMGQG.png": 0.9917355371900827, + "1BG5K95UCWQ3JXWC501XA.png": 0.9926470588235294, + "1CPW27F5C8I03UQBVBL2Y.png": 0.9916666666666667 + } +} \ No newline at end of file diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index 9a50b4fa..dcdef2d4 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -2,6 +2,8 @@ from pathlib import Path import json import pprint +from typing import Optional, List + import numpy as np import wget @@ -14,14 +16,12 @@ path_result = Path(__file__).parent / ".." / ".." / "resources" / "benchmarks" path_result.absolute().mkdir(parents=True, exist_ok=True) -URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download" - table_recognizer = TableRecognizer(config=get_config()) image_reader = PdfImageReader(config=get_config()) -def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False) -> dict: - teds = TEDS(structure_only=structure_only) +def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False, ignore_nodes: Optional[List] = None) -> dict: + teds = TEDS(structure_only=structure_only, ignore_nodes=ignore_nodes) scores = teds.batch_evaluate(pred_json, true_json) pp = pprint.PrettyPrinter() pp.pprint(scores) @@ -51,15 +51,14 @@ def make_predict_json(data_path: Path) -> dict: return predict_json -def download_dataset(data_dir: Path) -> None: - +def download_dataset(data_dir: Path, name_zip: str, url: str) -> None: if Path.exists(data_dir): print(f"Use cached benchmark data from {data_dir}") return data_dir.mkdir(parents=True, exist_ok=True) - pdfs_zip_path = data_dir / "benchmark_table_data.zip" - wget.download(URL, str(data_dir)) + pdfs_zip_path = data_dir / name_zip + wget.download(url, str(data_dir)) with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: zip_ref.extractall(data_dir) @@ -71,17 +70,19 @@ def download_dataset(data_dir: Path) -> None: def prediction(path_pred: Path, path_images: Path) -> dict: pred_json = make_predict_json(path_images) with path_pred.open("w") as fd: - json.dump(str(pred_json), fd, indent=2, ensure_ascii=False) + json.dump(pred_json, fd, indent=2, ensure_ascii=False) return pred_json -if __name__ == "__main__": +def benchmark_on_our_data(): data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data" path_images = data_dir / "images" path_gt = data_dir / "gt.json" path_pred = data_dir / "pred.json" - download_dataset(data_dir) + download_dataset(data_dir, + name_zip="benchmark_table_data.zip", + url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download") mode_metric_structure_only = False @@ -104,3 +105,62 @@ def prediction(path_pred: Path, path_images: Path) -> dict: file_result = path_result / "table_benchmark.json" with file_result.open("w") as fd: json.dump(result, fd, indent=2, ensure_ascii=False) + + +def benchmark_on_generated_table(): + """ + Generated data from https://github.com/hassan-mahmood/TIES_DataGeneration + Article generation information https://arxiv.org/pdf/1905.13391.pdf + Note: generate the 1st table tape category + Note: don't use header table tag
, replacing on tag + Note: all generated data (four categories) you can download from + TODO: some tables have a low quality. Should to trace the reason. + All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU + """ + + data_dir = Path(get_config()["intermediate_data_path"]) / "visualizeimgs" / "category1" + path_images = data_dir / "img_500" + path_gt = data_dir / "html_500" + download_dataset(data_dir, + name_zip="benchmark_table_data_generated_500_tables_category_1.zip", + url="https://at.ispras.ru/owncloud/index.php/s/gItWxupnF2pve6B/download") + mode_metric_structure_only = True + + # make common ground-truth file + common_gt_json = {} + for pathname in Path.iterdir(path_gt): + image_name = pathname.name.split(".")[0] + '.png' + with open(pathname, "r") as fp: + table_html = fp.read() + # exclude header tags + table_html = table_html.replace("", "") + + common_gt_json[image_name] = {"html": table_html} + + file_common_gt = data_dir / "common_gt.json" + with file_common_gt.open("w") as fd: + json.dump(common_gt_json, fd, indent=2, ensure_ascii=False) + + # calculate metrics + path_pred = data_dir / "pred.json" + + pred_json = prediction(path_pred, path_images) + scores = call_metric(pred_json=pred_json, true_json=common_gt_json, + structure_only=mode_metric_structure_only, + ignore_nodes=['span', 'style', 'head', 'h4']) + + result = dict() + result["mode_metric_structure_only"] = mode_metric_structure_only + result["mean"] = np.mean([score for score in scores.values()]) + result["images"] = scores + + # save benchmarks + file_result = path_result / "table_benchmark_on_generated_data.json" + with file_result.open("w") as fd: + json.dump(result, fd, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + # benchmark_on_our_data() + benchmark_on_generated_table() diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index 1872b414..ff84a4a7 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -32,7 +32,7 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, def bracket(self): """Show tree using brackets notation """ - if self.tag == "td": + if self.tag == "td" or self.tag == 'th': result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}' else: result = f'"tag": {self.tag}' @@ -89,6 +89,10 @@ def tokenize(self, node): if node.tag != "td" and node.tail is not None: self.__tokens__ += list(node.tail) + def get_span(self, node, name_span: str) -> int: + value = int(node.attrib.get(name_span, "1")) + return 1 if value <= 0 else value + def load_html_tree(self, node, parent=None): """ Converts HTML tree to the format required by apted """ @@ -102,8 +106,8 @@ def load_html_tree(self, node, parent=None): try: new_node = TableTree(tag=node.tag, - colspan=int(node.attrib.get("colspan", "1")), - rowspan=int(node.attrib.get("rowspan", "1")), + colspan=self.get_span(node, "colspan"), + rowspan=self.get_span(node, "rowspan"), content=cell, visible=False if node.attrib.get("style") == "display: none" else True, *deque()) except Exception as ex: From ca029ded7aeaeb5cde635a66958be67e19e10539 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Mon, 29 Jan 2024 13:19:31 +0300 Subject: [PATCH 5/5] TLDR-585 fixed after review --- scripts/benchmark_table/benchmark_table.py | 27 +++++++++++----------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index dcdef2d4..c6cbd7cb 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -3,7 +3,6 @@ import json import pprint from typing import Optional, List - import numpy as np import wget @@ -19,6 +18,10 @@ table_recognizer = TableRecognizer(config=get_config()) image_reader = PdfImageReader(config=get_config()) +GENERATED_BENCHMARK = "on_generated_data" +OURDATA_BENCHMARK = "on_our_data" +TYPE_BENCHMARK = OURDATA_BENCHMARK + def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False, ignore_nodes: Optional[List] = None) -> dict: teds = TEDS(structure_only=structure_only, ignore_nodes=ignore_nodes) @@ -75,7 +78,7 @@ def prediction(path_pred: Path, path_images: Path) -> dict: return pred_json -def benchmark_on_our_data(): +def benchmark_on_our_data() -> dict: data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data" path_images = data_dir / "images" path_gt = data_dir / "gt.json" @@ -101,13 +104,10 @@ def benchmark_on_our_data(): result["mean"] = np.mean([score for score in scores.values()]) result["images"] = scores - # save benchmarks - file_result = path_result / "table_benchmark.json" - with file_result.open("w") as fd: - json.dump(result, fd, indent=2, ensure_ascii=False) + return result -def benchmark_on_generated_table(): +def benchmark_on_generated_table() -> dict: """ Generated data from https://github.com/hassan-mahmood/TIES_DataGeneration Article generation information https://arxiv.org/pdf/1905.13391.pdf @@ -155,12 +155,13 @@ def benchmark_on_generated_table(): result["mean"] = np.mean([score for score in scores.values()]) result["images"] = scores - # save benchmarks - file_result = path_result / "table_benchmark_on_generated_data.json" - with file_result.open("w") as fd: - json.dump(result, fd, indent=2, ensure_ascii=False) + return result if __name__ == "__main__": - # benchmark_on_our_data() - benchmark_on_generated_table() + result = benchmark_on_our_data() if TYPE_BENCHMARK == OURDATA_BENCHMARK else benchmark_on_generated_table() + + # save benchmarks + file_result = path_result / f"table_benchmark_{TYPE_BENCHMARK}.json" + with file_result.open("w") as fd: + json.dump(result, fd, indent=2, ensure_ascii=False)