From 35c1a52d4373c49778425ce2576b6a36d2cbced8 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Mon, 22 Jan 2024 14:26:52 +0300
Subject: [PATCH 1/5] TLDR-585 added TEDS table benchmark

---
 dedoc/api/api_utils.py                      |   4 +-
 dedoc/readers/pdf_reader/pdf_base_reader.py |  13 +-
 resources/benchmarks/table_benchmark.json   |  15 ++
 scripts/benchmark_table/benchmark_table.py  | 114 ++++++++++++++
 scripts/benchmark_table/metric.py           | 158 ++++++++++++++++++++
 scripts/benchmark_table/requirements.txt    |   3 +
 6 files changed, 299 insertions(+), 8 deletions(-)
 create mode 100644 resources/benchmarks/table_benchmark.json
 create mode 100644 scripts/benchmark_table/benchmark_table.py
 create mode 100644 scripts/benchmark_table/metric.py
 create mode 100644 scripts/benchmark_table/requirements.txt
diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
index df8a1286..1287912d 100644
--- a/dedoc/api/api_utils.py
+++ b/dedoc/api/api_utils.py
@@ -133,7 +133,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab
     if tables is not None and len(tables) > 0:
         text += "<h3> Tables: </h3>"
         for table in tables:
-            text += __table2html(table, table2id)
+            text += table2html(table, table2id)
             text += "<p>&nbsp;</p>"
     return text
 
@@ -201,7 +201,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
     return text.replace("\n", "<br>")
 
 
-def __table2html(table: Table, table2id: Dict[str, int]) -> str:
+def table2html(table: Table, table2id: Dict[str, int]) -> str:
     uid = table.metadata.uid
     text = f"<h4> table {table2id[uid]}:</h4>"
     text += f'<table border="1" id={uid} style="border-collapse: collapse; width: 100%;">\n<tbody>\n'
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
index d52e0d3c..ea869675 100644
--- a/dedoc/readers/pdf_reader/pdf_base_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -92,12 +92,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         )
 
         lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse)
-        tables = []
-        for scan_table in scan_tables:
-            metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle)
-            cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells]
-            table = Table(metadata=metadata, cells=cells_with_meta)
-            tables.append(table)
+        tables = [self.scantable2table(scan_table) for scan_table in scan_tables]
 
         if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters):
             attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
@@ -105,6 +100,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields)
         return self._postprocess(result)
 
+    @staticmethod
+    def scantable2table(table: ScanTable) -> Table:
+        metadata = TableMetadata(page_id=table.page_number, uid=table.name, rotated_angle=table.location.rotated_angle)
+        cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in table.matrix_cells]
+        return Table(metadata=metadata, cells=cells_with_meta)
+
     def _can_contain_attachements(self, path: str) -> bool:
         can_contain_attachments = False
         mime = get_file_mime_type(path)
diff --git a/resources/benchmarks/table_benchmark.json b/resources/benchmarks/table_benchmark.json
new file mode 100644
index 00000000..1ed4fef7
--- /dev/null
+++ b/resources/benchmarks/table_benchmark.json
@@ -0,0 +1,15 @@
+{
+  "mean": 0.9824606866114314,
+  "images": {
+    "example_with_table0_0.png": 0.9873417721518988,
+    "example_with_table0_1.png": 1.0,
+    "example_with_table6.png": 1.0,
+    "example_with_table4.jpg": 1.0,
+    "example_with_table17.jpg": 0.8536585365853658,
+    "example_with_table_hor_vert_union.png": 1.0,
+    "example_with_table1.png": 1.0,
+    "example_with_table_horizontal_union.jpg": 1.0,
+    "example_with_table3.png": 1.0,
+    "example_with_table5.png": 0.9836065573770492
+  }
+}
\ No newline at end of file
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
new file mode 100644
index 00000000..c3292e87
--- /dev/null
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -0,0 +1,114 @@
+import os
+import zipfile
+from pathlib import Path
+import json
+import pprint
+
+import numpy as np
+import wget
+
+from dedoc.api.api_utils import table2html
+from dedoc.config import get_config
+from dedoc.readers import PdfImageReader
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
+from scripts.benchmark_table.metric import TEDS
+from tests.test_utils import get_test_config
+
+path_result = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")
+path_result = os.path.abspath(path_result)
+os.makedirs(path_result, exist_ok=True)
+path_result = os.path.join(path_result, "table_benchmark.json")
+
+URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download"
+
+table_recognizer = TableRecognizer(config=get_test_config())
+image_reader = PdfImageReader(config=get_test_config())
+teds = TEDS()
+
+
+def call_metric(pred_json: dict, true_json: dict) -> dict:
+    scores = teds.batch_evaluate(pred_json, true_json)
+    pp = pprint.PrettyPrinter()
+    pp.pprint(scores)
+
+    return scores
+
+
+def get_tables(image_path: str) -> str:
+    document = image_reader.read(image_path)
+
+    for table in document.tables:
+        table.metadata.uid = "test_id"
+    table2id = {"test_id": 0}
+    html_tables = [table2html(table, table2id) for table in document.tables]
+
+    # TODO: while works with one table in an image
+    return html_tables[0]
+
+
+def make_predict_json(data_path: Path) -> dict:
+    predict_json = {}
+    for filename in os.listdir(data_path):
+        print(filename)
+        file_path = str(data_path / filename)
+
+        predict_json[filename] = {"html": "<html><body>" + get_tables(file_path) + "</body></html>"}
+
+    return predict_json
+
+
+def download_dataset(data_dir: Path) -> None:
+
+    if not os.path.isdir(data_dir):
+        data_dir.mkdir(parents=True)
+        pdfs_zip_path = str(data_dir / "benchmark_table_data.zip")
+        wget.download(URL, pdfs_zip_path)
+
+        with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+            zip_ref.extractall(data_dir)
+        os.remove(pdfs_zip_path)
+
+        print(f"Benchmark data downloaded to {data_dir}")
+    else:
+        print(f"Use cached benchmark data from {data_dir}")
+
+
+def prediction(path_pred: Path, path_images: Path) -> dict:
+    pred_json = make_predict_json(path_images)
+    with open(path_pred, "w") as fd:
+        json.dump(pred_json, fd, indent=2, ensure_ascii=False)
+
+    return pred_json
+
+
+if __name__ == "__main__":
+    data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data"
+    path_images = data_dir / "images"
+    path_gt = data_dir / "gt.json"
+    path_pred = data_dir / "pred.json"
+    download_dataset(data_dir)
+
+    with open(path_gt, "r") as fp:
+        gt_json = json.load(fp)
+    '''
+    Creating base html (based on method predictions for future labeling)
+    path_images = data_dir / "images_tmp"
+    pred_json = prediction("gt_tmp.json", path_images)
+    '''
+    pred_json = prediction(path_pred, path_images)   
+    scores = call_metric(pred_json=pred_json, true_json=gt_json)
+
+    result = dict()
+    result["mean"] = np.mean([score for score in scores.values()])
+    result["images"] = scores
+
+    # save benchmarks
+    with open(path_result, "w") as fd:
+        json.dump(result, fd, indent=2, ensure_ascii=False)
+
+
+
+
+
+
+
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
new file mode 100644
index 00000000..bf3f6bc5
--- /dev/null
+++ b/scripts/benchmark_table/metric.py
@@ -0,0 +1,158 @@
+# Copyright 2020 IBM
+# Author: peter.zhong@au1.ibm.com
+#
+# This is free software; you can redistribute it and/or modify
+# it under the terms of the Apache 2.0 License.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# Apache 2.0 License for more details.
+
+# Source: https://github.com/ibm-aur-nlp/PubTabNet
+
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+
+from tqdm import tqdm
+
+
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.visible = visible
+        self.children = list(children)
+
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+
+
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+
+    def normalized_distance(self, *sequences) -> float:
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+
+    def rename(self, node1: TableTree, node2: TableTree) -> float:
+        """Compares attributes of trees"""
+        if not node1.visible or node2.visible:
+            return 0.
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+
+
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+
+            try:
+                new_node = TableTree(tag=node.tag,
+                                     colspan=int(node.attrib.get('colspan', '1')),
+                                     rowspan=int(node.attrib.get('rowspan', '1')),
+                                     content=cell,
+                                     visible=False if node.attrib.get('style') == "display: none" else True, *deque())
+            except Exception as ex:
+                print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
+                raise ex
+        else:
+            new_node = TableTree(node.tag, None, None, None, True, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+
+    def evaluate(self, pred: str, true: str) -> float:
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        scores = [self.evaluate(pred_json.get(filename, '')['html'], true_json[filename]['html']) for filename in tqdm(samples)]
+        scores = dict(zip(samples, scores))
+        return scores
diff --git a/scripts/benchmark_table/requirements.txt b/scripts/benchmark_table/requirements.txt
new file mode 100644
index 00000000..99314805
--- /dev/null
+++ b/scripts/benchmark_table/requirements.txt
@@ -0,0 +1,3 @@
+# for metric TEDS:
+apted==1.0.3
+distance==0.1.3
\ No newline at end of file

From 00cebfd21d17d68473629fb20a54bfd1f7708300 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Tue, 23 Jan 2024 18:19:02 +0300
Subject: [PATCH 2/5] TLDR-585 fixed after review

---
 .../data_classes/tables/scantable.py          |  6 ++
 dedoc/readers/pdf_reader/pdf_base_reader.py   | 11 +---
 scripts/benchmark_table/benchmark_table.py    | 61 ++++++++-----------
 scripts/benchmark_table/metric.py             |  5 +-
 4 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
index c7c47fe1..4bc057df 100644
--- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
+++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
@@ -5,6 +5,7 @@
 import numpy as np
 from dedocutils.data_structures import BBox
 
+from dedoc.data_structures import CellWithMeta, Table, TableMetadata
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
 from dedoc.readers.pdf_reader.data_classes.tables.location import Location
 
@@ -27,6 +28,11 @@ def extended(self, table: "ScanTable") -> None:
         # extend order
         self.order = max(self.order, table.order)
 
+    def to_table(self) -> Table:
+        metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
+        cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
+        return Table(metadata=metadata, cells=cells_with_meta)
+
     @staticmethod
     def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]:
         attrs = []
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
index ea869675..fd6ed93b 100644
--- a/dedoc/readers/pdf_reader/pdf_base_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -13,10 +13,7 @@
 import dedoc.utils.parameter_utils as param_utils
 from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
-from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.line_with_meta import LineWithMeta
-from dedoc.data_structures.table import Table
-from dedoc.data_structures.table_metadata import TableMetadata
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.extensions import recognized_extensions, recognized_mimes
 from dedoc.readers.base_reader import BaseReader
@@ -92,7 +89,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         )
 
         lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse)
-        tables = [self.scantable2table(scan_table) for scan_table in scan_tables]
+        tables = [scan_table.to_table() for scan_table in scan_tables]
 
         if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters):
             attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
@@ -100,12 +97,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields)
         return self._postprocess(result)
 
-    @staticmethod
-    def scantable2table(table: ScanTable) -> Table:
-        metadata = TableMetadata(page_id=table.page_number, uid=table.name, rotated_angle=table.location.rotated_angle)
-        cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in table.matrix_cells]
-        return Table(metadata=metadata, cells=cells_with_meta)
-
     def _can_contain_attachements(self, path: str) -> bool:
         can_contain_attachments = False
         mime = get_file_mime_type(path)
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index c3292e87..a1066d4b 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -1,9 +1,7 @@
-import os
 import zipfile
 from pathlib import Path
 import json
 import pprint
-
 import numpy as np
 import wget
 
@@ -12,17 +10,14 @@
 from dedoc.readers import PdfImageReader
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
 from scripts.benchmark_table.metric import TEDS
-from tests.test_utils import get_test_config
 
-path_result = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")
-path_result = os.path.abspath(path_result)
-os.makedirs(path_result, exist_ok=True)
-path_result = os.path.join(path_result, "table_benchmark.json")
+path_result = Path(__file__).parent / ".." / "resources" / "benchmarks"
+path_result.absolute().mkdir(parents=True, exist_ok=True)
 
 URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download"
 
-table_recognizer = TableRecognizer(config=get_test_config())
-image_reader = PdfImageReader(config=get_test_config())
+table_recognizer = TableRecognizer(config=get_config())
+image_reader = PdfImageReader(config=get_config())
 teds = TEDS()
 
 
@@ -34,8 +29,8 @@ def call_metric(pred_json: dict, true_json: dict) -> dict:
     return scores
 
 
-def get_tables(image_path: str) -> str:
-    document = image_reader.read(image_path)
+def get_tables(image_path: Path) -> str:
+    document = image_reader.read(str(image_path))
 
     for table in document.tables:
         table.metadata.uid = "test_id"
@@ -48,35 +43,35 @@ def get_tables(image_path: str) -> str:
 
 def make_predict_json(data_path: Path) -> dict:
     predict_json = {}
-    for filename in os.listdir(data_path):
-        print(filename)
-        file_path = str(data_path / filename)
+    for pathname in Path.iterdir(data_path):
+        print(pathname)
 
-        predict_json[filename] = {"html": "<html><body>" + get_tables(file_path) + "</body></html>"}
+        predict_json[pathname.name] = {"html": "<html><body>" + get_tables(pathname) + "</body></html>"}
 
     return predict_json
 
 
 def download_dataset(data_dir: Path) -> None:
 
-    if not os.path.isdir(data_dir):
-        data_dir.mkdir(parents=True)
-        pdfs_zip_path = str(data_dir / "benchmark_table_data.zip")
-        wget.download(URL, pdfs_zip_path)
+    if Path.exists(data_dir):
+        print(f"Use cached benchmark data from {data_dir}")
+        return
 
-        with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
-            zip_ref.extractall(data_dir)
-        os.remove(pdfs_zip_path)
+    data_dir.mkdir(parents=True, exist_ok=True)
+    pdfs_zip_path = data_dir / "benchmark_table_data.zip"
+    wget.download(URL, str(data_dir))
 
-        print(f"Benchmark data downloaded to {data_dir}")
-    else:
-        print(f"Use cached benchmark data from {data_dir}")
+    with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+        zip_ref.extractall(data_dir)
+    pdfs_zip_path.unlink()
+
+    print(f"Benchmark data downloaded to {data_dir}")
 
 
 def prediction(path_pred: Path, path_images: Path) -> dict:
     pred_json = make_predict_json(path_images)
-    with open(path_pred, "w") as fd:
-        json.dump(pred_json, fd, indent=2, ensure_ascii=False)
+    with path_pred.open("w") as fd:
+        json.dump(str(pred_json), fd, indent=2, ensure_ascii=False)
 
     return pred_json
 
@@ -103,12 +98,6 @@ def prediction(path_pred: Path, path_images: Path) -> dict:
     result["images"] = scores
 
     # save benchmarks
-    with open(path_result, "w") as fd:
-        json.dump(result, fd, indent=2, ensure_ascii=False)
-
-
-
-
-
-
-
+    file_result = path_result / "table_benchmark.json"
+    with file_result.open("w") as fd:
+        json.dump(str(file_result), fd, indent=2, ensure_ascii=False)
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index bf3f6bc5..ebf978cc 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -32,10 +32,9 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None,
     def bracket(self):
         """Show tree using brackets notation"""
         if self.tag == 'td':
-            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
-                     (self.tag, self.colspan, self.rowspan, self.content)
+            result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
         else:
-            result = '"tag": %s' % self.tag
+            result = f'"tag": {self.tag}'
         for child in self.children:
             result += child.bracket()
         return "{{{}}}".format(result)

From 8e7f86bcc3a0f5631928b7db200e167162dd2960 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Wed, 24 Jan 2024 13:09:36 +0300
Subject: [PATCH 3/5] TLDR-585 fixed bug, include cells's content in metric

---
 resources/benchmarks/table_benchmark.json  | 23 ++++----
 scripts/benchmark_table/benchmark_table.py | 15 +++--
 scripts/benchmark_table/metric.py          | 66 +++++++++++-----------
 3 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/resources/benchmarks/table_benchmark.json b/resources/benchmarks/table_benchmark.json
index 1ed4fef7..d7a9d7c6 100644
--- a/resources/benchmarks/table_benchmark.json
+++ b/resources/benchmarks/table_benchmark.json
@@ -1,15 +1,16 @@
 {
-  "mean": 0.9824606866114314,
+  "mode_metric_structure_only": false,
+  "mean": 0.9468374367023571,
   "images": {
-    "example_with_table0_0.png": 0.9873417721518988,
-    "example_with_table0_1.png": 1.0,
-    "example_with_table6.png": 1.0,
-    "example_with_table4.jpg": 1.0,
-    "example_with_table17.jpg": 0.8536585365853658,
-    "example_with_table_hor_vert_union.png": 1.0,
-    "example_with_table1.png": 1.0,
-    "example_with_table_horizontal_union.jpg": 1.0,
-    "example_with_table3.png": 1.0,
-    "example_with_table5.png": 0.9836065573770492
+    "example_with_table0_0.png": 0.9525583036909738,
+    "example_with_table0_1.png": 0.9264351862896008,
+    "example_with_table6.png": 0.989010989010989,
+    "example_with_table4.jpg": 0.908436211832951,
+    "example_with_table17.jpg": 0.8078952936402488,
+    "example_with_table_hor_vert_union.png": 0.9896091617933723,
+    "example_with_table1.png": 0.9781560283687943,
+    "example_with_table_horizontal_union.jpg": 0.9925757575757576,
+    "example_with_table3.png": 0.9778008866078716,
+    "example_with_table5.png": 0.9458965482130129
   }
 }
\ No newline at end of file
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index a1066d4b..9a50b4fa 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -11,17 +11,17 @@
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
 from scripts.benchmark_table.metric import TEDS
 
-path_result = Path(__file__).parent / ".." / "resources" / "benchmarks"
+path_result = Path(__file__).parent / ".." / ".." / "resources" / "benchmarks"
 path_result.absolute().mkdir(parents=True, exist_ok=True)
 
 URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download"
 
 table_recognizer = TableRecognizer(config=get_config())
 image_reader = PdfImageReader(config=get_config())
-teds = TEDS()
 
 
-def call_metric(pred_json: dict, true_json: dict) -> dict:
+def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False) -> dict:
+    teds = TEDS(structure_only=structure_only)
     scores = teds.batch_evaluate(pred_json, true_json)
     pp = pprint.PrettyPrinter()
     pp.pprint(scores)
@@ -83,6 +83,8 @@ def prediction(path_pred: Path, path_images: Path) -> dict:
     path_pred = data_dir / "pred.json"
     download_dataset(data_dir)
 
+    mode_metric_structure_only = False
+
     with open(path_gt, "r") as fp:
         gt_json = json.load(fp)
     '''
@@ -90,14 +92,15 @@ def prediction(path_pred: Path, path_images: Path) -> dict:
     path_images = data_dir / "images_tmp"
     pred_json = prediction("gt_tmp.json", path_images)
     '''
-    pred_json = prediction(path_pred, path_images)   
-    scores = call_metric(pred_json=pred_json, true_json=gt_json)
+    pred_json = prediction(path_pred, path_images)
+    scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only)
 
     result = dict()
+    result["mode_metric_structure_only"] = mode_metric_structure_only
     result["mean"] = np.mean([score for score in scores.values()])
     result["images"] = scores
 
     # save benchmarks
     file_result = path_result / "table_benchmark.json"
     with file_result.open("w") as fd:
-        json.dump(str(file_result), fd, indent=2, ensure_ascii=False)
+        json.dump(result, fd, indent=2, ensure_ascii=False)
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index ebf978cc..1872b414 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -30,14 +30,15 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None,
         self.children = list(children)
 
     def bracket(self):
-        """Show tree using brackets notation"""
-        if self.tag == 'td':
+        """Show tree using brackets notation
+        """
+        if self.tag == "td":
             result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
         else:
             result = f'"tag": {self.tag}'
         for child in self.children:
             result += child.bracket()
-        return "{{{}}}".format(result)
+        return "{{" + result + "}}"
 
 
 class CustomConfig(Config):
@@ -54,44 +55,44 @@ def normalized_distance(self, *sequences) -> float:
 
     def rename(self, node1: TableTree, node2: TableTree) -> float:
         """Compares attributes of trees"""
-        if not node1.visible or node2.visible:
-            return 0.
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
-        if node1.tag == 'td':
+        if node1.tag == "td":
+            if not node1.visible or not node2.visible:
+                return 0.
             if node1.content or node2.content:
                 return self.normalized_distance(node1.content, node2.content)
         return 0.
 
 
 class TEDS(object):
-    ''' Tree Edit Distance basead Similarity
-    '''
+    """ Tree Edit Distance based Similarity
+    """
+
     def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
-        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1"
         self.structure_only = structure_only
         self.n_jobs = n_jobs
         self.ignore_nodes = ignore_nodes
         self.__tokens__ = []
 
     def tokenize(self, node):
-        ''' Tokenizes table cells
-        '''
-        self.__tokens__.append('<%s>' % node.tag)
+        """ Tokenizes table cells
+        """
+        self.__tokens__.append(f"<{node.tag}>")
         if node.text is not None:
             self.__tokens__ += list(node.text)
         for n in node.getchildren():
             self.tokenize(n)
-        if node.tag != 'unk':
-            self.__tokens__.append('</%s>' % node.tag)
-        if node.tag != 'td' and node.tail is not None:
+        if node.tag != "unk":
+            self.__tokens__.append(f"</{node.tag}>")
+        if node.tag != "td" and node.tail is not None:
             self.__tokens__ += list(node.tail)
 
     def load_html_tree(self, node, parent=None):
-        ''' Converts HTML tree to the format required by apted
-        '''
-        global __tokens__
-        if node.tag == 'td':
+        """ Converts HTML tree to the format required by apted
+        """
+        if node.tag == "td":
             if self.structure_only:
                 cell = []
             else:
@@ -101,10 +102,10 @@ def load_html_tree(self, node, parent=None):
 
             try:
                 new_node = TableTree(tag=node.tag,
-                                     colspan=int(node.attrib.get('colspan', '1')),
-                                     rowspan=int(node.attrib.get('rowspan', '1')),
+                                     colspan=int(node.attrib.get("colspan", "1")),
+                                     rowspan=int(node.attrib.get("rowspan", "1")),
                                      content=cell,
-                                     visible=False if node.attrib.get('style') == "display: none" else True, *deque())
+                                     visible=False if node.attrib.get("style") == "display: none" else True, *deque())
             except Exception as ex:
                 print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
                 raise ex
@@ -112,24 +113,23 @@ def load_html_tree(self, node, parent=None):
             new_node = TableTree(node.tag, None, None, None, True, *deque())
         if parent is not None:
             parent.children.append(new_node)
-        if node.tag != 'td':
+        if node.tag != "td":
             for n in node.getchildren():
                 self.load_html_tree(n, new_node)
         if parent is None:
             return new_node
 
     def evaluate(self, pred: str, true: str) -> float:
-        ''' Computes TEDS score between the prediction and the ground truth of a
-            given sample
-        '''
+        """ Computes TEDS score between the prediction and the ground truth of a given sample
+        """
         if (not pred) or (not true):
             return 0.0
-        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
         pred = html.fromstring(pred, parser=parser)
         true = html.fromstring(true, parser=parser)
-        if pred.xpath('body/table') and true.xpath('body/table'):
-            pred = pred.xpath('body/table')[0]
-            true = true.xpath('body/table')[0]
+        if pred.xpath("body/table") and true.xpath("body/table"):
+            pred = pred.xpath("body/table")[0]
+            true = true.xpath("body/table")[0]
             if self.ignore_nodes:
                 etree.strip_tags(pred, *self.ignore_nodes)
                 etree.strip_tags(true, *self.ignore_nodes)
@@ -145,13 +145,13 @@ def evaluate(self, pred: str, true: str) -> float:
             return 0.0
 
     def batch_evaluate(self, pred_json, true_json):
-        ''' Computes TEDS score between the prediction and the ground truth of
+        """ Computes TEDS score between the prediction and the ground truth of
             a batch of samples
             @params pred_json: {'FILENAME': 'HTML CODE', ...}
             @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
             @output: {'FILENAME': 'TEDS SCORE', ...}
-        '''
+        """
         samples = true_json.keys()
-        scores = [self.evaluate(pred_json.get(filename, '')['html'], true_json[filename]['html']) for filename in tqdm(samples)]
+        scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)]
         scores = dict(zip(samples, scores))
         return scores

From 764235505e328bf1bfdeefc66d6cfb2c42af3442 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Fri, 26 Jan 2024 13:13:49 +0300
Subject: [PATCH 4/5] TLDR-591 added table generation benchmark

---
 .../table_benchmark_on_generated_data.json    | 506 ++++++++++++++++++
 scripts/benchmark_table/benchmark_table.py    |  82 ++-
 scripts/benchmark_table/metric.py             |  10 +-
 3 files changed, 584 insertions(+), 14 deletions(-)
 create mode 100644 resources/benchmarks/table_benchmark_on_generated_data.json

diff --git a/resources/benchmarks/table_benchmark_on_generated_data.json b/resources/benchmarks/table_benchmark_on_generated_data.json
new file mode 100644
index 00000000..130bcd28
--- /dev/null
+++ b/resources/benchmarks/table_benchmark_on_generated_data.json
@@ -0,0 +1,506 @@
+{
+  "mode_metric_structure_only": true,
+  "mean": 0.9467889492889642,
+  "images": {
+    "0OEG7D5CXUSXDNEXAZ8A3.png": 0.993103448275862,
+    "0IS8OPRTM71QYN821WA5S.png": 0.9878048780487805,
+    "0KX1D4AGMTM3EWR0EF0A5.png": 0.989010989010989,
+    "0QBK1U71YOHBG5Z23MT7E.png": 0.9916666666666667,
+    "0DC57AS1OYZ1BRHZHPIO2.png": 0.96,
+    "0GJE73OG32H2P2SL2AI2J.png": 0.9905660377358491,
+    "0GYAQKWTI3LN6DNZFM2TZ.png": 0.9904761904761905,
+    "0GHKLO6LOH5LBTYEUND3S.png": 0.9917355371900827,
+    "0F831FOUA10K3594FG4IM.png": 0.9896907216494846,
+    "0XG0I2F0MMZ3QMXWLWFMX.png": 0.9935064935064936,
+    "0WMTO9U10ILEB9HCX4C0B.png": 0.9863013698630136,
+    "0C1ZYGFL2YNFM2W3P2KN1.png": 0.9795918367346939,
+    "0A4G5JAZSJS4BT5LBZ2Q3.png": 0.9850746268656716,
+    "0I75SMSDR5JSJXF07PN6J.png": 0.9915966386554622,
+    "0K9EAAIYXSUT80SYF3ML4.png": 0.9836065573770492,
+    "0PH78O2B9CJAM6MMINZXT.png": 0.9876543209876543,
+    "1BRZ4ALOZMMEXGR4AVJWG.png": 0.9876543209876543,
+    "0AFVW6AL3EH9H76ONNDYF.png": 0.9848484848484849,
+    "0EVCQHN9C65AUYG1UAN3C.png": 0.9696969696969697,
+    "0DD9D0ILAPJIH77GEVRGP.png": 0.4098360655737705,
+    "1BHU2JO8ODKS3OL4RIU6A.png": 0.9905660377358491,
+    "0OG2AZLHJPMBX43O2O9LR.png": 0.989010989010989,
+    "0M64SMZT9HTN6LXQ4M24T.png": 0.9846153846153847,
+    "0TY12X0C3U2BPZC81PW66.png": 0.9836065573770492,
+    "0W109P7LI6B5HIYM3SJ5A.png": 0.9873417721518988,
+    "0H6272E6S2YUDJWBSWKQN.png": 0.9873417721518988,
+    "0ZRX97WSSVCVQ3NJ5959P.png": 0.9873417721518988,
+    "0ZFX4HDI3O7YQFDYRRYKI.png": 0.9722222222222222,
+    "1A4SDGAXB66WDBW7OUH58.png": 0.9876543209876543,
+    "0HQVUJMOQRQQ5FIP4PMZF.png": 0.9859154929577465,
+    "0Q335MQBC8UJJMASJUNWZ.png": 0.9938271604938271,
+    "0AF02R419WL1YN97ZV144.png": 0.9868421052631579,
+    "0JWOGY4C0KQ14J958GLYD.png": 0.9922480620155039,
+    "0JCVUE03Y5YD8A45IOIA5.png": 0.972972972972973,
+    "0R10PSLELMJ0SPFCXX92A.png": 0.9814814814814815,
+    "0BF411IVR1HLU1Q44I3K7.png": 0.990990990990991,
+    "0K9C1HJZ8K3L6CRAQ6VCW.png": 0.9850746268656716,
+    "0Q2MRICBMAFRV1GRRR5TA.png": 0.5189873417721519,
+    "0ZK44UG99IWIPKRSCOSJV.png": 0.9795918367346939,
+    "0S09D3ZPVQ8YOT55XIOE1.png": 0.9887640449438202,
+    "0KRLZUD3DQAU1DYDU99ZH.png": 0.9882352941176471,
+    "1ATIOLLN3DOAHKX75560Q.png": 0.5460992907801419,
+    "0KSBBUINDNN16F2ZLQHV4.png": 0.9915254237288136,
+    "0P5IE8XH9BN2EGC0DX27Z.png": 0.989010989010989,
+    "0GJ88Q9SMUOWF3WILKG14.png": 0.98,
+    "0QYA242XOQ0Y9078UC7NI.png": 0.984375,
+    "0F4Z8B4S5RV008LHJBW8S.png": 0.9896907216494846,
+    "0GWJH40B21AJBR1F73FXI.png": 0.9722222222222222,
+    "0ZO44O69QHTV62QJ3X9KH.png": 0.9883720930232558,
+    "0VK3KLUJVLAB9SRQDN6EJ.png": 0.45622119815668205,
+    "0HQHS3BO0IIOJ5L2EP2H4.png": 0.9882352941176471,
+    "1DBQ2M6XQ66Y2895PYNOM.png": 0.9824561403508771,
+    "0A4OYW3ZL5QP76IGF0DK0.png": 0.9836065573770492,
+    "1AFVW6AL3EH9H76ONNDYF.png": 0.9863013698630136,
+    "0J0JQM9WD7B0RCNKWBC5S.png": 0.972972972972973,
+    "0OVSLM3WAA36TZQCOL1WS.png": 0.9821428571428571,
+    "0GMEN2MGE7HN3ROOZQ5YD.png": 0.9879518072289156,
+    "0AJLKOKRHEVOTGE90GEH6.png": 0.9922480620155039,
+    "0DUX4YKT5JYJO3Z573OG8.png": 0.967741935483871,
+    "0A4SDGAXB66WDBW7OUH58.png": 0.9871794871794872,
+    "0G5S5CXGRLABEYII4QG2Z.png": 0.9887640449438202,
+    "0YM92E2EEDDGHAUW2YZ8Q.png": 0.9863013698630136,
+    "0F0E32N4VR4Q9960I0DB8.png": 0.975609756097561,
+    "0JNQPLSGKLPQ0UAAFYL5T.png": 0.967741935483871,
+    "0YL4VFF3LUUQITLVU3U9V.png": 0.9902912621359223,
+    "0ZO2Z3XCHZLB43ARH68WS.png": 0.9803921568627451,
+    "0SHL8BKLII1AGBZ1SEB4U.png": 0.9876543209876543,
+    "0KBLEG9N1SBX956ZCIP5I.png": 0.9795918367346939,
+    "1B1QX4K8U8P9QA3HVLRPN.png": 0.9896907216494846,
+    "0D0ZG3O9YHMQAPHCD0890.png": 0.9924242424242424,
+    "0PTMPFGYNVJWO6FCX1QRZ.png": 0.9933333333333333,
+    "0A7ZA5BA5TPHBN2WP6TT9.png": 0.9904761904761905,
+    "0DE9UIIVMYH3UK0SYFVUG.png": 0.9767441860465116,
+    "0U0LFAJATVD9YEC1Z3497.png": 0.9767441860465116,
+    "1A3YX0911ULBZSCUBNDZS.png": 0.9929577464788732,
+    "0WRQRWHH2CMV2L4CE3SN9.png": 0.9878048780487805,
+    "0C1X00FENSOUN2Y08Y3JT.png": 0.9824561403508771,
+    "0ONRU7A4SU4WAUWF25FRP.png": 0.9878048780487805,
+    "1DD9D0ILAPJIH77GEVRGP.png": 0.9824561403508771,
+    "0CBFM7HG55Z7O8F4Y0O0L.png": 0.9905660377358491,
+    "0C9EM94JJTICVGS6U2T2U.png": 0.9917355371900827,
+    "0KK57808VO3HNS1AW4CJO.png": 0.9767441860465116,
+    "0E7XFLPH56MT23HNK3MZ6.png": 0.9886363636363636,
+    "0J818KH6HIIA83D74FXS3.png": 0.9887640449438202,
+    "0TLMD42BW0F4NSD9PG19X.png": 0.984375,
+    "0PSN5QFZWTPA9U05O7MZ3.png": 0.99,
+    "0GI0JNFJAOXK5OJKRXCND.png": 0.9891304347826086,
+    "0Z4X1LVZ1K4NE2RR8P7EA.png": 0.9859154929577465,
+    "0S709DW5AZF9VPCPMVHXB.png": 0.9891304347826086,
+    "1B87OEX5XX0BHUOQAS50A.png": 0.9767441860465116,
+    "1AAERNSDA06GDA7OFZVCA.png": 0.9850746268656716,
+    "1CORAY089OILX2OWIKU1E.png": 0.98,
+    "0WO86MK2DC2EZUZLSMFA1.png": 0.9911504424778761,
+    "0KNUPYHEXZYSW1TNZ6I7L.png": 0.9846153846153847,
+    "0ONF59OAQYX89LAM941E6.png": 0.9859154929577465,
+    "0E1IVAEMQXKVCH3Q0JCVX.png": 0.9782608695652174,
+    "0K8YJZK75V8SXL0GIM4SU.png": 0.9896907216494846,
+    "0LUL2CVQ1HLC1KL6D2VMP.png": 0.4065934065934066,
+    "0WIEGQEF4G9LN2UM49Y12.png": 0.9876543209876543,
+    "0AQ9EL10BYBSGJO2RLC6Q.png": 0.9888888888888889,
+    "0M0WYXRJONRUQ3ZG24MJJ.png": 0.989010989010989,
+    "0E3XQJO1C4CKR9TNFB4IC.png": 0.9871794871794872,
+    "0H17CYXGJTHXPQUP51TBI.png": 0.9911504424778761,
+    "0NK736IIIHGBF52E1UKQ4.png": 0.9859154929577465,
+    "0DYIHMLOKOR6HNF2XAI8F.png": 0.9836065573770492,
+    "0BZ5GZPTUSCNBNGBNQZEG.png": 0.9859154929577465,
+    "0M47PMX0DRIVKCJBYKHPJ.png": 0.9767441860465116,
+    "00MK8C41M7MW013CJ9SPU.png": 0.9922480620155039,
+    "0DMXCT01TPF8O33UMENE4.png": 0.9917355371900827,
+    "0WS9VI6T1X0M5H6D8O67Z.png": 0.9859154929577465,
+    "0XQ9XQOL15RDKQT4YZUQC.png": 0.5739130434782609,
+    "1DD8FWYLADAY5EJ3UZUD9.png": 0.9876543209876543,
+    "0MXPSYD5A5U86BSSZQMJN.png": 0.975609756097561,
+    "0QAOLXSIIRIRQ3W1OP7Y8.png": 0.9921259842519685,
+    "1BZ5GZPTUSCNBNGBNQZEG.png": 0.9767441860465116,
+    "0Y5AIJNHB8DTPQOC92X6P.png": 0.9882352941176471,
+    "0IL3BP1QRAZ54V54IBK9A.png": 0.9876543209876543,
+    "0MDCUYD9ASW4AGWD3ZYK5.png": 0.9891304347826086,
+    "0MON88TOR16AGTBLDTGJC.png": 0.9904761904761905,
+    "0QVCHWR0EZCMQ5J5P0Z1J.png": 0.9767441860465116,
+    "0IPJ09DW34Q275Z5CMS1X.png": 0.99,
+    "1A7ZU26KX6C0LG0D3T3ZS.png": 0.9863013698630136,
+    "0M49YEV7H4P48EONCBFPS.png": 0.9863013698630136,
+    "1D34PI1NNCV0AB4WCQMB3.png": 0.9863013698630136,
+    "0AGYYXV88WJW2FC6FVV3Q.png": 0.9863013698630136,
+    "0F9W69ODT3GQCQ6F11L2E.png": 0.9767441860465116,
+    "0Q3RJT1DJMPO9D9BE6JNO.png": 0.9868421052631579,
+    "0ETQJY2HRGYIBO46BSD3P.png": 0.4503105590062112,
+    "0BRZ4ALOZMMEXGR4AVJWG.png": 0.43983402489626555,
+    "0WLG2ZXPFXZGF9RM2Z6N6.png": 0.9871794871794872,
+    "1BP5KU2XHXZ0C431B4OL9.png": 0.43450479233226835,
+    "00ZG4J0UMAHQMR57DQ5T7.png": 0.9818181818181818,
+    "0S5HD36LFVDWLLH6UFK9I.png": 0.9939759036144579,
+    "0EW4PZW85MH9BS8VI83KZ.png": 0.9848484848484849,
+    "0EMFKQLMGGAFPLQGUEZSJ.png": 0.96,
+    "0H4TWDI39J0HRG239GQ10.png": 0.9938650306748467,
+    "0BSXNNN0LA94101P5D38I.png": 0.9882352941176471,
+    "0SLVZSD9X7VZPGQU0Q2QN.png": 0.9850746268656716,
+    "0K6WPSDJC0ICOWFEASYB4.png": 0.9911504424778761,
+    "0TY3MTJ6YZDE6QI73SH5A.png": 0.9859154929577465,
+    "0B87OEX5XX0BHUOQAS50A.png": 0.9896907216494846,
+    "1C1X00FENSOUN2Y08Y3JT.png": 0.967741935483871,
+    "0KEM29NIZZ7UI3CTN6NEA.png": 0.9896907216494846,
+    "0JZQMX95783K8QW3ERXSM.png": 0.8827586206896552,
+    "0R47TY8TMFAL346RUY0LW.png": 0.9696969696969697,
+    "0EG83QLMPW7MGGMGBYGPD.png": 0.9882352941176471,
+    "1AGYYXV88WJW2FC6FVV3Q.png": 0.9927536231884058,
+    "0OF74SYX6Q102JCQ5KELF.png": 0.9896907216494846,
+    "1BX1I2HS6BLV92NZHV6J1.png": 0.9940828402366864,
+    "1DDEMI2034QD7F4QRH1IV.png": 0.972972972972973,
+    "0Z8LGXZ1SMLBHV5T6Y4O9.png": 0.9859154929577465,
+    "0NGE5XRBD2YHBZFMDL7VD.png": 0.9795918367346939,
+    "0SX4TWDHV25DCZV3HQEHH.png": 0.9777777777777777,
+    "00Q04QLVCESVWCSMDAURN.png": 0.9855072463768116,
+    "0SK696SAQW3MZNDMD4W85.png": 0.984375,
+    "0F4WBFLG32FAT22W0NGEY.png": 0.45871559633027525,
+    "0TNFF3RUQ2UL3PRNYF45M.png": 0.9868421052631579,
+    "1A4OYW3ZL5QP76IGF0DK0.png": 0.9904761904761905,
+    "0IUNRRJ3JHMEAORR2EXRS.png": 0.9908256880733946,
+    "0L764EQB3ZGC3FYQ20PR9.png": 0.9863013698630136,
+    "0XZJ4SZWY0ZOD9QBZP96A.png": 0.9922480620155039,
+    "0PU3J7NYVCB6XLSJJOEZ9.png": 0.9911504424778761,
+    "0DDKIN1PFJQTFW1JADVHT.png": 0.9863013698630136,
+    "1A4G5JAZSJS4BT5LBZ2Q3.png": 0.9836065573770492,
+    "0DPMX3BRIG9CWZPYKXFWS.png": 0.9921259842519685,
+    "0N7P792721CFI8EDOCB0N.png": 0.9908256880733946,
+    "1BIC4PMO7M3ZB8WUC3STJ.png": 0.9933333333333333,
+    "0XNBY82W4NFSD9GV6ONKU.png": 0.9911504424778761,
+    "0F3P8XGEMBYESYCYAOQPN.png": 0.9923076923076923,
+    "00D983SP0WHF6YGMKSHCR.png": 0.9803921568627451,
+    "0N91H0ZWMHBPPPPON4HUW.png": 0.993006993006993,
+    "0AWZPWR198XN7U8HY1E32.png": 0.9836065573770492,
+    "0S0Z9J05KZWNPKUFRD78Z.png": 0.9927007299270073,
+    "0FCDXM7JS1QEBBY3DCGBM.png": 0.9795918367346939,
+    "0OEVVJNLZKKW7GOPM188W.png": 0.9615384615384616,
+    "0Z7FUMCO707ZDI55EG306.png": 0.9878048780487805,
+    "0DE4P4M2855D754NA8993.png": 0.9722222222222222,
+    "0UZ81HSUQSHVVGU56NIOG.png": 0.9902912621359223,
+    "0AAERNSDA06GDA7OFZVCA.png": 0.992,
+    "0WZXI1YECN77S9GD6GQ4M.png": 0.98989898989899,
+    "0S8HOU13AW544ALTKAB73.png": 0.989010989010989,
+    "0AAPDAAK73MRINE7PM0ZJ.png": 0.41628959276018096,
+    "0UJ2AFVE6RWGTYSB6DKLJ.png": 0.45871559633027525,
+    "0ISYQEE43TA3O41XMA47A.png": 0.993103448275862,
+    "0L2E8S3ICCMGPE9PS3RLV.png": 0.9908256880733946,
+    "0BIC4PMO7M3ZB8WUC3STJ.png": 0.984375,
+    "0CTFYQFHQ1S1FLIEAPZTB.png": 0.9767441860465116,
+    "0A9RJA2I3YJT58JR2MEOT.png": 0.9818181818181818,
+    "0UGHOJ96BTPB57BR0DJS7.png": 0.4505494505494505,
+    "0TW35WW1PRLL2YKVYWYRM.png": 0.9818181818181818,
+    "0HTO45RT9NH5KQUCLOV2H.png": 0.9722222222222222,
+    "0F0TA5W8GO31TXUFMHHTO.png": 0.98989898989899,
+    "0HWMSCT6L3MCGFJV4OXF8.png": 0.975609756097561,
+    "0KCIUQNXNE3ZMX5ECY7V3.png": 0.9925925925925926,
+    "0ET4I24PZATQRKGMGG5KC.png": 0.975609756097561,
+    "0I6WVEL7V26O3KJJ1GGYF.png": 0.9896907216494846,
+    "1A2AT7TW5KOMUUAK7TQXT.png": 0.9767441860465116,
+    "0W8NNJL30MNEY6RTPD6DA.png": 0.9767441860465116,
+    "0XC0XOHP855H9DFG41W9T.png": 0.9803921568627451,
+    "1DC57AS1OYZ1BRHZHPIO2.png": 0.9903846153846154,
+    "00WVVGSQ00B0IZU4OKPHQ.png": 0.9916666666666667,
+    "0RTI5C20W407SL59RANEM.png": 0.991304347826087,
+    "0A0DA327P9Y532UTLHE2N.png": 0.9722222222222222,
+    "0AQZMEU4Q38NKK4USHAC5.png": 0.9896907216494846,
+    "0U7602J86XPC7AVTSPMWL.png": 0.9878048780487805,
+    "0DSGAEKSK52RUNGEOGEXP.png": 0.9921259842519685,
+    "0JD5R5NDJKRRHT1UI6GFW.png": 0.7058823529411764,
+    "0PQ9OK98A29AC6GEI3DKQ.png": 0.4882352941176471,
+    "0TA7SVAQC7PKDE8BUP3NF.png": 0.9887640449438202,
+    "0TQM47CA0F30LG2C0S2KN.png": 0.9887640449438202,
+    "0HA1FE8828DJ86ZIJUIX4.png": 0.967741935483871,
+    "0P1Y0C88Y17DSXE616MQN.png": 0.98989898989899,
+    "0I6GDDWCTMF9V4YLGLBIM.png": 0.4036697247706422,
+    "0SI6DA6CAXUMFYSXBXIF6.png": 0.9906542056074766,
+    "0NM9CUQJV6W2N9434O81D.png": 0.9859154929577465,
+    "0WU8XJP1VJSLZXQ7S43HM.png": 0.9767441860465116,
+    "0P0WR7JJ9JBXO0HVMDETS.png": 0.975609756097561,
+    "0ZNTZMWW1X0QZV4AGDHYL.png": 0.9926470588235294,
+    "0C98HOE9TQ4HZK6DKGF5I.png": 0.989010989010989,
+    "0JCDZWWAMUR9FRGHL9IVN.png": 0.9911504424778761,
+    "0PVN50SJP1LUTHE2TID60.png": 0.9926470588235294,
+    "0D7CMRTBBENLYDO7EWWVZ.png": 0.98,
+    "0JOTZX26K6UJB6LNVK9RH.png": 0.975609756097561,
+    "0ZFOZ6UKG7DCCD5HSUIIX.png": 0.9876543209876543,
+    "0L7V0ZXS2M9JMSBD05I25.png": 0.9873417721518988,
+    "0G1E97R3QFH7FG9AUAIFB.png": 0.9863013698630136,
+    "0CORAY089OILX2OWIKU1E.png": 0.984375,
+    "0EH9JARAL7RYD3CVMM8AZ.png": 0.5185185185185186,
+    "00KDBG5H22KPNCPCK7L2P.png": 0.9848484848484849,
+    "00XJ5C1RWIRVID9IPUX8G.png": 0.8,
+    "0FFJM5ABUDDCT2DOCW2T4.png": 0.9916666666666667,
+    "0D34PI1NNCV0AB4WCQMB3.png": 0.9896907216494846,
+    "0X9D7AJTD7S91BNHMQ4L0.png": 0.9876543209876543,
+    "0W9SN5GJDEWTG3WAPGPDZ.png": 0.9887640449438202,
+    "0ATIOLLN3DOAHKX75560Q.png": 0.9882352941176471,
+    "1C9EM94JJTICVGS6U2T2U.png": 0.9883720930232558,
+    "0TG6BRHGF3C865C2OL6DE.png": 0.9882352941176471,
+    "1BUP8L4PGVBNQE1GSCGJZ.png": 0.9863013698630136,
+    "1AJLKOKRHEVOTGE90GEH6.png": 0.989010989010989,
+    "1C98HOE9TQ4HZK6DKGF5I.png": 0.9859154929577465,
+    "0IH65GI6IN6RQWJE04YPG.png": 0.9859154929577465,
+    "0DNHG32KRYJ9PQ7UU1YL5.png": 0.9863013698630136,
+    "0EV54WP1Y9JDCWMDIT0OM.png": 0.975609756097561,
+    "0BE3I0HX6XWZQA4EFY99C.png": 0.984375,
+    "0O7G4HGEK48J2NUB5RCES.png": 0.9882352941176471,
+    "1BXWVCNXW1Z4N1XG8QOG4.png": 0.9905660377358491,
+    "0M2V36SUMHY2U8FRS9NYZ.png": 0.4424778761061947,
+    "0STJA7OMA59TOQ8XQ54G5.png": 0.98,
+    "0VB0OIQZQXKY5PA111Q8B.png": 0.984375,
+    "0RBPX6DU1W6LIYA2VRAA4.png": 0.972972972972973,
+    "0SP3KJJ2HMQZF088NH2DR.png": 0.9904761904761905,
+    "1D0ZG3O9YHMQAPHCD0890.png": 0.9655172413793104,
+    "0XZ590ZLZXRB09XIADL9V.png": 0.9934640522875817,
+    "0QU6QW0KAWVXZ6TL7FVJE.png": 0.9933774834437086,
+    "0PKH21420YW57OPRJR21R.png": 0.9922480620155039,
+    "0TX7Y5KWQ2MVU3579QIYH.png": 0.9777777777777777,
+    "0Y6OW4PMMWG05F4ZFYQ40.png": 0.9767441860465116,
+    "0EK5DRITVR9G3KDVF1CTJ.png": 0.9876543209876543,
+    "0DDEMI2034QD7F4QRH1IV.png": 0.9933774834437086,
+    "0HJXUBEZQCR1DEUQ8V30I.png": 0.9932885906040269,
+    "0BG5K95UCWQ3JXWC501XA.png": 0.9886363636363636,
+    "00TNQG8N9T3KUVMZ7AWTB.png": 0.967741935483871,
+    "0TJSB9YOUAG7C9OZW3U80.png": 0.9848484848484849,
+    "0SYEGYPSNLKCALCQBPGK2.png": 0.9929577464788732,
+    "0IP23CAYMTIVE93KLVMRA.png": 0.9824561403508771,
+    "0KFRN6DX1A6MMGS24B39T.png": 0.9850746268656716,
+    "1CTFYQFHQ1S1FLIEAPZTB.png": 0.9803921568627451,
+    "0U9U2Q7VBD1V6HBT7FQKM.png": 0.9923076923076923,
+    "0S7MUFP120D8OP4ZCCCUV.png": 0.4873417721518988,
+    "0BXWVCNXW1Z4N1XG8QOG4.png": 0.9873417721518988,
+    "1A7ZA5BA5TPHBN2WP6TT9.png": 0.9824561403508771,
+    "1ACY14LU0VWSKDOHEAVZM.png": 0.9924812030075187,
+    "0MPO1XXHHM8I5BOIT3DB9.png": 0.9876543209876543,
+    "0RSQ19UNM98CNWII5Q25F.png": 0.975609756097561,
+    "0EAA9XEBN9W7XDBPK31UZ.png": 0.9803921568627451,
+    "0U0BR4A64P7CE7YZ57HQ1.png": 0.9911504424778761,
+    "0XFNT3NMKFW1DB0F2LVY3.png": 0.9916666666666667,
+    "1AQZMEU4Q38NKK4USHAC5.png": 0.9904761904761905,
+    "0VGZMTO2VCZVZKGAOHZEU.png": 0.9910714285714286,
+    "0DBQ2M6XQ66Y2895PYNOM.png": 0.984375,
+    "0BP5KU2XHXZ0C431B4OL9.png": 0.9811320754716981,
+    "0PYCGJHF1705P4NTCM8AS.png": 0.9824561403508771,
+    "0RAGYZ9465I7GLXZXCLCQ.png": 0.9924812030075187,
+    "1A9560NY0NQ5OVZQQBJRQ.png": 0.4636363636363636,
+    "0KXDSHWWWYQJBXT2Y6U8S.png": 0.9803921568627451,
+    "1BF411IVR1HLU1Q44I3K7.png": 0.984375,
+    "0T1ZL9NSVN3385DR7B86C.png": 0.9824561403508771,
+    "0SYKTWM1EF4KS646AWQEL.png": 0.9803921568627451,
+    "0S104IFNSN5EJ31212IOP.png": 0.989010989010989,
+    "0H2RZUXKBQEVFJ2JT29R4.png": 0.9818181818181818,
+    "0SVC8WRHPF38HHKBN65YD.png": 0.9926470588235294,
+    "0HVIW7DPWCJSWJ5PCJDM2.png": 0.9855072463768116,
+    "0PRIZA7CG2JAL9GTN265B.png": 0.9929577464788732,
+    "0FXLG8PO267BZPBBXIX4E.png": 0.9922480620155039,
+    "1B0LNAITDDPPCJ4I6XIWK.png": 0.9868421052631579,
+    "0YNQ2KZ01B1TWP9FR5DE7.png": 0.45360824742268047,
+    "0A8AVSZNK6GTNOCBEVFOY.png": 0.9722222222222222,
+    "0XM8RQF6JQDOTJ5WQVHFE.png": 0.9873417721518988,
+    "0JBU3LJRDTMJI2XGB6NUE.png": 0.9868421052631579,
+    "0FKIASN9E4KCZ0JRCAJLQ.png": 0.9917355371900827,
+    "0A2AT7TW5KOMUUAK7TQXT.png": 0.9882352941176471,
+    "0QISJETVE3HGF1PMBD1BM.png": 0.9848484848484849,
+    "0KBOWWQLYSIZ0P4SIZMHJ.png": 0.993421052631579,
+    "0OMZO818L9AC4U3JJTKGD.png": 0.9863013698630136,
+    "0IVOAVCWOJ4CA92H7CM1Q.png": 0.9917355371900827,
+    "0SH9F7EHAT35OVT003OC5.png": 0.3728813559322034,
+    "0F7BJ4Z9F1R95HUG4RRZD.png": 0.9767441860465116,
+    "00RJGV4A4UTMTLDEIR1IG.png": 0.975609756097561,
+    "0BUP8L4PGVBNQE1GSCGJZ.png": 0.967741935483871,
+    "0B1QX4K8U8P9QA3HVLRPN.png": 0.9923664122137404,
+    "0IZ8M2UHYSA9H6K8XIOKS.png": 0.9855072463768116,
+    "0KLEV2650Z6X2DAUO94QK.png": 0.9876543209876543,
+    "0MRYJGMAVHEDMZ3XSX9XI.png": 0.9871794871794872,
+    "0I6PWVE3HEK6ZZ5K53UY4.png": 0.9818181818181818,
+    "1BE3I0HX6XWZQA4EFY99C.png": 0.984375,
+    "0M7CJCA8K3PX504PNHJRT.png": 0.9883720930232558,
+    "0ESACK4QILSDBXRS54UK0.png": 0.9795918367346939,
+    "0KLU5K631Q9RHQOY6771B.png": 0.4444444444444444,
+    "0RAZV12CY84ZGA4BRZQUC.png": 0.9871794871794872,
+    "0HZ4TDEJG6BY7B2RTALZK.png": 0.9868421052631579,
+    "0ROPMUV96VG8PTONLNGV9.png": 0.9887640449438202,
+    "0L194VI2NIOAX4AUCU2WG.png": 0.9767441860465116,
+    "0PG6K8IFJM2PHHLA1S4Y6.png": 0.9905660377358491,
+    "0H5AHQVKHAKQ1W636PLCS.png": 0.9878048780487805,
+    "0ZAHJJUMYDOQIMIUUFAUD.png": 0.9863013698630136,
+    "0MO39PWU9N82Y88WNANVM.png": 0.984375,
+    "0ZSUP0IMF3PK86DIVWQ8V.png": 0.967741935483871,
+    "0M1B6J5CTPBITI79C68MO.png": 0.9824561403508771,
+    "0BWJOYJSDHL1XJH6UG2RM.png": 0.9882352941176471,
+    "0SPYHIS3OEEZ082CFJEGF.png": 0.9871794871794872,
+    "0A3YX0911ULBZSCUBNDZS.png": 0.9896907216494846,
+    "0FK1CU21TAIHIR7YWZ2W7.png": 0.9818181818181818,
+    "0WP1ZBKQCK8W2W0ZXI2Z4.png": 0.7916666666666666,
+    "1AF02R419WL1YN97ZV144.png": 0.9767441860465116,
+    "0BKBFKJTQPLQBNIBZSM7E.png": 0.9916666666666667,
+    "1C1ZYGFL2YNFM2W3P2KN1.png": 0.9871794871794872,
+    "0IHOYC7KXLECI1F3G1WAF.png": 0.9848484848484849,
+    "1A0DA327P9Y532UTLHE2N.png": 0.9868421052631579,
+    "0SK9B35AHQ2OQA1RDKHHP.png": 0.9917355371900827,
+    "0EECJZYQ42MZLSWPOK9ZH.png": 0.9887640449438202,
+    "0UFBWJZOD5PBKMVX7G231.png": 0.9824561403508771,
+    "0OZ6DU5POAFSM589UXX4S.png": 0.9876543209876543,
+    "0OUIP8MTUSWLFQ6J13VXT.png": 0.967741935483871,
+    "0NFAI2Z8TAUKU6S7892KH.png": 0.975609756097561,
+    "0F3VUGWY35HLOJYHPT78G.png": 0.9883720930232558,
+    "0AYZOGNX998RYQVPWP1OA.png": 0.9846153846153847,
+    "0UC2QTKS4ITXYK4E6HU9T.png": 0.9939759036144579,
+    "0KK6YAU45B9B34SSZTAS7.png": 0.9836065573770492,
+    "0WV2Q54214D8ARYKCMBE0.png": 0.547945205479452,
+    "0TUDLFORB7K1BVA4U0ULU.png": 0.9917355371900827,
+    "0XZRML313QJ6X82YZJLYT.png": 0.9848484848484849,
+    "0ACY14LU0VWSKDOHEAVZM.png": 0.9873417721518988,
+    "0HH9NAZ1I95NJINORKJIM.png": 0.9795918367346939,
+    "1AWZPWR198XN7U8HY1E32.png": 0.9795918367346939,
+    "0TLG8NFY9BXHB15A47OGW.png": 0.9926470588235294,
+    "1CBFM7HG55Z7O8F4Y0O0L.png": 0.9848484848484849,
+    "0EV3WT6VJG3QH2HFJEIBA.png": 0.975609756097561,
+    "0OBPU21JDPO0KPYEQGLFO.png": 0.9722222222222222,
+    "0MJ27YD7XBYLQKM87RM3Y.png": 0.9887640449438202,
+    "0BHU2JO8ODKS3OL4RIU6A.png": 0.99,
+    "0WVB351NNWY8OOQQRRW6F.png": 0.476878612716763,
+    "1BCT1VG1R4HUK3Q6NMZGU.png": 0.9916666666666667,
+    "0YJ043WAWUTW4AEMDTD4R.png": 0.9782608695652174,
+    "0YS08VVMS1YPOHVJOFXXA.png": 0.98989898989899,
+    "0EWWFSOUCGGD5BK6RKMKO.png": 0.522875816993464,
+    "0VCTD6BP09MBAXOOM5Y5E.png": 0.975609756097561,
+    "0S7ZGBZ7OBI15CZS5V95A.png": 0.984375,
+    "0JJ9O2OQ6O13OAOFM7643.png": 0.99,
+    "00TXY79AHYWJ7WLXB3VLV.png": 0.9846153846153847,
+    "0J2UQ7WIZXFK4I5TV9UHW.png": 0.9935064935064936,
+    "0TYF1PBQCH64LANCKYWY7.png": 0.9859154929577465,
+    "0SWG2OW7F5RLADFAHJ9A4.png": 0.9882352941176471,
+    "0RV3TKC89HQD4FRFCTNSK.png": 0.9767441860465116,
+    "1BQBJ8UFLH7H3JQ965JF6.png": 0.9863013698630136,
+    "0C70JEJWPOAT1S8RUWCVB.png": 0.972972972972973,
+    "0RCE6GI0QYPCA15RH6HM7.png": 0.49382716049382713,
+    "0SB1QV5XRJM6W0HRU4AH7.png": 0.9891304347826086,
+    "0I1HQDO584A6ODC54PLNA.png": 0.9891304347826086,
+    "1AWHACFMS9KSHM18INN41.png": 0.9836065573770492,
+    "1BKBFKJTQPLQBNIBZSM7E.png": 0.9863013698630136,
+    "0T0Q44ALMC9WURWEESEMP.png": 0.9875,
+    "1A9RJA2I3YJT58JR2MEOT.png": 0.9615384615384616,
+    "0DD8FWYLADAY5EJ3UZUD9.png": 0.9868421052631579,
+    "0F078JDZMTC8C8H2P8IVA.png": 0.9921875,
+    "0L5KEP1L6K1ALH88LLMEY.png": 0.9795918367346939,
+    "0U2FXJ2H3K5SQTZNJ1WV1.png": 0.98,
+    "0U49K9QPO02GF77TU5JB8.png": 0.9863013698630136,
+    "0A9560NY0NQ5OVZQQBJRQ.png": 0.9836065573770492,
+    "0MRQ2DF27RW94C36QLLTZ.png": 0.9863013698630136,
+    "0BCT1VG1R4HUK3Q6NMZGU.png": 0.9795918367346939,
+    "0GQC64N9E830BWDTF8L0Q.png": 0.9910714285714286,
+    "0HIESCSLITYADXZHOO7IA.png": 0.989010989010989,
+    "0FZFGRN9B0WT3XCQMOVPJ.png": 0.9767441860465116,
+    "00LQMDL10JL253UW69YUO.png": 0.9818181818181818,
+    "0U79XK18POJ6HCLLOXS4Z.png": 0.9905660377358491,
+    "0I3RG6GXJ2VILV3BPFIY4.png": 0.9767441860465116,
+    "0X8PV0Z6SNEKPIPOCP5HR.png": 0.3931034482758621,
+    "0UFQOEKLIWTX65AY778BD.png": 0.5275590551181102,
+    "0HZUERFF8VNKXAZLV8RO5.png": 0.9850746268656716,
+    "0FWXHCMHZ7KG6WYRNWD6Q.png": 0.9922480620155039,
+    "0A9B6NHM7J57SCT1Z8TAS.png": 0.9861111111111112,
+    "0WOTQFWQFAEPN0HZ6MYIL.png": 0.9929577464788732,
+    "0IUNSDMCG8WWVJJ758NN9.png": 0.9887640449438202,
+    "0XLK4S5OWK77LRNU2JAG9.png": 0.46543778801843316,
+    "00FMSMFBJU5732FGUTLIF.png": 0.9821428571428571,
+    "0YOETJE558OS77GHG5L5U.png": 0.9876543209876543,
+    "0BKXE7HQJOJV0I1LL8YOF.png": 0.9821428571428571,
+    "0AWHACFMS9KSHM18INN41.png": 0.5416666666666667,
+    "1BL58Q9DLPBQF73ROGFDX.png": 0.9921875,
+    "0Q7EACO6OF8WQFZXI1MRQ.png": 0.9896907216494846,
+    "0R1IOV08YNRVC0KQS84EF.png": 0.9818181818181818,
+    "0SEF4O8YR8ULW23U32SE6.png": 0.9836065573770492,
+    "0IQGTS9QZK0ZYRL80GOSD.png": 0.9767441860465116,
+    "0E00IBZTY74DGR1SSX77L.png": 0.975609756097561,
+    "0BR0V61AWXYXVQSK6RMY7.png": 0.9911504424778761,
+    "0MESCFGQYOQNMVWD6B1VU.png": 0.9885057471264368,
+    "0F3GIMIL9E4UNWEFYLKGV.png": 0.9824561403508771,
+    "1A8AVSZNK6GTNOCBEVFOY.png": 0.9910714285714286,
+    "0E7WX1NX5ZKR24SEIUKRN.png": 0.9811320754716981,
+    "0QZOZCFYQ2TK5C0Q3KN5C.png": 0.5106382978723405,
+    "0SDC2B1I853GR50G545IX.png": 0.9891304347826086,
+    "0FEKB24PHTZNT3KIZZVIS.png": 0.9876543209876543,
+    "0SIW9Q9NWY3TWRC712D4J.png": 0.9876543209876543,
+    "0JFFFUOFXDOLV2ZGQJAPB.png": 0.9887640449438202,
+    "0O976W9Y9NDSJ24YV7HU9.png": 0.975609756097561,
+    "0B0LNAITDDPPCJ4I6XIWK.png": 0.9811320754716981,
+    "1AAPDAAK73MRINE7PM0ZJ.png": 0.9852941176470589,
+    "0HC8F1RENJE297WV8RW0N.png": 0.45517241379310347,
+    "0OXJ4SWAYILOZVQCGO1OB.png": 0.9937106918238994,
+    "0I3S2Z8YWZ0JOIMKGU51B.png": 0.972972972972973,
+    "0Y0LZ2LRH7BR5ZDYBTH7U.png": 0.9824561403508771,
+    "0T0LAS5REAE827IQO0Q9U.png": 0.98989898989899,
+    "1AQ9EL10BYBSGJO2RLC6Q.png": 0.9868421052631579,
+    "0L1YL688ZRRPYAJ07UOFQ.png": 0.9911504424778761,
+    "1BWJOYJSDHL1XJH6UG2RM.png": 0.9922480620155039,
+    "0TELO9B7QI0QQVFMJXAQ1.png": 0.9896907216494846,
+    "0XDX2OT3OG575I0U99YAQ.png": 0.54,
+    "0X49B57NNHU6FEB4J21VY.png": 0.993006993006993,
+    "0DHJ8WY2XLWKG7K345LAK.png": 0.975609756097561,
+    "0BQBJ8UFLH7H3JQ965JF6.png": 0.989010989010989,
+    "00CBN2MRTC48ZY50RUSBW.png": 0.9767441860465116,
+    "1D7CMRTBBENLYDO7EWWVZ.png": 0.9863013698630136,
+    "0BX1I2HS6BLV92NZHV6J1.png": 0.9722222222222222,
+    "0XQE375V4J34MLJYN711T.png": 0.9722222222222222,
+    "1BKXE7HQJOJV0I1LL8YOF.png": 0.9795918367346939,
+    "0E3OA2PY1K3B44GN9AS0Y.png": 0.9863013698630136,
+    "0SCRALC3GPIO2ZD918U8L.png": 0.478021978021978,
+    "0ITKDLWB7SDGMM8980ZSS.png": 0.9911504424778761,
+    "1BYRMKANKN4PL6JFPG8AR.png": 0.989010989010989,
+    "0BYRMKANKN4PL6JFPG8AR.png": 0.9803921568627451,
+    "0R8W6O2N25AVQI9FQ5IL7.png": 0.972972972972973,
+    "0J9TV59N7U65CB7YCHD38.png": 0.9922480620155039,
+    "0VX41MM59ET2MK09202C3.png": 0.9896907216494846,
+    "1CGP5R7FMVCKR47XK6IVA.png": 0.9896907216494846,
+    "1BSXNNN0LA94101P5D38I.png": 0.9926470588235294,
+    "0UMVEM9RUVZDRJRFA1W2V.png": 0.9722222222222222,
+    "0KPHJHUXB0MS3B9RHL57O.png": 0.9868421052631579,
+    "0TYH6IN161KXZT369VVWQ.png": 0.9795918367346939,
+    "0AUTW1OL7IAPO1JH1TQUR.png": 0.984375,
+    "0GNCKEB99NZ0J9GCAI0TH.png": 0.9850746268656716,
+    "0CGP5R7FMVCKR47XK6IVA.png": 0.9915966386554622,
+    "0NNLAUZDCGVKZP852ZJ7X.png": 0.9836065573770492,
+    "0EJW9DEXTHUR17CZCUPB1.png": 0.9850746268656716,
+    "0JZRIWIFSATGGFL8P0NZF.png": 0.9908256880733946,
+    "0VNHMSVYYS2Q0H0VJDNAK.png": 0.9782608695652174,
+    "0Z2ZZWW84O21E70F5RGIA.png": 0.993103448275862,
+    "0UODYVKUWDGD6S5D7LNAW.png": 0.9930555555555556,
+    "0NGNPB7KAJSSKSHQV1KZS.png": 0.9767441860465116,
+    "0Q3C8N8G8GXV2EP88XEXI.png": 0.9795918367346939,
+    "0IU89E255WY0KPUD6L7Y9.png": 0.9902912621359223,
+    "0F22CQYG638LSZROETJ9V.png": 0.9904761904761905,
+    "0HBX9X0EJVVL4TA9CJ25G.png": 0.9873417721518988,
+    "0IRDSID7UDBLOIRB9JQ9S.png": 0.9883720930232558,
+    "0IHCMVD5NO41KSAB3ODC0.png": 0.5213270142180095,
+    "1AYZOGNX998RYQVPWP1OA.png": 0.9931506849315068,
+    "0WM2Y66O2ZJA831TN2E7Z.png": 0.9615384615384616,
+    "0CPW27F5C8I03UQBVBL2Y.png": 0.42500000000000004,
+    "1BR0V61AWXYXVQSK6RMY7.png": 0.989010989010989,
+    "0JJPRMSYFQLJKD3JYA1JP.png": 0.9850746268656716,
+    "0RT937QPOOWU9LKZVU0G3.png": 0.9922480620155039,
+    "0EFBK546D496KI033ACDF.png": 0.972972972972973,
+    "0EP1D1EXZC4VOMGZJGQQT.png": 0.9891304347826086,
+    "0DSQ4IAVY32EHCJ0AJM1Y.png": 0.9824561403508771,
+    "0F4HFOUP4374O8RL4E914.png": 0.9824561403508771,
+    "0IMS5FXCTVU6GSCR5CHTK.png": 0.984375,
+    "0P82SO3E98ECMRNRS62D4.png": 0.9868421052631579,
+    "1AUTW1OL7IAPO1JH1TQUR.png": 0.9911504424778761,
+    "0WFIWI83FBAOLU16M27NL.png": 0.9939024390243902,
+    "1A9B6NHM7J57SCT1Z8TAS.png": 0.9859154929577465,
+    "0UU3AG1PSZ1H78B6J17PA.png": 0.9882352941176471,
+    "0UVW81GETVKT5GPM6ZX0S.png": 0.9803921568627451,
+    "0IKFXKSQ9OA3OCRGQBZFI.png": 0.9795918367346939,
+    "1C70JEJWPOAT1S8RUWCVB.png": 0.9803921568627451,
+    "0A7ZU26KX6C0LG0D3T3ZS.png": 0.9818181818181818,
+    "0BL58Q9DLPBQF73ROGFDX.png": 0.9937106918238994,
+    "0UQWQMAYVXUFY65GH4ION.png": 0.9836065573770492,
+    "0R77TU5P7A0F1YTLIGSOA.png": 0.9863013698630136,
+    "0Q740R8QE6ZAF034ZMGQG.png": 0.9917355371900827,
+    "1BG5K95UCWQ3JXWC501XA.png": 0.9926470588235294,
+    "1CPW27F5C8I03UQBVBL2Y.png": 0.9916666666666667
+  }
+}
\ No newline at end of file
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index 9a50b4fa..dcdef2d4 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 import json
 import pprint
+from typing import Optional, List
+
 import numpy as np
 import wget
 
@@ -14,14 +16,12 @@
 path_result = Path(__file__).parent / ".." / ".." / "resources" / "benchmarks"
 path_result.absolute().mkdir(parents=True, exist_ok=True)
 
-URL = "https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download"
-
 table_recognizer = TableRecognizer(config=get_config())
 image_reader = PdfImageReader(config=get_config())
 
 
-def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False) -> dict:
-    teds = TEDS(structure_only=structure_only)
+def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False, ignore_nodes: Optional[List] = None) -> dict:
+    teds = TEDS(structure_only=structure_only, ignore_nodes=ignore_nodes)
     scores = teds.batch_evaluate(pred_json, true_json)
     pp = pprint.PrettyPrinter()
     pp.pprint(scores)
@@ -51,15 +51,14 @@ def make_predict_json(data_path: Path) -> dict:
     return predict_json
 
 
-def download_dataset(data_dir: Path) -> None:
-
+def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
     if Path.exists(data_dir):
         print(f"Use cached benchmark data from {data_dir}")
         return
 
     data_dir.mkdir(parents=True, exist_ok=True)
-    pdfs_zip_path = data_dir / "benchmark_table_data.zip"
-    wget.download(URL, str(data_dir))
+    pdfs_zip_path = data_dir / name_zip
+    wget.download(url, str(data_dir))
 
     with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
         zip_ref.extractall(data_dir)
@@ -71,17 +70,19 @@ def download_dataset(data_dir: Path) -> None:
 def prediction(path_pred: Path, path_images: Path) -> dict:
     pred_json = make_predict_json(path_images)
     with path_pred.open("w") as fd:
-        json.dump(str(pred_json), fd, indent=2, ensure_ascii=False)
+        json.dump(pred_json, fd, indent=2, ensure_ascii=False)
 
     return pred_json
 
 
-if __name__ == "__main__":
+def benchmark_on_our_data():
     data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data"
     path_images = data_dir / "images"
     path_gt = data_dir / "gt.json"
     path_pred = data_dir / "pred.json"
-    download_dataset(data_dir)
+    download_dataset(data_dir,
+                     name_zip="benchmark_table_data.zip",
+                     url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
 
     mode_metric_structure_only = False
 
@@ -104,3 +105,62 @@ def prediction(path_pred: Path, path_images: Path) -> dict:
     file_result = path_result / "table_benchmark.json"
     with file_result.open("w") as fd:
         json.dump(result, fd, indent=2, ensure_ascii=False)
+
+
+def benchmark_on_generated_table():
+    """
+    Generated data from https://github.com/hassan-mahmood/TIES_DataGeneration
+    Article generation information https://arxiv.org/pdf/1905.13391.pdf
+    Note: generate the 1st table tape category
+    Note: don't use header table tag <th>, replacing on <td> tag
+    Note: all generated data (four categories) you can download from 
+    TODO: some tables have a low quality. Should to trace the reason.
+    All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU
+    """
+
+    data_dir = Path(get_config()["intermediate_data_path"]) / "visualizeimgs" / "category1"
+    path_images = data_dir / "img_500"
+    path_gt = data_dir / "html_500"
+    download_dataset(data_dir,
+                     name_zip="benchmark_table_data_generated_500_tables_category_1.zip",
+                     url="https://at.ispras.ru/owncloud/index.php/s/gItWxupnF2pve6B/download")
+    mode_metric_structure_only = True
+
+    # make common ground-truth file
+    common_gt_json = {}
+    for pathname in Path.iterdir(path_gt):
+        image_name = pathname.name.split(".")[0] + '.png'
+        with open(pathname, "r") as fp:
+            table_html = fp.read()
+            # exclude header tags
+            table_html = table_html.replace("<th ", "<td ")
+            table_html = table_html.replace("</th>", "</td>")
+
+        common_gt_json[image_name] = {"html": table_html}
+
+    file_common_gt = data_dir / "common_gt.json"
+    with file_common_gt.open("w") as fd:
+        json.dump(common_gt_json, fd, indent=2, ensure_ascii=False)
+
+    # calculate metrics
+    path_pred = data_dir / "pred.json"
+
+    pred_json = prediction(path_pred, path_images)
+    scores = call_metric(pred_json=pred_json, true_json=common_gt_json,
+                         structure_only=mode_metric_structure_only,
+                         ignore_nodes=['span', 'style', 'head', 'h4'])
+
+    result = dict()
+    result["mode_metric_structure_only"] = mode_metric_structure_only
+    result["mean"] = np.mean([score for score in scores.values()])
+    result["images"] = scores
+
+    # save benchmarks
+    file_result = path_result / "table_benchmark_on_generated_data.json"
+    with file_result.open("w") as fd:
+        json.dump(result, fd, indent=2, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    # benchmark_on_our_data()
+    benchmark_on_generated_table()
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index 1872b414..ff84a4a7 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -32,7 +32,7 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None,
     def bracket(self):
         """Show tree using brackets notation
         """
-        if self.tag == "td":
+        if self.tag == "td" or self.tag == 'th':
             result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
         else:
             result = f'"tag": {self.tag}'
@@ -89,6 +89,10 @@ def tokenize(self, node):
         if node.tag != "td" and node.tail is not None:
             self.__tokens__ += list(node.tail)
 
+    def get_span(self, node, name_span: str) -> int:
+        value = int(node.attrib.get(name_span, "1"))
+        return 1 if value <= 0 else value
+
     def load_html_tree(self, node, parent=None):
         """ Converts HTML tree to the format required by apted
         """
@@ -102,8 +106,8 @@ def load_html_tree(self, node, parent=None):
 
             try:
                 new_node = TableTree(tag=node.tag,
-                                     colspan=int(node.attrib.get("colspan", "1")),
-                                     rowspan=int(node.attrib.get("rowspan", "1")),
+                                     colspan=self.get_span(node, "colspan"),
+                                     rowspan=self.get_span(node, "rowspan"),
                                      content=cell,
                                      visible=False if node.attrib.get("style") == "display: none" else True, *deque())
             except Exception as ex:

From ca029ded7aeaeb5cde635a66958be67e19e10539 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Mon, 29 Jan 2024 13:19:31 +0300
Subject: [PATCH 5/5] TLDR-585 fixed after review

---
 scripts/benchmark_table/benchmark_table.py | 27 +++++++++++-----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index dcdef2d4..c6cbd7cb 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -3,7 +3,6 @@
 import json
 import pprint
 from typing import Optional, List
-
 import numpy as np
 import wget
 
@@ -19,6 +18,10 @@
 table_recognizer = TableRecognizer(config=get_config())
 image_reader = PdfImageReader(config=get_config())
 
+GENERATED_BENCHMARK = "on_generated_data"
+OURDATA_BENCHMARK = "on_our_data"
+TYPE_BENCHMARK = OURDATA_BENCHMARK
+
 
 def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False, ignore_nodes: Optional[List] = None) -> dict:
     teds = TEDS(structure_only=structure_only, ignore_nodes=ignore_nodes)
@@ -75,7 +78,7 @@ def prediction(path_pred: Path, path_images: Path) -> dict:
     return pred_json
 
 
-def benchmark_on_our_data():
+def benchmark_on_our_data() -> dict:
     data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data"
     path_images = data_dir / "images"
     path_gt = data_dir / "gt.json"
@@ -101,13 +104,10 @@ def benchmark_on_our_data():
     result["mean"] = np.mean([score for score in scores.values()])
     result["images"] = scores
 
-    # save benchmarks
-    file_result = path_result / "table_benchmark.json"
-    with file_result.open("w") as fd:
-        json.dump(result, fd, indent=2, ensure_ascii=False)
+    return result
 
 
-def benchmark_on_generated_table():
+def benchmark_on_generated_table() -> dict:
     """
     Generated data from https://github.com/hassan-mahmood/TIES_DataGeneration
     Article generation information https://arxiv.org/pdf/1905.13391.pdf
@@ -155,12 +155,13 @@ def benchmark_on_generated_table():
     result["mean"] = np.mean([score for score in scores.values()])
     result["images"] = scores
 
-    # save benchmarks
-    file_result = path_result / "table_benchmark_on_generated_data.json"
-    with file_result.open("w") as fd:
-        json.dump(result, fd, indent=2, ensure_ascii=False)
+    return result
 
 
 if __name__ == "__main__":
-    # benchmark_on_our_data()
-    benchmark_on_generated_table()
+    result = benchmark_on_our_data() if TYPE_BENCHMARK == OURDATA_BENCHMARK else benchmark_on_generated_table()
+
+    # save benchmarks
+    file_result = path_result / f"table_benchmark_{TYPE_BENCHMARK}.json"
+    with file_result.open("w") as fd:
+        json.dump(result, fd, indent=2, ensure_ascii=False)