diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index df8a1286..1287912d 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -133,7 +133,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab if tables is not None and len(tables) > 0: text += "

Tables:

" for table in tables: - text += __table2html(table, table2id) + text += table2html(table, table2id) text += "

 

" return text @@ -201,7 +201,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: return text.replace("\n", "
") -def __table2html(table: Table, table2id: Dict[str, int]) -> str: +def table2html(table: Table, table2id: Dict[str, int]) -> str: uid = table.metadata.uid text = f"

table {table2id[uid]}:

" text += f'\n\n' diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index c7c47fe1..4bc057df 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -5,6 +5,7 @@ import numpy as np from dedocutils.data_structures import BBox +from dedoc.data_structures import CellWithMeta, Table, TableMetadata from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -27,6 +28,11 @@ def extended(self, table: "ScanTable") -> None: # extend order self.order = max(self.order, table.order) + def to_table(self) -> Table: + metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle) + cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells] + return Table(metadata=metadata, cells=cells_with_meta) + @staticmethod def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]: attrs = [] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index d52e0d3c..fd6ed93b 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -13,10 +13,7 @@ import dedoc.utils.parameter_utils as param_utils from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader @@ -92,12 +89,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) - tables = [] - for scan_table in scan_tables: - metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle) - cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells] - table = Table(metadata=metadata, cells=cells_with_meta) - tables.append(table) + tables = [scan_table.to_table() for scan_table in scan_tables] if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) diff --git a/resources/benchmarks/table_benchmark.json b/resources/benchmarks/table_benchmark.json new file mode 100644 index 00000000..d7a9d7c6 --- /dev/null +++ b/resources/benchmarks/table_benchmark.json @@ -0,0 +1,16 @@ +{ + "mode_metric_structure_only": false, + "mean": 0.9468374367023571, + "images": { + "example_with_table0_0.png": 0.9525583036909738, + "example_with_table0_1.png": 0.9264351862896008, + "example_with_table6.png": 0.989010989010989, + "example_with_table4.jpg": 0.908436211832951, + "example_with_table17.jpg": 0.8078952936402488, + "example_with_table_hor_vert_union.png": 0.9896091617933723, + "example_with_table1.png": 0.9781560283687943, + "example_with_table_horizontal_union.jpg": 0.9925757575757576, + "example_with_table3.png": 0.9778008866078716, + "example_with_table5.png": 0.9458965482130129 + } +} \ No newline at end of file diff --git a/resources/benchmarks/table_benchmark_on_generated_data.json b/resources/benchmarks/table_benchmark_on_generated_data.json new file mode 100644 index 00000000..130bcd28 --- /dev/null +++ b/resources/benchmarks/table_benchmark_on_generated_data.json @@ -0,0 +1,506 @@ +{ + "mode_metric_structure_only": true, + "mean": 0.9467889492889642, + "images": { + "0OEG7D5CXUSXDNEXAZ8A3.png": 0.993103448275862, + "0IS8OPRTM71QYN821WA5S.png": 0.9878048780487805, + "0KX1D4AGMTM3EWR0EF0A5.png": 0.989010989010989, + "0QBK1U71YOHBG5Z23MT7E.png": 0.9916666666666667, + "0DC57AS1OYZ1BRHZHPIO2.png": 0.96, + "0GJE73OG32H2P2SL2AI2J.png": 0.9905660377358491, + "0GYAQKWTI3LN6DNZFM2TZ.png": 0.9904761904761905, + "0GHKLO6LOH5LBTYEUND3S.png": 0.9917355371900827, + "0F831FOUA10K3594FG4IM.png": 0.9896907216494846, + "0XG0I2F0MMZ3QMXWLWFMX.png": 0.9935064935064936, + "0WMTO9U10ILEB9HCX4C0B.png": 0.9863013698630136, + "0C1ZYGFL2YNFM2W3P2KN1.png": 0.9795918367346939, + "0A4G5JAZSJS4BT5LBZ2Q3.png": 0.9850746268656716, + "0I75SMSDR5JSJXF07PN6J.png": 0.9915966386554622, + "0K9EAAIYXSUT80SYF3ML4.png": 0.9836065573770492, + "0PH78O2B9CJAM6MMINZXT.png": 0.9876543209876543, + "1BRZ4ALOZMMEXGR4AVJWG.png": 0.9876543209876543, + "0AFVW6AL3EH9H76ONNDYF.png": 0.9848484848484849, + "0EVCQHN9C65AUYG1UAN3C.png": 0.9696969696969697, + "0DD9D0ILAPJIH77GEVRGP.png": 0.4098360655737705, + "1BHU2JO8ODKS3OL4RIU6A.png": 0.9905660377358491, + "0OG2AZLHJPMBX43O2O9LR.png": 0.989010989010989, + "0M64SMZT9HTN6LXQ4M24T.png": 0.9846153846153847, + "0TY12X0C3U2BPZC81PW66.png": 0.9836065573770492, + "0W109P7LI6B5HIYM3SJ5A.png": 0.9873417721518988, + "0H6272E6S2YUDJWBSWKQN.png": 0.9873417721518988, + "0ZRX97WSSVCVQ3NJ5959P.png": 0.9873417721518988, + "0ZFX4HDI3O7YQFDYRRYKI.png": 0.9722222222222222, + "1A4SDGAXB66WDBW7OUH58.png": 0.9876543209876543, + "0HQVUJMOQRQQ5FIP4PMZF.png": 0.9859154929577465, + "0Q335MQBC8UJJMASJUNWZ.png": 0.9938271604938271, + "0AF02R419WL1YN97ZV144.png": 0.9868421052631579, + "0JWOGY4C0KQ14J958GLYD.png": 0.9922480620155039, + "0JCVUE03Y5YD8A45IOIA5.png": 0.972972972972973, + "0R10PSLELMJ0SPFCXX92A.png": 0.9814814814814815, + "0BF411IVR1HLU1Q44I3K7.png": 0.990990990990991, + "0K9C1HJZ8K3L6CRAQ6VCW.png": 0.9850746268656716, + "0Q2MRICBMAFRV1GRRR5TA.png": 0.5189873417721519, + "0ZK44UG99IWIPKRSCOSJV.png": 0.9795918367346939, + "0S09D3ZPVQ8YOT55XIOE1.png": 0.9887640449438202, + "0KRLZUD3DQAU1DYDU99ZH.png": 0.9882352941176471, + "1ATIOLLN3DOAHKX75560Q.png": 0.5460992907801419, + "0KSBBUINDNN16F2ZLQHV4.png": 0.9915254237288136, + "0P5IE8XH9BN2EGC0DX27Z.png": 0.989010989010989, + "0GJ88Q9SMUOWF3WILKG14.png": 0.98, + "0QYA242XOQ0Y9078UC7NI.png": 0.984375, + "0F4Z8B4S5RV008LHJBW8S.png": 0.9896907216494846, + "0GWJH40B21AJBR1F73FXI.png": 0.9722222222222222, + "0ZO44O69QHTV62QJ3X9KH.png": 0.9883720930232558, + "0VK3KLUJVLAB9SRQDN6EJ.png": 0.45622119815668205, + "0HQHS3BO0IIOJ5L2EP2H4.png": 0.9882352941176471, + "1DBQ2M6XQ66Y2895PYNOM.png": 0.9824561403508771, + "0A4OYW3ZL5QP76IGF0DK0.png": 0.9836065573770492, + "1AFVW6AL3EH9H76ONNDYF.png": 0.9863013698630136, + "0J0JQM9WD7B0RCNKWBC5S.png": 0.972972972972973, + "0OVSLM3WAA36TZQCOL1WS.png": 0.9821428571428571, + "0GMEN2MGE7HN3ROOZQ5YD.png": 0.9879518072289156, + "0AJLKOKRHEVOTGE90GEH6.png": 0.9922480620155039, + "0DUX4YKT5JYJO3Z573OG8.png": 0.967741935483871, + "0A4SDGAXB66WDBW7OUH58.png": 0.9871794871794872, + "0G5S5CXGRLABEYII4QG2Z.png": 0.9887640449438202, + "0YM92E2EEDDGHAUW2YZ8Q.png": 0.9863013698630136, + "0F0E32N4VR4Q9960I0DB8.png": 0.975609756097561, + "0JNQPLSGKLPQ0UAAFYL5T.png": 0.967741935483871, + "0YL4VFF3LUUQITLVU3U9V.png": 0.9902912621359223, + "0ZO2Z3XCHZLB43ARH68WS.png": 0.9803921568627451, + "0SHL8BKLII1AGBZ1SEB4U.png": 0.9876543209876543, + "0KBLEG9N1SBX956ZCIP5I.png": 0.9795918367346939, + "1B1QX4K8U8P9QA3HVLRPN.png": 0.9896907216494846, + "0D0ZG3O9YHMQAPHCD0890.png": 0.9924242424242424, + "0PTMPFGYNVJWO6FCX1QRZ.png": 0.9933333333333333, + "0A7ZA5BA5TPHBN2WP6TT9.png": 0.9904761904761905, + "0DE9UIIVMYH3UK0SYFVUG.png": 0.9767441860465116, + "0U0LFAJATVD9YEC1Z3497.png": 0.9767441860465116, + "1A3YX0911ULBZSCUBNDZS.png": 0.9929577464788732, + "0WRQRWHH2CMV2L4CE3SN9.png": 0.9878048780487805, + "0C1X00FENSOUN2Y08Y3JT.png": 0.9824561403508771, + "0ONRU7A4SU4WAUWF25FRP.png": 0.9878048780487805, + "1DD9D0ILAPJIH77GEVRGP.png": 0.9824561403508771, + "0CBFM7HG55Z7O8F4Y0O0L.png": 0.9905660377358491, + "0C9EM94JJTICVGS6U2T2U.png": 0.9917355371900827, + "0KK57808VO3HNS1AW4CJO.png": 0.9767441860465116, + "0E7XFLPH56MT23HNK3MZ6.png": 0.9886363636363636, + "0J818KH6HIIA83D74FXS3.png": 0.9887640449438202, + "0TLMD42BW0F4NSD9PG19X.png": 0.984375, + "0PSN5QFZWTPA9U05O7MZ3.png": 0.99, + "0GI0JNFJAOXK5OJKRXCND.png": 0.9891304347826086, + "0Z4X1LVZ1K4NE2RR8P7EA.png": 0.9859154929577465, + "0S709DW5AZF9VPCPMVHXB.png": 0.9891304347826086, + "1B87OEX5XX0BHUOQAS50A.png": 0.9767441860465116, + "1AAERNSDA06GDA7OFZVCA.png": 0.9850746268656716, + "1CORAY089OILX2OWIKU1E.png": 0.98, + "0WO86MK2DC2EZUZLSMFA1.png": 0.9911504424778761, + "0KNUPYHEXZYSW1TNZ6I7L.png": 0.9846153846153847, + "0ONF59OAQYX89LAM941E6.png": 0.9859154929577465, + "0E1IVAEMQXKVCH3Q0JCVX.png": 0.9782608695652174, + "0K8YJZK75V8SXL0GIM4SU.png": 0.9896907216494846, + "0LUL2CVQ1HLC1KL6D2VMP.png": 0.4065934065934066, + "0WIEGQEF4G9LN2UM49Y12.png": 0.9876543209876543, + "0AQ9EL10BYBSGJO2RLC6Q.png": 0.9888888888888889, + "0M0WYXRJONRUQ3ZG24MJJ.png": 0.989010989010989, + "0E3XQJO1C4CKR9TNFB4IC.png": 0.9871794871794872, + "0H17CYXGJTHXPQUP51TBI.png": 0.9911504424778761, + "0NK736IIIHGBF52E1UKQ4.png": 0.9859154929577465, + "0DYIHMLOKOR6HNF2XAI8F.png": 0.9836065573770492, + "0BZ5GZPTUSCNBNGBNQZEG.png": 0.9859154929577465, + "0M47PMX0DRIVKCJBYKHPJ.png": 0.9767441860465116, + "00MK8C41M7MW013CJ9SPU.png": 0.9922480620155039, + "0DMXCT01TPF8O33UMENE4.png": 0.9917355371900827, + "0WS9VI6T1X0M5H6D8O67Z.png": 0.9859154929577465, + "0XQ9XQOL15RDKQT4YZUQC.png": 0.5739130434782609, + "1DD8FWYLADAY5EJ3UZUD9.png": 0.9876543209876543, + "0MXPSYD5A5U86BSSZQMJN.png": 0.975609756097561, + "0QAOLXSIIRIRQ3W1OP7Y8.png": 0.9921259842519685, + "1BZ5GZPTUSCNBNGBNQZEG.png": 0.9767441860465116, + "0Y5AIJNHB8DTPQOC92X6P.png": 0.9882352941176471, + "0IL3BP1QRAZ54V54IBK9A.png": 0.9876543209876543, + "0MDCUYD9ASW4AGWD3ZYK5.png": 0.9891304347826086, + "0MON88TOR16AGTBLDTGJC.png": 0.9904761904761905, + "0QVCHWR0EZCMQ5J5P0Z1J.png": 0.9767441860465116, + "0IPJ09DW34Q275Z5CMS1X.png": 0.99, + "1A7ZU26KX6C0LG0D3T3ZS.png": 0.9863013698630136, + "0M49YEV7H4P48EONCBFPS.png": 0.9863013698630136, + "1D34PI1NNCV0AB4WCQMB3.png": 0.9863013698630136, + "0AGYYXV88WJW2FC6FVV3Q.png": 0.9863013698630136, + "0F9W69ODT3GQCQ6F11L2E.png": 0.9767441860465116, + "0Q3RJT1DJMPO9D9BE6JNO.png": 0.9868421052631579, + "0ETQJY2HRGYIBO46BSD3P.png": 0.4503105590062112, + "0BRZ4ALOZMMEXGR4AVJWG.png": 0.43983402489626555, + "0WLG2ZXPFXZGF9RM2Z6N6.png": 0.9871794871794872, + "1BP5KU2XHXZ0C431B4OL9.png": 0.43450479233226835, + "00ZG4J0UMAHQMR57DQ5T7.png": 0.9818181818181818, + "0S5HD36LFVDWLLH6UFK9I.png": 0.9939759036144579, + "0EW4PZW85MH9BS8VI83KZ.png": 0.9848484848484849, + "0EMFKQLMGGAFPLQGUEZSJ.png": 0.96, + "0H4TWDI39J0HRG239GQ10.png": 0.9938650306748467, + "0BSXNNN0LA94101P5D38I.png": 0.9882352941176471, + "0SLVZSD9X7VZPGQU0Q2QN.png": 0.9850746268656716, + "0K6WPSDJC0ICOWFEASYB4.png": 0.9911504424778761, + "0TY3MTJ6YZDE6QI73SH5A.png": 0.9859154929577465, + "0B87OEX5XX0BHUOQAS50A.png": 0.9896907216494846, + "1C1X00FENSOUN2Y08Y3JT.png": 0.967741935483871, + "0KEM29NIZZ7UI3CTN6NEA.png": 0.9896907216494846, + "0JZQMX95783K8QW3ERXSM.png": 0.8827586206896552, + "0R47TY8TMFAL346RUY0LW.png": 0.9696969696969697, + "0EG83QLMPW7MGGMGBYGPD.png": 0.9882352941176471, + "1AGYYXV88WJW2FC6FVV3Q.png": 0.9927536231884058, + "0OF74SYX6Q102JCQ5KELF.png": 0.9896907216494846, + "1BX1I2HS6BLV92NZHV6J1.png": 0.9940828402366864, + "1DDEMI2034QD7F4QRH1IV.png": 0.972972972972973, + "0Z8LGXZ1SMLBHV5T6Y4O9.png": 0.9859154929577465, + "0NGE5XRBD2YHBZFMDL7VD.png": 0.9795918367346939, + "0SX4TWDHV25DCZV3HQEHH.png": 0.9777777777777777, + "00Q04QLVCESVWCSMDAURN.png": 0.9855072463768116, + "0SK696SAQW3MZNDMD4W85.png": 0.984375, + "0F4WBFLG32FAT22W0NGEY.png": 0.45871559633027525, + "0TNFF3RUQ2UL3PRNYF45M.png": 0.9868421052631579, + "1A4OYW3ZL5QP76IGF0DK0.png": 0.9904761904761905, + "0IUNRRJ3JHMEAORR2EXRS.png": 0.9908256880733946, + "0L764EQB3ZGC3FYQ20PR9.png": 0.9863013698630136, + "0XZJ4SZWY0ZOD9QBZP96A.png": 0.9922480620155039, + "0PU3J7NYVCB6XLSJJOEZ9.png": 0.9911504424778761, + "0DDKIN1PFJQTFW1JADVHT.png": 0.9863013698630136, + "1A4G5JAZSJS4BT5LBZ2Q3.png": 0.9836065573770492, + "0DPMX3BRIG9CWZPYKXFWS.png": 0.9921259842519685, + "0N7P792721CFI8EDOCB0N.png": 0.9908256880733946, + "1BIC4PMO7M3ZB8WUC3STJ.png": 0.9933333333333333, + "0XNBY82W4NFSD9GV6ONKU.png": 0.9911504424778761, + "0F3P8XGEMBYESYCYAOQPN.png": 0.9923076923076923, + "00D983SP0WHF6YGMKSHCR.png": 0.9803921568627451, + "0N91H0ZWMHBPPPPON4HUW.png": 0.993006993006993, + "0AWZPWR198XN7U8HY1E32.png": 0.9836065573770492, + "0S0Z9J05KZWNPKUFRD78Z.png": 0.9927007299270073, + "0FCDXM7JS1QEBBY3DCGBM.png": 0.9795918367346939, + "0OEVVJNLZKKW7GOPM188W.png": 0.9615384615384616, + "0Z7FUMCO707ZDI55EG306.png": 0.9878048780487805, + "0DE4P4M2855D754NA8993.png": 0.9722222222222222, + "0UZ81HSUQSHVVGU56NIOG.png": 0.9902912621359223, + "0AAERNSDA06GDA7OFZVCA.png": 0.992, + "0WZXI1YECN77S9GD6GQ4M.png": 0.98989898989899, + "0S8HOU13AW544ALTKAB73.png": 0.989010989010989, + "0AAPDAAK73MRINE7PM0ZJ.png": 0.41628959276018096, + "0UJ2AFVE6RWGTYSB6DKLJ.png": 0.45871559633027525, + "0ISYQEE43TA3O41XMA47A.png": 0.993103448275862, + "0L2E8S3ICCMGPE9PS3RLV.png": 0.9908256880733946, + "0BIC4PMO7M3ZB8WUC3STJ.png": 0.984375, + "0CTFYQFHQ1S1FLIEAPZTB.png": 0.9767441860465116, + "0A9RJA2I3YJT58JR2MEOT.png": 0.9818181818181818, + "0UGHOJ96BTPB57BR0DJS7.png": 0.4505494505494505, + "0TW35WW1PRLL2YKVYWYRM.png": 0.9818181818181818, + "0HTO45RT9NH5KQUCLOV2H.png": 0.9722222222222222, + "0F0TA5W8GO31TXUFMHHTO.png": 0.98989898989899, + "0HWMSCT6L3MCGFJV4OXF8.png": 0.975609756097561, + "0KCIUQNXNE3ZMX5ECY7V3.png": 0.9925925925925926, + "0ET4I24PZATQRKGMGG5KC.png": 0.975609756097561, + "0I6WVEL7V26O3KJJ1GGYF.png": 0.9896907216494846, + "1A2AT7TW5KOMUUAK7TQXT.png": 0.9767441860465116, + "0W8NNJL30MNEY6RTPD6DA.png": 0.9767441860465116, + "0XC0XOHP855H9DFG41W9T.png": 0.9803921568627451, + "1DC57AS1OYZ1BRHZHPIO2.png": 0.9903846153846154, + "00WVVGSQ00B0IZU4OKPHQ.png": 0.9916666666666667, + "0RTI5C20W407SL59RANEM.png": 0.991304347826087, + "0A0DA327P9Y532UTLHE2N.png": 0.9722222222222222, + "0AQZMEU4Q38NKK4USHAC5.png": 0.9896907216494846, + "0U7602J86XPC7AVTSPMWL.png": 0.9878048780487805, + "0DSGAEKSK52RUNGEOGEXP.png": 0.9921259842519685, + "0JD5R5NDJKRRHT1UI6GFW.png": 0.7058823529411764, + "0PQ9OK98A29AC6GEI3DKQ.png": 0.4882352941176471, + "0TA7SVAQC7PKDE8BUP3NF.png": 0.9887640449438202, + "0TQM47CA0F30LG2C0S2KN.png": 0.9887640449438202, + "0HA1FE8828DJ86ZIJUIX4.png": 0.967741935483871, + "0P1Y0C88Y17DSXE616MQN.png": 0.98989898989899, + "0I6GDDWCTMF9V4YLGLBIM.png": 0.4036697247706422, + "0SI6DA6CAXUMFYSXBXIF6.png": 0.9906542056074766, + "0NM9CUQJV6W2N9434O81D.png": 0.9859154929577465, + "0WU8XJP1VJSLZXQ7S43HM.png": 0.9767441860465116, + "0P0WR7JJ9JBXO0HVMDETS.png": 0.975609756097561, + "0ZNTZMWW1X0QZV4AGDHYL.png": 0.9926470588235294, + "0C98HOE9TQ4HZK6DKGF5I.png": 0.989010989010989, + "0JCDZWWAMUR9FRGHL9IVN.png": 0.9911504424778761, + "0PVN50SJP1LUTHE2TID60.png": 0.9926470588235294, + "0D7CMRTBBENLYDO7EWWVZ.png": 0.98, + "0JOTZX26K6UJB6LNVK9RH.png": 0.975609756097561, + "0ZFOZ6UKG7DCCD5HSUIIX.png": 0.9876543209876543, + "0L7V0ZXS2M9JMSBD05I25.png": 0.9873417721518988, + "0G1E97R3QFH7FG9AUAIFB.png": 0.9863013698630136, + "0CORAY089OILX2OWIKU1E.png": 0.984375, + "0EH9JARAL7RYD3CVMM8AZ.png": 0.5185185185185186, + "00KDBG5H22KPNCPCK7L2P.png": 0.9848484848484849, + "00XJ5C1RWIRVID9IPUX8G.png": 0.8, + "0FFJM5ABUDDCT2DOCW2T4.png": 0.9916666666666667, + "0D34PI1NNCV0AB4WCQMB3.png": 0.9896907216494846, + "0X9D7AJTD7S91BNHMQ4L0.png": 0.9876543209876543, + "0W9SN5GJDEWTG3WAPGPDZ.png": 0.9887640449438202, + "0ATIOLLN3DOAHKX75560Q.png": 0.9882352941176471, + "1C9EM94JJTICVGS6U2T2U.png": 0.9883720930232558, + "0TG6BRHGF3C865C2OL6DE.png": 0.9882352941176471, + "1BUP8L4PGVBNQE1GSCGJZ.png": 0.9863013698630136, + "1AJLKOKRHEVOTGE90GEH6.png": 0.989010989010989, + "1C98HOE9TQ4HZK6DKGF5I.png": 0.9859154929577465, + "0IH65GI6IN6RQWJE04YPG.png": 0.9859154929577465, + "0DNHG32KRYJ9PQ7UU1YL5.png": 0.9863013698630136, + "0EV54WP1Y9JDCWMDIT0OM.png": 0.975609756097561, + "0BE3I0HX6XWZQA4EFY99C.png": 0.984375, + "0O7G4HGEK48J2NUB5RCES.png": 0.9882352941176471, + "1BXWVCNXW1Z4N1XG8QOG4.png": 0.9905660377358491, + "0M2V36SUMHY2U8FRS9NYZ.png": 0.4424778761061947, + "0STJA7OMA59TOQ8XQ54G5.png": 0.98, + "0VB0OIQZQXKY5PA111Q8B.png": 0.984375, + "0RBPX6DU1W6LIYA2VRAA4.png": 0.972972972972973, + "0SP3KJJ2HMQZF088NH2DR.png": 0.9904761904761905, + "1D0ZG3O9YHMQAPHCD0890.png": 0.9655172413793104, + "0XZ590ZLZXRB09XIADL9V.png": 0.9934640522875817, + "0QU6QW0KAWVXZ6TL7FVJE.png": 0.9933774834437086, + "0PKH21420YW57OPRJR21R.png": 0.9922480620155039, + "0TX7Y5KWQ2MVU3579QIYH.png": 0.9777777777777777, + "0Y6OW4PMMWG05F4ZFYQ40.png": 0.9767441860465116, + "0EK5DRITVR9G3KDVF1CTJ.png": 0.9876543209876543, + "0DDEMI2034QD7F4QRH1IV.png": 0.9933774834437086, + "0HJXUBEZQCR1DEUQ8V30I.png": 0.9932885906040269, + "0BG5K95UCWQ3JXWC501XA.png": 0.9886363636363636, + "00TNQG8N9T3KUVMZ7AWTB.png": 0.967741935483871, + "0TJSB9YOUAG7C9OZW3U80.png": 0.9848484848484849, + "0SYEGYPSNLKCALCQBPGK2.png": 0.9929577464788732, + "0IP23CAYMTIVE93KLVMRA.png": 0.9824561403508771, + "0KFRN6DX1A6MMGS24B39T.png": 0.9850746268656716, + "1CTFYQFHQ1S1FLIEAPZTB.png": 0.9803921568627451, + "0U9U2Q7VBD1V6HBT7FQKM.png": 0.9923076923076923, + "0S7MUFP120D8OP4ZCCCUV.png": 0.4873417721518988, + "0BXWVCNXW1Z4N1XG8QOG4.png": 0.9873417721518988, + "1A7ZA5BA5TPHBN2WP6TT9.png": 0.9824561403508771, + "1ACY14LU0VWSKDOHEAVZM.png": 0.9924812030075187, + "0MPO1XXHHM8I5BOIT3DB9.png": 0.9876543209876543, + "0RSQ19UNM98CNWII5Q25F.png": 0.975609756097561, + "0EAA9XEBN9W7XDBPK31UZ.png": 0.9803921568627451, + "0U0BR4A64P7CE7YZ57HQ1.png": 0.9911504424778761, + "0XFNT3NMKFW1DB0F2LVY3.png": 0.9916666666666667, + "1AQZMEU4Q38NKK4USHAC5.png": 0.9904761904761905, + "0VGZMTO2VCZVZKGAOHZEU.png": 0.9910714285714286, + "0DBQ2M6XQ66Y2895PYNOM.png": 0.984375, + "0BP5KU2XHXZ0C431B4OL9.png": 0.9811320754716981, + "0PYCGJHF1705P4NTCM8AS.png": 0.9824561403508771, + "0RAGYZ9465I7GLXZXCLCQ.png": 0.9924812030075187, + "1A9560NY0NQ5OVZQQBJRQ.png": 0.4636363636363636, + "0KXDSHWWWYQJBXT2Y6U8S.png": 0.9803921568627451, + "1BF411IVR1HLU1Q44I3K7.png": 0.984375, + "0T1ZL9NSVN3385DR7B86C.png": 0.9824561403508771, + "0SYKTWM1EF4KS646AWQEL.png": 0.9803921568627451, + "0S104IFNSN5EJ31212IOP.png": 0.989010989010989, + "0H2RZUXKBQEVFJ2JT29R4.png": 0.9818181818181818, + "0SVC8WRHPF38HHKBN65YD.png": 0.9926470588235294, + "0HVIW7DPWCJSWJ5PCJDM2.png": 0.9855072463768116, + "0PRIZA7CG2JAL9GTN265B.png": 0.9929577464788732, + "0FXLG8PO267BZPBBXIX4E.png": 0.9922480620155039, + "1B0LNAITDDPPCJ4I6XIWK.png": 0.9868421052631579, + "0YNQ2KZ01B1TWP9FR5DE7.png": 0.45360824742268047, + "0A8AVSZNK6GTNOCBEVFOY.png": 0.9722222222222222, + "0XM8RQF6JQDOTJ5WQVHFE.png": 0.9873417721518988, + "0JBU3LJRDTMJI2XGB6NUE.png": 0.9868421052631579, + "0FKIASN9E4KCZ0JRCAJLQ.png": 0.9917355371900827, + "0A2AT7TW5KOMUUAK7TQXT.png": 0.9882352941176471, + "0QISJETVE3HGF1PMBD1BM.png": 0.9848484848484849, + "0KBOWWQLYSIZ0P4SIZMHJ.png": 0.993421052631579, + "0OMZO818L9AC4U3JJTKGD.png": 0.9863013698630136, + "0IVOAVCWOJ4CA92H7CM1Q.png": 0.9917355371900827, + "0SH9F7EHAT35OVT003OC5.png": 0.3728813559322034, + "0F7BJ4Z9F1R95HUG4RRZD.png": 0.9767441860465116, + "00RJGV4A4UTMTLDEIR1IG.png": 0.975609756097561, + "0BUP8L4PGVBNQE1GSCGJZ.png": 0.967741935483871, + "0B1QX4K8U8P9QA3HVLRPN.png": 0.9923664122137404, + "0IZ8M2UHYSA9H6K8XIOKS.png": 0.9855072463768116, + "0KLEV2650Z6X2DAUO94QK.png": 0.9876543209876543, + "0MRYJGMAVHEDMZ3XSX9XI.png": 0.9871794871794872, + "0I6PWVE3HEK6ZZ5K53UY4.png": 0.9818181818181818, + "1BE3I0HX6XWZQA4EFY99C.png": 0.984375, + "0M7CJCA8K3PX504PNHJRT.png": 0.9883720930232558, + "0ESACK4QILSDBXRS54UK0.png": 0.9795918367346939, + "0KLU5K631Q9RHQOY6771B.png": 0.4444444444444444, + "0RAZV12CY84ZGA4BRZQUC.png": 0.9871794871794872, + "0HZ4TDEJG6BY7B2RTALZK.png": 0.9868421052631579, + "0ROPMUV96VG8PTONLNGV9.png": 0.9887640449438202, + "0L194VI2NIOAX4AUCU2WG.png": 0.9767441860465116, + "0PG6K8IFJM2PHHLA1S4Y6.png": 0.9905660377358491, + "0H5AHQVKHAKQ1W636PLCS.png": 0.9878048780487805, + "0ZAHJJUMYDOQIMIUUFAUD.png": 0.9863013698630136, + "0MO39PWU9N82Y88WNANVM.png": 0.984375, + "0ZSUP0IMF3PK86DIVWQ8V.png": 0.967741935483871, + "0M1B6J5CTPBITI79C68MO.png": 0.9824561403508771, + "0BWJOYJSDHL1XJH6UG2RM.png": 0.9882352941176471, + "0SPYHIS3OEEZ082CFJEGF.png": 0.9871794871794872, + "0A3YX0911ULBZSCUBNDZS.png": 0.9896907216494846, + "0FK1CU21TAIHIR7YWZ2W7.png": 0.9818181818181818, + "0WP1ZBKQCK8W2W0ZXI2Z4.png": 0.7916666666666666, + "1AF02R419WL1YN97ZV144.png": 0.9767441860465116, + "0BKBFKJTQPLQBNIBZSM7E.png": 0.9916666666666667, + "1C1ZYGFL2YNFM2W3P2KN1.png": 0.9871794871794872, + "0IHOYC7KXLECI1F3G1WAF.png": 0.9848484848484849, + "1A0DA327P9Y532UTLHE2N.png": 0.9868421052631579, + "0SK9B35AHQ2OQA1RDKHHP.png": 0.9917355371900827, + "0EECJZYQ42MZLSWPOK9ZH.png": 0.9887640449438202, + "0UFBWJZOD5PBKMVX7G231.png": 0.9824561403508771, + "0OZ6DU5POAFSM589UXX4S.png": 0.9876543209876543, + "0OUIP8MTUSWLFQ6J13VXT.png": 0.967741935483871, + "0NFAI2Z8TAUKU6S7892KH.png": 0.975609756097561, + "0F3VUGWY35HLOJYHPT78G.png": 0.9883720930232558, + "0AYZOGNX998RYQVPWP1OA.png": 0.9846153846153847, + "0UC2QTKS4ITXYK4E6HU9T.png": 0.9939759036144579, + "0KK6YAU45B9B34SSZTAS7.png": 0.9836065573770492, + "0WV2Q54214D8ARYKCMBE0.png": 0.547945205479452, + "0TUDLFORB7K1BVA4U0ULU.png": 0.9917355371900827, + "0XZRML313QJ6X82YZJLYT.png": 0.9848484848484849, + "0ACY14LU0VWSKDOHEAVZM.png": 0.9873417721518988, + "0HH9NAZ1I95NJINORKJIM.png": 0.9795918367346939, + "1AWZPWR198XN7U8HY1E32.png": 0.9795918367346939, + "0TLG8NFY9BXHB15A47OGW.png": 0.9926470588235294, + "1CBFM7HG55Z7O8F4Y0O0L.png": 0.9848484848484849, + "0EV3WT6VJG3QH2HFJEIBA.png": 0.975609756097561, + "0OBPU21JDPO0KPYEQGLFO.png": 0.9722222222222222, + "0MJ27YD7XBYLQKM87RM3Y.png": 0.9887640449438202, + "0BHU2JO8ODKS3OL4RIU6A.png": 0.99, + "0WVB351NNWY8OOQQRRW6F.png": 0.476878612716763, + "1BCT1VG1R4HUK3Q6NMZGU.png": 0.9916666666666667, + "0YJ043WAWUTW4AEMDTD4R.png": 0.9782608695652174, + "0YS08VVMS1YPOHVJOFXXA.png": 0.98989898989899, + "0EWWFSOUCGGD5BK6RKMKO.png": 0.522875816993464, + "0VCTD6BP09MBAXOOM5Y5E.png": 0.975609756097561, + "0S7ZGBZ7OBI15CZS5V95A.png": 0.984375, + "0JJ9O2OQ6O13OAOFM7643.png": 0.99, + "00TXY79AHYWJ7WLXB3VLV.png": 0.9846153846153847, + "0J2UQ7WIZXFK4I5TV9UHW.png": 0.9935064935064936, + "0TYF1PBQCH64LANCKYWY7.png": 0.9859154929577465, + "0SWG2OW7F5RLADFAHJ9A4.png": 0.9882352941176471, + "0RV3TKC89HQD4FRFCTNSK.png": 0.9767441860465116, + "1BQBJ8UFLH7H3JQ965JF6.png": 0.9863013698630136, + "0C70JEJWPOAT1S8RUWCVB.png": 0.972972972972973, + "0RCE6GI0QYPCA15RH6HM7.png": 0.49382716049382713, + "0SB1QV5XRJM6W0HRU4AH7.png": 0.9891304347826086, + "0I1HQDO584A6ODC54PLNA.png": 0.9891304347826086, + "1AWHACFMS9KSHM18INN41.png": 0.9836065573770492, + "1BKBFKJTQPLQBNIBZSM7E.png": 0.9863013698630136, + "0T0Q44ALMC9WURWEESEMP.png": 0.9875, + "1A9RJA2I3YJT58JR2MEOT.png": 0.9615384615384616, + "0DD8FWYLADAY5EJ3UZUD9.png": 0.9868421052631579, + "0F078JDZMTC8C8H2P8IVA.png": 0.9921875, + "0L5KEP1L6K1ALH88LLMEY.png": 0.9795918367346939, + "0U2FXJ2H3K5SQTZNJ1WV1.png": 0.98, + "0U49K9QPO02GF77TU5JB8.png": 0.9863013698630136, + "0A9560NY0NQ5OVZQQBJRQ.png": 0.9836065573770492, + "0MRQ2DF27RW94C36QLLTZ.png": 0.9863013698630136, + "0BCT1VG1R4HUK3Q6NMZGU.png": 0.9795918367346939, + "0GQC64N9E830BWDTF8L0Q.png": 0.9910714285714286, + "0HIESCSLITYADXZHOO7IA.png": 0.989010989010989, + "0FZFGRN9B0WT3XCQMOVPJ.png": 0.9767441860465116, + "00LQMDL10JL253UW69YUO.png": 0.9818181818181818, + "0U79XK18POJ6HCLLOXS4Z.png": 0.9905660377358491, + "0I3RG6GXJ2VILV3BPFIY4.png": 0.9767441860465116, + "0X8PV0Z6SNEKPIPOCP5HR.png": 0.3931034482758621, + "0UFQOEKLIWTX65AY778BD.png": 0.5275590551181102, + "0HZUERFF8VNKXAZLV8RO5.png": 0.9850746268656716, + "0FWXHCMHZ7KG6WYRNWD6Q.png": 0.9922480620155039, + "0A9B6NHM7J57SCT1Z8TAS.png": 0.9861111111111112, + "0WOTQFWQFAEPN0HZ6MYIL.png": 0.9929577464788732, + "0IUNSDMCG8WWVJJ758NN9.png": 0.9887640449438202, + "0XLK4S5OWK77LRNU2JAG9.png": 0.46543778801843316, + "00FMSMFBJU5732FGUTLIF.png": 0.9821428571428571, + "0YOETJE558OS77GHG5L5U.png": 0.9876543209876543, + "0BKXE7HQJOJV0I1LL8YOF.png": 0.9821428571428571, + "0AWHACFMS9KSHM18INN41.png": 0.5416666666666667, + "1BL58Q9DLPBQF73ROGFDX.png": 0.9921875, + "0Q7EACO6OF8WQFZXI1MRQ.png": 0.9896907216494846, + "0R1IOV08YNRVC0KQS84EF.png": 0.9818181818181818, + "0SEF4O8YR8ULW23U32SE6.png": 0.9836065573770492, + "0IQGTS9QZK0ZYRL80GOSD.png": 0.9767441860465116, + "0E00IBZTY74DGR1SSX77L.png": 0.975609756097561, + "0BR0V61AWXYXVQSK6RMY7.png": 0.9911504424778761, + "0MESCFGQYOQNMVWD6B1VU.png": 0.9885057471264368, + "0F3GIMIL9E4UNWEFYLKGV.png": 0.9824561403508771, + "1A8AVSZNK6GTNOCBEVFOY.png": 0.9910714285714286, + "0E7WX1NX5ZKR24SEIUKRN.png": 0.9811320754716981, + "0QZOZCFYQ2TK5C0Q3KN5C.png": 0.5106382978723405, + "0SDC2B1I853GR50G545IX.png": 0.9891304347826086, + "0FEKB24PHTZNT3KIZZVIS.png": 0.9876543209876543, + "0SIW9Q9NWY3TWRC712D4J.png": 0.9876543209876543, + "0JFFFUOFXDOLV2ZGQJAPB.png": 0.9887640449438202, + "0O976W9Y9NDSJ24YV7HU9.png": 0.975609756097561, + "0B0LNAITDDPPCJ4I6XIWK.png": 0.9811320754716981, + "1AAPDAAK73MRINE7PM0ZJ.png": 0.9852941176470589, + "0HC8F1RENJE297WV8RW0N.png": 0.45517241379310347, + "0OXJ4SWAYILOZVQCGO1OB.png": 0.9937106918238994, + "0I3S2Z8YWZ0JOIMKGU51B.png": 0.972972972972973, + "0Y0LZ2LRH7BR5ZDYBTH7U.png": 0.9824561403508771, + "0T0LAS5REAE827IQO0Q9U.png": 0.98989898989899, + "1AQ9EL10BYBSGJO2RLC6Q.png": 0.9868421052631579, + "0L1YL688ZRRPYAJ07UOFQ.png": 0.9911504424778761, + "1BWJOYJSDHL1XJH6UG2RM.png": 0.9922480620155039, + "0TELO9B7QI0QQVFMJXAQ1.png": 0.9896907216494846, + "0XDX2OT3OG575I0U99YAQ.png": 0.54, + "0X49B57NNHU6FEB4J21VY.png": 0.993006993006993, + "0DHJ8WY2XLWKG7K345LAK.png": 0.975609756097561, + "0BQBJ8UFLH7H3JQ965JF6.png": 0.989010989010989, + "00CBN2MRTC48ZY50RUSBW.png": 0.9767441860465116, + "1D7CMRTBBENLYDO7EWWVZ.png": 0.9863013698630136, + "0BX1I2HS6BLV92NZHV6J1.png": 0.9722222222222222, + "0XQE375V4J34MLJYN711T.png": 0.9722222222222222, + "1BKXE7HQJOJV0I1LL8YOF.png": 0.9795918367346939, + "0E3OA2PY1K3B44GN9AS0Y.png": 0.9863013698630136, + "0SCRALC3GPIO2ZD918U8L.png": 0.478021978021978, + "0ITKDLWB7SDGMM8980ZSS.png": 0.9911504424778761, + "1BYRMKANKN4PL6JFPG8AR.png": 0.989010989010989, + "0BYRMKANKN4PL6JFPG8AR.png": 0.9803921568627451, + "0R8W6O2N25AVQI9FQ5IL7.png": 0.972972972972973, + "0J9TV59N7U65CB7YCHD38.png": 0.9922480620155039, + "0VX41MM59ET2MK09202C3.png": 0.9896907216494846, + "1CGP5R7FMVCKR47XK6IVA.png": 0.9896907216494846, + "1BSXNNN0LA94101P5D38I.png": 0.9926470588235294, + "0UMVEM9RUVZDRJRFA1W2V.png": 0.9722222222222222, + "0KPHJHUXB0MS3B9RHL57O.png": 0.9868421052631579, + "0TYH6IN161KXZT369VVWQ.png": 0.9795918367346939, + "0AUTW1OL7IAPO1JH1TQUR.png": 0.984375, + "0GNCKEB99NZ0J9GCAI0TH.png": 0.9850746268656716, + "0CGP5R7FMVCKR47XK6IVA.png": 0.9915966386554622, + "0NNLAUZDCGVKZP852ZJ7X.png": 0.9836065573770492, + "0EJW9DEXTHUR17CZCUPB1.png": 0.9850746268656716, + "0JZRIWIFSATGGFL8P0NZF.png": 0.9908256880733946, + "0VNHMSVYYS2Q0H0VJDNAK.png": 0.9782608695652174, + "0Z2ZZWW84O21E70F5RGIA.png": 0.993103448275862, + "0UODYVKUWDGD6S5D7LNAW.png": 0.9930555555555556, + "0NGNPB7KAJSSKSHQV1KZS.png": 0.9767441860465116, + "0Q3C8N8G8GXV2EP88XEXI.png": 0.9795918367346939, + "0IU89E255WY0KPUD6L7Y9.png": 0.9902912621359223, + "0F22CQYG638LSZROETJ9V.png": 0.9904761904761905, + "0HBX9X0EJVVL4TA9CJ25G.png": 0.9873417721518988, + "0IRDSID7UDBLOIRB9JQ9S.png": 0.9883720930232558, + "0IHCMVD5NO41KSAB3ODC0.png": 0.5213270142180095, + "1AYZOGNX998RYQVPWP1OA.png": 0.9931506849315068, + "0WM2Y66O2ZJA831TN2E7Z.png": 0.9615384615384616, + "0CPW27F5C8I03UQBVBL2Y.png": 0.42500000000000004, + "1BR0V61AWXYXVQSK6RMY7.png": 0.989010989010989, + "0JJPRMSYFQLJKD3JYA1JP.png": 0.9850746268656716, + "0RT937QPOOWU9LKZVU0G3.png": 0.9922480620155039, + "0EFBK546D496KI033ACDF.png": 0.972972972972973, + "0EP1D1EXZC4VOMGZJGQQT.png": 0.9891304347826086, + "0DSQ4IAVY32EHCJ0AJM1Y.png": 0.9824561403508771, + "0F4HFOUP4374O8RL4E914.png": 0.9824561403508771, + "0IMS5FXCTVU6GSCR5CHTK.png": 0.984375, + "0P82SO3E98ECMRNRS62D4.png": 0.9868421052631579, + "1AUTW1OL7IAPO1JH1TQUR.png": 0.9911504424778761, + "0WFIWI83FBAOLU16M27NL.png": 0.9939024390243902, + "1A9B6NHM7J57SCT1Z8TAS.png": 0.9859154929577465, + "0UU3AG1PSZ1H78B6J17PA.png": 0.9882352941176471, + "0UVW81GETVKT5GPM6ZX0S.png": 0.9803921568627451, + "0IKFXKSQ9OA3OCRGQBZFI.png": 0.9795918367346939, + "1C70JEJWPOAT1S8RUWCVB.png": 0.9803921568627451, + "0A7ZU26KX6C0LG0D3T3ZS.png": 0.9818181818181818, + "0BL58Q9DLPBQF73ROGFDX.png": 0.9937106918238994, + "0UQWQMAYVXUFY65GH4ION.png": 0.9836065573770492, + "0R77TU5P7A0F1YTLIGSOA.png": 0.9863013698630136, + "0Q740R8QE6ZAF034ZMGQG.png": 0.9917355371900827, + "1BG5K95UCWQ3JXWC501XA.png": 0.9926470588235294, + "1CPW27F5C8I03UQBVBL2Y.png": 0.9916666666666667 + } +} \ No newline at end of file diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py new file mode 100644 index 00000000..c6cbd7cb --- /dev/null +++ b/scripts/benchmark_table/benchmark_table.py @@ -0,0 +1,167 @@ +import zipfile +from pathlib import Path +import json +import pprint +from typing import Optional, List +import numpy as np +import wget + +from dedoc.api.api_utils import table2html +from dedoc.config import get_config +from dedoc.readers import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer +from scripts.benchmark_table.metric import TEDS + +path_result = Path(__file__).parent / ".." / ".." / "resources" / "benchmarks" +path_result.absolute().mkdir(parents=True, exist_ok=True) + +table_recognizer = TableRecognizer(config=get_config()) +image_reader = PdfImageReader(config=get_config()) + +GENERATED_BENCHMARK = "on_generated_data" +OURDATA_BENCHMARK = "on_our_data" +TYPE_BENCHMARK = OURDATA_BENCHMARK + + +def call_metric(pred_json: dict, true_json: dict, structure_only: bool = False, ignore_nodes: Optional[List] = None) -> dict: + teds = TEDS(structure_only=structure_only, ignore_nodes=ignore_nodes) + scores = teds.batch_evaluate(pred_json, true_json) + pp = pprint.PrettyPrinter() + pp.pprint(scores) + + return scores + + +def get_tables(image_path: Path) -> str: + document = image_reader.read(str(image_path)) + + for table in document.tables: + table.metadata.uid = "test_id" + table2id = {"test_id": 0} + html_tables = [table2html(table, table2id) for table in document.tables] + + # TODO: while works with one table in an image + return html_tables[0] + + +def make_predict_json(data_path: Path) -> dict: + predict_json = {} + for pathname in Path.iterdir(data_path): + print(pathname) + + predict_json[pathname.name] = {"html": "" + get_tables(pathname) + ""} + + return predict_json + + +def download_dataset(data_dir: Path, name_zip: str, url: str) -> None: + if Path.exists(data_dir): + print(f"Use cached benchmark data from {data_dir}") + return + + data_dir.mkdir(parents=True, exist_ok=True) + pdfs_zip_path = data_dir / name_zip + wget.download(url, str(data_dir)) + + with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + zip_ref.extractall(data_dir) + pdfs_zip_path.unlink() + + print(f"Benchmark data downloaded to {data_dir}") + + +def prediction(path_pred: Path, path_images: Path) -> dict: + pred_json = make_predict_json(path_images) + with path_pred.open("w") as fd: + json.dump(pred_json, fd, indent=2, ensure_ascii=False) + + return pred_json + + +def benchmark_on_our_data() -> dict: + data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_table_data" + path_images = data_dir / "images" + path_gt = data_dir / "gt.json" + path_pred = data_dir / "pred.json" + download_dataset(data_dir, + name_zip="benchmark_table_data.zip", + url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download") + + mode_metric_structure_only = False + + with open(path_gt, "r") as fp: + gt_json = json.load(fp) + ''' + Creating base html (based on method predictions for future labeling) + path_images = data_dir / "images_tmp" + pred_json = prediction("gt_tmp.json", path_images) + ''' + pred_json = prediction(path_pred, path_images) + scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only) + + result = dict() + result["mode_metric_structure_only"] = mode_metric_structure_only + result["mean"] = np.mean([score for score in scores.values()]) + result["images"] = scores + + return result + + +def benchmark_on_generated_table() -> dict: + """ + Generated data from https://github.com/hassan-mahmood/TIES_DataGeneration + Article generation information https://arxiv.org/pdf/1905.13391.pdf + Note: generate the 1st table tape category + Note: don't use header table tag
, replacing on tag + Note: all generated data (four categories) you can download from + TODO: some tables have a low quality. Should to trace the reason. + All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU + """ + + data_dir = Path(get_config()["intermediate_data_path"]) / "visualizeimgs" / "category1" + path_images = data_dir / "img_500" + path_gt = data_dir / "html_500" + download_dataset(data_dir, + name_zip="benchmark_table_data_generated_500_tables_category_1.zip", + url="https://at.ispras.ru/owncloud/index.php/s/gItWxupnF2pve6B/download") + mode_metric_structure_only = True + + # make common ground-truth file + common_gt_json = {} + for pathname in Path.iterdir(path_gt): + image_name = pathname.name.split(".")[0] + '.png' + with open(pathname, "r") as fp: + table_html = fp.read() + # exclude header tags + table_html = table_html.replace("", "") + + common_gt_json[image_name] = {"html": table_html} + + file_common_gt = data_dir / "common_gt.json" + with file_common_gt.open("w") as fd: + json.dump(common_gt_json, fd, indent=2, ensure_ascii=False) + + # calculate metrics + path_pred = data_dir / "pred.json" + + pred_json = prediction(path_pred, path_images) + scores = call_metric(pred_json=pred_json, true_json=common_gt_json, + structure_only=mode_metric_structure_only, + ignore_nodes=['span', 'style', 'head', 'h4']) + + result = dict() + result["mode_metric_structure_only"] = mode_metric_structure_only + result["mean"] = np.mean([score for score in scores.values()]) + result["images"] = scores + + return result + + +if __name__ == "__main__": + result = benchmark_on_our_data() if TYPE_BENCHMARK == OURDATA_BENCHMARK else benchmark_on_generated_table() + + # save benchmarks + file_result = path_result / f"table_benchmark_{TYPE_BENCHMARK}.json" + with file_result.open("w") as fd: + json.dump(result, fd, indent=2, ensure_ascii=False) diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py new file mode 100644 index 00000000..ff84a4a7 --- /dev/null +++ b/scripts/benchmark_table/metric.py @@ -0,0 +1,161 @@ +# Copyright 2020 IBM +# Author: peter.zhong@au1.ibm.com +# +# This is free software; you can redistribute it and/or modify +# it under the terms of the Apache 2.0 License. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Apache 2.0 License for more details. + +# Source: https://github.com/ibm-aur-nlp/PubTabNet + +import distance +from apted import APTED, Config +from apted.helpers import Tree +from lxml import etree, html +from collections import deque + +from tqdm import tqdm + + +class TableTree(Tree): + def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.visible = visible + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation + """ + if self.tag == "td" or self.tag == 'th': + result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}' + else: + result = f'"tag": {self.tag}' + for child in self.children: + result += child.bracket() + return "{{" + result + "}}" + + +class CustomConfig(Config): + @staticmethod + def maximum(*sequences): + """Get maximum possible value + """ + return max(map(len, sequences)) + + def normalized_distance(self, *sequences) -> float: + """Get distance from 0 to 1 + """ + return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) + + def rename(self, node1: TableTree, node2: TableTree) -> float: + """Compares attributes of trees""" + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == "td": + if not node1.visible or not node2.visible: + return 0. + if node1.content or node2.content: + return self.normalized_distance(node1.content, node2.content) + return 0. + + +class TEDS(object): + """ Tree Edit Distance based Similarity + """ + + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1" + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + """ Tokenizes table cells + """ + self.__tokens__.append(f"<{node.tag}>") + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != "unk": + self.__tokens__.append(f"") + if node.tag != "td" and node.tail is not None: + self.__tokens__ += list(node.tail) + + def get_span(self, node, name_span: str) -> int: + value = int(node.attrib.get(name_span, "1")) + return 1 if value <= 0 else value + + def load_html_tree(self, node, parent=None): + """ Converts HTML tree to the format required by apted + """ + if node.tag == "td": + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + + try: + new_node = TableTree(tag=node.tag, + colspan=self.get_span(node, "colspan"), + rowspan=self.get_span(node, "rowspan"), + content=cell, + visible=False if node.attrib.get("style") == "display: none" else True, *deque()) + except Exception as ex: + print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") + raise ex + else: + new_node = TableTree(node.tag, None, None, None, True, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != "td": + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred: str, true: str) -> float: + """ Computes TEDS score between the prediction and the ground truth of a given sample + """ + if (not pred) or (not true): + return 0.0 + parser = html.HTMLParser(remove_comments=True, encoding="utf-8") + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + if pred.xpath("body/table") and true.xpath("body/table"): + pred = pred.xpath("body/table")[0] + true = true.xpath("body/table")[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + + distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + def batch_evaluate(self, pred_json, true_json): + """ Computes TEDS score between the prediction and the ground truth of + a batch of samples + @params pred_json: {'FILENAME': 'HTML CODE', ...} + @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + @output: {'FILENAME': 'TEDS SCORE', ...} + """ + samples = true_json.keys() + scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)] + scores = dict(zip(samples, scores)) + return scores diff --git a/scripts/benchmark_table/requirements.txt b/scripts/benchmark_table/requirements.txt new file mode 100644 index 00000000..99314805 --- /dev/null +++ b/scripts/benchmark_table/requirements.txt @@ -0,0 +1,3 @@ +# for metric TEDS: +apted==1.0.3 +distance==0.1.3 \ No newline at end of file