diff --git a/.flake8 b/.flake8 index d7afb7d1..555b4381 100644 --- a/.flake8 +++ b/.flake8 @@ -16,12 +16,15 @@ exclude = resources, venv, build, - dedoc.egg-info - docs/_build + dedoc.egg-info, + docs/_build, + scripts/fintoc2022/metric.py # ANN101 - type annotations for self +# T201 - prints found +# JS101 - Multi-line container not broken after opening character ignore = ANN101 per-file-ignores = scripts/*:T201 - scripts/benchmark_pdf_performance*:JS101,T201 + scripts/benchmark_pdf_performance*:JS101 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 76ee04b4..0f439368 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info + exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py args: - "--config=.flake8" additional_dependencies: [ diff --git a/README.md b/README.md index a4c02e92..519ebb7b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # Dedoc +[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) [![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest) +[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) +[![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space) +[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls") ![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png) @@ -39,26 +43,26 @@ In 2022, the system won a grant to support the development of promising AI proje ## Document format description The system processes different document formats. The main formats are listed below: -| Format group | Description | -|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Office formats | DOCX, XLSX, PPTX and formats that canbe converted to them. Handling of these for-mats is held by analysis of format inner rep-resentation and using specialized libraries ([python-docx](https://python-docx.readthedocs.io/en/latest/), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)) | -| HTML, EML, MHTML | HTML documents are parsed using tagsanalysis, HTML handler is used for han-dling documents of other formats in thisgroup | -| TXT | Only raw textual content is analyzed | -| Archives | Attachments of the archive are analyzed | | -| PDF,document images | Copyable PDF documents (with a textual layer) can be handled using [pdfminer-six](https://pdfminersix.readthedocs.io/en/latest/) library or [tabby](https://github.com/sunveil/ispras_tbl_extr) software. Non-copyable PDF documents or imagesare handled using [Tesseract-OCR](https://github.com/tesseract-ocr/tesseract), machine learning methods (including neural network methods) and [image processing methods](https://opencv.org/) | +| Format group | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Office formats | DOCX, XLSX, PPTX and formats that can be converted to them. Handling of these formats is held by analysis of format inner representation and using specialized libraries ([python-docx](https://python-docx.readthedocs.io/en/latest/), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)) | +| HTML, EML, MHTML | HTML documents are parsed using tags analysis, HTML handler is used for handling documents of other formats in this group | +| TXT | Only raw textual content is analyzed | +| Archives | Attachments of the archive are analyzed | | +| PDF, document images | Copyable PDF documents (with a textual layer) can be handled using [pdfminer-six](https://pdfminersix.readthedocs.io/en/latest/) library or [tabby](https://github.com/sunveil/ispras_tbl_extr) software. Non-copyable PDF documents or images are handled using [Tesseract-OCR](https://github.com/tesseract-ocr/tesseract), machine learning methods (including neural network methods) and [image processing methods](https://opencv.org/) | ## Examples of processed scanned documents * Dedoc can only process scanned black and white documents, such as technical specifications, regulations, articles, etc. -Document examples - +Document examples + * In particular, dedoc recognizes tabular information only from tables with explicit boundaries. Here are examples of documents that can be processed by an dedoc's image handler: -Table parsing example - +Table parsing example + * The system also automatically detects and corrects the orientation of scanned documents -## Example of structure extractor -Law structure example -Tz structure example +## Examples of structure extractors +Law structure example +Tz structure example ## Impact @@ -66,25 +70,26 @@ This project may be useful as a first step of automatic document analysis pipeli Dedoc is in demand for information analytic systems, information leak monitoring systems, as well as for natural language processing systems. The library is intended for application use by developers of systems for automatic analysis and structuring of electronic documents, including for further search in electronic documents. -# Online-Documentation -Relevant documentation of the dedoc is available [here](https://dedoc.readthedocs.io/en/latest/) +# Documentation +Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io/en/latest/) # Demo -You can try dedoc's demo: https://dedoc-readme.hf.space. -We have a video to demonstrate how to use the system: https://www.youtube.com/watch?v=ZUnPYV8rd9A. +* You can try [dedoc demo](https://dedoc-readme.hf.space) +* You can watch [video about dedoc](https://www.youtube.com/watch?v=ZUnPYV8rd9A) -![Web_interface](docs/source/_static/web_interface.png) +![](https://github.com/ispras/dedoc/raw/master/docs/source/_static/web_interface.png) -![dedoc_demo](docs/source/_static/dedoc_short.gif) +![](https://github.com/ispras/dedoc/raw/master/docs/source/_static/dedoc_short.gif) -# Some our publications +# Publications related to dedoc -* Article on [Habr](https://habr.com/ru/companies/isp_ras/articles/779390/), where we describe our system in detail -* [Our article](https://aclanthology.org/2022.fnp-1.13.pdf) from the FINTOC 2022 competition. We are the winners :smiley: :trophy:! +* Article [ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model](https://aclanthology.org/2022.fnp-1.13.pdf) for the [FinTOC 2022 Shared Task](https://wp.lancs.ac.uk/cfie/fintoc2022/). We are the winners :smiley: :trophy:! +* Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023) +* Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023) # Installation instructions -**************************************** + This project has REST Api and you can run it in Docker container. Also, dedoc can be installed as a library via `pip`. There are two ways to install and run dedoc as a web application or a library that are described below. diff --git a/VERSION b/VERSION index 61618788..fae692e4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2 \ No newline at end of file +2.2.1 \ No newline at end of file diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 1c260b37..f139733f 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -7,7 +7,7 @@ @dataclass class QueryParameters: # type of document structure parsing - document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain") + document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain") structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], description="Response representation, most types (except json) are used for debug purposes only") @@ -29,7 +29,7 @@ class QueryParameters: # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") - language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language") + language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], description='One or multiple column document, "auto" - predict number of page columns automatically') diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index 1287912d..dd27cfc1 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -3,6 +3,7 @@ from dedoc.data_structures import LineMetadata from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation +from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation from dedoc.data_structures.concrete_annotations.strike_annotation import StrikeAnnotation from dedoc.data_structures.concrete_annotations.subscript_annotation import SubscriptAnnotation from dedoc.data_structures.concrete_annotations.superscript_annotation import SuperscriptAnnotation @@ -116,7 +117,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab if table2id is None: table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} - ptext = __annotations2html(paragraph, table2id) + ptext = __annotations2html(paragraph, table2id, tabs=tabs) if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]: ptext = f"{ptext.strip()}" @@ -125,7 +126,10 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab else: ptext = ptext.strip() - text += f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}

' + ptext = f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}

' + if hasattr(paragraph.metadata, "uid"): + ptext = f'
{ptext}
' + text += ptext for subparagraph in paragraph.subparagraphs: text = json2html(text=text, paragraph=subparagraph, tables=None, tabs=tabs + 4, table2id=table2id) @@ -157,6 +161,9 @@ def __value2tag(name: str, value: str) -> str: if name == UnderlinedAnnotation.name: return "u" + if name == ReferenceAnnotation.name: + return "a" + if value.startswith("heading "): level = value[len("heading "):] return "h" + level if level.isdigit() and int(level) < 7 else "strong" @@ -164,7 +171,7 @@ def __value2tag(name: str, value: str) -> str: return value -def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: +def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], tabs: int = 0) -> str: indexes = dict() for annotation in paragraph.annotations: @@ -177,7 +184,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name] - check_annotations = bool_annotations + ["table"] + check_annotations = bool_annotations + ["table", "reference"] if name not in check_annotations and not value.startswith("heading "): continue elif name in bool_annotations and annotation.value == "False": @@ -187,10 +194,13 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: indexes.setdefault(annotation.start, "") indexes.setdefault(annotation.end, "") if name == "table": - indexes[annotation.start] += f'

table#{table2id[tag]}

' + indexes[annotation.end] += f' (table {table2id[tag]})' + elif name == "reference": + indexes[annotation.start] += f'<{tag} href="#{value}">' + indexes[annotation.end] = f"" + indexes[annotation.end] else: - indexes[annotation.start] += "<" + tag + ">" - indexes[annotation.end] = "" + indexes[annotation.end] + indexes[annotation.start] += f"<{tag}>" + indexes[annotation.end] = f"" + indexes[annotation.end] insert_tags = sorted([(index, tag) for index, tag in indexes.items()], reverse=True) text = paragraph.text @@ -198,12 +208,13 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: for index, tag in insert_tags: text = text[:index] + tag + text[index:] - return text.replace("\n", "
") + return text.replace("\n", f'
{" " * tabs}') def table2html(table: Table, table2id: Dict[str, int]) -> str: uid = table.metadata.uid - text = f"

table {table2id[uid]}:

" + table_title = f" {table.metadata.title}" if table.metadata.title else "" + text = f"

table {table2id[uid]}:{table_title}

" text += f'\n\n' for row in table.cells: text += "\n" diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py index 197bfbc1..4d814fc3 100644 --- a/dedoc/api/schema/document_metadata.py +++ b/dedoc/api/schema/document_metadata.py @@ -1,5 +1,3 @@ -from typing import Optional - from pydantic import BaseModel, Extra, Field @@ -18,4 +16,3 @@ class Config: created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805) access_time: int = Field(description="File access time in the UnixTime format", example=1590579805) file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text") - other_fields: Optional[dict] = Field(description="Other optional fields") diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py index 0c08dabe..37e893d8 100644 --- a/dedoc/api/schema/line_metadata.py +++ b/dedoc/api/schema/line_metadata.py @@ -13,4 +13,3 @@ class Config: paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text") page_id: int = Field(description="Page number of the line/paragraph beginning", example=0) line_id: Optional[int] = Field(description="Line number", example=1) - other_fields: Optional[dict] = Field(description="Some other fields") diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 5ca05cec..055ef58b 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -38,6 +38,7 @@

Type of document structure parsing

+ document_type

@@ -137,6 +138,8 @@

PDF handling

+ + language

diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 1935e5d2..b657dd88 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -72,7 +72,7 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct # return empty ParsedDocument with Meta information parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) - parsed_file.metadata.set_uid(attachment.uid) + parsed_file.metadata.uid = attachment.uid attachments.append(parsed_file) return attachments diff --git a/dedoc/data_structures/concrete_annotations/reference_annotation.py b/dedoc/data_structures/concrete_annotations/reference_annotation.py index e629ba8b..52a45f1d 100644 --- a/dedoc/data_structures/concrete_annotations/reference_annotation.py +++ b/dedoc/data_structures/concrete_annotations/reference_annotation.py @@ -27,7 +27,7 @@ class ReferenceAnnotation(Annotation): page_id=10, line_id=189, tag_hierarchy_level=HierarchyLevel(level1=2, level2=0, paragraph_type="bibliography_item")), - other_fields={"uid": "97cfac39-f0e3-11ee-b81c-b88584b4e4a1"} + uid="97cfac39-f0e3-11ee-b81c-b88584b4e4a1" ), annotations=[] ) diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index 134ba6a4..beec9c56 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,4 +1,5 @@ import uuid +from typing import Dict, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -17,8 +18,8 @@ def __init__(self, created_time: int, access_time: int, file_type: str, - other_fields: dict = None, - uid: str = None) -> None: + uid: str = None, + **kwargs: Dict[str, Union[str, int, float]]) -> None: """ :param uid: document unique identifier (useful for attached files) :param file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) @@ -28,7 +29,6 @@ def __init__(self, :param created_time: time of the creation in unixtime :param access_time: time of the last access to the file in unixtime :param file_type: mime type of the file - :param other_fields: additional fields of user metadata """ self.file_name = file_name self.temporary_file_name = temporary_file_name @@ -37,32 +37,9 @@ def __init__(self, self.created_time = created_time self.access_time = access_time self.file_type = file_type - self.other_fields = {} - if other_fields is not None and len(other_fields) > 0: - self.extend_other_fields(other_fields) - self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid - - def set_uid(self, uid: str) -> None: - self.uid = uid # noqa - - def extend_other_fields(self, new_fields: dict) -> None: - """ - Add new attributes to the class and to the other_fields dictionary. - - :param new_fields: fields to add - """ - assert (new_fields is not None) - assert (len(new_fields) > 0) - - for key, value in new_fields.items(): + for key, value in kwargs.items(): setattr(self, key, value) - self.other_fields[key] = value + self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid def to_api_schema(self) -> ApiDocumentMetadata: - api_document_metadata = ApiDocumentMetadata(uid=self.uid, file_name=self.file_name, temporary_file_name=self.temporary_file_name, size=self.size, - modified_time=self.modified_time, created_time=self.created_time, access_time=self.access_time, - file_type=self.file_type, other_fields=self.other_fields) - if self.other_fields is not None: - for (key, value) in self.other_fields.items(): - setattr(api_document_metadata, key, value) - return api_document_metadata + return ApiDocumentMetadata(**vars(self)) diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index 19b6730a..e9be87a3 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Dict, Optional, Union from dedoc.api.schema.line_metadata import LineMetadata as ApiLineMetadata from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -15,7 +15,7 @@ def __init__(self, line_id: Optional[int], tag_hierarchy_level: Optional[HierarchyLevel] = None, hierarchy_level: Optional[HierarchyLevel] = None, - other_fields: Optional[dict] = None) -> None: + **kwargs: Dict[str, Union[str, int, float]]) -> None: """ :param page_id: page number where paragraph starts, the numeration starts from page 0 :param line_id: line number inside the entire document, the numeration starts from line 0 @@ -23,33 +23,19 @@ def __init__(self, (usually information got from tags e.g. in docx or html readers) :param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line. The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree. - :param other_fields: additional fields of user metadata """ self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \ if tag_hierarchy_level is None else tag_hierarchy_level self.hierarchy_level = hierarchy_level self.page_id = page_id self.line_id = line_id - self.__other_fields = {} - if other_fields is not None and len(other_fields) > 0: - self.extend_other_fields(other_fields) - - def extend_other_fields(self, new_fields: dict) -> None: - """ - Add new attributes to the class and to the other_fields dictionary. - - :param new_fields: fields to add - """ - assert (new_fields is not None) - assert (len(new_fields) > 0) - - for key, value in new_fields.items(): + for key, value in kwargs.items(): setattr(self, key, value) - self.__other_fields[key] = value def to_api_schema(self) -> ApiLineMetadata: paragraph_type = self.hierarchy_level.line_type if self.hierarchy_level is not None else HierarchyLevel.raw_text - api_line_metadata = ApiLineMetadata(page_id=self.page_id, line_id=self.line_id, paragraph_type=paragraph_type, other_fields=self.__other_fields) - for key, value in self.__other_fields.items(): - setattr(api_line_metadata, key, value) + api_line_metadata = ApiLineMetadata(page_id=self.page_id, line_id=self.line_id, paragraph_type=paragraph_type) + for key, value in vars(self).items(): + if not hasattr(api_line_metadata, key) and key not in ("tag_hierarchy_level", "hierarchy_level"): + setattr(api_line_metadata, key, value) return api_line_metadata diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 301498d4..5600dd29 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -105,8 +105,8 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) # Step 3 - Adding meta-information metadata = self.document_metadata_extractor.extract(file_path=tmp_file_path, converted_filename=os.path.basename(converted_file_path), - original_filename=file_name, parameters=parameters, other_fields=unstructured_document.metadata) - unstructured_document.metadata = metadata + original_filename=file_name, parameters=parameters) + unstructured_document.metadata = {**unstructured_document.metadata, **metadata} self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure diff --git a/dedoc/download_models.py b/dedoc/download_models.py index 643cf30e..b520a7df 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -15,7 +15,8 @@ scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58", font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864", - line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013" + line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013", + fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263" ) @@ -42,6 +43,14 @@ def download(resources_path: str) -> None: repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz") + fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers") + for language in ("en", "fr", "sp"): + for classifier_type in ("target", "binary"): + download_from_hub(out_dir=fintoc_classifiers_resources_path, + out_name=f"{classifier_type}_classifier_{language}.pkg.gz", + repo_name="fintoc_classifiers", + hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz") + if __name__ == "__main__": resources_path = get_config()["resources_path"] diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index 679db954..35815ecf 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -1,7 +1,5 @@ from typing import Optional -from dedoc.readers.article_reader.article_reader import ArticleReader - def _get_manager_config(config: dict) -> dict: """ @@ -23,6 +21,7 @@ def _get_manager_config(config: dict) -> dict: from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.readers.article_reader.article_reader import ArticleReader from dedoc.readers.csv_reader.csv_reader import CSVReader from dedoc.readers.docx_reader.docx_reader import DocxReader from dedoc.readers.email_reader.email_reader import EmailReader @@ -41,9 +40,11 @@ def _get_manager_config(config: dict) -> dict: from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor @@ -93,7 +94,9 @@ def _get_manager_config(config: dict) -> dict: DefaultStructureExtractor.document_type: DefaultStructureExtractor(config=config), DiplomaStructureExtractor.document_type: DiplomaStructureExtractor(config=config), TzStructureExtractor.document_type: TzStructureExtractor(config=config), - ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config) + ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config), + ArticleStructureExtractor.document_type: ArticleStructureExtractor(config=config), + FintocStructureExtractor.document_type: FintocStructureExtractor(config=config) } return dict( diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 3aa74bfe..02b1a8e4 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -11,8 +11,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. @@ -24,8 +23,7 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. @@ -35,7 +33,6 @@ def extract(self, by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting. :param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path :param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details - :param other_fields: other fields that should be added to the document's metadata :return: dict with metadata information about the document """ pass diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index 0e467760..2fc984a0 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -32,8 +32,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ This extractor can handle any file so the method always returns True. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. @@ -44,8 +43,7 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Gets the basic meta-information about the file. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. @@ -55,14 +53,9 @@ def extract(self, meta_info = self._get_base_meta_information(file_dir, file_name, original_filename) if parameters.get("is_attached", False) and str(parameters.get("return_base64", "false")).lower() == "true": - other_fields = {} if other_fields is None else other_fields + with open(os.path.join(file_dir, converted_filename), "rb") as file: + meta_info["base64_encode"] = b64encode(file.read()).decode("utf-8") - path = os.path.join(file_dir, converted_filename) - with open(path, "rb") as file: - other_fields["base64_encode"] = b64encode(file.read()).decode("utf-8") - - if other_fields is not None and len(other_fields) > 0: - meta_info["other_fields"] = other_fields return meta_info @staticmethod diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index be0964c2..cab05fa3 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -30,8 +30,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ Check if the document has .docx extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. @@ -43,8 +42,7 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the docx documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. @@ -52,19 +50,14 @@ def extract(self, parameters = {} if parameters is None else parameters file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) - result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields) + base_fields = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters) + docx_fields = self._get_docx_fields(os.path.join(file_dir, converted_filename)) - file_path = os.path.join(file_dir, converted_filename) - docx_other_fields = self._get_docx_fields(file_path) - - result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields} + result = {**base_fields, **docx_fields} return result def __convert_date(self, date: Optional[datetime]) -> Optional[int]: - if date is not None: - return int(date.timestamp()) - return None + return None if date is None else int(date.timestamp()) def _get_docx_fields(self, file_path: str) -> dict: assert os.path.isfile(file_path) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index 465c9dea..60bec824 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -52,8 +52,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ Check if the document has image-like extension (".png", ".jpg", ".jpeg"). Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. @@ -65,20 +64,16 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for images. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) - result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields) + base_fields = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters) - path = os.path.join(file_dir, converted_filename) - exif_fields = self._get_exif(path) - if len(exif_fields) > 0: - result["other_fields"] = {**result.get("other_fields", {}), **exif_fields} + exif_fields = self._get_exif(os.path.join(file_dir, converted_filename)) + result = {**base_fields, **exif_fields} return result def __encode_exif(self, exif: Union[str, bytes]) -> Optional[str]: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index e0dc4b6e..7c33e290 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -21,8 +21,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ Check if the document has .note.pickle extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. @@ -34,8 +33,7 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the .note.pickle documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. @@ -47,16 +45,13 @@ def extract(self, with open(file_path, "rb") as infile: note_dict = pickle.load(infile) - fields = {"author": note_dict["author"]} - other_fields = {**other_fields, **fields} if other_fields is not None else fields - meta_info = dict(file_name=original_filename, file_type="note", size=note_dict["size"], access_time=note_dict["modified_time"], created_time=note_dict["created_time"], modified_time=note_dict["modified_time"], - other_fields=other_fields) + author=note_dict["author"]) return meta_info except Exception: raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index e3502e44..78fc2ac6 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -44,8 +44,7 @@ def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> bool: + parameters: Optional[dict] = None) -> bool: """ Check if the document has .pdf extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. @@ -57,19 +56,15 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the pdf documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) - result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields) - path = os.path.join(file_dir, converted_filename) - pdf_fields = self._get_pdf_info(path) - if len(pdf_fields) > 0: - result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields} + base_fields = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters) + pdf_fields = self._get_pdf_info(os.path.join(file_dir, converted_filename)) + result = {**base_fields, **pdf_fields} return result def _get_pdf_info(self, path: str) -> dict: diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index ba46c4b0..8505dcf3 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -21,16 +21,13 @@ def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + parameters: Optional[dict] = None) -> dict: """ Extract metadata using one of the extractors if suitable extractor was found. Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` of the class :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters. """ for extractor in self.extractors: - if extractor.can_extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields): - return extractor.extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields) + if extractor.can_extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters): + return extractor.extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters) raise Exception(f"Can't extract metadata from from file {os.path.basename(file_path)}") diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py index f2169452..fcb21cfb 100644 --- a/dedoc/readers/article_reader/article_reader.py +++ b/dedoc/readers/article_reader/article_reader.py @@ -11,6 +11,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.utils.parameter_utils import get_param_document_type from dedoc.utils.utils import get_mime_extension @@ -33,7 +34,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure using beautifulsoup library. As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`. Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. - The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``. + The method extracts information about ``authors``, ``keywords``, ``bibliography items``, ``sections``, and ``tables``. + In table cells, ``colspan`` attribute can be filled according to the GROBID's "cols" attribute. You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. @@ -51,12 +53,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure self.logger.warning(warning) return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning]) - soup = BeautifulSoup(response.text, features="lxml") + soup = BeautifulSoup(response.text, features="xml") lines = self.__parse_title(soup) - if soup.biblstruct is not None: - authors = soup.biblstruct.find_all("author") + if soup.biblStruct is not None: + authors = soup.biblStruct.find_all("author") lines += [line for author in authors for line in self.__parse_author(author)] + lines += self.__parse_keywords(soup.keywords) bib_lines, bib2uid = self.__parse_bibliography(soup) tables, table2uid = self.__parse_tables(soup) @@ -129,17 +132,19 @@ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, par hierarchy_level = HierarchyLevel(level_1=hierarchy_level_id, level_2=0, can_be_multiline=False, line_type=paragraph_type) return LineWithMeta(line=text, - metadata=LineMetadata(page_id=0, line_id=0, tag_hierarchy_level=hierarchy_level, other_fields=other_fields), + metadata=LineMetadata(page_id=0, line_id=0, tag_hierarchy_level=hierarchy_level, **other_fields), annotations=annotations) def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]: lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")] - if affiliation_tag.orgname: - lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name")) + if affiliation_tag.orgName: + lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgName), hierarchy_level_id=3, paragraph_type="org_name")) if affiliation_tag.address: - lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address")) + lines.append(self.__create_line(text=self.__remove_newlines(affiliation_tag.address).get_text(separator=", "), + hierarchy_level_id=3, + paragraph_type="address")) return lines @@ -169,11 +174,11 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]: """ lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")] - first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"]) + first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "forename"]) if first_name: lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name")) - surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"]) + surname = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "surname"]) if surname: lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname")) @@ -187,6 +192,21 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]: return lines + def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]: + """ + + Multi-Object Tracking + Data Association + Survey + + """ + if keywords_tag is None: + return [] + + lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="keywords")] + lines += [self.__create_line(text=item.text, hierarchy_level_id=2, paragraph_type="keyword") for item in keywords_tag.find_all("term")] + return lines + def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta: text = "" start = 0 @@ -219,20 +239,31 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWi lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract")) lines.append(self.__create_line(text=self.__tag2text(abstract))) - for text in soup.find_all("text"): - for part in text.find_all("div"): - # TODO: Beautifulsoup doesn't read tags from input XML file. WTF! - # As a result we lose section number in text (see example above) - # Need to fix this in the future. - number = part.head.get("n") + " " if part.head else "" - line_text = str(part.contents[0]) if len(part.contents) > 0 else None - if line_text is not None and len(line_text) > 0: - lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section")) - for subpart in part.find_all("p"): - if subpart.string is not None: - lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid)) - elif subpart.contents and len(subpart.contents) > 0: - lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid)) + for part in soup.body.find_all("div"): + lines.extend(self.__parse_section(part, bib2uid, table2uid)) + + for other_text_type in ("acknowledgement", "annex"): + for text_tag in soup.find_all("div", attrs={"type": other_text_type}): + for part in text_tag.find_all("div"): + lines.extend(self.__parse_section(part, bib2uid, table2uid)) + + return lines + + def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]: + lines = [] + number = section_tag.head.get("n") if section_tag.head else "" + number = number + " " if number else "" + section_depth = get_dotted_item_depth(number) + section_depth = section_depth if section_depth > 0 else 1 + + line_text = section_tag.head.string if section_tag.head else None + if line_text is not None and len(line_text) > 0: + lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=section_depth, paragraph_type="section")) + for subpart in section_tag.find_all("p"): + if subpart.string is not None: + lines.append(self.__create_line_with_refs(subpart.string + "\n", bib2uid, table2uid)) + elif subpart.contents and len(subpart.contents) > 0: + lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid)) return lines @@ -265,12 +296,26 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]: tag_tables = soup.find_all("figure", {"type": "table"}) for table in tag_tables: - row_cells = [] + table_cells = [] head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head) - title = head + self.__tag2text(table.figdesc) + title = head + self.__tag2text(table.figDesc) for row in table.table.find_all("row"): - row_cells.append([CellWithMeta(lines=[self.__create_line(self.__tag2text(cell))]) for cell in row.find_all("cell")]) - tables.append(Table(cells=row_cells, metadata=TableMetadata(page_id=0, title=title))) + row_cells = [] + for cell in row.find_all("cell"): + cell_text = self.__create_line(self.__tag2text(cell)) + colspan = int(cell.get("cols", 1)) + row_cells.append(CellWithMeta(lines=[cell_text], colspan=colspan)) + + if colspan > 1: + row_cells.extend([CellWithMeta(lines=[cell_text], invisible=True) for _ in range(colspan - 1)]) + + table_cells.append(row_cells) + + # ignore empty tables + if len(table_cells) == 0: + continue + + tables.append(Table(cells=table_cells, metadata=TableMetadata(page_id=0, title=title))) table2uid["#" + table.get("xml:id")] = tables[-1].metadata.uid return tables, table2uid @@ -310,12 +355,12 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: # according GROBID description level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"} - bibliography = soup.find("listbibl", recursive=True) + bibliography = soup.find("listBibl", recursive=True) lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography")) if not bibliography: return lines, cites - bib_items = bibliography.find_all("biblstruct") + bib_items = bibliography.find_all("biblStruct") if not bib_items: return lines, cites @@ -331,19 +376,19 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type)) lines += [ # parse bib authors - self.__create_line(text=author.get_text(), hierarchy_level_id=3, paragraph_type="author") + self.__create_line(text=self.__remove_newlines(author).get_text(separator=" "), hierarchy_level_id=3, paragraph_type="author") for author in bib_item.find_all("author", recursive=True) if author ] lines += [ # parse biblScope self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume") - for bibl_scope in bib_item.find_all("biblscope", {"unit": "volume"}, recursive=True) if bibl_scope + for bibl_scope in bib_item.find_all("biblScope", {"unit": "volume"}, recursive=True) if bibl_scope ] try: lines += [ # parse values self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page") - for bibl_scope in bib_item.find_all("biblscope", {"unit": "page"}, recursive=True) if bibl_scope + for bibl_scope in bib_item.find_all("biblScope", {"unit": "page"}, recursive=True) if bibl_scope ] finally: self.logger.warning("Grobid parsing warning: was non-standard format") @@ -363,3 +408,9 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: def __parse_title(self, soup: Tag) -> List[LineWithMeta]: return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")] + + def __remove_newlines(self, tag: Tag) -> Tag: + for item in tag: + if not isinstance(item, Tag): + item.extract() + return tag diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index 83fb2085..bea5af70 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -83,7 +83,7 @@ def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_t block_lines = self.__handle_single_tag(tag=tag, filepath_hash=filepath_hash, uid=tag_uid, table=table) for line in block_lines: if not getattr(line.metadata, "html_tag", None): - line.metadata.extend_other_fields({"html_tag": tag.name}) + line.metadata.html_tag = tag.name return block_lines def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]: @@ -97,7 +97,7 @@ def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Opt line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header tag_uid = hashlib.md5((uid + text).encode()).hexdigest() line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, filepath_hash=filepath_hash, annotations=annotations) - line.metadata.extend_other_fields({"html_tag": tag.name}) + line.metadata.html_tag = tag.name return [line] def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False, diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 8372fb92..4dd00c9b 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -53,7 +53,7 @@ class PdfBaseReader(BaseReader): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.config["n_jobs"] = config.get("n_jobs", 1) + self.config["n_jobs"] = self.config.get("n_jobs", 1) self.table_recognizer = TableRecognizer(config=self.config) self.metadata_extractor = LineMetadataExtractor(config=self.config) self.attachment_extractor = PDFAttachmentsExtractor(config=self.config) @@ -88,13 +88,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure attachments_dir=attachments_dir ) - lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) + lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) tables = [scan_table.to_table() for scan_table in scan_tables] if param_utils.get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) - result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields) + result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata) return self._postprocess(result) def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( diff --git a/dedoc/structure_extractors/__init__.py b/dedoc/structure_extractors/__init__.py index 404d915c..20f6d350 100644 --- a/dedoc/structure_extractors/__init__.py +++ b/dedoc/structure_extractors/__init__.py @@ -4,11 +4,12 @@ from .concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor from .concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor from .concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor +from .concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor from .concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor from .concrete_structure_extractors.law_structure_excractor import LawStructureExtractor from .concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor from .structure_extractor_composition import StructureExtractorComposition __all__ = ['AbstractStructureExtractor', 'AbstractLawStructureExtractor', 'ArticleStructureExtractor', 'ClassifyingLawStructureExtractor', - 'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', 'TzStructureExtractor', - 'StructureExtractorComposition'] + 'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FintocStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', + 'TzStructureExtractor', 'StructureExtractorComposition'] diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py new file mode 100644 index 00000000..0d78c783 --- /dev/null +++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py @@ -0,0 +1,134 @@ +import os +import re +from typing import Dict, List, Optional, Tuple, Union + +import pandas as pd + +from dedoc.config import get_config +from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument +from dedoc.structure_extractors import AbstractStructureExtractor +from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor +from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor +from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier + + +class FintocStructureExtractor(AbstractStructureExtractor): + """ + This class is an implementation of the TOC extractor for the `FinTOC 2022 Shared task `_. + The code is a modification of the winner's solution (ISP RAS team). + + This structure extractor is used for English, French and Spanish financial prospects in PDF format (with a textual layer). + It is recommended to use :class:`~dedoc.readers.PdfTxtlayerReader` to obtain document lines. + You can find the more detailed description of this type of structure in the section :ref:`fintoc_structure`. + """ + document_type = "fintoc" + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + from dedoc.readers import PdfTxtlayerReader # to exclude circular imports + self.pdf_reader = PdfTxtlayerReader(config=self.config) + self.toc_extractor = TOCFeatureExtractor() + self.features_extractor = FintocFeatureExtractor() + self.languages = ("en", "fr", "sp") + path = os.path.join(get_config()["resources_path"], "fintoc_classifiers") + self.classifiers = {language: FintocClassifier(language=language, weights_dir_path=path) for language in self.languages} + self.toc_item_regexp = re.compile(r'"([^"]+)" (\d+)') + self.empty_string_regexp = re.compile(r"^\s*\n$") + + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None, file_path: Optional[str] = None) -> UnstructuredDocument: + """ + According to the `FinTOC 2022 `_ title detection task, lines are classified as titles and non-titles. + The information about titles is saved in ``line.metadata.hierarchy_level`` (:class:`~dedoc.data_structures.HierarchyLevel` class): + + - Title lines have ``HierarchyLevel.header`` type, and their depth (``HierarchyLevel.level_2``) is similar to \ + the depth of TOC item from the FinTOC 2022 TOC generation task. + - Non-title lines have ``HierarchyLevel.raw_text`` type, and their depth isn't obtained. + + :param document: document content that has been received from some of the readers (:class:`~dedoc.readers.PdfTxtlayerReader` is recommended). + :param parameters: for this structure extractor, "language" parameter is used for setting document's language, e.g. ``parameters={"language": "en"}``. \ + The following options are supported: + + * "en", "eng" - English (default); + * "fr", "fra" - French; + * "sp", "spa" - Spanish. + :param file_path: path to the file on disk. + :return: document content with added additional information about title/non-title lines and hierarchy levels of titles. + """ + parameters = {} if parameters is None else parameters + language = self.__get_param_language(parameters=parameters) + + features, documents = self.get_features(documents_dict={file_path: document.lines}) + predictions = self.classifiers[language].predict(features) + lines: List[LineWithMeta] = documents[0] + assert len(lines) == len(predictions) + + for line, prediction in zip(lines, predictions): + if prediction > 0: + line.metadata.hierarchy_level = HierarchyLevel(level_1=1, level_2=prediction, line_type=HierarchyLevel.header, can_be_multiline=True) + else: + line.metadata.hierarchy_level = HierarchyLevel.create_raw_text() + document.lines = lines + + return document + + def __get_param_language(self, parameters: dict) -> str: + language = parameters.get("language", "en") + + if language in ("en", "eng", "rus+eng"): + return "en" + + if language in ("fr", "fra"): + return "fr" + + if language in ("sp", "spa"): + return "sp" + + if language not in self.languages: + self.logger.warning(f"Language {language} is not supported by this extractor. Use default language (en)") + return "en" + + def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[pd.DataFrame, List[List[LineWithMeta]]]: + toc_lines, documents = [], [] + for file_path, document_lines in documents_dict.items(): + toc_lines.append(self.__get_toc(file_path=file_path)) + documents.append(self.__filter_lines(document_lines)) + features = self.features_extractor.transform(documents=documents, toc_lines=toc_lines) + return features, documents + + def __filter_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: + special_unicode_symbols = [u"\uf0b7", u"\uf0d8", u"\uf084", u"\uf0a7", u"\uf0f0", u"\x83"] + + lines = [line for line in lines if not self.empty_string_regexp.match(line.line)] + for line in lines: + for ch in special_unicode_symbols: + line.set_line(line.line.replace(ch, "")) + + return lines + + def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMeta, str]]]: + """ + Try to get TOC from PDF automatically. If TOC wasn't extracted automatically, it is extracted using regular expressions. + """ + if file_path is None or not file_path.lower().endswith(".pdf"): + return [] + + toc = self.__get_automatic_toc(path=file_path) + if len(toc) > 0: + self.logger.info(f"Got automatic TOC from {os.path.basename(file_path)}") + return toc + + parameters = {"is_one_column_document": "True", "need_header_footer_analysis": "True", "pages": ":10"} + lines = self.pdf_reader.read(file_path=file_path, parameters=parameters).lines + return self.toc_extractor.get_toc(lines) + + def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]: + result = [] + with os.popen(f'pdftocio -p "{path}"') as out: + toc = out.readlines() + + for line in toc: + match = self.toc_item_regexp.match(line.strip()) + if match: + result.append({"line": LineWithMeta(match.group(1)), "page": match.group(2)}) + + return result diff --git a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py new file mode 100644 index 00000000..82e53111 --- /dev/null +++ b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py @@ -0,0 +1,158 @@ +import re +from collections import defaultdict +from typing import Dict, Iterator, List, Optional, Tuple + +import pandas as pd +from Levenshtein._levenshtein import ratio + +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor +from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor +from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.empty_prefix import EmptyPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.roman_prefix import RomanPrefix +from dedoc.structure_extractors.feature_extractors.paired_feature_extractor import PairedFeatureExtractor +from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor +from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_year + + +class FintocFeatureExtractor(AbstractFeatureExtractor): + + def __init__(self) -> None: + self.paired_feature_extractor = PairedFeatureExtractor() + self.prefix_list = [BulletPrefix, AnyLetterPrefix, LetterPrefix, BracketPrefix, BracketRomanPrefix, DottedPrefix, RomanPrefix] + self.list_feature_extractors = [ + ListFeaturesExtractor(window_size=10, prefix_list=self.prefix_list), + ListFeaturesExtractor(window_size=25, prefix_list=self.prefix_list), + ListFeaturesExtractor(window_size=100, prefix_list=self.prefix_list) + ] + self.prefix2number = {prefix.name: i for i, prefix in enumerate(self.prefix_list, start=1)} + self.prefix2number[EmptyPrefix.name] = 0 + + def parameters(self) -> dict: + return {} + + def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> "AbstractFeatureExtractor": + return self + + def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None, toc_lines: Optional[List[List[dict]]] = None) -> pd.DataFrame: + assert len(documents) > 0 + result_matrix = pd.concat([self.__process_document(document, d_toc_lines) for document, d_toc_lines in zip(documents, toc_lines)], ignore_index=True) + result_matrix = pd.concat([result_matrix, self.paired_feature_extractor.transform(documents)], axis=1) + features = sorted(result_matrix.columns) + result_matrix = result_matrix[features].astype(float) + return result_matrix[features] + + def __process_document(self, lines: List[LineWithMeta], toc: Optional[list] = None) -> pd.DataFrame: + features_df = pd.DataFrame(self.__look_at_prev_line(document=lines, n=1)) + features_df["line_relative_length"] = self.__get_line_relative_length(lines) + + list_features = pd.concat([f_e.one_document(lines)[1] for f_e in self.list_feature_extractors], axis=1) + + page_ids = [line.metadata.page_id for line in lines] + start_page, finish_page = (min(page_ids), max(page_ids)) if page_ids else (0, 0) + + total_lines = len(lines) + one_line_features_dict = defaultdict(list) + for line in lines: + for item in self.__one_line_features(line, total_lines, start_page=start_page, finish_page=finish_page, toc=toc): + feature_name, feature = item[0], item[1] + one_line_features_dict[feature_name].append(feature) + + one_line_features_df = pd.DataFrame(one_line_features_dict) + one_line_features_df["font_size"] = self._normalize_features(one_line_features_df.font_size) + + one_line_features_df = self.prev_next_line_features(one_line_features_df, 3, 3) + result_matrix = pd.concat([one_line_features_df, features_df, list_features], axis=1) + result_matrix["page_id"] = [line.metadata.page_id for line in lines] + return result_matrix + + def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[str, List]: + """ + Look at previous line and compare with current line + + :param document: list of lines + :param n: previous line number to look + :return: dict of features + """ + res = defaultdict(list) + for line_id, _ in enumerate(document): + if line_id >= n: + prev_line = document[line_id - n] + res["prev_line_ends"].append(prev_line.line.endswith((".", ";"))) + res["prev_ends_with_colon"].append(prev_line.line.endswith(":")) + res["prev_is_space"].append(prev_line.line.lower().isspace()) + else: + res["prev_line_ends"].append(False) + res["prev_ends_with_colon"].append(False) + res["prev_is_space"].append(False) + return res + + def __get_line_relative_length(self, lines: List[LineWithMeta]) -> List[float]: + max_len = max([len(line.line) for line in lines]) + relative_lengths = [len(line.line) / max_len for line in lines] + return relative_lengths + + def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page: int, finish_page: int, toc: Optional[list]) -> Iterator[tuple]: + yield "normalized_page_id", normalization_by_min_max(line.metadata.page_id, min_v=start_page, max_v=finish_page) + yield "indentation", self._get_indentation(line) + yield "spacing", self._get_spacing(line) + yield "bold", self._get_bold(line) + yield "italic", self._get_italic(line) + yield from self._get_color(line) + yield "font_size", self._get_size(line) + + yield "line_id", normalization_by_min_max(line.metadata.line_id, min_v=0, max_v=total_lines) + yield "num_year_regexp", len(regexps_year.findall(line.line)) + yield "endswith_dot", line.line.endswith(".") + yield "endswith_semicolon", line.line.endswith(";") + yield "endswith_colon", line.line.endswith(":") + yield "endswith_comma", line.line.endswith(",") + yield "startswith_bracket", line.line.strip().startswith(("(", "{")) + + bracket_cnt = 0 + for char in line.line: + if char == "(": + bracket_cnt += 1 + elif char == ")": + bracket_cnt = max(0, bracket_cnt - 1) + yield "bracket_num", bracket_cnt + + probable_toc_title = re.sub(r"[\s:]", "", line.line).lower() + yield "is_toc_title", probable_toc_title in TOCFeatureExtractor.titles + yield from self.__find_in_toc(line, toc) + + line_length = len(line.line) + 1 + yield "supper_percent", sum((1 for letter in line.line if letter.isupper())) / line_length + yield "letter_percent", sum((1 for letter in line.line if letter.isalpha())) / line_length + yield "number_percent", sum((1 for letter in line.line if letter.isnumeric())) / line_length + yield "words_number", len(line.line.split()) + + def __find_in_toc(self, line: LineWithMeta, toc: Optional[List[dict]]) -> Iterator[Tuple[str, int]]: + if toc is None: + yield "is_toc", 0 + yield "in_toc", 0 + yield "toc_exists", 0 + else: + is_toc, in_toc, toc_exists = 0, 0, int(len(toc) > 0) + line_text = line.line.lower().strip() + for item in toc: + if ratio(line_text, item["line"].line.lower()) < 0.8: + continue + # toc entry found + try: + is_toc = 0 if line.metadata.page_id + 1 == int(item["page"]) else 1 + in_toc = 1 if line.metadata.page_id + 1 == int(item["page"]) else 0 + except TypeError: + pass + break + + yield "is_toc", is_toc + yield "in_toc", in_toc + yield "toc_exists", toc_exists diff --git a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py new file mode 100755 index 00000000..130a5560 --- /dev/null +++ b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py @@ -0,0 +1,87 @@ +import json +from typing import List, Optional + +import numpy as np +import pandas as pd +from scipy.stats._multivariate import method + +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor +from dedoc.utils.utils import flatten + + +class PairedFeatureExtractor(AbstractFeatureExtractor): + """ + This class is used as an auxiliary feature extractor to the main extractor. + It allows to add "raw" features related to the lines importance. + Based on one line property (size, indentation) it computes a raw line's depth inside the document tree. + + Example: + For lines + line1 (size=16) + line2 (size=14) + line3 (size=12) + line4 (size=12) + line5 (size=14) + line6 (size=12) + We will obtain a feature vector (raw_depth_size) + [0, 1, 2, 2, 1, 2] + """ + + def parameters(self) -> dict: + return {} + + def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> "AbstractFeatureExtractor": + return self + + def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame: + df = pd.DataFrame() + df["raw_depth_size"] = list(flatten([self._handle_one_document(document, self.__get_size) for document in documents])) + df["raw_depth_indentation"] = list(flatten([self._handle_one_document(document, self._get_indentation) for document in documents])) + return df + + def _handle_one_document(self, document: List[LineWithMeta], get_feature: method) -> List[int]: + if len(document) == 0: + return [] + if len(document) == 1: + return [0] + + features = [get_feature(line) for line in document] + std = np.std(features) + result = [] + stack = [] + + for line in document: + while len(stack) > 0 and self.__compare_lines(stack[-1], line, get_feature, std) <= 0: # noqa + stack.pop() + result.append(len(stack)) + stack.append(line) + + return result + + def __get_size(self, line: LineWithMeta) -> float: + annotations = line.annotations + size_annotation = [annotation for annotation in annotations if annotation.name == SizeAnnotation.name] + if len(size_annotation) > 0: + return float(size_annotation[0].value) + + bbox_annotation = [annotation for annotation in annotations if annotation.name == BBoxAnnotation.name] + if len(bbox_annotation) > 0: + bbox = json.loads(bbox_annotation[0].value) + return bbox["height"] + + return 0 + + def __compare_lines(self, first_line: LineWithMeta, second_line: LineWithMeta, get_feature: method, threshold: float = 0) -> int: + first_feature = get_feature(first_line) + second_feature = get_feature(second_line) + + if first_feature > second_feature + threshold: + return 1 + + if second_feature > first_feature + threshold: + return -1 + + return 0 diff --git a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py index 28fab042..a0000e0a 100644 --- a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np from Levenshtein._levenshtein import ratio @@ -17,11 +17,11 @@ class TOCFeatureExtractor: "indice", "índice", "contenidos", "tabladecontenido" # spanish ) - def get_toc(self, document: List[LineWithMeta]) -> List[dict]: + def get_toc(self, document: List[LineWithMeta]) -> List[Dict[str, Union[LineWithMeta, str]]]: """ Finds the table of contents in the given document Returns: - list of dictionaries with toc item and page number where it is located: {"line", "page"} + list of dictionaries with toc item (LineWithMeta) and page number where it is located: {"line", "page"} """ corrected_lines, marks = self.__get_probable_toc(document) diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py new file mode 100755 index 00000000..9e00e819 --- /dev/null +++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py @@ -0,0 +1,95 @@ +import gzip +import logging +import os +import pickle +from typing import Dict, List, Optional, Union + +import numpy as np +import pandas as pd +import xgbfir +from xgboost import XGBClassifier + +from dedoc.download_models import download_from_hub + + +class FintocClassifier: + """ + Classifier of financial documents for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/). + Lines are classified in two stages: + 1. Binary classification title/not title (title detection task) + 2. Classification of title lines into title depth classes (1-6) (TOC generation task) + + More important lines have a lesser depth. + As a result: + 1. For non-title lines, classifier returns -1. + 2. For title lines, classifier returns their depth (from 1 to 6). + """ + + def __init__(self, language: str, weights_dir_path: Optional[str] = None) -> None: + """ + :param language: language of data ("en", "fr", "sp") + :param weights_dir_path: path to directory with trained models weights + """ + self.weights_dir_path = weights_dir_path + self.language = language + self.classifiers = {"binary": None, "target": None} + + def predict(self, features: pd.DataFrame) -> List[int]: + """ + Two-staged classification: title/not title and depth classification for titles. + For non-title lines, classifier returns -1, for title lines, classifier returns their depth (from 1 to 6). + """ + binary_predictions = self.binary_classifier.predict(features) + # binary_predictions = [True, False, ...], target predictions are predicted only for True items + target_predictions = self.target_classifier.predict(features[binary_predictions]) + result = np.ones_like(binary_predictions) * -1 + result[binary_predictions] = target_predictions + # return list [1, 2, 3, -1, -1, ...], where positive values mean headers depth, -1 mean non-header lines + return list(result) + + def fit(self, + binary_classifier_parameters: Dict[str, Union[int, float, str]], + target_classifier_parameters: Dict[str, Union[int, float, str]], + features: pd.DataFrame, + features_names: List[str]) -> None: + self.classifiers["binary"] = XGBClassifier(**binary_classifier_parameters) + self.classifiers["target"] = XGBClassifier(**target_classifier_parameters) + self.binary_classifier.fit(features[features_names], features.label != -1) + self.target_classifier.fit(features[features_names][features.label != -1], features.label[features.label != -1]) + + def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str], reader: str) -> None: + os.makedirs(classifiers_dir_path, exist_ok=True) + for classifier_type in ("binary", "target"): + with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.pkg.gz"), "wb") as output_file: + pickle.dump(self.classifiers[classifier_type], output_file) + logger.info(f"Classifiers were saved in {classifiers_dir_path} directory") + + os.makedirs(features_importances_dir_path, exist_ok=True) + for classifier_type in ("binary", "target"): + xgbfir.saveXgbFI(self.classifiers[classifier_type], feature_names=features_names, + OutputXlsxFile=os.path.join(features_importances_dir_path, f"feature_importances_{classifier_type}_{self.language}_{reader}.xlsx")) + logger.info(f"Features importances were saved in {features_importances_dir_path} directory") + + @property + def binary_classifier(self) -> XGBClassifier: + return self.__lazy_load_weights("binary") + + @property + def target_classifier(self) -> XGBClassifier: + return self.__lazy_load_weights("target") + + def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier: + if self.classifiers[classifier_type] is None: + assert self.weights_dir_path is not None + file_name = f"{classifier_type}_classifier_{self.language}.pkg.gz" + classifier_path = os.path.join(self.weights_dir_path, file_name) + if not os.path.isfile(classifier_path): + download_from_hub(out_dir=self.weights_dir_path, + out_name=file_name, + repo_name="fintoc_classifiers", + hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.pkg.gz") + + with gzip.open(classifier_path, "rb") as input_file: + self.classifiers[classifier_type] = pickle.load(file=input_file) + + return self.classifiers[classifier_type] diff --git a/docs/source/_static/add_new_structure_type/article_classifier_000000_UX6.json b/docs/source/_static/add_new_structure_type/article_classifier_000000_UX6.json index c7e3da40..881a3c21 100644 --- a/docs/source/_static/add_new_structure_type/article_classifier_000000_UX6.json +++ b/docs/source/_static/add_new_structure_type/article_classifier_000000_UX6.json @@ -33,8 +33,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10005, - "_LineMetadata__other_fields": {} + "line_id": 10005 }, "_annotations": [ { @@ -184,8 +183,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10007, - "_LineMetadata__other_fields": {} + "line_id": 10007 }, "_annotations": [ { @@ -279,8 +277,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10012, - "_LineMetadata__other_fields": {} + "line_id": 10012 }, "_annotations": [ { @@ -437,8 +434,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10018, - "_LineMetadata__other_fields": {} + "line_id": 10018 }, "_annotations": [ { @@ -588,8 +584,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10027, - "_LineMetadata__other_fields": {} + "line_id": 10027 }, "_annotations": [ { @@ -781,8 +776,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10029, - "_LineMetadata__other_fields": {} + "line_id": 10029 }, "_annotations": [ { @@ -876,8 +870,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10040, - "_LineMetadata__other_fields": {} + "line_id": 10040 }, "_annotations": [ { @@ -1097,8 +1090,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10053, - "_LineMetadata__other_fields": {} + "line_id": 10053 }, "_annotations": [ { @@ -1346,8 +1338,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10066, - "_LineMetadata__other_fields": {} + "line_id": 10066 }, "_annotations": [ { @@ -1595,8 +1586,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10077, - "_LineMetadata__other_fields": {} + "line_id": 10077 }, "_annotations": [ { @@ -1816,8 +1806,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10087, - "_LineMetadata__other_fields": {} + "line_id": 10087 }, "_annotations": [ { @@ -2023,8 +2012,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10096, - "_LineMetadata__other_fields": {} + "line_id": 10096 }, "_annotations": [ { @@ -2216,8 +2204,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10107, - "_LineMetadata__other_fields": {} + "line_id": 10107 }, "_annotations": [ { @@ -2437,8 +2424,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10116, - "_LineMetadata__other_fields": {} + "line_id": 10116 }, "_annotations": [ { @@ -2630,8 +2616,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10126, - "_LineMetadata__other_fields": {} + "line_id": 10126 }, "_annotations": [ { @@ -2837,8 +2822,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10139, - "_LineMetadata__other_fields": {} + "line_id": 10139 }, "_annotations": [ { @@ -3086,8 +3070,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10149, - "_LineMetadata__other_fields": {} + "line_id": 10149 }, "_annotations": [ { @@ -3293,8 +3276,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10154, - "_LineMetadata__other_fields": {} + "line_id": 10154 }, "_annotations": [ { @@ -3430,8 +3412,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10165, - "_LineMetadata__other_fields": {} + "line_id": 10165 }, "_annotations": [ { @@ -3651,8 +3632,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10172, - "_LineMetadata__other_fields": {} + "line_id": 10172 }, "_annotations": [ { @@ -3816,8 +3796,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10174, - "_LineMetadata__other_fields": {} + "line_id": 10174 }, "_annotations": [ { @@ -3911,8 +3890,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10186, - "_LineMetadata__other_fields": {} + "line_id": 10186 }, "_annotations": [ { @@ -4146,8 +4124,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10200, - "_LineMetadata__other_fields": {} + "line_id": 10200 }, "_annotations": [ { @@ -4409,8 +4386,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10213, - "_LineMetadata__other_fields": {} + "line_id": 10213 }, "_annotations": [ { @@ -4658,8 +4634,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10227, - "_LineMetadata__other_fields": {} + "line_id": 10227 }, "_annotations": [ { @@ -4921,8 +4896,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10238, - "_LineMetadata__other_fields": {} + "line_id": 10238 }, "_annotations": [ { @@ -5142,8 +5116,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10252, - "_LineMetadata__other_fields": {} + "line_id": 10252 }, "_annotations": [ { @@ -5405,8 +5378,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10265, - "_LineMetadata__other_fields": {} + "line_id": 10265 }, "_annotations": [ { @@ -5654,8 +5626,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10278, - "_LineMetadata__other_fields": {} + "line_id": 10278 }, "_annotations": [ { @@ -5903,8 +5874,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10284, - "_LineMetadata__other_fields": {} + "line_id": 10284 }, "_annotations": [ { @@ -6054,8 +6024,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10296, - "_LineMetadata__other_fields": {} + "line_id": 10296 }, "_annotations": [ { @@ -6289,8 +6258,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10308, - "_LineMetadata__other_fields": {} + "line_id": 10308 }, "_annotations": [ { @@ -6524,8 +6492,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10320, - "_LineMetadata__other_fields": {} + "line_id": 10320 }, "_annotations": [ { @@ -6759,8 +6726,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10335, - "_LineMetadata__other_fields": {} + "line_id": 10335 }, "_annotations": [ { @@ -7036,8 +7002,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10348, - "_LineMetadata__other_fields": {} + "line_id": 10348 }, "_annotations": [ { @@ -7285,8 +7250,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10361, - "_LineMetadata__other_fields": {} + "line_id": 10361 }, "_annotations": [ { @@ -7534,8 +7498,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10375, - "_LineMetadata__other_fields": {} + "line_id": 10375 }, "_annotations": [ { @@ -7797,8 +7760,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10383, - "_LineMetadata__other_fields": {} + "line_id": 10383 }, "_annotations": [ { @@ -7976,8 +7938,7 @@ "line_type": "list_item" }, "page_id": 0, - "line_id": 10389, - "_LineMetadata__other_fields": {} + "line_id": 10389 }, "_annotations": [ { @@ -8127,8 +8088,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10401, - "_LineMetadata__other_fields": {} + "line_id": 10401 }, "_annotations": [ { @@ -8362,8 +8322,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 10402, - "_LineMetadata__other_fields": {} + "line_id": 10402 }, "_annotations": [ { @@ -8450,8 +8409,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20011, - "_LineMetadata__other_fields": {} + "line_id": 20011 }, "_annotations": [ { @@ -8685,8 +8643,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20024, - "_LineMetadata__other_fields": {} + "line_id": 20024 }, "_annotations": [ { @@ -8934,8 +8891,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20038, - "_LineMetadata__other_fields": {} + "line_id": 20038 }, "_annotations": [ { @@ -9197,8 +9153,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20048, - "_LineMetadata__other_fields": {} + "line_id": 20048 }, "_annotations": [ { @@ -9404,8 +9359,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20059, - "_LineMetadata__other_fields": {} + "line_id": 20059 }, "_annotations": [ { @@ -9625,8 +9579,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20072, - "_LineMetadata__other_fields": {} + "line_id": 20072 }, "_annotations": [ { @@ -9874,8 +9827,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20073, - "_LineMetadata__other_fields": {} + "line_id": 20073 }, "_annotations": [ { @@ -9955,8 +9907,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20086, - "_LineMetadata__other_fields": {} + "line_id": 20086 }, "_annotations": [ { @@ -10204,8 +10155,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20098, - "_LineMetadata__other_fields": {} + "line_id": 20098 }, "_annotations": [ { @@ -10439,8 +10389,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20109, - "_LineMetadata__other_fields": {} + "line_id": 20109 }, "_annotations": [ { @@ -10660,8 +10609,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20121, - "_LineMetadata__other_fields": {} + "line_id": 20121 }, "_annotations": [ { @@ -10895,8 +10843,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20134, - "_LineMetadata__other_fields": {} + "line_id": 20134 }, "_annotations": [ { @@ -11144,8 +11091,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20143, - "_LineMetadata__other_fields": {} + "line_id": 20143 }, "_annotations": [ { @@ -11337,8 +11283,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20156, - "_LineMetadata__other_fields": {} + "line_id": 20156 }, "_annotations": [ { @@ -11586,8 +11531,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20171, - "_LineMetadata__other_fields": {} + "line_id": 20171 }, "_annotations": [ { @@ -11863,8 +11807,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20182, - "_LineMetadata__other_fields": {} + "line_id": 20182 }, "_annotations": [ { @@ -12084,8 +12027,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20196, - "_LineMetadata__other_fields": {} + "line_id": 20196 }, "_annotations": [ { @@ -12347,8 +12289,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20209, - "_LineMetadata__other_fields": {} + "line_id": 20209 }, "_annotations": [ { @@ -12596,8 +12537,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20219, - "_LineMetadata__other_fields": {} + "line_id": 20219 }, "_annotations": [ { @@ -12803,8 +12743,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20232, - "_LineMetadata__other_fields": {} + "line_id": 20232 }, "_annotations": [ { @@ -13052,8 +12991,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20243, - "_LineMetadata__other_fields": {} + "line_id": 20243 }, "_annotations": [ { @@ -13273,8 +13211,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20245, - "_LineMetadata__other_fields": {} + "line_id": 20245 }, "_annotations": [ { @@ -13368,8 +13305,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20253, - "_LineMetadata__other_fields": {} + "line_id": 20253 }, "_annotations": [ { @@ -13547,8 +13483,7 @@ "line_type": "raw_text" }, "page_id": 1, - "line_id": 20262, - "_LineMetadata__other_fields": {} + "line_id": 20262 }, "_annotations": [ { @@ -13747,8 +13682,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30003, - "_LineMetadata__other_fields": {} + "line_id": 30003 }, "_annotations": [ { @@ -13870,8 +13804,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30014, - "_LineMetadata__other_fields": {} + "line_id": 30014 }, "_annotations": [ { @@ -14091,8 +14024,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30028, - "_LineMetadata__other_fields": {} + "line_id": 30028 }, "_annotations": [ { @@ -14368,8 +14300,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30041, - "_LineMetadata__other_fields": {} + "line_id": 30041 }, "_annotations": [ { @@ -14617,8 +14548,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30056, - "_LineMetadata__other_fields": {} + "line_id": 30056 }, "_annotations": [ { @@ -14894,8 +14824,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30069, - "_LineMetadata__other_fields": {} + "line_id": 30069 }, "_annotations": [ { @@ -15143,8 +15072,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30079, - "_LineMetadata__other_fields": {} + "line_id": 30079 }, "_annotations": [ { @@ -15350,8 +15278,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30087, - "_LineMetadata__other_fields": {} + "line_id": 30087 }, "_annotations": [ { @@ -15529,8 +15456,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30098, - "_LineMetadata__other_fields": {} + "line_id": 30098 }, "_annotations": [ { @@ -15750,8 +15676,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30108, - "_LineMetadata__other_fields": {} + "line_id": 30108 }, "_annotations": [ { @@ -15957,8 +15882,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30121, - "_LineMetadata__other_fields": {} + "line_id": 30121 }, "_annotations": [ { @@ -16206,8 +16130,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30136, - "_LineMetadata__other_fields": {} + "line_id": 30136 }, "_annotations": [ { @@ -16483,8 +16406,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30150, - "_LineMetadata__other_fields": {} + "line_id": 30150 }, "_annotations": [ { @@ -16746,8 +16668,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30162, - "_LineMetadata__other_fields": {} + "line_id": 30162 }, "_annotations": [ { @@ -16981,8 +16902,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30173, - "_LineMetadata__other_fields": {} + "line_id": 30173 }, "_annotations": [ { @@ -17202,8 +17122,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30184, - "_LineMetadata__other_fields": {} + "line_id": 30184 }, "_annotations": [ { @@ -17423,8 +17342,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30197, - "_LineMetadata__other_fields": {} + "line_id": 30197 }, "_annotations": [ { @@ -17672,8 +17590,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30211, - "_LineMetadata__other_fields": {} + "line_id": 30211 }, "_annotations": [ { @@ -17935,8 +17852,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30222, - "_LineMetadata__other_fields": {} + "line_id": 30222 }, "_annotations": [ { @@ -18156,8 +18072,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30233, - "_LineMetadata__other_fields": {} + "line_id": 30233 }, "_annotations": [ { @@ -18377,8 +18292,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30243, - "_LineMetadata__other_fields": {} + "line_id": 30243 }, "_annotations": [ { @@ -18584,8 +18498,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30245, - "_LineMetadata__other_fields": {} + "line_id": 30245 }, "_annotations": [ { @@ -18679,8 +18592,7 @@ "line_type": "list_item" }, "page_id": 2, - "line_id": 30255, - "_LineMetadata__other_fields": {} + "line_id": 30255 }, "_annotations": [ { @@ -18886,8 +18798,7 @@ "line_type": "list_item" }, "page_id": 2, - "line_id": 30263, - "_LineMetadata__other_fields": {} + "line_id": 30263 }, "_annotations": [ { @@ -19065,8 +18976,7 @@ "line_type": "list_item" }, "page_id": 2, - "line_id": 30270, - "_LineMetadata__other_fields": {} + "line_id": 30270 }, "_annotations": [ { @@ -19230,8 +19140,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30278, - "_LineMetadata__other_fields": {} + "line_id": 30278 }, "_annotations": [ { @@ -19409,8 +19318,7 @@ "line_type": "raw_text" }, "page_id": 2, - "line_id": 30284, - "_LineMetadata__other_fields": {} + "line_id": 30284 }, "_annotations": [ { @@ -19560,8 +19468,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40004, - "_LineMetadata__other_fields": {} + "line_id": 40004 }, "_annotations": [ { @@ -19697,8 +19604,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40016, - "_LineMetadata__other_fields": {} + "line_id": 40016 }, "_annotations": [ { @@ -19932,8 +19838,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40032, - "_LineMetadata__other_fields": {} + "line_id": 40032 }, "_annotations": [ { @@ -20223,8 +20128,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40040, - "_LineMetadata__other_fields": {} + "line_id": 40040 }, "_annotations": [ { @@ -20402,8 +20306,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40053, - "_LineMetadata__other_fields": {} + "line_id": 40053 }, "_annotations": [ { @@ -20651,8 +20554,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40066, - "_LineMetadata__other_fields": {} + "line_id": 40066 }, "_annotations": [ { @@ -20900,8 +20802,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40080, - "_LineMetadata__other_fields": {} + "line_id": 40080 }, "_annotations": [ { @@ -21163,8 +21064,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40095, - "_LineMetadata__other_fields": {} + "line_id": 40095 }, "_annotations": [ { @@ -21440,8 +21340,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40106, - "_LineMetadata__other_fields": {} + "line_id": 40106 }, "_annotations": [ { @@ -21661,8 +21560,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40121, - "_LineMetadata__other_fields": {} + "line_id": 40121 }, "_annotations": [ { @@ -21938,8 +21836,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40123, - "_LineMetadata__other_fields": {} + "line_id": 40123 }, "_annotations": [ { @@ -22033,8 +21930,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40139, - "_LineMetadata__other_fields": {} + "line_id": 40139 }, "_annotations": [ { @@ -22324,8 +22220,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40152, - "_LineMetadata__other_fields": {} + "line_id": 40152 }, "_annotations": [ { @@ -22573,8 +22468,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40163, - "_LineMetadata__other_fields": {} + "line_id": 40163 }, "_annotations": [ { @@ -22794,8 +22688,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40174, - "_LineMetadata__other_fields": {} + "line_id": 40174 }, "_annotations": [ { @@ -23015,8 +22908,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40189, - "_LineMetadata__other_fields": {} + "line_id": 40189 }, "_annotations": [ { @@ -23292,8 +23184,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40202, - "_LineMetadata__other_fields": {} + "line_id": 40202 }, "_annotations": [ { @@ -23541,8 +23432,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40213, - "_LineMetadata__other_fields": {} + "line_id": 40213 }, "_annotations": [ { @@ -23762,8 +23652,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40222, - "_LineMetadata__other_fields": {} + "line_id": 40222 }, "_annotations": [ { @@ -23955,8 +23844,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40236, - "_LineMetadata__other_fields": {} + "line_id": 40236 }, "_annotations": [ { @@ -24218,8 +24106,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40245, - "_LineMetadata__other_fields": {} + "line_id": 40245 }, "_annotations": [ { @@ -24411,8 +24298,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40249, - "_LineMetadata__other_fields": {} + "line_id": 40249 }, "_annotations": [ { @@ -24534,8 +24420,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40261, - "_LineMetadata__other_fields": {} + "line_id": 40261 }, "_annotations": [ { @@ -24769,8 +24654,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40273, - "_LineMetadata__other_fields": {} + "line_id": 40273 }, "_annotations": [ { @@ -25004,8 +24888,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40285, - "_LineMetadata__other_fields": {} + "line_id": 40285 }, "_annotations": [ { @@ -25239,8 +25122,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40301, - "_LineMetadata__other_fields": {} + "line_id": 40301 }, "_annotations": [ { @@ -25530,8 +25412,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40309, - "_LineMetadata__other_fields": {} + "line_id": 40309 }, "_annotations": [ { @@ -25709,8 +25590,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40324, - "_LineMetadata__other_fields": {} + "line_id": 40324 }, "_annotations": [ { @@ -25986,8 +25866,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40337, - "_LineMetadata__other_fields": {} + "line_id": 40337 }, "_annotations": [ { @@ -26235,8 +26114,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40352, - "_LineMetadata__other_fields": {} + "line_id": 40352 }, "_annotations": [ { @@ -26512,8 +26390,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40363, - "_LineMetadata__other_fields": {} + "line_id": 40363 }, "_annotations": [ { @@ -26733,8 +26610,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40376, - "_LineMetadata__other_fields": {} + "line_id": 40376 }, "_annotations": [ { @@ -26982,8 +26858,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40389, - "_LineMetadata__other_fields": {} + "line_id": 40389 }, "_annotations": [ { @@ -27231,8 +27106,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40394, - "_LineMetadata__other_fields": {} + "line_id": 40394 }, "_annotations": [ { @@ -27368,8 +27242,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40405, - "_LineMetadata__other_fields": {} + "line_id": 40405 }, "_annotations": [ { @@ -27589,8 +27462,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40418, - "_LineMetadata__other_fields": {} + "line_id": 40418 }, "_annotations": [ { @@ -27838,8 +27710,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40430, - "_LineMetadata__other_fields": {} + "line_id": 40430 }, "_annotations": [ { @@ -28073,8 +27944,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40444, - "_LineMetadata__other_fields": {} + "line_id": 40444 }, "_annotations": [ { @@ -28336,8 +28206,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40457, - "_LineMetadata__other_fields": {} + "line_id": 40457 }, "_annotations": [ { @@ -28585,8 +28454,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40464, - "_LineMetadata__other_fields": {} + "line_id": 40464 }, "_annotations": [ { @@ -28750,8 +28618,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40475, - "_LineMetadata__other_fields": {} + "line_id": 40475 }, "_annotations": [ { @@ -28971,8 +28838,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40486, - "_LineMetadata__other_fields": {} + "line_id": 40486 }, "_annotations": [ { @@ -29192,8 +29058,7 @@ "line_type": "raw_text" }, "page_id": 3, - "line_id": 40495, - "_LineMetadata__other_fields": {} + "line_id": 40495 }, "_annotations": [ { @@ -29385,8 +29250,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50011, - "_LineMetadata__other_fields": {} + "line_id": 50011 }, "_annotations": [ { @@ -29620,8 +29484,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50022, - "_LineMetadata__other_fields": {} + "line_id": 50022 }, "_annotations": [ { @@ -29841,8 +29704,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50034, - "_LineMetadata__other_fields": {} + "line_id": 50034 }, "_annotations": [ { @@ -30076,8 +29938,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50048, - "_LineMetadata__other_fields": {} + "line_id": 50048 }, "_annotations": [ { @@ -30339,8 +30200,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50058, - "_LineMetadata__other_fields": {} + "line_id": 50058 }, "_annotations": [ { @@ -30546,8 +30406,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50067, - "_LineMetadata__other_fields": {} + "line_id": 50067 }, "_annotations": [ { @@ -30739,8 +30598,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50076, - "_LineMetadata__other_fields": {} + "line_id": 50076 }, "_annotations": [ { @@ -30932,8 +30790,7 @@ "line_type": "list_item" }, "page_id": 4, - "line_id": 50088, - "_LineMetadata__other_fields": {} + "line_id": 50088 }, "_annotations": [ { @@ -31167,8 +31024,7 @@ "line_type": "list_item" }, "page_id": 4, - "line_id": 50102, - "_LineMetadata__other_fields": {} + "line_id": 50102 }, "_annotations": [ { @@ -31430,8 +31286,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50109, - "_LineMetadata__other_fields": {} + "line_id": 50109 }, "_annotations": [ { @@ -31595,8 +31450,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50122, - "_LineMetadata__other_fields": {} + "line_id": 50122 }, "_annotations": [ { @@ -31844,8 +31698,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50134, - "_LineMetadata__other_fields": {} + "line_id": 50134 }, "_annotations": [ { @@ -32079,8 +31932,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50139, - "_LineMetadata__other_fields": {} + "line_id": 50139 }, "_annotations": [ { @@ -32216,8 +32068,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50142, - "_LineMetadata__other_fields": {} + "line_id": 50142 }, "_annotations": [ { @@ -32325,8 +32176,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50155, - "_LineMetadata__other_fields": {} + "line_id": 50155 }, "_annotations": [ { @@ -32574,8 +32424,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50164, - "_LineMetadata__other_fields": {} + "line_id": 50164 }, "_annotations": [ { @@ -32767,8 +32616,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50175, - "_LineMetadata__other_fields": {} + "line_id": 50175 }, "_annotations": [ { @@ -32988,8 +32836,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50186, - "_LineMetadata__other_fields": {} + "line_id": 50186 }, "_annotations": [ { @@ -33209,8 +33056,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50198, - "_LineMetadata__other_fields": {} + "line_id": 50198 }, "_annotations": [ { @@ -33444,8 +33290,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50211, - "_LineMetadata__other_fields": {} + "line_id": 50211 }, "_annotations": [ { @@ -33693,8 +33538,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50220, - "_LineMetadata__other_fields": {} + "line_id": 50220 }, "_annotations": [ { @@ -33886,8 +33730,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50231, - "_LineMetadata__other_fields": {} + "line_id": 50231 }, "_annotations": [ { @@ -34107,8 +33950,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50241, - "_LineMetadata__other_fields": {} + "line_id": 50241 }, "_annotations": [ { @@ -34314,8 +34156,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50255, - "_LineMetadata__other_fields": {} + "line_id": 50255 }, "_annotations": [ { @@ -34577,8 +34418,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50267, - "_LineMetadata__other_fields": {} + "line_id": 50267 }, "_annotations": [ { @@ -34812,8 +34652,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50281, - "_LineMetadata__other_fields": {} + "line_id": 50281 }, "_annotations": [ { @@ -35075,8 +34914,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50293, - "_LineMetadata__other_fields": {} + "line_id": 50293 }, "_annotations": [ { @@ -35310,8 +35148,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50304, - "_LineMetadata__other_fields": {} + "line_id": 50304 }, "_annotations": [ { @@ -35531,8 +35368,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50313, - "_LineMetadata__other_fields": {} + "line_id": 50313 }, "_annotations": [ { @@ -35724,8 +35560,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50322, - "_LineMetadata__other_fields": {} + "line_id": 50322 }, "_annotations": [ { @@ -35917,8 +35752,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50325, - "_LineMetadata__other_fields": {} + "line_id": 50325 }, "_annotations": [ { @@ -36026,8 +35860,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50338, - "_LineMetadata__other_fields": {} + "line_id": 50338 }, "_annotations": [ { @@ -36275,8 +36108,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50350, - "_LineMetadata__other_fields": {} + "line_id": 50350 }, "_annotations": [ { @@ -36510,8 +36342,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50361, - "_LineMetadata__other_fields": {} + "line_id": 50361 }, "_annotations": [ { @@ -36731,8 +36562,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50372, - "_LineMetadata__other_fields": {} + "line_id": 50372 }, "_annotations": [ { @@ -36952,8 +36782,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50383, - "_LineMetadata__other_fields": {} + "line_id": 50383 }, "_annotations": [ { @@ -37173,8 +37002,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50395, - "_LineMetadata__other_fields": {} + "line_id": 50395 }, "_annotations": [ { @@ -37408,8 +37236,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50396, - "_LineMetadata__other_fields": {} + "line_id": 50396 }, "_annotations": [ { @@ -37489,8 +37316,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50408, - "_LineMetadata__other_fields": {} + "line_id": 50408 }, "_annotations": [ { @@ -37724,8 +37550,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50419, - "_LineMetadata__other_fields": {} + "line_id": 50419 }, "_annotations": [ { @@ -37945,8 +37770,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50432, - "_LineMetadata__other_fields": {} + "line_id": 50432 }, "_annotations": [ { @@ -38194,8 +38018,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50443, - "_LineMetadata__other_fields": {} + "line_id": 50443 }, "_annotations": [ { @@ -38415,8 +38238,7 @@ "line_type": "raw_text" }, "page_id": 4, - "line_id": 50449, - "_LineMetadata__other_fields": {} + "line_id": 50449 }, "_annotations": [ { @@ -38566,8 +38388,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60008, - "_LineMetadata__other_fields": {} + "line_id": 60008 }, "_annotations": [ { @@ -38759,8 +38580,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60022, - "_LineMetadata__other_fields": {} + "line_id": 60022 }, "_annotations": [ { @@ -39022,8 +38842,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60035, - "_LineMetadata__other_fields": {} + "line_id": 60035 }, "_annotations": [ { @@ -39271,8 +39090,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60045, - "_LineMetadata__other_fields": {} + "line_id": 60045 }, "_annotations": [ { @@ -39478,8 +39296,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60060, - "_LineMetadata__other_fields": {} + "line_id": 60060 }, "_annotations": [ { @@ -39755,8 +39572,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60075, - "_LineMetadata__other_fields": {} + "line_id": 60075 }, "_annotations": [ { @@ -40032,8 +39848,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60086, - "_LineMetadata__other_fields": {} + "line_id": 60086 }, "_annotations": [ { @@ -40253,8 +40068,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60087, - "_LineMetadata__other_fields": {} + "line_id": 60087 }, "_annotations": [ { @@ -40334,8 +40148,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60091, - "_LineMetadata__other_fields": {} + "line_id": 60091 }, "_annotations": [ { @@ -40457,8 +40270,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60104, - "_LineMetadata__other_fields": {} + "line_id": 60104 }, "_annotations": [ { @@ -40706,8 +40518,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60115, - "_LineMetadata__other_fields": {} + "line_id": 60115 }, "_annotations": [ { @@ -40927,8 +40738,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60128, - "_LineMetadata__other_fields": {} + "line_id": 60128 }, "_annotations": [ { @@ -41176,8 +40986,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60140, - "_LineMetadata__other_fields": {} + "line_id": 60140 }, "_annotations": [ { @@ -41411,8 +41220,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60154, - "_LineMetadata__other_fields": {} + "line_id": 60154 }, "_annotations": [ { @@ -41674,8 +41482,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60162, - "_LineMetadata__other_fields": {} + "line_id": 60162 }, "_annotations": [ { @@ -41853,8 +41660,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60174, - "_LineMetadata__other_fields": {} + "line_id": 60174 }, "_annotations": [ { @@ -42088,8 +41894,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60186, - "_LineMetadata__other_fields": {} + "line_id": 60186 }, "_annotations": [ { @@ -42323,8 +42128,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60198, - "_LineMetadata__other_fields": {} + "line_id": 60198 }, "_annotations": [ { @@ -42558,8 +42362,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60211, - "_LineMetadata__other_fields": {} + "line_id": 60211 }, "_annotations": [ { @@ -42807,8 +42610,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60222, - "_LineMetadata__other_fields": {} + "line_id": 60222 }, "_annotations": [ { @@ -43028,8 +42830,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60235, - "_LineMetadata__other_fields": {} + "line_id": 60235 }, "_annotations": [ { @@ -43277,8 +43078,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60249, - "_LineMetadata__other_fields": {} + "line_id": 60249 }, "_annotations": [ { @@ -43540,8 +43340,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60264, - "_LineMetadata__other_fields": {} + "line_id": 60264 }, "_annotations": [ { @@ -43817,8 +43616,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60279, - "_LineMetadata__other_fields": {} + "line_id": 60279 }, "_annotations": [ { @@ -44094,8 +43892,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60292, - "_LineMetadata__other_fields": {} + "line_id": 60292 }, "_annotations": [ { @@ -44343,8 +44140,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60306, - "_LineMetadata__other_fields": {} + "line_id": 60306 }, "_annotations": [ { @@ -44606,8 +44402,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60317, - "_LineMetadata__other_fields": {} + "line_id": 60317 }, "_annotations": [ { @@ -44827,8 +44622,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60327, - "_LineMetadata__other_fields": {} + "line_id": 60327 }, "_annotations": [ { @@ -45034,8 +44828,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60340, - "_LineMetadata__other_fields": {} + "line_id": 60340 }, "_annotations": [ { @@ -45283,8 +45076,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60349, - "_LineMetadata__other_fields": {} + "line_id": 60349 }, "_annotations": [ { @@ -45476,8 +45268,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60360, - "_LineMetadata__other_fields": {} + "line_id": 60360 }, "_annotations": [ { @@ -45697,8 +45488,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60370, - "_LineMetadata__other_fields": {} + "line_id": 60370 }, "_annotations": [ { @@ -45904,8 +45694,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60381, - "_LineMetadata__other_fields": {} + "line_id": 60381 }, "_annotations": [ { @@ -46125,8 +45914,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60392, - "_LineMetadata__other_fields": {} + "line_id": 60392 }, "_annotations": [ { @@ -46346,8 +46134,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60406, - "_LineMetadata__other_fields": {} + "line_id": 60406 }, "_annotations": [ { @@ -46609,8 +46396,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60416, - "_LineMetadata__other_fields": {} + "line_id": 60416 }, "_annotations": [ { @@ -46816,8 +46602,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60428, - "_LineMetadata__other_fields": {} + "line_id": 60428 }, "_annotations": [ { @@ -47051,8 +46836,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60441, - "_LineMetadata__other_fields": {} + "line_id": 60441 }, "_annotations": [ { @@ -47300,8 +47084,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60448, - "_LineMetadata__other_fields": {} + "line_id": 60448 }, "_annotations": [ { @@ -47465,8 +47248,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60461, - "_LineMetadata__other_fields": {} + "line_id": 60461 }, "_annotations": [ { @@ -47714,8 +47496,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60472, - "_LineMetadata__other_fields": {} + "line_id": 60472 }, "_annotations": [ { @@ -47935,8 +47716,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60484, - "_LineMetadata__other_fields": {} + "line_id": 60484 }, "_annotations": [ { @@ -48170,8 +47950,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60495, - "_LineMetadata__other_fields": {} + "line_id": 60495 }, "_annotations": [ { @@ -48391,8 +48170,7 @@ "line_type": "raw_text" }, "page_id": 5, - "line_id": 60504, - "_LineMetadata__other_fields": {} + "line_id": 60504 }, "_annotations": [ { @@ -48584,8 +48362,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70011, - "_LineMetadata__other_fields": {} + "line_id": 70011 }, "_annotations": [ { @@ -48819,8 +48596,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70017, - "_LineMetadata__other_fields": {} + "line_id": 70017 }, "_annotations": [ { @@ -48970,8 +48746,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70029, - "_LineMetadata__other_fields": {} + "line_id": 70029 }, "_annotations": [ { @@ -49205,8 +48980,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70041, - "_LineMetadata__other_fields": {} + "line_id": 70041 }, "_annotations": [ { @@ -49440,8 +49214,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70054, - "_LineMetadata__other_fields": {} + "line_id": 70054 }, "_annotations": [ { @@ -49689,8 +49462,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70067, - "_LineMetadata__other_fields": {} + "line_id": 70067 }, "_annotations": [ { @@ -49938,8 +49710,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70080, - "_LineMetadata__other_fields": {} + "line_id": 70080 }, "_annotations": [ { @@ -50187,8 +49958,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70094, - "_LineMetadata__other_fields": {} + "line_id": 70094 }, "_annotations": [ { @@ -50450,8 +50220,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70105, - "_LineMetadata__other_fields": {} + "line_id": 70105 }, "_annotations": [ { @@ -50671,8 +50440,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70117, - "_LineMetadata__other_fields": {} + "line_id": 70117 }, "_annotations": [ { @@ -50906,8 +50674,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70122, - "_LineMetadata__other_fields": {} + "line_id": 70122 }, "_annotations": [ { @@ -51043,8 +50810,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70126, - "_LineMetadata__other_fields": {} + "line_id": 70126 }, "_annotations": [ { @@ -51166,8 +50932,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70139, - "_LineMetadata__other_fields": {} + "line_id": 70139 }, "_annotations": [ { @@ -51415,8 +51180,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70149, - "_LineMetadata__other_fields": {} + "line_id": 70149 }, "_annotations": [ { @@ -51622,8 +51386,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70159, - "_LineMetadata__other_fields": {} + "line_id": 70159 }, "_annotations": [ { @@ -51829,8 +51592,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70168, - "_LineMetadata__other_fields": {} + "line_id": 70168 }, "_annotations": [ { @@ -52022,8 +51784,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70180, - "_LineMetadata__other_fields": {} + "line_id": 70180 }, "_annotations": [ { @@ -52257,8 +52018,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70192, - "_LineMetadata__other_fields": {} + "line_id": 70192 }, "_annotations": [ { @@ -52492,8 +52252,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70203, - "_LineMetadata__other_fields": {} + "line_id": 70203 }, "_annotations": [ { @@ -52713,8 +52472,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70216, - "_LineMetadata__other_fields": {} + "line_id": 70216 }, "_annotations": [ { @@ -52962,8 +52720,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70224, - "_LineMetadata__other_fields": {} + "line_id": 70224 }, "_annotations": [ { @@ -53141,8 +52898,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70237, - "_LineMetadata__other_fields": {} + "line_id": 70237 }, "_annotations": [ { @@ -53390,8 +53146,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70249, - "_LineMetadata__other_fields": {} + "line_id": 70249 }, "_annotations": [ { @@ -53625,8 +53380,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70258, - "_LineMetadata__other_fields": {} + "line_id": 70258 }, "_annotations": [ { @@ -53818,8 +53572,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70271, - "_LineMetadata__other_fields": {} + "line_id": 70271 }, "_annotations": [ { @@ -54067,8 +53820,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70284, - "_LineMetadata__other_fields": {} + "line_id": 70284 }, "_annotations": [ { @@ -54316,8 +54068,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70297, - "_LineMetadata__other_fields": {} + "line_id": 70297 }, "_annotations": [ { @@ -54565,8 +54316,7 @@ "line_type": "raw_text" }, "page_id": 6, - "line_id": 70313, - "_LineMetadata__other_fields": {} + "line_id": 70313 }, "_annotations": [ { diff --git a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py index b0069517..40a18c5d 100644 --- a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py +++ b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py @@ -7,7 +7,7 @@ hierarchy_level = HierarchyLevel(level_1=0, level_2=0, line_type="header", can_be_multiline=True) -metadata = LineMetadata(page_id=0, line_id=1, tag_hierarchy_level=None, hierarchy_level=hierarchy_level, other_fields=None) +metadata = LineMetadata(page_id=0, line_id=1, tag_hierarchy_level=None, hierarchy_level=hierarchy_level) annotations = [LinkedTextAnnotation(start=0, end=5, value="Now the line isn't so simple :)"), BoldAnnotation(start=7, end=10, value="True")] super_line = LineWithMeta(text, metadata=metadata, annotations=annotations) diff --git a/docs/source/_static/code_examples/dedoc_return_format.py b/docs/source/_static/code_examples/dedoc_return_format.py index 8d22432f..1d5ec0b7 100644 --- a/docs/source/_static/code_examples/dedoc_return_format.py +++ b/docs/source/_static/code_examples/dedoc_return_format.py @@ -55,6 +55,16 @@ def with_parsed_attachments_example() -> dict: return json.loads(result) +def article_example() -> dict: + with open("test_dir/article.pdf", "rb") as file: + files = {"file": ("article.pdf", file)} + r = requests.post("http://localhost:1231/upload", files=files, data=dict(document_type="article")) + result = r.content.decode("utf-8") + + assert r.status_code == 200 + return json.loads(result) + + if __name__ == "__main__": with open("../json_format_examples/basic_example.json", "w") as f: json.dump(basic_example(), f, indent=2, ensure_ascii=False) @@ -70,3 +80,6 @@ def with_parsed_attachments_example() -> dict: with open("../json_format_examples/with_parsed_attachments.json", "w") as f: json.dump(with_parsed_attachments_example(), f, indent=2, ensure_ascii=False) + + with open("../json_format_examples/article_example.json", "w") as f: + json.dump(article_example(), f, indent=2, ensure_ascii=False) diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 671a5ee6..8af1c6b1 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -64,10 +64,10 @@ metadata_extractor = DocxMetadataExtractor() metadata_extractor.can_extract(file_path) # True document.metadata = metadata_extractor.extract(file_path) -document.metadata # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, -# 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', -# 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, -# 'last_printed_date': None}} +document.metadata # {'file_name': 'example.docx', 'temporary_file_name': 'example.docx', +# 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373839, 'access_time': 1713964145, +# 'created_time': 1713958120, 'modified_time': 1709111749, 'document_subject': '', 'keywords': '', 'category': '', 'comments': '', 'author': '', +# 'last_modified_by': 'python-docx', 'created_date': None, 'modified_date': 1714635406, 'last_printed_date': None} """Using attachments extractors""" diff --git a/docs/source/_static/code_examples/test_dir/article.pdf b/docs/source/_static/code_examples/test_dir/article.pdf new file mode 100644 index 00000000..6c74f192 Binary files /dev/null and b/docs/source/_static/code_examples/test_dir/article.pdf differ diff --git a/docs/source/_static/json_format_examples/article_example.json b/docs/source/_static/json_format_examples/article_example.json index 712c5841..41be6abf 100644 --- a/docs/source/_static/json_format_examples/article_example.json +++ b/docs/source/_static/json_format_examples/article_example.json @@ -7,8 +7,7 @@ "metadata": { "paragraph_type": "root", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -18,8 +17,7 @@ "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -29,8 +27,7 @@ "metadata": { "paragraph_type": "author_first_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -41,8 +38,7 @@ "metadata": { "paragraph_type": "author_surname", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -53,8 +49,7 @@ "metadata": { "paragraph_type": "author_affiliation", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -64,20 +59,18 @@ "metadata": { "paragraph_type": "org_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { "node_id": "0.0.2.1", - "text": "\n45 rue dUlm\n75005\nParis\n", + "text": "45 rue dUlm, 75005, Paris", "annotations": [], "metadata": { "paragraph_type": "address", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -90,8 +83,7 @@ "metadata": { "paragraph_type": "author_affiliation", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -101,20 +93,18 @@ "metadata": { "paragraph_type": "org_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { "node_id": "0.0.3.1", - "text": "\n4 Avenue des Louvresses\n92230\nGennevilliers\n", + "text": "4 Avenue des Louvresses, 92230, Gennevilliers", "annotations": [], "metadata": { "paragraph_type": "address", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -129,8 +119,7 @@ "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -140,8 +129,7 @@ "metadata": { "paragraph_type": "author_first_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -152,8 +140,7 @@ "metadata": { "paragraph_type": "author_surname", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -164,8 +151,7 @@ "metadata": { "paragraph_type": "author_affiliation", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -175,20 +161,18 @@ "metadata": { "paragraph_type": "org_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { "node_id": "0.1.2.1", - "text": "\nBelgium\n", + "text": "Belgium", "annotations": [], "metadata": { "paragraph_type": "address", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -203,8 +187,7 @@ "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -214,8 +197,7 @@ "metadata": { "paragraph_type": "author_first_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -226,8 +208,7 @@ "metadata": { "paragraph_type": "author_surname", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, @@ -238,8 +219,7 @@ "metadata": { "paragraph_type": "author_affiliation", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -249,20 +229,18 @@ "metadata": { "paragraph_type": "org_name", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { "node_id": "0.2.2.1", - "text": "\nBelgium\n", + "text": "Belgium", "annotations": [], "metadata": { "paragraph_type": "address", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -277,8 +255,7 @@ "metadata": { "paragraph_type": "abstract", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -288,8 +265,7 @@ "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -297,13 +273,12 @@ }, { "node_id": "0.4", - "text": "Introduction", + "text": "1 Introduction", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -313,207 +288,206 @@ { "start": 92, "end": 95, - "name": "bibliography_ref", - "value": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10981248-0872-11ef-b95c-0242ac120002" }, { "start": 95, "end": 98, - "name": "bibliography_ref", - "value": "bac4e4bb-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109a0b84-0872-11ef-b95c-0242ac120002" }, { "start": 201, "end": 205, - "name": "bibliography_ref", - "value": "bac4e4ab-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "1099b8be-0872-11ef-b95c-0242ac120002" }, { "start": 205, "end": 208, - "name": "bibliography_ref", - "value": "bac4e551-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c6370-0872-11ef-b95c-0242ac120002" }, { "start": 208, "end": 211, - "name": "bibliography_ref", - "value": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109eab44-0872-11ef-b95c-0242ac120002" }, { "start": 211, "end": 214, - "name": "bibliography_ref", - "value": "bac4e5dd-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109ef5e0-0872-11ef-b95c-0242ac120002" }, { "start": 846, "end": 850, - "name": "bibliography_ref", - "value": "bac4e584-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109d578a-0872-11ef-b95c-0242ac120002" }, { "start": 850, "end": 853, - "name": "bibliography_ref", - "value": "bac4e602-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f92d4-0872-11ef-b95c-0242ac120002" }, { "start": 942, "end": 946, - "name": "bibliography_ref", - "value": "bac4e516-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109b8ad6-0872-11ef-b95c-0242ac120002" }, { "start": 1055, "end": 1059, - "name": "bibliography_ref", - "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109a34ec-0872-11ef-b95c-0242ac120002" }, { "start": 1550, "end": 1554, - "name": "bibliography_ref", - "value": "bac4e501-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109b29c4-0872-11ef-b95c-0242ac120002" }, { "start": 1619, "end": 1623, - "name": "bibliography_ref", - "value": "bac4e480-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10990112-0872-11ef-b95c-0242ac120002" }, { "start": 1623, "end": 1626, - "name": "bibliography_ref", - "value": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10997246-0872-11ef-b95c-0242ac120002" }, { "start": 1683, "end": 1686, - "name": "bibliography_ref", - "value": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10997246-0872-11ef-b95c-0242ac120002" }, { "start": 1626, "end": 1629, - "name": "bibliography_ref", - "value": "bac4e571-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109cf646-0872-11ef-b95c-0242ac120002" }, { "start": 1629, "end": 1632, - "name": "bibliography_ref", - "value": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f2bbe-0872-11ef-b95c-0242ac120002" }, { "start": 1929, "end": 1933, - "name": "bibliography_ref", - "value": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f2bbe-0872-11ef-b95c-0242ac120002" }, { "start": 1632, "end": 1635, - "name": "bibliography_ref", - "value": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f6200-0872-11ef-b95c-0242ac120002" }, { "start": 1689, "end": 1692, - "name": "bibliography_ref", - "value": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f6200-0872-11ef-b95c-0242ac120002" }, { "start": 1635, "end": 1638, - "name": "bibliography_ref", - "value": "bac4e634-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10a07d98-0872-11ef-b95c-0242ac120002" }, { "start": 1692, "end": 1695, - "name": "bibliography_ref", - "value": "bac4e634-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10a07d98-0872-11ef-b95c-0242ac120002" }, { "start": 1638, "end": 1641, - "name": "bibliography_ref", - "value": "bac4e63d-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10a0afc0-0872-11ef-b95c-0242ac120002" }, { "start": 1677, "end": 1680, - "name": "bibliography_ref", - "value": "bac4e42a-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "1097209a-0872-11ef-b95c-0242ac120002" }, { "start": 1680, "end": 1683, - "name": "bibliography_ref", - "value": "bac4e46d-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10989a06-0872-11ef-b95c-0242ac120002" }, { "start": 1686, "end": 1689, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 3412, "end": 3416, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 4544, "end": 4548, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 5206, "end": 5210, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 6249, "end": 6253, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 2381, "end": 2385, - "name": "bibliography_ref", - "value": "bac4e499-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10996f12-0872-11ef-b95c-0242ac120002" }, { "start": 2405, "end": 2408, - "name": "bibliography_ref", - "value": "bac4e461-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "1098687e-0872-11ef-b95c-0242ac120002" }, { "start": 2640, "end": 2643, - "name": "bibliography_ref", - "value": "bac4e43e-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "1097c3a6-0872-11ef-b95c-0242ac120002" }, { "start": 3306, "end": 3310, - "name": "bibliography_ref", - "value": "bac4e4b2-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "1099dbe6-0872-11ef-b95c-0242ac120002" } ], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -521,61 +495,59 @@ }, { "node_id": "0.5", - "text": "Methodology & limitations", + "text": "2 Methodology & limitations", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { "node_id": "0.5.0", - "text": "The main goal of this paper is to provide sound techniques to evaluate how leakage-resilient PRGs/PRFs and masking combine. In this section, we provide a brief description of the methodology we will use for this purpose, and underline its limitations. The two main components, namely performance and security evaluations, are detailed in Sections 3 and 4, and then combined in Section 5. Our proposal essentially holds in five steps that we detail below.1. Fix the target security level. In the following, we will take the AES Rijndael with 128-bit key as case study. Since a small security degradation due to side-channel attacks is unavoidable, we will consider 120-bit, 100-bit and 80-bit target security levels for illustration. We do not go below 80-bit keys since it typically corresponds to current short-term security levels [9].2. Choose an implementation. Given a cryptographic algorithm, this essentially corresponds to the selection of a technology and possibly a set of countermeasures to incorporate in the designs to evaluate. In the following, we will consider both software and hardware implementations for illustration, since they lead to significantly different performance and security levels. As for countermeasures, different types of masking schemes will be considered.3. Evaluate performances / extract a cost function. Given an implementation, different metrics can be selected for this purpose (such as code size, RAM, or cycle count in software and area, frequency, throughput or power consumption in hardware). Both for software and hardware implementations, we will use combined functions, namely the \"code size × cycle count\" product and the \"area / throughput\" ratio. While our methodology would be perfectly applicable to other choices of metrics, we believe they are an interesting starting point to capture the efficiency of our different implementations. In particular for the hardware cases, such metrics are less dependent on the serial vs. parallel nature of the target architectures (see [26], Section 2).4. Evaluate security / extract the maximum number of measurements. This central part of our analysis first requires to select the attacks from which we will evaluate security. In the following, we will consider the \"standard DPA attacks\" described in [31] for this purpose. Furthermore, we will investigate them in the profiled setting of template attacks (i.e. assuming that the adversary can build a precise model for the leakage function) [6]. This choice is motivated by the goal of approaching worst-case evaluations [56]. Based on these attacks, we will estimate the security graphs introduced in [61], i.e. compute the adversaries' success rates in function of their time complexity and number of measurements. From a given security level (e.g. 120-bit time complexity), we will finally extract the maximum number of measurements per key tolerated, as can be bounded by the PRG construction1 .", + "text": "The main goal of this paper is to provide sound techniques to evaluate how leakage-resilient PRGs/PRFs and masking combine. In this section, we provide a brief description of the methodology we will use for this purpose, and underline its limitations. The two main components, namely performance and security evaluations, are detailed in Sections 3 and 4, and then combined in Section 5. Our proposal essentially holds in five steps that we detail below.\n1. Fix the target security level. In the following, we will take the AES Rijndael with 128-bit key as case study. Since a small security degradation due to side-channel attacks is unavoidable, we will consider 120-bit, 100-bit and 80-bit target security levels for illustration. We do not go below 80-bit keys since it typically corresponds to current short-term security levels [9].2. Choose an implementation. Given a cryptographic algorithm, this essentially corresponds to the selection of a technology and possibly a set of countermeasures to incorporate in the designs to evaluate. In the following, we will consider both software and hardware implementations for illustration, since they lead to significantly different performance and security levels. As for countermeasures, different types of masking schemes will be considered.\n3. Evaluate performances / extract a cost function. Given an implementation, different metrics can be selected for this purpose (such as code size, RAM, or cycle count in software and area, frequency, throughput or power consumption in hardware). Both for software and hardware implementations, we will use combined functions, namely the \"code size × cycle count\" product and the \"area / throughput\" ratio. While our methodology would be perfectly applicable to other choices of metrics, we believe they are an interesting starting point to capture the efficiency of our different implementations. In particular for the hardware cases, such metrics are less dependent on the serial vs. parallel nature of the target architectures (see [26], Section 2).4. Evaluate security / extract the maximum number of measurements. This central part of our analysis first requires to select the attacks from which we will evaluate security. In the following, we will consider the \"standard DPA attacks\" described in [31] for this purpose. Furthermore, we will investigate them in the profiled setting of template attacks (i.e. assuming that the adversary can build a precise model for the leakage function) [6]. This choice is motivated by the goal of approaching worst-case evaluations [56]. Based on these attacks, we will estimate the security graphs introduced in [61], i.e. compute the adversaries' success rates in function of their time complexity and number of measurements. From a given security level (e.g. 120-bit time complexity), we will finally extract the maximum number of measurements per key tolerated, as can be bounded by the PRG construction1 .", "annotations": [ { - "start": 833, - "end": 836, - "name": "bibliography_ref", - "value": "bac4e46b-f290-11ee-a6ed-b88584b4e4a1" + "start": 834, + "end": 837, + "name": "reference", + "value": "10988c82-0872-11ef-b95c-0242ac120002" }, { - "start": 2027, - "end": 2031, - "name": "bibliography_ref", - "value": "bac4e4f7-f290-11ee-a6ed-b88584b4e4a1" + "start": 2029, + "end": 2033, + "name": "reference", + "value": "109b1452-0872-11ef-b95c-0242ac120002" }, { - "start": 2295, - "end": 2299, - "name": "bibliography_ref", - "value": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1" + "start": 2297, + "end": 2301, + "name": "reference", + "value": "109baa66-0872-11ef-b95c-0242ac120002" }, { - "start": 2486, - "end": 2489, - "name": "bibliography_ref", - "value": "bac4e456-f290-11ee-a6ed-b88584b4e4a1" + "start": 2488, + "end": 2491, + "name": "reference", + "value": "10982954-0872-11ef-b95c-0242ac120002" }, { - "start": 2566, - "end": 2570, - "name": "bibliography_ref", - "value": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1" + "start": 2568, + "end": 2572, + "name": "reference", + "value": "109f1e26-0872-11ef-b95c-0242ac120002" }, { - "start": 2647, - "end": 2651, - "name": "bibliography_ref", - "value": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1" + "start": 2649, + "end": 2653, + "name": "reference", + "value": "109fee1e-0872-11ef-b95c-0242ac120002" } ], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -588,19 +560,17 @@ "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { "node_id": "0.6.0", - "text": "Compute a global cost metric (possibly with an application constraint). In case of security-bounded implementations, the previous security evaluation can be used to estimate how frequently one has to \"re-key\" within a leakageresilient construction. From this estimate, we derive the average number of AES encryptions to execute per 128-bit output. By multiplying this number with the cost function of our performance evaluations, we obtain a global metric for the implementation of an AES-based design ensuring a given security level. In case of security-unbounded implementations, re-keying is not sufficient to maintain the target security level independent of the number of measurements performed by the adversary. So the cost functions have to be combined with an application constraint, stating the maximum number of measurements that can be tolerated to maintain this security level.Quite naturally, such a methodology is limited in the same way as any performance and security evaluation. From the performance point-of-view, our investigations only apply to a representative subset of the (large) set of AES designs published in the literature. Because of place constraints, we first paid attention to state-of-the-art implementations and countermeasures, but applying our methodology to more examples is naturally feasible (and desirable). A very similar statement holds for security evaluations. Namely, we considered standard DPA attacks as a starting point, and because they typically correspond to the state-of-the-art in research and evaluation laboratories. Yet, cryptanalytic progresses can always appear2 . Besides, countermeasures such as masking may rely on physical assumptions that are difficult to compare rigorously (since highly technology-dependent), as will be detailed next with the case of \"glitches\".Note that these limitations are to a large extent inherent to the problem we tackle, and our results also correspond to the best we can hope in this respect. Hence, more than the practical conclusions that we draw in the following sections (that are of course important for current engineers willing to implement physically secure designs), it is the fact that we are able to compare the performance vs. security tradeoffs corresponding to the combination of leakage-resilient constructions with masking that is the most important contribution of this work. Indeed, these comparisons are dependent on the state-of-the-art implementations and attacks that are considered to be relevant for the selected algorithm.", + "text": "Compute a global cost metric (possibly with an application constraint). In case of security-bounded implementations, the previous security evaluation can be used to estimate how frequently one has to \"re-key\" within a leakageresilient construction. From this estimate, we derive the average number of AES encryptions to execute per 128-bit output. By multiplying this number with the cost function of our performance evaluations, we obtain a global metric for the implementation of an AES-based design ensuring a given security level. In case of security-unbounded implementations, re-keying is not sufficient to maintain the target security level independent of the number of measurements performed by the adversary. So the cost functions have to be combined with an application constraint, stating the maximum number of measurements that can be tolerated to maintain this security level.\nQuite naturally, such a methodology is limited in the same way as any performance and security evaluation. From the performance point-of-view, our investigations only apply to a representative subset of the (large) set of AES designs published in the literature. Because of place constraints, we first paid attention to state-of-the-art implementations and countermeasures, but applying our methodology to more examples is naturally feasible (and desirable). A very similar statement holds for security evaluations. Namely, we considered standard DPA attacks as a starting point, and because they typically correspond to the state-of-the-art in research and evaluation laboratories. Yet, cryptanalytic progresses can always appear2 . Besides, countermeasures such as masking may rely on physical assumptions that are difficult to compare rigorously (since highly technology-dependent), as will be detailed next with the case of \"glitches\".Note that these limitations are to a large extent inherent to the problem we tackle, and our results also correspond to the best we can hope in this respect. Hence, more than the practical conclusions that we draw in the following sections (that are of course important for current engineers willing to implement physically secure designs), it is the fact that we are able to compare the performance vs. security tradeoffs corresponding to the combination of leakage-resilient constructions with masking that is the most important contribution of this work. Indeed, these comparisons are dependent on the state-of-the-art implementations and attacks that are considered to be relevant for the selected algorithm.\n", "annotations": [], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -608,103 +578,101 @@ }, { "node_id": "0.7", - "text": "Performance evaluations", + "text": "3 Performance evaluations", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { "node_id": "0.7.0", - "text": "In this section, we provide our performance evaluations for unprotected and masked AES designs. As previously mentioned, we will consider both software and hardware examples for this purpose. In this context, the main challenge is to find implementations that are (reasonably) comparable. This turned out to be relatively easy in the software case, for which we selected a couple of implementations in 8-bit microcontrollers, i.e. typical targets for side-channel analysis. By contrast, finding implementations in the same technology turns out to be more challenging in hardware: transistor sizes have evolved from (more than) 130µm to (less than) 65ηm over the last 15 years (i.e. the period over which most countermeasures against side-channel attacks have been proposed). Hence, published performance evaluations for side-channel protected designs are rarely comparable. Yet, we could find several designs in a recent FPGA technology, namely the Xilinx Virtex-5 devices (that are based on a 65ηm process).The performances of the implementations we will analyze are summarized in Table 1. As previously mentioned, our software cost function is the frequently considered \"code size × cycle count\" metric, while we use the \"area / throughput\" ratio in the hardware (FPGA) case. As for the countermeasures evaluated, we first focused on the higher-order masking scheme proposed by Rivain and Prouff at CHES 2010, which can be considered as the state-of-the-art in software [53]. We then added the CHES 2011 polynomial masking scheme of Prouff and Roche [45] (and its implementation in [20]), as a typical example of \"glitchresistant\" solution relying on secret sharing and multiparty computation (see the discussion in the next paragraph). A similar variety of countermeasures is proposed in hardware, where we also consider an efficient but glitch-sensitive implementation proposed in [48], and a threshold AES implementation that is one of the most promising solutions to deal with glitches in this case [36]. Note that this latter implementation is based on an 8-bit architecture (rather than a 128-bit one for the others). So although our cost function is aimed at making comparisons between different architectures more reflective of the algorithms' and countermeasures' performances, more serial implementations as this one generally pay a small overhead due to their more complex control logic.Physical assumptions and glitches. As explicit in Table 1, countermeasures against side-channel attacks always rely on a number of physical assumptions.In the case of masking, a central one is that the leakage of the shares manipulated by the target implementation should be independent of each other [22]. Glitches, that are transient signals appearing during the computations in certain (e.g. CMOS) implementations, are a typical physical default that can cause this assumption to fail, as first put forward by Mangard et al. in [32]. There are two possible solutions to deal with such physical defaults: either by making explicit to cryptographic engineers that they have to prevent glitches at the physical level, or by designing countermeasures that can cope with glitches.Interestingly, the first solution is one aspect where hardware and software implementations significantly differ. Namely, while it is usually possible to ensure independent leakages in masked software, by ensuring a sufficient time separation between the manipulation of the shares, it is extremely difficult to avoid glitches in hardware [33]. Yet, even in hardware it is generally expected that the \"glitch signal\" will be more difficult to exploit by adversaries, especially if designers pay attention to this issue [35]. In this context, the main question is to determine the amplitude of this signal: if sufficiently reduced in front of the measurement noise, it may turn out that a glitch-sensitive masked implementation leads to improved security levels (compared to an unprotected one). Since this amplitude is highly technology-dependent, we will use it as a parameter to analyze the security of our hardware implementations in the next sections. Yet, we recall that it is a safe practice to focus on glitch-resistant implementations when it comes to hardware. Besides, we note that glitches are not the only physical default that may cause the independent leakage assumption to be contradicted in practice [42,51].", + "text": "In this section, we provide our performance evaluations for unprotected and masked AES designs. As previously mentioned, we will consider both software and hardware examples for this purpose. In this context, the main challenge is to find implementations that are (reasonably) comparable. This turned out to be relatively easy in the software case, for which we selected a couple of implementations in 8-bit microcontrollers, i.e. typical targets for side-channel analysis. By contrast, finding implementations in the same technology turns out to be more challenging in hardware: transistor sizes have evolved from (more than) 130µm to (less than) 65ηm over the last 15 years (i.e. the period over which most countermeasures against side-channel attacks have been proposed). Hence, published performance evaluations for side-channel protected designs are rarely comparable. Yet, we could find several designs in a recent FPGA technology, namely the Xilinx Virtex-5 devices (that are based on a 65ηm process).\nThe performances of the implementations we will analyze are summarized in Table 1. As previously mentioned, our software cost function is the frequently considered \"code size × cycle count\" metric, while we use the \"area / throughput\" ratio in the hardware (FPGA) case. As for the countermeasures evaluated, we first focused on the higher-order masking scheme proposed by Rivain and Prouff at CHES 2010, which can be considered as the state-of-the-art in software [53]. We then added the CHES 2011 polynomial masking scheme of Prouff and Roche [45] (and its implementation in [20]), as a typical example of \"glitchresistant\" solution relying on secret sharing and multiparty computation (see the discussion in the next paragraph). A similar variety of countermeasures is proposed in hardware, where we also consider an efficient but glitch-sensitive implementation proposed in [48], and a threshold AES implementation that is one of the most promising solutions to deal with glitches in this case [36]. Note that this latter implementation is based on an 8-bit architecture (rather than a 128-bit one for the others). So although our cost function is aimed at making comparisons between different architectures more reflective of the algorithms' and countermeasures' performances, more serial implementations as this one generally pay a small overhead due to their more complex control logic.Physical assumptions and glitches. As explicit in Table 1, countermeasures against side-channel attacks always rely on a number of physical assumptions.In the case of masking, a central one is that the leakage of the shares manipulated by the target implementation should be independent of each other [22]. Glitches, that are transient signals appearing during the computations in certain (e.g. CMOS) implementations, are a typical physical default that can cause this assumption to fail, as first put forward by Mangard et al. in [32]. There are two possible solutions to deal with such physical defaults: either by making explicit to cryptographic engineers that they have to prevent glitches at the physical level, or by designing countermeasures that can cope with glitches.Interestingly, the first solution is one aspect where hardware and software implementations significantly differ. Namely, while it is usually possible to ensure independent leakages in masked software, by ensuring a sufficient time separation between the manipulation of the shares, it is extremely difficult to avoid glitches in hardware [33]. Yet, even in hardware it is generally expected that the \"glitch signal\" will be more difficult to exploit by adversaries, especially if designers pay attention to this issue [35]. In this context, the main question is to determine the amplitude of this signal: if sufficiently reduced in front of the measurement noise, it may turn out that a glitch-sensitive masked implementation leads to improved security levels (compared to an unprotected one). Since this amplitude is highly technology-dependent, we will use it as a parameter to analyze the security of our hardware implementations in the next sections. Yet, we recall that it is a safe practice to focus on glitch-resistant implementations when it comes to hardware. Besides, we note that glitches are not the only physical default that may cause the independent leakage assumption to be contradicted in practice [42,51].", "annotations": [ { - "start": 1088, - "end": 1089, + "start": 1089, + "end": 1090, "name": "table", - "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1" + "value": "1c9f98e6-e1f8-49f3-8bf7-24022f2d1939" }, { - "start": 2456, - "end": 2457, + "start": 2457, + "end": 2458, "name": "table", - "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1" + "value": "1c9f98e6-e1f8-49f3-8bf7-24022f2d1939" }, { - "start": 1472, - "end": 1476, - "name": "bibliography_ref", - "value": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1" + "start": 1473, + "end": 1477, + "name": "reference", + "value": "109eab44-0872-11ef-b95c-0242ac120002" }, { - "start": 1552, - "end": 1556, - "name": "bibliography_ref", - "value": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1" + "start": 1553, + "end": 1557, + "name": "reference", + "value": "109d73e6-0872-11ef-b95c-0242ac120002" }, { - "start": 1584, - "end": 1588, - "name": "bibliography_ref", - "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1" + "start": 1585, + "end": 1589, + "name": "reference", + "value": "109a34ec-0872-11ef-b95c-0242ac120002" }, { - "start": 1885, - "end": 1889, - "name": "bibliography_ref", - "value": "bac4e5a0-f290-11ee-a6ed-b88584b4e4a1" + "start": 1886, + "end": 1890, + "name": "reference", + "value": "109df136-0872-11ef-b95c-0242ac120002" }, { - "start": 2005, - "end": 2009, - "name": "bibliography_ref", - "value": "bac4e549-f290-11ee-a6ed-b88584b4e4a1" + "start": 2006, + "end": 2010, + "name": "reference", + "value": "109c4dea-0872-11ef-b95c-0242ac120002" }, { - "start": 2701, - "end": 2705, - "name": "bibliography_ref", - "value": "bac4e4d6-f290-11ee-a6ed-b88584b4e4a1" + "start": 2702, + "end": 2706, + "name": "reference", + "value": "109a7a92-0872-11ef-b95c-0242ac120002" }, { - "start": 2931, - "end": 2935, - "name": "bibliography_ref", - "value": "bac4e526-f290-11ee-a6ed-b88584b4e4a1" + "start": 2932, + "end": 2936, + "name": "reference", + "value": "109bcd5c-0872-11ef-b95c-0242ac120002" }, { - "start": 3517, - "end": 3521, - "name": "bibliography_ref", - "value": "bac4e531-f290-11ee-a6ed-b88584b4e4a1" + "start": 3518, + "end": 3522, + "name": "reference", + "value": "109bf85e-0872-11ef-b95c-0242ac120002" }, { - "start": 3697, - "end": 3701, - "name": "bibliography_ref", - "value": "bac4e541-f290-11ee-a6ed-b88584b4e4a1" + "start": 3698, + "end": 3702, + "name": "reference", + "value": "109c2608-0872-11ef-b95c-0242ac120002" }, { - "start": 4394, - "end": 4398, - "name": "bibliography_ref", - "value": "bac4e575-f290-11ee-a6ed-b88584b4e4a1" + "start": 4395, + "end": 4399, + "name": "reference", + "value": "109d0ba4-0872-11ef-b95c-0242ac120002" }, { - "start": 4398, - "end": 4401, - "name": "bibliography_ref", - "value": "bac4e5bc-f290-11ee-a6ed-b88584b4e4a1" + "start": 4399, + "end": 4402, + "name": "reference", + "value": "109e67a6-0872-11ef-b95c-0242ac120002" } ], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -712,13 +680,12 @@ }, { "node_id": "0.8", - "text": "Security evaluations", + "text": "4 Security evaluations", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { @@ -728,7219 +695,6481 @@ { "start": 520, "end": 523, - "name": "bibliography_ref", - "value": "bac4e463-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10986b4e-0872-11ef-b95c-0242ac120002" }, { "start": 818, "end": 822, - "name": "bibliography_ref", - "value": "bac4e505-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109b3b6c-0872-11ef-b95c-0242ac120002" }, { "start": 1045, "end": 1048, - "name": "bibliography_ref", - "value": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10981248-0872-11ef-b95c-0242ac120002" }, { "start": 2048, "end": 2052, - "name": "bibliography_ref", - "value": "bac4e4cd-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109a4482-0872-11ef-b95c-0242ac120002" }, { "start": 2520, "end": 2524, - "name": "bibliography_ref", - "value": "bac4e516-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109b8ad6-0872-11ef-b95c-0242ac120002" }, { "start": 2604, "end": 2608, - "name": "bibliography_ref", - "value": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f1e26-0872-11ef-b95c-0242ac120002" }, { "start": 3482, "end": 3486, - "name": "bibliography_ref", - "value": "bac4e602-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109f92d4-0872-11ef-b95c-0242ac120002" }, { "start": 3864, "end": 3868, - "name": "bibliography_ref", - "value": "bac4e4a3-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10999406-0872-11ef-b95c-0242ac120002" }, { "start": 3868, "end": 3871, - "name": "bibliography_ref", - "value": "bac4e5c4-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109e782c-0872-11ef-b95c-0242ac120002" } ], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] - } - ] - }, - { - "node_id": "0.9", - "text": "Evaluation setups", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ + }, { - "node_id": "0.9.0", - "text": "We will consider two types of setups in our evaluations: one for software, one for hardware. As illustrated in Figure 3 in the case of a Boolean-masked S-box implementation with two shares, the main difference is that the software performs all the operations sequentially, while the hardware performs them in parallel. We will further assume that the leakage of parallel operations is summed [40]. As previously mentioned, we will illustrate our analyses with a Hamming weight leakage function. Additionally, we will consider a noise variance of 10, corresponding to a Signal-to-Noise Ratio of 0.2 (as defined in [29]) 3 . This is a typical value, both for software implementations [11] and FPGA measurement boards [25].Let us denote the AES S-box as S, a byte of plaintext and key as x i and k i (respectively), the random shares used in masking as r j i (before the S-box) and m j i (after the S-box), the Hamming weight function as HW, the bitwise XOR as ⊕, the field multiplication used in polynomial masking as ⊗, and Gaussiandistributed noise random variables N j i . From these notations, we can specify the list of all our target implementations as summarized in Table 2.A couple of observations are worth being underlined as we now discuss.First, and as already mentioned, the main difference between software and hardware implementations is the number of exploitable leakage samples: there is a single such sample per plaintext in hardware while there are 16×(N m +1) ones in software (with N m the number of masks). Next, we only considered glitches in hardware (since it is generally possible to ensure independent leakage in software, by ensuring a sufficient time separation between the manipulation of the shares). We assumed that \"first-order glitches\" can appear in our Boolean-masked FPGA implementation, and modeled the impact of the mask as an additive binomial noise in this case. We further assumed that the amplitude of this first-order signal was reduced according to a factor f . This factor corresponds to the parameter used to quantify the amplitude of the glitches mentioned in the previous section. Note that this modeling is sound because the complexity of a first-order DPA only depends on the value of its SNR (which is equivalent to correlation and information theoretic metrics in this case, as proven in [31]). So even leakage functions deviating from the Hamming weight abstraction would lead to similar trends. Since the threshold implementation in [36] guarantees the absence of firstorder glitches, we only analyzed the possibility of second-order glitches for this one, and modeled them in the same way as just described (i.e. by considering the second mask M 2 i as an additive binomial noise, and reducing the amplitude of the second-order signal by a factor f ). Third, the chosen-plaintext construction of [34] is only applicable in hardware. Furthermore, we only evaluated its impact for the unprotected implementation, and the 1-mask Boolean one with glitches. As will become clear in the next section, this is because the data complexity bound to 256 (that is the maximum tolerated by design in this case) is only relevant when successful side-channel attacks occur for such small complexities (which was only observed for implementations with first-order signal).For convenience, we denoted each implementation in our experiments with three letters. The first one corresponds to the type of scenario considered, i.e. with Known (K) or carefully Chosen (C) plaintexts. The second one indicates [20,45]2nd-order KP whether we are in a Software (S) or Hardware (H) case study. The third one corresponds to the type of countermeasure selected, i.e. Unprotected (U), 1-or 2-mask Boolean (B 1 , B 2 ), 1-mask Polynomial (P 1 ) and 2-mask threshold (T 2 ). The additional star signals finally reflect the presence of (first-order or secondorder) glitches. For example, KHB * 1 is an AES design protected with a 1-mask Boolean scheme, implemented in an imperfect hardware leading to first-order glitches, and analyzed in the context of known (uniform) plaintexts.", - "annotations": [ - { - "start": 392, - "end": 396, - "name": "bibliography_ref", - "value": "bac4e568-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 613, - "end": 617, - "name": "bibliography_ref", - "value": "bac4e50e-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 682, - "end": 686, - "name": "bibliography_ref", - "value": "bac4e476-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 715, - "end": 719, - "name": "bibliography_ref", - "value": "bac4e4ee-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 2339, - "end": 2343, - "name": "bibliography_ref", - "value": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 2486, - "end": 2490, - "name": "bibliography_ref", - "value": "bac4e549-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 2850, - "end": 2854, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 3542, - "end": 3546, - "name": "bibliography_ref", - "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 3546, - "end": 3548, - "name": "bibliography_ref", - "value": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 1177, - "end": 1178, - "name": "table", - "value": "6e093372-d147-4245-8aab-08ed5fe5c072" - } - ], + "node_id": "0.8.1", + "text": "4.1 Evaluation setups", + "annotations": [], "metadata": { - "paragraph_type": "raw_text", + "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, - "subparagraphs": [] - } - ] - }, - { - "node_id": "0.10", - "text": "Template attacks and security graphs", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ - { - "node_id": "0.10.0", - "text": "Given the leakage functions defined in Table 2, a template attack first requires to build a leakage model. In the following, and for each byte of the AES master key, we will consider Gaussian templates for unprotected implementations, and Gaussian for masked implementations. Let us denote the probability density function of a Gaussian distribution taken on input z, with mean µ (resp. mean vector µ) and variance σ 2 (resp. covariance matrix Σ) as N (z|µ, σ 2 ) (resp. N (z|µ, Σ)). This notation directly leads to models of the form:Prfor (software and hardware) unprotected implementations and:Prfor (software and hardware) masked implementations with two shares. The formula naturally extends to more shares, by just adding more sums over the masks. Note that in these models, all the noise (including the algorithmic one in hardware implementations) is captured by the Gaussian distribution 4 . Given these models, the template adversary will accumulate information on the key bytes k i , by computing products of probabilities corresponding to multiple plaintexts. Doing so and for each key byte, he will produce lists of 256 probabilities corresponding each possible candidate ki , defined as follows:i ],with the leakage vector L (j) respectively corresponding to l (j) i (resp. l (j) ) in the context of Equ. 1 (resp. Equ. 2) and l 1,(j) i , l 2,(j) i (resp. l (j) ) in the context of Equ. 3 (resp. Equ. 4) The number of measurements is given by q in Equ. 5. Next and for each target implementation, we will repeat 100 experiments. And for each value of q in these experiments, use a rank estimation algorithm to evaluate the time complexity needed to recover the full AES master key [61]. Eventually, we will build \"security graphs\" where the attack probability of success is provided in function of a time complexity and a number of measurements.Iterative DPA against constructions with carefully chosen plaintexts. Note that while standard DPA attacks are adequate to analyze the security of unprotected and masked implementations in a known-plaintext scenario, their divide-and-conquer strategy hardly applies to the PRF in [34], with carefullychosen plaintexts leading to key-dependent algorithmic noise. This is because the (maximum 256) constants c j used in this proposal are such that all 16 bytes are always identical. Hence, a standard DPA will provide a single list of probabilities, containing information about the 16 AES key bytes at once. In this case, we additionally considered the iterative DPA described in this previous reference, which essentially works by successively removing the algorithmic noise generated by the best-rated key bytes. While such an attack can only work under the assumption that the adversary has an very precise leakage model in hand, we use it as a representative of worst-case attack against such a construction.", - "annotations": [ - { - "start": 45, - "end": 46, - "name": "table", - "value": "6e093372-d147-4245-8aab-08ed5fe5c072" - }, - { - "start": 1693, - "end": 1697, - "name": "bibliography_ref", - "value": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1" - }, + "subparagraphs": [ { - "start": 2137, - "end": 2141, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "node_id": "0.8.1.0", + "text": "We will consider two types of setups in our evaluations: one for software, one for hardware. As illustrated in Figure 3 in the case of a Boolean-masked S-box implementation with two shares, the main difference is that the software performs all the operations sequentially, while the hardware performs them in parallel. We will further assume that the leakage of parallel operations is summed [40]. As previously mentioned, we will illustrate our analyses with a Hamming weight leakage function. Additionally, we will consider a noise variance of 10, corresponding to a Signal-to-Noise Ratio of 0.2 (as defined in [29]) 3 . This is a typical value, both for software implementations [11] and FPGA measurement boards [25].Let us denote the AES S-box as S, a byte of plaintext and key as x i and k i (respectively), the random shares used in masking as r j i (before the S-box) and m j i (after the S-box), the Hamming weight function as HW, the bitwise XOR as ⊕, the field multiplication used in polynomial masking as ⊗, and Gaussiandistributed noise random variables N j i . From these notations, we can specify the list of all our target implementations as summarized in Table 2.A couple of observations are worth being underlined as we now discuss.\nFirst, and as already mentioned, the main difference between software and hardware implementations is the number of exploitable leakage samples: there is a single such sample per plaintext in hardware while there are 16×(N m +1) ones in software (with N m the number of masks). Next, we only considered glitches in hardware (since it is generally possible to ensure independent leakage in software, by ensuring a sufficient time separation between the manipulation of the shares). We assumed that \"first-order glitches\" can appear in our Boolean-masked FPGA implementation, and modeled the impact of the mask as an additive binomial noise in this case. We further assumed that the amplitude of this first-order signal was reduced according to a factor f . This factor corresponds to the parameter used to quantify the amplitude of the glitches mentioned in the previous section. Note that this modeling is sound because the complexity of a first-order DPA only depends on the value of its SNR (which is equivalent to correlation and information theoretic metrics in this case, as proven in [31]). So even leakage functions deviating from the Hamming weight abstraction would lead to similar trends. Since the threshold implementation in [36] guarantees the absence of firstorder glitches, we only analyzed the possibility of second-order glitches for this one, and modeled them in the same way as just described (i.e. by considering the second mask M 2 i as an additive binomial noise, and reducing the amplitude of the second-order signal by a factor f ). Third, the chosen-plaintext construction of [34] is only applicable in hardware. Furthermore, we only evaluated its impact for the unprotected implementation, and the 1-mask Boolean one with glitches. As will become clear in the next section, this is because the data complexity bound to 256 (that is the maximum tolerated by design in this case) is only relevant when successful side-channel attacks occur for such small complexities (which was only observed for implementations with first-order signal).For convenience, we denoted each implementation in our experiments with three letters. The first one corresponds to the type of scenario considered, i.e. with Known (K) or carefully Chosen (C) plaintexts. The second one indicates [20,45]2nd-order KP whether we are in a Software (S) or Hardware (H) case study. The third one corresponds to the type of countermeasure selected, i.e. Unprotected (U), 1-or 2-mask Boolean (B 1 , B 2 ), 1-mask Polynomial (P 1 ) and 2-mask threshold (T 2 ). The additional star signals finally reflect the presence of (first-order or secondorder) glitches. For example, KHB * 1 is an AES design protected with a 1-mask Boolean scheme, implemented in an imperfect hardware leading to first-order glitches, and analyzed in the context of known (uniform) plaintexts.\n", + "annotations": [ + { + "start": 392, + "end": 396, + "name": "reference", + "value": "109ce200-0872-11ef-b95c-0242ac120002" + }, + { + "start": 613, + "end": 617, + "name": "reference", + "value": "109b5dfe-0872-11ef-b95c-0242ac120002" + }, + { + "start": 682, + "end": 686, + "name": "reference", + "value": "1098d606-0872-11ef-b95c-0242ac120002" + }, + { + "start": 715, + "end": 719, + "name": "reference", + "value": "109ae9dc-0872-11ef-b95c-0242ac120002" + }, + { + "start": 2340, + "end": 2344, + "name": "reference", + "value": "109baa66-0872-11ef-b95c-0242ac120002" + }, + { + "start": 2487, + "end": 2491, + "name": "reference", + "value": "109c4dea-0872-11ef-b95c-0242ac120002" + }, + { + "start": 2851, + "end": 2855, + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" + }, + { + "start": 3543, + "end": 3547, + "name": "reference", + "value": "109a34ec-0872-11ef-b95c-0242ac120002" + }, + { + "start": 3547, + "end": 3549, + "name": "reference", + "value": "109d73e6-0872-11ef-b95c-0242ac120002" + }, + { + "start": 1177, + "end": 1178, + "name": "table", + "value": "355d0fa4-7326-4228-a163-31c56483f80d" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 0 + }, + "subparagraphs": [] } - ], + ] + }, + { + "node_id": "0.8.2", + "text": "4.2 Template attacks and security graphs", + "annotations": [], "metadata": { - "paragraph_type": "raw_text", + "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, - "subparagraphs": [] - } - ] - }, - { - "node_id": "0.11", - "text": "Experimental results", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ - { - "node_id": "0.11.0", - "text": "For illustration, the security graph of the AES implementation KHB 1 is given in Figure 4, where we additionally provide the maximum number of measurements tolerated to maintain security levels corresponding to 2 120 , 2 100 and 2 80 time complexity. All the implementations in Table 2 have been similarly evaluated and the result of these experiments are in Appendix A, Figures 8 to 13. Note that in the aforementioned case of iterative DPA (Appendix A, Figure 14), the adversary recovers the AES key bytes but still has to find their position within the AES state, which (roughly) corresponds to 16! ≈ 2 44 possibilities [2].", - "annotations": [ - { - "start": 284, - "end": 285, - "name": "table", - "value": "6e093372-d147-4245-8aab-08ed5fe5c072" - }, + "subparagraphs": [ { - "start": 623, - "end": 626, - "name": "bibliography_ref", - "value": "bac4e432-f290-11ee-a6ed-b88584b4e4a1" + "node_id": "0.8.2.0", + "text": "Given the leakage functions defined in Table 2, a template attack first requires to build a leakage model. In the following, and for each byte of the AES master key, we will consider Gaussian templates for unprotected implementations, and Gaussian for masked implementations. Let us denote the probability density function of a Gaussian distribution taken on input z, with mean µ (resp. mean vector µ) and variance σ 2 (resp. covariance matrix Σ) as N (z|µ, σ 2 ) (resp. N (z|µ, Σ)). This notation directly leads to models of the form:Pr\nfor (software and hardware) unprotected implementations and:\nPr\nfor (software and hardware) masked implementations with two shares. The formula naturally extends to more shares, by just adding more sums over the masks. Note that in these models, all the noise (including the algorithmic one in hardware implementations) is captured by the Gaussian distribution 4 . Given these models, the template adversary will accumulate information on the key bytes k i , by computing products of probabilities corresponding to multiple plaintexts. Doing so and for each key byte, he will produce lists of 256 probabilities corresponding each possible candidate ki , defined as follows:i ],\nwith the leakage vector L (j) respectively corresponding to l (j) i (resp. l (j) ) in the context of Equ. 1 (resp. Equ. 2) and l 1,(j) i , l 2,(j) i (resp. l (j) ) in the context of Equ. 3 (resp. Equ. 4) The number of measurements is given by q in Equ. 5. Next and for each target implementation, we will repeat 100 experiments. And for each value of q in these experiments, use a rank estimation algorithm to evaluate the time complexity needed to recover the full AES master key [61]. Eventually, we will build \"security graphs\" where the attack probability of success is provided in function of a time complexity and a number of measurements.Iterative DPA against constructions with carefully chosen plaintexts. Note that while standard DPA attacks are adequate to analyze the security of unprotected and masked implementations in a known-plaintext scenario, their divide-and-conquer strategy hardly applies to the PRF in [34], with carefullychosen plaintexts leading to key-dependent algorithmic noise. This is because the (maximum 256) constants c j used in this proposal are such that all 16 bytes are always identical. Hence, a standard DPA will provide a single list of probabilities, containing information about the 16 AES key bytes at once. In this case, we additionally considered the iterative DPA described in this previous reference, which essentially works by successively removing the algorithmic noise generated by the best-rated key bytes. While such an attack can only work under the assumption that the adversary has an very precise leakage model in hand, we use it as a representative of worst-case attack against such a construction.", + "annotations": [ + { + "start": 45, + "end": 46, + "name": "table", + "value": "355d0fa4-7326-4228-a163-31c56483f80d" + }, + { + "start": 1697, + "end": 1701, + "name": "reference", + "value": "109fee1e-0872-11ef-b95c-0242ac120002" + }, + { + "start": 2141, + "end": 2145, + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 0 + }, + "subparagraphs": [] } - ], + ] + }, + { + "node_id": "0.8.3", + "text": "4.3 Experimental results", + "annotations": [], "metadata": { - "paragraph_type": "raw_text", + "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, - "subparagraphs": [] + "subparagraphs": [ + { + "node_id": "0.8.3.0", + "text": "For illustration, the security graph of the AES implementation KHB 1 is given in Figure 4, where we additionally provide the maximum number of measurements tolerated to maintain security levels corresponding to 2 120 , 2 100 and 2 80 time complexity. All the implementations in Table 2 have been similarly evaluated and the result of these experiments are in Appendix A, Figures 8 to 13. Note that in the aforementioned case of iterative DPA (Appendix A, Figure 14), the adversary recovers the AES key bytes but still has to find their position within the AES state, which (roughly) corresponds to 16! ≈ 2 44 possibilities [2].", + "annotations": [ + { + "start": 284, + "end": 285, + "name": "table", + "value": "355d0fa4-7326-4228-a163-31c56483f80d" + }, + { + "start": 623, + "end": 626, + "name": "reference", + "value": "109771bc-0872-11ef-b95c-0242ac120002" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 0 + }, + "subparagraphs": [] + } + ] } ] }, { - "node_id": "0.12", - "text": "Security vs. performance tradeoffs", + "node_id": "0.9", + "text": "5 Security vs. performance tradeoffs", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { - "node_id": "0.12.0", - "text": "We now combine the results in the previous sections to answer our main question. Namely, what is the best way to exploit masking and/or leakage-resilient primitives to resist standard DPA in hardware and software implementations?", + "node_id": "0.9.0", + "text": "We now combine the results in the previous sections to answer our main question. Namely, what is the best way to exploit masking and/or leakage-resilient primitives to resist standard DPA in hardware and software implementations?\n", "annotations": [], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] - } - ] - }, - { - "node_id": "0.13", - "text": "Leakage-resilient PRGs", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ + }, { - "node_id": "0.13.0", - "text": "Let M be the maximum number of measurements tolerated to maintain a given security level for one of the implementations in section 4. The re-keying in leakage-resilient PRGs is such that it is exactly this number M that is limited by design (i.e. the value N in Figure 1 bounds M for the adversary), hence directly leading to security-bounded implementations. The global cost metric we use in this case can be written as M M -1 × cost f unction, where the first factor corresponds to the average number of AES encryptions that are used to produce each 128-bit output string, and the second one is the cost function of Table 1.A comparison of different leakage-resilient PRG implementations in software (i.e. based on different unprotected and protected AES implementations) is given in Figure 5 for 80-bit and 120-bit security levels (the results for 100-bit security are in Appendix A, Figure 15, left). The main observation in this context is that the straightforward implementation of the PRG with an unprotected AES design is the most efficient solution. This is mainly because moving from the smallest M value (i.e. M = 2, as imposed by the 120-bit security level in the unprotected case -see Figure 8-left) to large ones (e.g. M > 1000 for masked implementations) can only lead to a gain factor of 2 for the global cost metric, which is not justified in view of the performance overheads due to the masking. For a similar reason (i.e. the limited interest of increasing M ), the global cost metric is essentially independent of the target security level in the figure. In other words, there is little interest in decreasing this security level since it leads to poor performance improvements. The hardware implementations in Appendix A, Figures 15-right and 16 lead to essentially similar intuitions, as also witnessed by the limited impact of decreasing the amplitude of the glitch signal with the f factor (see the KHB * 1 and KHT * 2 implementations for which f = 10 in the latter figures).", - "annotations": [ - { - "start": 624, - "end": 625, - "name": "table", - "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1" - } - ], + "node_id": "0.9.1", + "text": "5.1 Leakage-resilient PRGs", + "annotations": [], "metadata": { - "paragraph_type": "raw_text", + "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, - "subparagraphs": [] - } - ] - }, - { - "node_id": "0.14", - "text": "Leakage-resilient PRFs", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ - { - "node_id": "0.14.0", - "text": "Security-unbounded implementations. Let us now consider (stateless) leakage-resilient PRFs. As already mentioned, those constructions only bound the adversary's data complexity. The main observation in this case is that if random plaintexts are considered, such implementations can only be security-unbounded (with the slight cautionary note that we give below). This fact can be easily explained when the PRF is instantiated with an unprotected software implementation of the AES. What happens then is that the adversary can repeat his measurements to get rid of the physical noise, and consequently move from the security graph of Appendix A, Figure 8-left to the one of Appendix A, Figure 13-right. Such a \"repeating\" attack is exactly the one already mentioned in [34] to argue that bounded data complexity is not enough to bound (computational) security. In fact, it similarly applies to masked implementations. The only difference is that the adversary will not average his measurements, but rather combine them as in Equation 5. This is because given a leakage function, e.g. the Hamming weight one that leads to 9 distinguishable events, the distribution of the measurements in a masked implementation will lead to the same number of distinguishable events: the only difference is that more sampling will be necessary to distinguish them (see the appendices in [60] for a plot of these distributions). So if the number of measurements is not bounded, attacks with low time complexities as in Appendix A, Figure 13 right will always exist.One important consequence is that using the PRF construction in this context is essentially useless for all the AES implementations we consider in this paper. The only way to maintain a target security level for such stateless primitives is to limit the number of measurements by putting a constraint on the lifetime of the system. And this lifetime will be selected according to the maximum number of measurements tolerated that can be extracted from our security graphs, which now highly depends on the countermeasure selected. In other words, we can only evaluate the cost function and the security level attained independently in this case, as illustrated in Figure 6 for our software instances (the 100-bit security level is again given in Appendix A, Figure 17-left). Here, we naturally come back to the standard result that Boolean (resp. polynomial) masking increases security at the cost of performance overheads that are roughly quadratic (resp. cubic) in the number of shares. Note that the security level of the 1-mask polynomial scheme is higher than the 2-mask Boolean one for the noise variance we consider, which is consistent with the previous work of Roche and Prouff [54]. Similar conclusions are obtained with hardware implementations (Appendix A, Figure 17-right and Appendix A, Figure 18), for which the impact of glitches is now clearly visible. For example, a factor f = 10 essentially multiplies the number of measurements by f for the Boolean masking with first-order glitches, and f 2 for the threshold implementation with second-order glitches. Cautionary note. The statement that stateless leakage-resilient PRFs can only be security unbounded if known plaintexts are considered essentially relates to the fact that repeated measurements allow removing the effect of the noise and the masks in a leaking implementation. Yet, this claim should be slightly mitigated in the case of algorithmic noise in hardware implementations. Indeed, this part of the noise can only be averaged up to the data complexity bound that is imposed by the PRF design. Taking the example of our hardware implementations where all 16 S-boxes are manipulated in parallel, the SNR corresponding to algorithmic noise can be computed as the ratio between the variance of a uniformly distributed 8-bit values's Hamming weight (i.e. 2) and the variance of 15 such values (i.e. 30). Averaging this noise over M plaintexts will lead to SNRs of 1 15/M , which is already larger than 17 if M = 256 (i.e. a noise level for which the security graph will be extremely close to the worst case one of Appendix A, Figure 13-right). So although there is a \"gray area\" where a leakage-resilient PRF implemented in hardware can be (weakly) security-bounded, these contexts are of quite limited interest because the will imply bounds on the data complexity that are below 256, i.e. they anyway lead to less efficient solutions than the tweaked construction that we investigate in the next subsection.Security-bounded implementations. As just discussed, stateless primitives hardly lead to security bounded implementations if physical and algorithmic noise can be averaged -which is straightforwardly feasible in a known plaintext scenario. The tweaked construction in [34] aims at avoiding such a weakness by preventing the averaging of the algorithmic noise, thanks to the combined effect of hardware parallelism and carefully chosen plaintexts leading to keydependencies in this noise. Since only the physical noise can be averaged in this case, the bounded data complexity that the leakage-resilient PRF guarantees consequently leads to security-bounded implementations again. This is illustrated both by the standard DPAs (such as in Appendix A, Figures 10-right and 12-left) and the iterative attacks (such as in Appendix A, Figure 13) that can be performed against this PRF 5 . As in Section 5.1, we extracted the maximum data complexity D from these graphs, and produced as global cost metric:where the first factor corresponds to the (rounded) average number of AES encryptions needed to produce a 128-bit output, and the second one is the cost function of Table 1. A comparison of our different leakage-resilient PRFs instantiated with a hardware implementation of the AES and chosen plaintexts is given in Figure 7. Here again, we observe that the most efficient solution is to consider an unprotected design. Interestingly, we also observe that for the unprotected AES, the iterative attack is the worst case for the 80-bit security level (where it forces the re-keying after 97 plaintexts vs. 256 for the standard DPA), while the standard DPA is the worst-case for the 120-bit security level (where it forces the re-keying after 10 plaintexts vs. 37 for the iterative attack). This nicely fits the intuition that iterative attacks become more powerful as the data complexity increases, i.e. when the additional time complexity corresponding to the enumeration of a permutation over 16 bytes becomes small compared to the time complexity required to recover the 16 AES key bytes (unordered). ", - "annotations": [ - { - "start": 768, - "end": 772, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 4800, - "end": 4804, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 1369, - "end": 1373, - "name": "bibliography_ref", - "value": "bac4e610-f290-11ee-a6ed-b88584b4e4a1" - }, - { - "start": 2732, - "end": 2736, - "name": "bibliography_ref", - "value": "bac4e5d7-f290-11ee-a6ed-b88584b4e4a1" - }, + "subparagraphs": [ { - "start": 5703, - "end": 5704, - "name": "table", - "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1" + "node_id": "0.9.1.0", + "text": "Let M be the maximum number of measurements tolerated to maintain a given security level for one of the implementations in section 4. The re-keying in leakage-resilient PRGs is such that it is exactly this number M that is limited by design (i.e. the value N in Figure 1 bounds M for the adversary), hence directly leading to security-bounded implementations. The global cost metric we use in this case can be written as M M -1 × cost f unction, where the first factor corresponds to the average number of AES encryptions that are used to produce each 128-bit output string, and the second one is the cost function of Table 1.A comparison of different leakage-resilient PRG implementations in software (i.e. based on different unprotected and protected AES implementations) is given in Figure 5 for 80-bit and 120-bit security levels (the results for 100-bit security are in Appendix A, Figure 15, left). The main observation in this context is that the straightforward implementation of the PRG with an unprotected AES design is the most efficient solution. This is mainly because moving from the smallest M value (i.e. M = 2, as imposed by the 120-bit security level in the unprotected case -see Figure 8-left) to large ones (e.g. M > 1000 for masked implementations) can only lead to a gain factor of 2 for the global cost metric, which is not justified in view of the performance overheads due to the masking. For a similar reason (i.e. the limited interest of increasing M ), the global cost metric is essentially independent of the target security level in the figure. In other words, there is little interest in decreasing this security level since it leads to poor performance improvements. The hardware implementations in Appendix A, Figures 15-right and 16 lead to essentially similar intuitions, as also witnessed by the limited impact of decreasing the amplitude of the glitch signal with the f factor (see the KHB * 1 and KHT * 2 implementations for which f = 10 in the latter figures).", + "annotations": [ + { + "start": 624, + "end": 625, + "name": "table", + "value": "1c9f98e6-e1f8-49f3-8bf7-24022f2d1939" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 0 + }, + "subparagraphs": [] } - ], + ] + }, + { + "node_id": "0.9.2", + "text": "5.2 Leakage-resilient PRFs", + "annotations": [], "metadata": { - "paragraph_type": "raw_text", + "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, - "subparagraphs": [] + "subparagraphs": [ + { + "node_id": "0.9.2.0", + "text": "Security-unbounded implementations. Let us now consider (stateless) leakage-resilient PRFs. As already mentioned, those constructions only bound the adversary's data complexity. The main observation in this case is that if random plaintexts are considered, such implementations can only be security-unbounded (with the slight cautionary note that we give below). This fact can be easily explained when the PRF is instantiated with an unprotected software implementation of the AES. What happens then is that the adversary can repeat his measurements to get rid of the physical noise, and consequently move from the security graph of Appendix A, Figure 8-left to the one of Appendix A, Figure 13-right. Such a \"repeating\" attack is exactly the one already mentioned in [34] to argue that bounded data complexity is not enough to bound (computational) security. In fact, it similarly applies to masked implementations. The only difference is that the adversary will not average his measurements, but rather combine them as in Equation 5. This is because given a leakage function, e.g. the Hamming weight one that leads to 9 distinguishable events, the distribution of the measurements in a masked implementation will lead to the same number of distinguishable events: the only difference is that more sampling will be necessary to distinguish them (see the appendices in [60] for a plot of these distributions). So if the number of measurements is not bounded, attacks with low time complexities as in Appendix A, Figure 13 right will always exist.One important consequence is that using the PRF construction in this context is essentially useless for all the AES implementations we consider in this paper. The only way to maintain a target security level for such stateless primitives is to limit the number of measurements by putting a constraint on the lifetime of the system. And this lifetime will be selected according to the maximum number of measurements tolerated that can be extracted from our security graphs, which now highly depends on the countermeasure selected. In other words, we can only evaluate the cost function and the security level attained independently in this case, as illustrated in Figure 6 for our software instances (the 100-bit security level is again given in Appendix A, Figure 17-left). Here, we naturally come back to the standard result that Boolean (resp. polynomial) masking increases security at the cost of performance overheads that are roughly quadratic (resp. cubic) in the number of shares. Note that the security level of the 1-mask polynomial scheme is higher than the 2-mask Boolean one for the noise variance we consider, which is consistent with the previous work of Roche and Prouff [54]. Similar conclusions are obtained with hardware implementations (Appendix A, Figure 17-right and Appendix A, Figure 18), for which the impact of glitches is now clearly visible. For example, a factor f = 10 essentially multiplies the number of measurements by f for the Boolean masking with first-order glitches, and f 2 for the threshold implementation with second-order glitches. Cautionary note. The statement that stateless leakage-resilient PRFs can only be security unbounded if known plaintexts are considered essentially relates to the fact that repeated measurements allow removing the effect of the noise and the masks in a leaking implementation. Yet, this claim should be slightly mitigated in the case of algorithmic noise in hardware implementations. Indeed, this part of the noise can only be averaged up to the data complexity bound that is imposed by the PRF design. Taking the example of our hardware implementations where all 16 S-boxes are manipulated in parallel, the SNR corresponding to algorithmic noise can be computed as the ratio between the variance of a uniformly distributed 8-bit values's Hamming weight (i.e. 2) and the variance of 15 such values (i.e. 30). Averaging this noise over M plaintexts will lead to SNRs of 1 15/M , which is already larger than 17 if M = 256 (i.e. a noise level for which the security graph will be extremely close to the worst case one of Appendix A, Figure 13-right). So although there is a \"gray area\" where a leakage-resilient PRF implemented in hardware can be (weakly) security-bounded, these contexts are of quite limited interest because the will imply bounds on the data complexity that are below 256, i.e. they anyway lead to less efficient solutions than the tweaked construction that we investigate in the next subsection.Security-bounded implementations. As just discussed, stateless primitives hardly lead to security bounded implementations if physical and algorithmic noise can be averaged -which is straightforwardly feasible in a known plaintext scenario. The tweaked construction in [34] aims at avoiding such a weakness by preventing the averaging of the algorithmic noise, thanks to the combined effect of hardware parallelism and carefully chosen plaintexts leading to keydependencies in this noise. Since only the physical noise can be averaged in this case, the bounded data complexity that the leakage-resilient PRF guarantees consequently leads to security-bounded implementations again. This is illustrated both by the standard DPAs (such as in Appendix A, Figures 10-right and 12-left) and the iterative attacks (such as in Appendix A, Figure 13) that can be performed against this PRF 5 . As in Section 5.1, we extracted the maximum data complexity D from these graphs, and produced as global cost metric:where the first factor corresponds to the (rounded) average number of AES encryptions needed to produce a 128-bit output, and the second one is the cost function of Table 1. A comparison of our different leakage-resilient PRFs instantiated with a hardware implementation of the AES and chosen plaintexts is given in Figure 7. Here again, we observe that the most efficient solution is to consider an unprotected design. Interestingly, we also observe that for the unprotected AES, the iterative attack is the worst case for the 80-bit security level (where it forces the re-keying after 97 plaintexts vs. 256 for the standard DPA), while the standard DPA is the worst-case for the 120-bit security level (where it forces the re-keying after 10 plaintexts vs. 37 for the iterative attack). This nicely fits the intuition that iterative attacks become more powerful as the data complexity increases, i.e. when the additional time complexity corresponding to the enumeration of a permutation over 16 bytes becomes small compared to the time complexity required to recover the 16 AES key bytes (unordered). ", + "annotations": [ + { + "start": 768, + "end": 772, + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" + }, + { + "start": 4800, + "end": 4804, + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" + }, + { + "start": 1369, + "end": 1373, + "name": "reference", + "value": "109fc63c-0872-11ef-b95c-0242ac120002" + }, + { + "start": 2732, + "end": 2736, + "name": "reference", + "value": "109ed6dc-0872-11ef-b95c-0242ac120002" + }, + { + "start": 5703, + "end": 5704, + "name": "table", + "value": "1c9f98e6-e1f8-49f3-8bf7-24022f2d1939" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 0 + }, + "subparagraphs": [] + } + ] } ] }, { - "node_id": "0.15", - "text": "Conclusion", + "node_id": "0.10", + "text": "6 Conclusion", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { - "node_id": "0.15.0", + "node_id": "0.10.0", "text": "The results in this work essentially show that masking and leakage-resilient constructions hardly combine constructively. For (stateful) PRGs, our experiments indicate that both for software and hardware implementations, a leakageresilient design instantiated with an unprotected AES is the most efficient solution to reach any given security level. For stateless PRFs, they rather show that a bounded data complexity guarantee is (mostly) ineffective in bounding the (computational) complexity of the best attacks. So implementing masking and limiting the lifetime of the cryptographic implementation is the best solution in this case. Nevertheless, the chosen-plaintext tweak proposed in [34] is an interesting exception to this conclusion, as it leads to security-bounded hardware implementations for stateless primitives that are particularly interesting from an application point-of-view, e.g. for re-synchronization, challenge-response protocols, . . . Beyond the further analysis of such constructions, their extension to software implementations is an interesting scope for further research. In this respect, the combination of a chosen-plaintext leakage-resilient PRF with the shuffling countermeasure in [62] seems promising, as it could \"emulate\" the keydependent algorithmic noise ensuring security bounds in hardware. ", "annotations": [ { "start": 690, "end": 694, - "name": "bibliography_ref", - "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "109c08ee-0872-11ef-b95c-0242ac120002" }, { "start": 1214, "end": 1218, - "name": "bibliography_ref", - "value": "bac4e623-f290-11ee-a6ed-b88584b4e4a1" + "name": "reference", + "value": "10a004ee-0872-11ef-b95c-0242ac120002" } ], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.16", + "node_id": "0.11", "text": "A Additional figures", "annotations": [], "metadata": { "paragraph_type": "section", "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [] - }, - { - "node_id": "0.17", - "text": "\n", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { - "node_id": "0.17.0", - "text": "Acknowledgements. F.-X. Standaert is an associate researcher of the . Work funded in parts by the through the project (CRASH) and the grant B- project.", - "annotations": [], - "metadata": { - "paragraph_type": "raw_text", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [] - } - ] - }, - { - "node_id": "0.18", - "text": "

Acknowledgements. F.-X. Standaert is an associate researcher of the Belgian Fund for Scientific Research (FNRS-F.R.S.). Work funded in parts by the European Commission through the ERC project 280141 (CRASH) and the European ISEC action grant HOME/2010/ISEC/AG/INT-011 B-CCENTRE project.

", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [ - { - "node_id": "0.18.0", + "node_id": "0.11.0", "text": "Acknowledgements. F.-X. Standaert is an associate researcher of the . Work funded in parts by the through the project (CRASH) and the grant B- project.", "annotations": [], "metadata": { "paragraph_type": "raw_text", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.19", - "text": "\n", - "annotations": [], - "metadata": { - "paragraph_type": "section", - "page_id": 0, - "line_id": 0, - "other_fields": {} - }, - "subparagraphs": [] - }, - { - "node_id": "0.20", + "node_id": "0.12", "text": "bibliography", "annotations": [], "metadata": { "paragraph_type": "bibliography", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [ { - "node_id": "0.20.0", + "node_id": "0.12.0", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e42a-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e42a-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1097209a-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.0.0", + "node_id": "0.12.0.0", "text": "Leakage-resilient symmetric encryption via re-keying", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.1", + "node_id": "0.12.0.1", "text": "Bertoni and Coron", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.2", - "text": "\nMichelAbdalla\n", + "node_id": "0.12.0.2", + "text": "Michel Abdalla", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.3", - "text": "\nSoniaBelaïd\n", + "node_id": "0.12.0.3", + "text": "Sonia Belaïd", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.4", - "text": "\nPierre-AlainFouque\n", + "node_id": "0.12.0.4", + "text": "Pierre-Alain Fouque", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.5", + "node_id": "0.12.0.5", "text": "4", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.0.6", + "node_id": "0.12.0.6", "text": "471-488", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.1", + "node_id": "0.12.1", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e432-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e432-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109771bc-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.1.0", + "node_id": "0.12.1.0", "text": "Towards fresh re-keying with leakage-resilient PRFs: Cipher design principles and analysis", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.1", + "node_id": "0.12.1.1", "text": "Cryptology ePrint Archive", "annotations": [], "metadata": { "paragraph_type": "title_journal", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.2", - "text": "\nSoniaBelaïd\n", + "node_id": "0.12.1.2", + "text": "Sonia Belaïd", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.3", - "text": "\nFabrizioDe Santis\n", + "node_id": "0.12.1.3", + "text": "Fabrizio De Santis", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.4", - "text": "\nJohannHeyszl\n", + "node_id": "0.12.1.4", + "text": "Johann Heyszl", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.5", - "text": "\nStefanMangard\n", + "node_id": "0.12.1.5", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.6", - "text": "\nMarcelMedwed\n", + "node_id": "0.12.1.6", + "text": "Marcel Medwed", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.7", - "text": "\nJorn-MarcSchmidt\n", + "node_id": "0.12.1.7", + "text": "Jorn-Marc Schmidt", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.8", - "text": "\nFrancois-XavierStandaert\n", + "node_id": "0.12.1.8", + "text": "Francois-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.9", - "text": "\nStefanTillich\n", + "node_id": "0.12.1.9", + "text": "Stefan Tillich", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.1.10", + "node_id": "0.12.1.10", "text": "2013. 2013", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.2", + "node_id": "0.12.2", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e43e-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e43e-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1097c3a6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.2.0", + "node_id": "0.12.2.0", "text": "Implementing \"practical leakage-resilient cryptography", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.2.1", + "node_id": "0.12.2.1", "text": "CHES 2012 Rump Session Talk", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.2.2", - "text": "\nJDaniel\n", + "node_id": "0.12.2.2", + "text": "J Daniel", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.2.3", - "text": "\nBernstein\n", + "node_id": "0.12.2.3", + "text": "Bernstein", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.2.4", + "node_id": "0.12.2.4", "text": "September 2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.3", + "node_id": "0.12.3", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e444-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e444-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1097e69c-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.3.0", + "node_id": "0.12.3.0", "text": "Cryptographic Hardware and Embedded Systems -CHES 2013 -15th International Workshop", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.1", + "node_id": "0.12.3.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.2", - "text": "\nGuidoBertoni\n", + "node_id": "0.12.3.2", + "text": "Guido Bertoni", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.3", - "text": "\nJean-SébastienCoron\n", + "node_id": "0.12.3.3", + "text": "Jean-Sébastien Coron", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.4", + "node_id": "0.12.3.4", "text": "8086", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.5", + "node_id": "0.12.3.5", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.3.6", + "node_id": "0.12.3.6", "text": "August 20-23, 2013. 2013", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.4", + "node_id": "0.12.4", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10981248-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.4.0", + "node_id": "0.12.4.0", "text": "Towards sound approaches to counteract power-analysis attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.1", + "node_id": "0.12.4.1", "text": "Wiener", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.2", - "text": "\nSureshChari\n", + "node_id": "0.12.4.2", + "text": "Suresh Chari", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.3", - "text": "\nCharanjitSJutla\n", + "node_id": "0.12.4.3", + "text": "Charanjit S Jutla", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.4", - "text": "\nRJosyula\n", + "node_id": "0.12.4.4", + "text": "R Josyula", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.5", - "text": "\nPankajRao\n", + "node_id": "0.12.4.5", + "text": "Pankaj Rao", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.6", - "text": "\nRohatgi\n", + "node_id": "0.12.4.6", + "text": "Rohatgi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.7", + "node_id": "0.12.4.7", "text": "63", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.4.8", + "node_id": "0.12.4.8", "text": "398-412", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.5", + "node_id": "0.12.5", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e456-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e456-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10982954-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.5.0", + "node_id": "0.12.5.0", "text": "Template attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.1", + "node_id": "0.12.5.1", "text": "CHES", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.2", + "node_id": "0.12.5.2", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.3", - "text": "\nSureshChari\n", + "node_id": "0.12.5.3", + "text": "Suresh Chari", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.4", - "text": "\nJosyulaRRao\n", + "node_id": "0.12.5.4", + "text": "Josyula R Rao", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.5", - "text": "\nPankajRohatgi\n", + "node_id": "0.12.5.5", + "text": "Pankaj Rohatgi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.6", + "node_id": "0.12.5.6", "text": "2523", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.7", + "node_id": "0.12.5.7", "text": "13-28", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.8", + "node_id": "0.12.5.8", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.5.9", + "node_id": "0.12.5.9", "text": "2002", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.6", + "node_id": "0.12.6", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e461-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e461-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1098687e-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.6.0", + "node_id": "0.12.6.0", "text": "Common Criteria Portal", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.7", + "node_id": "0.12.7", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e463-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e463-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10986b4e-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.7.0", + "node_id": "0.12.7.0", "text": "Side channel cryptanalysis of a higher order masking scheme", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.1", + "node_id": "0.12.7.1", "text": "Paillier and Verbauwhede", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.2", - "text": "\nJean-SébastienCoron\n", + "node_id": "0.12.7.2", + "text": "Jean-Sébastien Coron", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.3", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.7.3", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.4", - "text": "\nMatthieuRivain\n", + "node_id": "0.12.7.4", + "text": "Matthieu Rivain", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.5", + "node_id": "0.12.7.5", "text": "38", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.7.6", + "node_id": "0.12.7.6", "text": "28-44", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.8", + "node_id": "0.12.8", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e46b-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e46b-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10988c82-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.8.0", + "node_id": "0.12.8.0", "text": "Cryptographic Key Length Recommendation", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.9", + "node_id": "0.12.9", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e46d-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e46d-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10989a06-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.9.0", + "node_id": "0.12.9.0", "text": "Leakage-resilient pseudorandom functions and side-channel attacks on feistel networks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.1", + "node_id": "0.12.9.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.2", - "text": "\nYevgeniyDodis\n", + "node_id": "0.12.9.2", + "text": "Yevgeniy Dodis", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.3", - "text": "\nKrzysztofPietrzak\n", + "node_id": "0.12.9.3", + "text": "Krzysztof Pietrzak", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.4", + "node_id": "0.12.9.4", "text": "6223", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.5", + "node_id": "0.12.9.5", "text": "21-40", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.6", + "node_id": "0.12.9.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.9.7", + "node_id": "0.12.9.7", "text": "2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.10", + "node_id": "0.12.10", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e476-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e476-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1098d606-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.10.0", + "node_id": "0.12.10.0", "text": "Loïc van Oldeneel tot Oldenzeel, and Nicolas Veyrat-Charvillon. Efficient removal of random delays from embedded software implementations using hidden markov models", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.1", + "node_id": "0.12.10.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.2", - "text": "\nFrançoisDurvaux\n", + "node_id": "0.12.10.2", + "text": "François Durvaux", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.3", - "text": "\nMathieuRenauld\n", + "node_id": "0.12.10.3", + "text": "Mathieu Renauld", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.4", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.10.4", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.5", + "node_id": "0.12.10.5", "text": "7771", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.6", + "node_id": "0.12.10.6", "text": "123-140", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.7", + "node_id": "0.12.10.7", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.10.8", + "node_id": "0.12.10.8", "text": "2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.11", + "node_id": "0.12.11", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e480-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e480-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10990112-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.11.0", + "node_id": "0.12.11.0", "text": "Leakage-resilient cryptography", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.1", + "node_id": "0.12.11.1", "text": "FOCS", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.2", - "text": "\nStefanDziembowski\n", + "node_id": "0.12.11.2", + "text": "Stefan Dziembowski", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.3", - "text": "\nKrzysztofPietrzak\n", + "node_id": "0.12.11.3", + "text": "Krzysztof Pietrzak", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.4", + "node_id": "0.12.11.4", "text": "293-302", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.5", + "node_id": "0.12.11.5", "text": "IEEE Computer Society", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.11.6", + "node_id": "0.12.11.6", "text": "2008", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.12", + "node_id": "0.12.12", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e488-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e488-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109929c6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.12.0", + "node_id": "0.12.12.0", "text": "François-Xavier Standaert, and Loïc van Oldeneel tot Oldenzeel. Compact implementation and performance evaluation of block ciphers in ATtiny devices", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.1", + "node_id": "0.12.12.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.2", - "text": "\nThomasEisenbarth\n", + "node_id": "0.12.12.2", + "text": "Thomas Eisenbarth", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.3", - "text": "\nZhengGong\n", + "node_id": "0.12.12.3", + "text": "Zheng Gong", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.4", - "text": "\nTimGüneysu\n", + "node_id": "0.12.12.4", + "text": "Tim Güneysu", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.5", - "text": "\nStefanHeyse\n", + "node_id": "0.12.12.5", + "text": "Stefan Heyse", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.6", - "text": "\nSebastiaanIndesteege\n", + "node_id": "0.12.12.6", + "text": "Sebastiaan Indesteege", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.7", - "text": "\nStéphanieKerckhof\n", + "node_id": "0.12.12.7", + "text": "Stéphanie Kerckhof", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.8", - "text": "\nFrançoisKoeune\n", + "node_id": "0.12.12.8", + "text": "François Koeune", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.9", - "text": "\nTomislavNad\n", + "node_id": "0.12.12.9", + "text": "Tomislav Nad", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.10", - "text": "\nThomasPlos\n", + "node_id": "0.12.12.10", + "text": "Thomas Plos", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.11", - "text": "\nFrancescoRegazzoni\n", + "node_id": "0.12.12.11", + "text": "Francesco Regazzoni", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.12", + "node_id": "0.12.12.12", "text": "7374", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.13", + "node_id": "0.12.12.13", "text": "172-187", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.14", + "node_id": "0.12.12.14", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.12.15", + "node_id": "0.12.12.15", "text": "2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.13", + "node_id": "0.12.13", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e499-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e499-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10996f12-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.13.0", + "node_id": "0.12.13.0", "text": "Europay Mastercard Visa", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.14", + "node_id": "0.12.14", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10997246-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.14.0", + "node_id": "0.12.14.0", "text": "Practical leakageresilient symmetric cryptography", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.1", + "node_id": "0.12.14.1", "text": "Prouff and Schaumont", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.2", - "text": "\nSebastianFaust\n", + "node_id": "0.12.14.2", + "text": "Sebastian Faust", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.3", - "text": "\nKrzysztofPietrzak\n", + "node_id": "0.12.14.3", + "text": "Krzysztof Pietrzak", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.4", - "text": "\nJoachimSchipper\n", + "node_id": "0.12.14.4", + "text": "Joachim Schipper", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.5", + "node_id": "0.12.14.5", "text": "46", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.14.6", + "node_id": "0.12.14.6", "text": "213-232", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.15", + "node_id": "0.12.15", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4a3-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4a3-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10999406-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.15.0", + "node_id": "0.12.15.0", "text": "A statistical model for dpa with novel algorithmic confusion analysis", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.1", + "node_id": "0.12.15.1", "text": "Prouff and Schaumont", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.2", - "text": "\nYunsiFei\n", + "node_id": "0.12.15.2", + "text": "Yunsi Fei", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.3", - "text": "\nQiasiLuo\n", + "node_id": "0.12.15.3", + "text": "Qiasi Luo", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.4", - "text": "\nAAdamDing\n", + "node_id": "0.12.15.4", + "text": "A Adam Ding", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.5", + "node_id": "0.12.15.5", "text": "46", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.15.6", + "node_id": "0.12.15.6", "text": "233-250", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.16", + "node_id": "0.12.16", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4ab-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4ab-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1099b8be-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.16.0", + "node_id": "0.12.16.0", "text": "Thwarting higherorder side channel analysis with additive and multiplicative maskings", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.16.1", - "text": "\nLaurieGenelle\n", + "node_id": "0.12.16.1", + "text": "Laurie Genelle", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.16.2", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.16.2", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.16.3", - "text": "\nMichaëlQuisquater\n", + "node_id": "0.12.16.3", + "text": "Michaël Quisquater", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.16.4", + "node_id": "0.12.16.4", "text": "43", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.16.5", + "node_id": "0.12.16.5", "text": "240-255", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.17", + "node_id": "0.12.17", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4b2-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4b2-f290-11ee-a6ed-b88584b4e4a1" + "uid": "1099dbe6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.17.0", + "node_id": "0.12.17.0", "text": "How to construct random functions (extended abstract)", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.1", + "node_id": "0.12.17.1", "text": "FOCS", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.2", - "text": "\nOdedGoldreich\n", + "node_id": "0.12.17.2", + "text": "Oded Goldreich", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.3", - "text": "\nShafiGoldwasser\n", + "node_id": "0.12.17.3", + "text": "Shafi Goldwasser", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.4", - "text": "\nSilvioMicali\n", + "node_id": "0.12.17.4", + "text": "Silvio Micali", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.5", + "node_id": "0.12.17.5", "text": "464-479", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.6", + "node_id": "0.12.17.6", "text": "IEEE Computer Society", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.17.7", + "node_id": "0.12.17.7", "text": "1984", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.18", + "node_id": "0.12.18", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4bb-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4bb-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109a0b84-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.18.0", + "node_id": "0.12.18.0", "text": "Des and differential power analysis (the \"duplication\" method)", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.1", + "node_id": "0.12.18.1", "text": "CHES", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.2", + "node_id": "0.12.18.2", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.3", - "text": "\nLouisGoubin\n", + "node_id": "0.12.18.3", + "text": "Louis Goubin", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.4", - "text": "\nJacquesPatarin\n", + "node_id": "0.12.18.4", + "text": "Jacques Patarin", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.5", + "node_id": "0.12.18.5", "text": "1717", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.6", + "node_id": "0.12.18.6", "text": "158-172", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.7", + "node_id": "0.12.18.7", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.18.8", + "node_id": "0.12.18.8", "text": "1999", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.19", + "node_id": "0.12.19", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109a34ec-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.19.0", + "node_id": "0.12.19.0", "text": "Masking vs. multiparty computation: How large is the gap for the AES?", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.1", + "node_id": "0.12.19.1", "text": "Bertoni and Coron", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.2", - "text": "\nVincentGrosso\n", + "node_id": "0.12.19.2", + "text": "Vincent Grosso", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.19.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.4", - "text": "\nSebastianFaust\n", + "node_id": "0.12.19.4", + "text": "Sebastian Faust", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.5", + "node_id": "0.12.19.5", "text": "4", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.19.6", + "node_id": "0.12.19.6", "text": "400-416", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.20", + "node_id": "0.12.20", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4cd-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4cd-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109a4482-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.20.0", + "node_id": "0.12.20.0", "text": "An AES smart card implementation resistant to power analysis attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.1", + "node_id": "0.12.20.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.2", - "text": "\nChristophHerbst\n", + "node_id": "0.12.20.2", + "text": "Christoph Herbst", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.3", - "text": "\nElisabethOswald\n", + "node_id": "0.12.20.3", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.4", - "text": "\nStefanMangard\n", + "node_id": "0.12.20.4", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.5", + "node_id": "0.12.20.5", "text": "3989", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.6", + "node_id": "0.12.20.6", "text": "239-252", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.20.7", + "node_id": "0.12.20.7", "text": "2006", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.21", + "node_id": "0.12.21", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4d6-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4d6-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109a7a92-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.21.0", + "node_id": "0.12.21.0", "text": "Private circuits: Securing hardware against probing attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.1", + "node_id": "0.12.21.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.2", - "text": "\nYuvalIshai\n", + "node_id": "0.12.21.2", + "text": "Yuval Ishai", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.3", - "text": "\nAmitSahai\n", + "node_id": "0.12.21.3", + "text": "Amit Sahai", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.4", - "text": "\nDavidWagner\n", + "node_id": "0.12.21.4", + "text": "David Wagner", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.5", + "node_id": "0.12.21.5", "text": "2729", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.6", + "node_id": "0.12.21.6", "text": "463-481", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.7", + "node_id": "0.12.21.7", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.21.8", + "node_id": "0.12.21.8", "text": "2003", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.22", + "node_id": "0.12.22", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4e0-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4e0-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109aa800-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.22.0", + "node_id": "0.12.22.0", "text": "Advances in Cryptology -EU-ROCRYPT 2013, 32nd Annual International Conference on the Theory and Applications of Cryptographic Techniques", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.1", + "node_id": "0.12.22.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.2", - "text": "\nThomasJohansson\n", + "node_id": "0.12.22.2", + "text": "Thomas Johansson", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.3", - "text": "\nPhongQNguyen\n", + "node_id": "0.12.22.3", + "text": "Phong Q Nguyen", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.4", + "node_id": "0.12.22.4", "text": "7881", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.5", + "node_id": "0.12.22.5", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.22.6", + "node_id": "0.12.22.6", "text": "May 26-30, 2013. 2013", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.23", + "node_id": "0.12.23", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4e8-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4e8-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109acb32-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.23.0", + "node_id": "0.12.23.0", "text": "Advances in Cryptology -EUROCRYPT 2009, 28th Annual International Conference on the Theory and Applications of Cryptographic Techniques", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.23.1", + "node_id": "0.12.23.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.23.2", + "node_id": "0.12.23.2", "text": "5479", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.23.3", + "node_id": "0.12.23.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.23.4", + "node_id": "0.12.23.4", "text": "April 26-30, 2009. 2009", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.24", + "node_id": "0.12.24", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4ee-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4ee-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109ae9dc-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.24.0", + "node_id": "0.12.24.0", "text": "Evaluation of DPA characteristics of sasebo for board level simulation", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.1", + "node_id": "0.12.24.1", "text": "proceedings of COSADE 2010, 4 pages", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.2", - "text": "\nToshihiroKatashita\n", + "node_id": "0.12.24.2", + "text": "Toshihiro Katashita", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.3", - "text": "\nAkashiSatoh\n", + "node_id": "0.12.24.3", + "text": "Akashi Satoh", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.4", - "text": "\nKatsuyaKikuchi\n", + "node_id": "0.12.24.4", + "text": "Katsuya Kikuchi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.5", - "text": "\nHiroshiNakagawa\n", + "node_id": "0.12.24.5", + "text": "Hiroshi Nakagawa", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.6", - "text": "\nMasahiroAoyagi\n", + "node_id": "0.12.24.6", + "text": "Masahiro Aoyagi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.24.7", + "node_id": "0.12.24.7", "text": "February 2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.25", + "node_id": "0.12.25", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e4f7-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e4f7-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109b1452-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.25.0", + "node_id": "0.12.25.0", "text": "Towards green cryptography: A comparison of lightweight ciphers from the energy viewpoint", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.1", + "node_id": "0.12.25.1", "text": "Prouff and Schaumont", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.2", - "text": "\nStéphanieKerckhof\n", + "node_id": "0.12.25.2", + "text": "Stéphanie Kerckhof", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.3", - "text": "\nFrançoisDurvaux\n", + "node_id": "0.12.25.3", + "text": "François Durvaux", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.4", - "text": "\nCédricHocquet\n", + "node_id": "0.12.25.4", + "text": "Cédric Hocquet", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.5", - "text": "\nDavidBol\n", + "node_id": "0.12.25.5", + "text": "David Bol", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.6", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.25.6", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.7", + "node_id": "0.12.25.7", "text": "46", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.25.8", + "node_id": "0.12.25.8", "text": "390-407", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.26", + "node_id": "0.12.26", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e501-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e501-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109b29c4-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.26.0", + "node_id": "0.12.26.0", "text": "Leak resistant cryptographic indexed key update", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.26.1", - "text": "\nCPaul\n", + "node_id": "0.12.26.1", + "text": "C Paul", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.26.2", - "text": "\nKocher\n", + "node_id": "0.12.26.2", + "text": "Kocher", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.27", + "node_id": "0.12.27", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e505-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e505-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109b3b6c-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.27.0", + "node_id": "0.12.27.0", "text": "Differential power analysis", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.1", + "node_id": "0.12.27.1", "text": "Wiener", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.2", - "text": "\nCPaul\n", + "node_id": "0.12.27.2", + "text": "C Paul", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.3", - "text": "\nJoshuaKocher\n", + "node_id": "0.12.27.3", + "text": "Joshua Kocher", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.4", - "text": "\nBenjaminJaffe\n", + "node_id": "0.12.27.4", + "text": "Benjamin Jaffe", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.5", - "text": "\nJun\n", + "node_id": "0.12.27.5", + "text": "Jun", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.6", + "node_id": "0.12.27.6", "text": "63", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.27.7", + "node_id": "0.12.27.7", "text": "388-397", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.28", + "node_id": "0.12.28", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e50e-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e50e-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109b5dfe-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.28.0", + "node_id": "0.12.28.0", "text": "Hardware countermeasures against DPA ? a statistical analysis of their effectiveness", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.1", + "node_id": "0.12.28.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.2", - "text": "\nStefanMangard\n", + "node_id": "0.12.28.2", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.3", + "node_id": "0.12.28.3", "text": "2964", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.4", + "node_id": "0.12.28.4", "text": "222-235", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.5", + "node_id": "0.12.28.5", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.28.6", + "node_id": "0.12.28.6", "text": "2004", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.29", + "node_id": "0.12.29", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e516-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e516-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109b8ad6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.29.0", + "node_id": "0.12.29.0", "text": "Power analysis attacksrevealing the secrets of smart cards", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.29.1", - "text": "\nStefanMangard\n", + "node_id": "0.12.29.1", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.29.2", - "text": "\nElisabethOswald\n", + "node_id": "0.12.29.2", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.29.3", - "text": "\nThomasPopp\n", + "node_id": "0.12.29.3", + "text": "Thomas Popp", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.29.4", + "node_id": "0.12.29.4", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.29.5", + "node_id": "0.12.29.5", "text": "2007", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.30", + "node_id": "0.12.30", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109baa66-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.30.0", + "node_id": "0.12.30.0", "text": "One for allall for one: unifying standard differential power analysis attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.1", + "node_id": "0.12.30.1", "text": "IET Information Security", "annotations": [], "metadata": { "paragraph_type": "title_journal", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.2", - "text": "\nStefanMangard\n", + "node_id": "0.12.30.2", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.3", - "text": "\nElisabethOswald\n", + "node_id": "0.12.30.3", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.4", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.30.4", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.5", + "node_id": "0.12.30.5", "text": "5", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.6", + "node_id": "0.12.30.6", "text": "100-110", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.30.7", + "node_id": "0.12.30.7", "text": "2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.31", + "node_id": "0.12.31", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e526-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e526-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109bcd5c-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.31.0", + "node_id": "0.12.31.0", "text": "Side-channel leakage of masked cmos gates", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.1", + "node_id": "0.12.31.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.2", - "text": "\nStefanMangard\n", + "node_id": "0.12.31.2", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.3", - "text": "\nThomasPopp\n", + "node_id": "0.12.31.3", + "text": "Thomas Popp", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.4", - "text": "\nMBerndt\n", + "node_id": "0.12.31.4", + "text": "M Berndt", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.5", - "text": "\nGammel\n", + "node_id": "0.12.31.5", + "text": "Gammel", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.6", + "node_id": "0.12.31.6", "text": "3376", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.7", + "node_id": "0.12.31.7", "text": "351-365", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.8", + "node_id": "0.12.31.8", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.31.9", + "node_id": "0.12.31.9", "text": "2005", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.32", + "node_id": "0.12.32", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e531-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e531-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109bf85e-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.32.0", + "node_id": "0.12.32.0", "text": "Successfully attacking masked AES hardware implementations", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.1", + "node_id": "0.12.32.1", "text": "Rao and Sunar", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.2", - "text": "\nStefanMangard\n", + "node_id": "0.12.32.2", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.3", - "text": "\nNorbertPramstaller\n", + "node_id": "0.12.32.3", + "text": "Norbert Pramstaller", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.4", - "text": "\nElisabethOswald\n", + "node_id": "0.12.32.4", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.5", + "node_id": "0.12.32.5", "text": "47", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.32.6", + "node_id": "0.12.32.6", "text": "157-171", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.33", + "node_id": "0.12.33", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e539-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109c08ee-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.33.0", + "node_id": "0.12.33.0", "text": "Towards superexponential side-channel security with efficient leakage-resilient PRFs", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.1", + "node_id": "0.12.33.1", "text": "Prouff and Schaumont", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.2", - "text": "\nMarcelMedwed\n", + "node_id": "0.12.33.2", + "text": "Marcel Medwed", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.33.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.4", - "text": "\nAntoineJoux\n", + "node_id": "0.12.33.4", + "text": "Antoine Joux", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.5", + "node_id": "0.12.33.5", "text": "46", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.33.6", + "node_id": "0.12.33.6", "text": "193-212", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.34", + "node_id": "0.12.34", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e541-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e541-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109c2608-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.34.0", + "node_id": "0.12.34.0", "text": "Glitch-free implementation of masking in modern FPGAs", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.1", + "node_id": "0.12.34.1", "text": "HOST", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.2", - "text": "\nAmirMoradi\n", + "node_id": "0.12.34.2", + "text": "Amir Moradi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.3", - "text": "\nOliverMischke\n", + "node_id": "0.12.34.3", + "text": "Oliver Mischke", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.4", + "node_id": "0.12.34.4", "text": "89-95", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.5", + "node_id": "0.12.34.5", "text": "IEEE", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.34.6", + "node_id": "0.12.34.6", "text": "2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.35", + "node_id": "0.12.35", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e549-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e549-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109c4dea-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.35.0", + "node_id": "0.12.35.0", "text": "Pushing the limits: A very compact and a threshold implementation of AES", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.1", - "text": "\nAmirMoradi\n", + "node_id": "0.12.35.1", + "text": "Amir Moradi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.2", - "text": "\nAxelPoschmann\n", + "node_id": "0.12.35.2", + "text": "Axel Poschmann", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.3", - "text": "\nSanLing\n", + "node_id": "0.12.35.3", + "text": "San Ling", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.4", - "text": "\nChristofPaar\n", + "node_id": "0.12.35.4", + "text": "Christof Paar", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.5", - "text": "\nHuaxiongWang\n", + "node_id": "0.12.35.5", + "text": "Huaxiong Wang", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.35.6", + "node_id": "0.12.35.6", "text": "69-88", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.36", + "node_id": "0.12.36", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e551-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e551-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109c6370-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.36.0", + "node_id": "0.12.36.0", "text": "A side-channel analysis resistant description of the AES S-Box", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.1", + "node_id": "0.12.36.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.2", - "text": "\nElisabethOswald\n", + "node_id": "0.12.36.2", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.3", - "text": "\nStefanMangard\n", + "node_id": "0.12.36.3", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.4", - "text": "\nNorbertPramstaller\n", + "node_id": "0.12.36.4", + "text": "Norbert Pramstaller", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.5", - "text": "\nVincentRijmen\n", + "node_id": "0.12.36.5", + "text": "Vincent Rijmen", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.6", + "node_id": "0.12.36.6", "text": "3557", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.7", + "node_id": "0.12.36.7", "text": "413-423", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.8", + "node_id": "0.12.36.8", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.36.9", + "node_id": "0.12.36.9", "text": "2005", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.37", + "node_id": "0.12.37", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e55c-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e55c-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109ca5c4-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.37.0", + "node_id": "0.12.37.0", "text": "Cryptographic Hardware and Embedded Systems -CHES 2007, 9th International Workshop", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.37.1", + "node_id": "0.12.37.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.37.2", + "node_id": "0.12.37.2", "text": "4727", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.37.3", + "node_id": "0.12.37.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.37.4", + "node_id": "0.12.37.4", "text": "September 10-13, 2007. 2007", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.38", + "node_id": "0.12.38", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e562-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e562-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109cc388-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.38.0", + "node_id": "0.12.38.0", "text": "Advances in Cryptology -EUROCRYPT 2011 -30th Annual International Conference on the Theory and Applications of Cryptographic Techniques", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.38.1", + "node_id": "0.12.38.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.38.2", + "node_id": "0.12.38.2", "text": "6632", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.38.3", + "node_id": "0.12.38.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.38.4", + "node_id": "0.12.38.4", "text": "May 15-19, 2011. 2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.39", + "node_id": "0.12.39", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e568-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e568-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109ce200-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.39.0", + "node_id": "0.12.39.0", "text": "Improved higher-order side-channel attacks with FPGA experiments", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.1", + "node_id": "0.12.39.1", "text": "Rao and Sunar", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.2", - "text": "\nEricPeeters\n", + "node_id": "0.12.39.2", + "text": "Eric Peeters", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.39.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.4", - "text": "\nNicolasDonckers\n", + "node_id": "0.12.39.4", + "text": "Nicolas Donckers", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.5", - "text": "\nJean-JacquesQuisquater\n", + "node_id": "0.12.39.5", + "text": "Jean-Jacques Quisquater", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.6", + "node_id": "0.12.39.6", "text": "47", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.39.7", + "node_id": "0.12.39.7", "text": "309-323", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.40", + "node_id": "0.12.40", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e571-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e571-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109cf646-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.40.0", + "node_id": "0.12.40.0", "text": "A leakage-resilient mode of operation", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.40.1", - "text": "\nKrzysztofPietrzak\n", + "node_id": "0.12.40.1", + "text": "Krzysztof Pietrzak", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.40.2", + "node_id": "0.12.40.2", "text": "462-482", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.41", + "node_id": "0.12.41", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e575-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e575-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109d0ba4-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.41.0", + "node_id": "0.12.41.0", "text": "Evaluation of the masked logic style mdpl on a prototype chip", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.1", + "node_id": "0.12.41.1", "text": "Paillier and Verbauwhede", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.2", - "text": "\nThomasPopp\n", + "node_id": "0.12.41.2", + "text": "Thomas Popp", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.3", - "text": "\nMarioKirschbaum\n", + "node_id": "0.12.41.3", + "text": "Mario Kirschbaum", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.4", - "text": "\nThomasZefferer\n", + "node_id": "0.12.41.4", + "text": "Thomas Zefferer", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.5", - "text": "\nStefanMangard\n", + "node_id": "0.12.41.5", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.6", + "node_id": "0.12.41.6", "text": "38", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.41.7", + "node_id": "0.12.41.7", "text": "81-94", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.42", + "node_id": "0.12.42", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e57e-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e57e-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109d2ba2-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.42.0", + "node_id": "0.12.42.0", "text": "Cryptographic Hardware and Embedded Systems -CHES 2011 -13th International Workshop", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.42.1", + "node_id": "0.12.42.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.42.2", + "node_id": "0.12.42.2", "text": "6917", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.42.3", + "node_id": "0.12.42.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.42.4", + "node_id": "0.12.42.4", "text": "September 28 -October 1, 2011. 2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.43", + "node_id": "0.12.43", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e584-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e584-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109d578a-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.43.0", + "node_id": "0.12.43.0", "text": "Masking against side-channel attacks: A formal security proof", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.43.1", + "node_id": "0.12.43.1", "text": "Johansson and Nguyen", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.43.2", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.43.2", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.43.3", - "text": "\nMatthieuRivain\n", + "node_id": "0.12.43.3", + "text": "Matthieu Rivain", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.43.4", + "node_id": "0.12.43.4", "text": "23", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.43.5", + "node_id": "0.12.43.5", "text": "142-159", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.44", + "node_id": "0.12.44", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109d73e6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.44.0", + "node_id": "0.12.44.0", "text": "Higher-order glitches free implementation of the AES using secure multi-party computation protocols", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.44.1", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.44.1", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.44.2", - "text": "\nThomasRoche\n", + "node_id": "0.12.44.2", + "text": "Thomas Roche", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.44.3", + "node_id": "0.12.44.3", "text": "43", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.44.4", + "node_id": "0.12.44.4", "text": "63-78", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.45", + "node_id": "0.12.45", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e591-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e591-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109da014-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.45.0", + "node_id": "0.12.45.0", "text": "Cryptographic Hardware and Embedded Systems -CHES 2012 -14th International Workshop", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.45.1", + "node_id": "0.12.45.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.45.2", + "node_id": "0.12.45.2", "text": "7428", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.45.3", + "node_id": "0.12.45.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.45.4", + "node_id": "0.12.45.4", "text": "September 9-12, 2012. 2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.46", + "node_id": "0.12.46", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e597-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e597-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109dccba-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.46.0", + "node_id": "0.12.46.0", "text": "Cryptographic Hardware and Embedded Systems -CHES 2005, 7th International Workshop", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.1", + "node_id": "0.12.46.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.2", - "text": "\nRJosyula\n", + "node_id": "0.12.46.2", + "text": "R Josyula", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.3", - "text": "\nBerkRao\n", + "node_id": "0.12.46.3", + "text": "Berk Rao", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.4", - "text": "\nSunar\n", + "node_id": "0.12.46.4", + "text": "Sunar", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.5", + "node_id": "0.12.46.5", "text": "3659", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.6", + "node_id": "0.12.46.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.46.7", + "node_id": "0.12.46.7", "text": "August 29 -September 1, 2005. 2005", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.47", + "node_id": "0.12.47", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5a0-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5a0-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109df136-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.47.0", + "node_id": "0.12.47.0", "text": "FPGA implementations of the AES masked against power analysis attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.1", + "node_id": "0.12.47.1", "text": "proceedings of COSADE 2011", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.2", - "text": "\nFrancescoRegazzoni\n", + "node_id": "0.12.47.2", + "text": "Francesco Regazzoni", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.3", - "text": "\nWangYi\n", + "node_id": "0.12.47.3", + "text": "Wang Yi", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.4", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.47.4", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.5", + "node_id": "0.12.47.5", "text": "56-66", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.47.6", + "node_id": "0.12.47.6", "text": "February 2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.48", + "node_id": "0.12.48", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5a8-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5a8-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109e1472-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.48.0", + "node_id": "0.12.48.0", "text": "Algebraic side-channel attacks", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.1", + "node_id": "0.12.48.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.2", - "text": "\nMathieuRenauld\n", + "node_id": "0.12.48.2", + "text": "Mathieu Renauld", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.48.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.4", + "node_id": "0.12.48.4", "text": "6151", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.5", + "node_id": "0.12.48.5", "text": "393-410", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.6", + "node_id": "0.12.48.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.48.7", + "node_id": "0.12.48.7", "text": "2009", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.49", + "node_id": "0.12.49", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5b1-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5b1-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109e3df8-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.49.0", + "node_id": "0.12.49.0", "text": "Algebraic side-channel attacks on the AES: Why time also matters in DPA", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.1", + "node_id": "0.12.49.1", "text": "CHES", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.2", + "node_id": "0.12.49.2", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.3", - "text": "\nFrançois-XavierMathieu Renauld\n", + "node_id": "0.12.49.3", + "text": "François-Xavier Mathieu Renauld", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.4", - "text": "\nNicolasStandaert\n", + "node_id": "0.12.49.4", + "text": "Nicolas Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.5", - "text": "\nVeyrat-Charvillon\n", + "node_id": "0.12.49.5", + "text": "Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.6", + "node_id": "0.12.49.6", "text": "5747", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.7", + "node_id": "0.12.49.7", "text": "97-111", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.8", + "node_id": "0.12.49.8", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.49.9", + "node_id": "0.12.49.9", "text": "2009", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.50", + "node_id": "0.12.50", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5bc-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5bc-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109e67a6-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.50.0", + "node_id": "0.12.50.0", "text": "A formal study of power variability issues and sidechannel attacks for nanoscale devices", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.1", - "text": "\nFrançois-XavierMathieu Renauld\n", + "node_id": "0.12.50.1", + "text": "François-Xavier Mathieu Renauld", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.2", - "text": "\nNicolasStandaert\n", + "node_id": "0.12.50.2", + "text": "Nicolas Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.3", - "text": "\nDinaVeyrat-Charvillon\n", + "node_id": "0.12.50.3", + "text": "Dina Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.4", - "text": "\nDenisKamel\n", + "node_id": "0.12.50.4", + "text": "Denis Kamel", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.5", - "text": "\nFlandre\n", + "node_id": "0.12.50.5", + "text": "Flandre", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.50.6", + "node_id": "0.12.50.6", "text": "109-128", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.51", + "node_id": "0.12.51", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5c4-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5c4-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109e782c-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.51.0", + "node_id": "0.12.51.0", "text": "On the exact success rate of side channel analysis in the gaussian model", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.1", + "node_id": "0.12.51.1", "text": "Selected Areas in Cryptography", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.2", + "node_id": "0.12.51.2", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.3", - "text": "\nMatthieuRivain\n", + "node_id": "0.12.51.3", + "text": "Matthieu Rivain", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.4", + "node_id": "0.12.51.4", "text": "5381", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.5", + "node_id": "0.12.51.5", "text": "165-183", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.6", + "node_id": "0.12.51.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.51.7", + "node_id": "0.12.51.7", "text": "2008", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.52", + "node_id": "0.12.52", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109eab44-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.52.0", + "node_id": "0.12.52.0", "text": "Provably secure higher-order masking of AES", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.1", + "node_id": "0.12.52.1", "text": "CHES", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.2", + "node_id": "0.12.52.2", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.3", - "text": "\nMatthieuRivain\n", + "node_id": "0.12.52.3", + "text": "Matthieu Rivain", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.4", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.52.4", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.5", + "node_id": "0.12.52.5", "text": "6225", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.6", + "node_id": "0.12.52.6", "text": "413-427", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.7", + "node_id": "0.12.52.7", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.52.8", + "node_id": "0.12.52.8", "text": "2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.53", + "node_id": "0.12.53", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5d7-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5d7-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109ed6dc-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.53.0", + "node_id": "0.12.53.0", "text": "Higher-order glitches free implementation of the AES using secure multi-party computation protocols extended version", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.53.1", + "node_id": "0.12.53.1", "text": "Cryptology ePrint Archive", "annotations": [], "metadata": { "paragraph_type": "title_journal", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.53.2", - "text": "\nThomasRoche\n", + "node_id": "0.12.53.2", + "text": "Thomas Roche", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.53.3", - "text": "\nEmmanuelProuff\n", + "node_id": "0.12.53.3", + "text": "Emmanuel Prouff", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.53.4", + "node_id": "0.12.53.4", "text": "2011/413, 2011", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.54", + "node_id": "0.12.54", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5dd-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5dd-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109ef5e0-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.54.0", + "node_id": "0.12.54.0", "text": "Higher order masking of the AES", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.1", + "node_id": "0.12.54.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.2", - "text": "\nKaiSchramm\n", + "node_id": "0.12.54.2", + "text": "Kai Schramm", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.3", - "text": "\nChristofPaar\n", + "node_id": "0.12.54.3", + "text": "Christof Paar", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.4", + "node_id": "0.12.54.4", "text": "3860", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.5", + "node_id": "0.12.54.5", "text": "208-225", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.6", + "node_id": "0.12.54.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.54.7", + "node_id": "0.12.54.7", "text": "2006", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.55", + "node_id": "0.12.55", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109f1e26-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.55.0", + "node_id": "0.12.55.0", "text": "A unified framework for the analysis of side-channel key recovery attacks", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.55.1", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.55.1", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.55.2", - "text": "\nTalMalkin\n", + "node_id": "0.12.55.2", + "text": "Tal Malkin", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.55.3", - "text": "\nMotiYung\n", + "node_id": "0.12.55.3", + "text": "Moti Yung", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.55.4", + "node_id": "0.12.55.4", "text": "443-461", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.56", + "node_id": "0.12.56", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109f2bbe-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.56.0", + "node_id": "0.12.56.0", "text": "Leakage-resilient symmetric cryptography under empirically verifiable assumptions", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.1", + "node_id": "0.12.56.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.2", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.56.2", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.3", - "text": "\nOlivierPereira\n", + "node_id": "0.12.56.3", + "text": "Olivier Pereira", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.4", - "text": "\nYuYu\n", + "node_id": "0.12.56.4", + "text": "Yu Yu", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.5", + "node_id": "0.12.56.5", "text": "8042", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.6", + "node_id": "0.12.56.6", "text": "335-352", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.7", + "node_id": "0.12.56.7", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.56.8", + "node_id": "0.12.56.8", "text": "2013", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.57", + "node_id": "0.12.57", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109f6200-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.57.0", + "node_id": "0.12.57.0", "text": "Leakage resilient cryptography in practice", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.1", + "node_id": "0.12.57.1", "text": "Towards Hardware-Intrinsic Security, Information Security and Cryptography", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.2", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.57.2", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.3", - "text": "\nOlivierPereira\n", + "node_id": "0.12.57.3", + "text": "Olivier Pereira", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.4", - "text": "\nYuYu\n", + "node_id": "0.12.57.4", + "text": "Yu Yu", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.5", - "text": "\nJean-JacquesQuisquater\n", + "node_id": "0.12.57.5", + "text": "Jean-Jacques Quisquater", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.6", - "text": "\nMotiYung\n", + "node_id": "0.12.57.6", + "text": "Moti Yung", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.7", - "text": "\nElisabethOswald\n", + "node_id": "0.12.57.7", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.8", + "node_id": "0.12.57.8", "text": "99-134", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.9", + "node_id": "0.12.57.9", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.57.10", + "node_id": "0.12.57.10", "text": "2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.58", + "node_id": "0.12.58", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e602-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e602-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109f92d4-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.58.0", + "node_id": "0.12.58.0", "text": "The world is not enough: Another look on second-order DPA", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.1", + "node_id": "0.12.58.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.2", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.58.2", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.3", - "text": "\nNicolasVeyrat-Charvillon\n", + "node_id": "0.12.58.3", + "text": "Nicolas Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.4", - "text": "\nElisabethOswald\n", + "node_id": "0.12.58.4", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.5", - "text": "\nBenediktGierlichs\n", + "node_id": "0.12.58.5", + "text": "Benedikt Gierlichs", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.6", - "text": "\nMarcelMedwed\n", + "node_id": "0.12.58.6", + "text": "Marcel Medwed", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.7", - "text": "\nMarkusKasper\n", + "node_id": "0.12.58.7", + "text": "Markus Kasper", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.8", - "text": "\nStefanMangard\n", + "node_id": "0.12.58.8", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.9", + "node_id": "0.12.58.9", "text": "6477", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.10", + "node_id": "0.12.58.10", "text": "112-129", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.11", + "node_id": "0.12.58.11", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.58.12", + "node_id": "0.12.58.12", "text": "2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.59", + "node_id": "0.12.59", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e610-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e610-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109fc63c-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.59.0", + "node_id": "0.12.59.0", "text": "The world is not enough: Another look on second-order DPA", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.1", + "node_id": "0.12.59.1", "text": "Cryptology ePrint Archive", "annotations": [], "metadata": { "paragraph_type": "title_journal", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.2", - "text": "\nFrancois-XavierStandaert\n", + "node_id": "0.12.59.2", + "text": "Francois-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.3", - "text": "\nNicolasVeyrat-Charvillon\n", + "node_id": "0.12.59.3", + "text": "Nicolas Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.4", - "text": "\nElisabethOswald\n", + "node_id": "0.12.59.4", + "text": "Elisabeth Oswald", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.5", - "text": "\nBenediktGierlichs\n", + "node_id": "0.12.59.5", + "text": "Benedikt Gierlichs", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.6", - "text": "\nMarcelMedwed\n", + "node_id": "0.12.59.6", + "text": "Marcel Medwed", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.7", - "text": "\nMarkusKasper\n", + "node_id": "0.12.59.7", + "text": "Markus Kasper", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.8", - "text": "\nStefanMangard\n", + "node_id": "0.12.59.8", + "text": "Stefan Mangard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.59.9", + "node_id": "0.12.59.9", "text": "2010/180. 2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.60", + "node_id": "0.12.60", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1" + "uid": "109fee1e-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.60.0", + "node_id": "0.12.60.0", "text": "Security evaluations beyond computing power", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.1", + "node_id": "0.12.60.1", "text": "Johansson and Nguyen", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.2", - "text": "\nNicolasVeyrat-Charvillon\n", + "node_id": "0.12.60.2", + "text": "Nicolas Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.3", - "text": "\nBenoîtGérard\n", + "node_id": "0.12.60.3", + "text": "Benoît Gérard", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.4", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.60.4", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.5", + "node_id": "0.12.60.5", "text": "23", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.60.6", + "node_id": "0.12.60.6", "text": "126-141", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.61", + "node_id": "0.12.61", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e623-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e623-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10a004ee-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.61.0", + "node_id": "0.12.61.0", "text": "Shuffling against side-channel attacks: A comprehensive study with cautionary note", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.1", + "node_id": "0.12.61.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.2", - "text": "\nNicolasVeyrat-Charvillon\n", + "node_id": "0.12.61.2", + "text": "Nicolas Veyrat-Charvillon", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.3", - "text": "\nMarcelMedwed\n", + "node_id": "0.12.61.3", + "text": "Marcel Medwed", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.4", - "text": "\nStéphanieKerckhof\n", + "node_id": "0.12.61.4", + "text": "Stéphanie Kerckhof", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.5", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.61.5", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.6", + "node_id": "0.12.61.6", "text": "7658", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.7", + "node_id": "0.12.61.7", "text": "740-757", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.8", + "node_id": "0.12.61.8", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.61.9", + "node_id": "0.12.61.9", "text": "2012", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.62", + "node_id": "0.12.62", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e62e-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e62e-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10a05430-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.62.0", + "node_id": "0.12.62.0", "text": "Advances in Cryptology -CRYPTO '99, 19th Annual International Cryptology Conference", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.62.1", + "node_id": "0.12.62.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.62.2", + "node_id": "0.12.62.2", "text": "1666", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.62.3", + "node_id": "0.12.62.3", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.62.4", + "node_id": "0.12.62.4", "text": "August 15-19, 1999. 1999", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.63", + "node_id": "0.12.63", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e634-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e634-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10a07d98-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.63.0", + "node_id": "0.12.63.0", "text": "Practical leakage-resilient pseudorandom objects with minimum public randomness", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.1", + "node_id": "0.12.63.1", "text": "Lecture Notes in Computer Science", "annotations": [], "metadata": { "paragraph_type": "title_series", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.2", - "text": "\nYuYu\n", + "node_id": "0.12.63.2", + "text": "Yu Yu", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.63.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.4", + "node_id": "0.12.63.4", "text": "7779", "annotations": [], "metadata": { "paragraph_type": "biblScope_volume", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.5", + "node_id": "0.12.63.5", "text": "223-238", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.6", + "node_id": "0.12.63.6", "text": "Springer", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.63.7", + "node_id": "0.12.63.7", "text": "2013", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } ] }, { - "node_id": "0.20.64", + "node_id": "0.12.64", "text": "", "annotations": [], "metadata": { "paragraph_type": "bibliography_item", "page_id": 0, "line_id": 0, - "other_fields": { - "uid": "bac4e63d-f290-11ee-a6ed-b88584b4e4a1" - }, - "uid": "bac4e63d-f290-11ee-a6ed-b88584b4e4a1" + "uid": "10a0afc0-0872-11ef-b95c-0242ac120002" }, "subparagraphs": [ { - "node_id": "0.20.64.0", + "node_id": "0.12.64.0", "text": "Practical leakage-resilient pseudorandom generators", "annotations": [], "metadata": { "paragraph_type": "title", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.1", + "node_id": "0.12.64.1", "text": "ACM Conference on Computer and Communications Security", "annotations": [], "metadata": { "paragraph_type": "title_conference_proceedings", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.2", - "text": "\nYuYu\n", + "node_id": "0.12.64.2", + "text": "Yu Yu", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.3", - "text": "\nFrançois-XavierStandaert\n", + "node_id": "0.12.64.3", + "text": "François-Xavier Standaert", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.4", - "text": "\nOlivierPereira\n", + "node_id": "0.12.64.4", + "text": "Olivier Pereira", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.5", - "text": "\nMotiYung\n", + "node_id": "0.12.64.5", + "text": "Moti Yung", "annotations": [], "metadata": { "paragraph_type": "author", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.6", + "node_id": "0.12.64.6", "text": "141-151", "annotations": [], "metadata": { "paragraph_type": "biblScope_page", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.7", + "node_id": "0.12.64.7", "text": "ACM", "annotations": [], "metadata": { "paragraph_type": "publisher", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] }, { - "node_id": "0.20.64.8", + "node_id": "0.12.64.8", "text": "2010", "annotations": [], "metadata": { "paragraph_type": "date", "page_id": 0, - "line_id": 0, - "other_fields": {} + "line_id": 0 }, "subparagraphs": [] } @@ -8469,8 +7698,8 @@ ], "metadata": { "page_id": 0, - "uid": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1", - "rotated_angle": 0, + "uid": "1c9f98e6-e1f8-49f3-8bf7-24022f2d1939", + "rotated_angle": 0.0, "title": "Table 1 .Performance of some illustrative AES implementations." } }, @@ -8599,34 +7828,30 @@ ], "metadata": { "page_id": 0, - "uid": "6e093372-d147-4245-8aab-08ed5fe5c072", - "rotated_angle": 0, + "uid": "355d0fa4-7326-4228-a163-31c56483f80d", + "rotated_angle": 0.0, "title": "Table 2 .List of our target implementations." } } ] }, "metadata": { - "uid": "doc_uid_auto_bac4e6c0-f290-11ee-a6ed-b88584b4e4a1", + "uid": "doc_uid_auto_10a91390-0872-11ef-b95c-0242ac120002", "file_name": "article.pdf", - "temporary_file_name": "1712241389_9.pdf", + "temporary_file_name": "1714647121_744.pdf", "size": 2919334, - "modified_time": 1712241389, - "created_time": 1712241389, - "access_time": 1712241389, + "modified_time": 1714647121, + "created_time": 1714647121, + "access_time": 1714647121, "file_type": "application/pdf", - "other_fields": { - "producer": "MiKTeX pdfTeX-1.40.11", - "creator": "TeX", - "creation_date": 1392998486, - "modification_date": 1392998486 - }, "producer": "MiKTeX pdfTeX-1.40.11", "creator": "TeX", "creation_date": 1392998486, "modification_date": 1392998486 }, - "version": "", - "warnings": [], + "version": "2.2", + "warnings": [ + "use GROBID (version: 0.8.0)" + ], "attachments": [] } \ No newline at end of file diff --git a/docs/source/_static/json_format_examples/basic_example.json b/docs/source/_static/json_format_examples/basic_example.json index 4900c286..138f564c 100644 --- a/docs/source/_static/json_format_examples/basic_example.json +++ b/docs/source/_static/json_format_examples/basic_example.json @@ -1,16 +1,13 @@ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -55,10 +52,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -103,10 +99,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [ { @@ -131,18 +126,42 @@ "name": "spacing", "value": "120" }, + { + "start": 11, + "end": 38, + "name": "spacing", + "value": "140" + }, + { + "start": 38, + "end": 50, + "name": "spacing", + "value": "0" + }, { "start": 0, "end": 37, "name": "style", "value": "Body Text" }, + { + "start": 37, + "end": 50, + "name": "style", + "value": "Normal" + }, { "start": 0, "end": 25, "name": "size", "value": "16.0" }, + { + "start": 25, + "end": 30, + "name": "size", + "value": "10.0" + }, { "start": 30, "end": 37, @@ -150,10 +169,10 @@ "value": "16.0" }, { - "start": 11, - "end": 38, - "name": "spacing", - "value": "140" + "start": 38, + "end": 50, + "name": "size", + "value": "14.0" }, { "start": 11, @@ -167,42 +186,17 @@ "name": "italic", "value": "True" }, - { - "start": 25, - "end": 30, - "name": "size", - "value": "10.0" - }, { "start": 11, "end": 37, "name": "table", "value": "3a327789721e09b3fa6fd9560f3ee263" - }, - { - "start": 37, - "end": 50, - "name": "style", - "value": "Normal" - }, - { - "start": 38, - "end": 50, - "name": "spacing", - "value": "0" - }, - { - "start": 38, - "end": 50, - "name": "size", - "value": "14.0" } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] }, @@ -211,10 +205,9 @@ "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [ { @@ -253,10 +246,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [] }, @@ -293,21 +285,51 @@ "end": 14, "name": "size", "value": "14.0" - }, - { - "start": 0, - "end": 14, - "name": "attachment", - "value": "attach_0ba300a6-5e07-11ee-9889-0242ac120002" } ], "metadata": { - "page_id": 0, - "line_id": 7, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 7 }, - "subparagraphs": [] + "subparagraphs": [ + { + "node_id": "0.0.0.1.1.0", + "text": "\n", + "annotations": [ + { + "start": 0, + "end": 1, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 1, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 1, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 1, + "name": "style", + "value": "Normal" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 8 + }, + "subparagraphs": [] + } + ] } ] } @@ -359,8 +381,8 @@ ] } ], - "colspan": 2, "rowspan": 1, + "colspan": 2, "invisible": false }, { @@ -401,8 +423,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true } ], @@ -445,8 +467,8 @@ ] } ], - "colspan": 1, "rowspan": 2, + "colspan": 1, "invisible": false }, { @@ -487,8 +509,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -531,8 +553,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true }, { @@ -573,8 +595,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -617,8 +639,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false }, { @@ -659,28 +681,29 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ] ], "metadata": { - "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "rotated_angle": 0.0 + "uid": "3a327789721e09b3fa6fd9560f3ee263", + "rotated_angle": 0.0, + "title": "" } } ] }, "metadata": { - "uid": "doc_uid_auto_0bb4e406-5e07-11ee-9889-0242ac120002", + "uid": "doc_uid_auto_ff95f898-0871-11ef-b95c-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695909484_339.docx", + "temporary_file_name": "1714647118_806.docx", "size": 21270, - "modified_time": 1695909484, - "created_time": 1695909484, - "access_time": 1695909484, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -690,18 +713,9 @@ "last_modified_by": "", "created_date": 1568725611, "modified_date": 1686912636, - "last_printed_date": null, - "other_fields": { - "document_subject": "", - "keywords": "", - "category": "", - "comments": "", - "author": "", - "last_modified_by": "", - "created_date": 1568725611, - "modified_date": 1686912636, - "last_printed_date": null - } + "last_printed_date": null }, + "version": "2.2", + "warnings": [], "attachments": [] } \ No newline at end of file diff --git a/docs/source/_static/json_format_examples/linear_structure_type.json b/docs/source/_static/json_format_examples/linear_structure_type.json index 535aa687..ba557091 100644 --- a/docs/source/_static/json_format_examples/linear_structure_type.json +++ b/docs/source/_static/json_format_examples/linear_structure_type.json @@ -1,16 +1,13 @@ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -55,10 +52,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [] }, @@ -104,10 +100,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [] }, @@ -147,10 +142,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] }, @@ -188,6 +182,12 @@ "name": "size", "value": "16.0" }, + { + "start": 14, + "end": 19, + "name": "size", + "value": "10.0" + }, { "start": 19, "end": 26, @@ -206,12 +206,6 @@ "name": "italic", "value": "True" }, - { - "start": 14, - "end": 19, - "name": "size", - "value": "10.0" - }, { "start": 0, "end": 26, @@ -220,10 +214,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 3, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 3 }, "subparagraphs": [] }, @@ -257,10 +250,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 4, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 4 }, "subparagraphs": [] }, @@ -300,10 +292,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 5, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 5 }, "subparagraphs": [] }, @@ -343,10 +334,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [] }, @@ -383,19 +373,84 @@ "end": 14, "name": "size", "value": "14.0" + } + ], + "metadata": { + "paragraph_type": "list_item", + "page_id": 0, + "line_id": 7 + }, + "subparagraphs": [] + }, + { + "node_id": "0.8", + "text": "\n", + "annotations": [ + { + "start": 0, + "end": 1, + "name": "indentation", + "value": "0" }, { "start": 0, - "end": 14, - "name": "attachment", - "value": "attach_75d13b70-5df1-11ee-bfc1-0242ac120002" + "end": 1, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 1, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 1, + "name": "style", + "value": "Normal" } ], "metadata": { + "paragraph_type": "raw_text", "page_id": 0, - "line_id": 7, - "paragraph_type": "list_item", - "other_fields": {} + "line_id": 8 + }, + "subparagraphs": [] + }, + { + "node_id": "0.9", + "text": "", + "annotations": [ + { + "start": 0, + "end": 0, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 0, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 0, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 0, + "name": "style", + "value": "Normal" + } + ], + "metadata": { + "paragraph_type": "raw_text", + "page_id": 0, + "line_id": 9 }, "subparagraphs": [] } @@ -409,22 +464,84 @@ "lines": [ { "text": "Table header", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 12, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 12, + "name": "alignment", + "value": "center" + }, + { + "start": 0, + "end": 12, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 12, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 12, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 2, "rowspan": 1, + "colspan": 2, "invisible": false }, { "lines": [ { "text": "Table header", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 12, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 12, + "name": "alignment", + "value": "center" + }, + { + "start": 0, + "end": 12, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 12, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 12, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true } ], @@ -433,22 +550,84 @@ "lines": [ { "text": "Vertically merged cells", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 23, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 23, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 23, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 23, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 23, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 2, + "colspan": 1, "invisible": false }, { "lines": [ { "text": "Text 1", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 6, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 6, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 6, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -457,22 +636,84 @@ "lines": [ { "text": "Vertically merged cells", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 23, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 23, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 23, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 23, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 23, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true }, { "lines": [ { "text": "Text 2", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 6, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 6, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 6, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -481,42 +722,105 @@ "lines": [ { "text": "Text 3", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 6, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 6, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 6, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false }, { "lines": [ { "text": "Text 4", - "annotations": [] + "annotations": [ + { + "start": 0, + "end": 6, + "name": "indentation", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "alignment", + "value": "left" + }, + { + "start": 0, + "end": 6, + "name": "spacing", + "value": "0" + }, + { + "start": 0, + "end": 6, + "name": "style", + "value": "Table Contents" + }, + { + "start": 0, + "end": 6, + "name": "size", + "value": "14.0" + } + ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ] ], "metadata": { - "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "rotated_angle": 0.0 + "uid": "3a327789721e09b3fa6fd9560f3ee263", + "rotated_angle": 0.0, + "title": "" } } ] }, "metadata": { - "uid": "doc_uid_auto_75e45e94-5df1-11ee-bfc1-0242ac120002", + "uid": "doc_uid_auto_ffa700ca-0871-11ef-b95c-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695900214_259.docx", + "temporary_file_name": "1714647118_760.docx", "size": 21270, - "modified_time": 1695900213, - "created_time": 1695900213, - "access_time": 1695900214, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -526,18 +830,9 @@ "last_modified_by": "", "created_date": 1568725611, "modified_date": 1686912636, - "last_printed_date": null, - "other_fields": { - "document_subject": "", - "keywords": "", - "category": "", - "comments": "", - "author": "", - "last_modified_by": "", - "created_date": 1568725611, - "modified_date": 1686912636, - "last_printed_date": null - } + "last_printed_date": null }, + "version": "2.2", + "warnings": [], "attachments": [] } \ No newline at end of file diff --git a/docs/source/_static/json_format_examples/with_attachments.json b/docs/source/_static/json_format_examples/with_attachments.json index dfa76ac6..e3104660 100644 --- a/docs/source/_static/json_format_examples/with_attachments.json +++ b/docs/source/_static/json_format_examples/with_attachments.json @@ -1,16 +1,13 @@ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -55,10 +52,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -103,10 +99,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [ { @@ -131,18 +126,42 @@ "name": "spacing", "value": "120" }, + { + "start": 11, + "end": 38, + "name": "spacing", + "value": "140" + }, + { + "start": 38, + "end": 50, + "name": "spacing", + "value": "0" + }, { "start": 0, "end": 37, "name": "style", "value": "Body Text" }, + { + "start": 37, + "end": 50, + "name": "style", + "value": "Normal" + }, { "start": 0, "end": 25, "name": "size", "value": "16.0" }, + { + "start": 25, + "end": 30, + "name": "size", + "value": "10.0" + }, { "start": 30, "end": 37, @@ -150,10 +169,10 @@ "value": "16.0" }, { - "start": 11, - "end": 38, - "name": "spacing", - "value": "140" + "start": 38, + "end": 50, + "name": "size", + "value": "14.0" }, { "start": 11, @@ -167,42 +186,17 @@ "name": "italic", "value": "True" }, - { - "start": 25, - "end": 30, - "name": "size", - "value": "10.0" - }, { "start": 11, "end": 37, "name": "table", "value": "3a327789721e09b3fa6fd9560f3ee263" - }, - { - "start": 37, - "end": 50, - "name": "style", - "value": "Normal" - }, - { - "start": 38, - "end": 50, - "name": "spacing", - "value": "0" - }, - { - "start": 38, - "end": 50, - "name": "size", - "value": "14.0" } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] }, @@ -211,10 +205,9 @@ "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [ { @@ -253,10 +246,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [] }, @@ -298,14 +290,13 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_0bd61680-5e07-11ee-9889-0242ac120002" + "value": "attach_7098fafc-e566-46d5-9125-adb6e9b047d8" } ], "metadata": { - "page_id": 0, - "line_id": 7, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 7 }, "subparagraphs": [] } @@ -359,8 +350,8 @@ ] } ], - "colspan": 2, "rowspan": 1, + "colspan": 2, "invisible": false }, { @@ -401,8 +392,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true } ], @@ -445,8 +436,8 @@ ] } ], - "colspan": 1, "rowspan": 2, + "colspan": 1, "invisible": false }, { @@ -487,8 +478,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -531,8 +522,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true }, { @@ -573,8 +564,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -617,8 +608,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false }, { @@ -659,28 +650,29 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ] ], "metadata": { - "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "rotated_angle": 0.0 + "uid": "3a327789721e09b3fa6fd9560f3ee263", + "rotated_angle": 0.0, + "title": "" } } ] }, "metadata": { - "uid": "doc_uid_auto_0be919ce-5e07-11ee-9889-0242ac120002", + "uid": "doc_uid_auto_ffbdff46-0871-11ef-b95c-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695909485_143.docx", + "temporary_file_name": "1714647118_747.docx", "size": 21270, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -690,49 +682,38 @@ "last_modified_by": "", "created_date": 1568725611, "modified_date": 1686912636, - "last_printed_date": null, - "other_fields": { - "document_subject": "", - "keywords": "", - "category": "", - "comments": "", - "author": "", - "last_modified_by": "", - "created_date": 1568725611, - "modified_date": 1686912636, - "last_printed_date": null - } + "last_printed_date": null }, + "version": "2.2", + "warnings": [], "attachments": [ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [] }, "tables": [] }, "metadata": { - "uid": "attach_0bd61680-5e07-11ee-9889-0242ac120002", + "uid": "attach_7098fafc-e566-46d5-9125-adb6e9b047d8", "file_name": "image1.png", - "temporary_file_name": "1695909485_135.png", + "temporary_file_name": "1714647118_301.png", "size": 14874, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, - "file_type": "image/png", - "other_fields": {} + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, + "file_type": "image/png" }, + "version": "2.2", + "warnings": [], "attachments": [] } ] diff --git a/docs/source/_static/json_format_examples/with_base64_attachments.json b/docs/source/_static/json_format_examples/with_base64_attachments.json index 638471b6..ccb536f5 100644 --- a/docs/source/_static/json_format_examples/with_base64_attachments.json +++ b/docs/source/_static/json_format_examples/with_base64_attachments.json @@ -1,16 +1,13 @@ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -55,10 +52,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -103,10 +99,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [ { @@ -131,18 +126,42 @@ "name": "spacing", "value": "120" }, + { + "start": 11, + "end": 38, + "name": "spacing", + "value": "140" + }, + { + "start": 38, + "end": 50, + "name": "spacing", + "value": "0" + }, { "start": 0, "end": 37, "name": "style", "value": "Body Text" }, + { + "start": 37, + "end": 50, + "name": "style", + "value": "Normal" + }, { "start": 0, "end": 25, "name": "size", "value": "16.0" }, + { + "start": 25, + "end": 30, + "name": "size", + "value": "10.0" + }, { "start": 30, "end": 37, @@ -150,10 +169,10 @@ "value": "16.0" }, { - "start": 11, - "end": 38, - "name": "spacing", - "value": "140" + "start": 38, + "end": 50, + "name": "size", + "value": "14.0" }, { "start": 11, @@ -167,42 +186,17 @@ "name": "italic", "value": "True" }, - { - "start": 25, - "end": 30, - "name": "size", - "value": "10.0" - }, { "start": 11, "end": 37, "name": "table", "value": "3a327789721e09b3fa6fd9560f3ee263" - }, - { - "start": 37, - "end": 50, - "name": "style", - "value": "Normal" - }, - { - "start": 38, - "end": 50, - "name": "spacing", - "value": "0" - }, - { - "start": 38, - "end": 50, - "name": "size", - "value": "14.0" } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] }, @@ -211,10 +205,9 @@ "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [ { @@ -253,10 +246,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [] }, @@ -298,14 +290,13 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_0bef9bf0-5e07-11ee-9889-0242ac120002" + "value": "attach_e2f42908-09a9-40fc-9b75-2e1413abc275" } ], "metadata": { - "page_id": 0, - "line_id": 7, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 7 }, "subparagraphs": [] } @@ -359,8 +350,8 @@ ] } ], - "colspan": 2, "rowspan": 1, + "colspan": 2, "invisible": false }, { @@ -401,8 +392,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true } ], @@ -445,8 +436,8 @@ ] } ], - "colspan": 1, "rowspan": 2, + "colspan": 1, "invisible": false }, { @@ -487,8 +478,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -531,8 +522,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true }, { @@ -573,8 +564,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -617,8 +608,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false }, { @@ -659,28 +650,29 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ] ], "metadata": { - "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "rotated_angle": 0.0 + "uid": "3a327789721e09b3fa6fd9560f3ee263", + "rotated_angle": 0.0, + "title": "" } } ] }, "metadata": { - "uid": "doc_uid_auto_0bfffd38-5e07-11ee-9889-0242ac120002", + "uid": "doc_uid_auto_ffd5115e-0871-11ef-b95c-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695909485_393.docx", + "temporary_file_name": "1714647118_665.docx", "size": 21270, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -690,52 +682,39 @@ "last_modified_by": "", "created_date": 1568725611, "modified_date": 1686912636, - "last_printed_date": null, - "other_fields": { - "document_subject": "", - "keywords": "", - "category": "", - "comments": "", - "author": "", - "last_modified_by": "", - "created_date": 1568725611, - "modified_date": 1686912636, - "last_printed_date": null - } + "last_printed_date": null }, + "version": "2.2", + "warnings": [], "attachments": [ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [] }, "tables": [] }, "metadata": { - "uid": "attach_0bef9bf0-5e07-11ee-9889-0242ac120002", + "uid": "attach_e2f42908-09a9-40fc-9b75-2e1413abc275", "file_name": "image1.png", - "temporary_file_name": "1695909485_423.png", + "temporary_file_name": "1714647118_610.png", "size": 14874, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "image/png", - "base64_encode": "", - "other_fields": { - "base64_encode": "" - } + "base64_encode": "" }, + "version": "2.2", + "warnings": [], "attachments": [] } ] diff --git a/docs/source/_static/json_format_examples/with_parsed_attachments.json b/docs/source/_static/json_format_examples/with_parsed_attachments.json index 0b8c165e..b8022980 100644 --- a/docs/source/_static/json_format_examples/with_parsed_attachments.json +++ b/docs/source/_static/json_format_examples/with_parsed_attachments.json @@ -1,16 +1,13 @@ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -55,10 +52,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -103,10 +99,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "header", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [ { @@ -131,18 +126,42 @@ "name": "spacing", "value": "120" }, + { + "start": 11, + "end": 38, + "name": "spacing", + "value": "140" + }, + { + "start": 38, + "end": 50, + "name": "spacing", + "value": "0" + }, { "start": 0, "end": 37, "name": "style", "value": "Body Text" }, + { + "start": 37, + "end": 50, + "name": "style", + "value": "Normal" + }, { "start": 0, "end": 25, "name": "size", "value": "16.0" }, + { + "start": 25, + "end": 30, + "name": "size", + "value": "10.0" + }, { "start": 30, "end": 37, @@ -150,10 +169,10 @@ "value": "16.0" }, { - "start": 11, - "end": 38, - "name": "spacing", - "value": "140" + "start": 38, + "end": 50, + "name": "size", + "value": "14.0" }, { "start": 11, @@ -167,42 +186,17 @@ "name": "italic", "value": "True" }, - { - "start": 25, - "end": 30, - "name": "size", - "value": "10.0" - }, { "start": 11, "end": 37, "name": "table", "value": "3a327789721e09b3fa6fd9560f3ee263" - }, - { - "start": 37, - "end": 50, - "name": "style", - "value": "Normal" - }, - { - "start": 38, - "end": 50, - "name": "spacing", - "value": "0" - }, - { - "start": 38, - "end": 50, - "name": "size", - "value": "14.0" } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] }, @@ -211,10 +205,9 @@ "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [ { @@ -253,10 +246,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 6, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 6 }, "subparagraphs": [] }, @@ -298,14 +290,13 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_0c051e4e-5e07-11ee-9889-0242ac120002" + "value": "attach_a6dcd6be-89d4-48aa-85c0-77828e3cfa13" } ], "metadata": { - "page_id": 0, - "line_id": 7, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 7 }, "subparagraphs": [] } @@ -359,8 +350,8 @@ ] } ], - "colspan": 2, "rowspan": 1, + "colspan": 2, "invisible": false }, { @@ -401,8 +392,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true } ], @@ -445,8 +436,8 @@ ] } ], - "colspan": 1, "rowspan": 2, + "colspan": 1, "invisible": false }, { @@ -487,8 +478,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -531,8 +522,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": true }, { @@ -573,8 +564,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ], @@ -617,8 +608,8 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false }, { @@ -659,28 +650,29 @@ ] } ], - "colspan": 1, "rowspan": 1, + "colspan": 1, "invisible": false } ] ], "metadata": { - "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "rotated_angle": 0.0 + "uid": "3a327789721e09b3fa6fd9560f3ee263", + "rotated_angle": 0.0, + "title": "" } } ] }, "metadata": { - "uid": "doc_uid_auto_0c30f938-5e07-11ee-9889-0242ac120002", + "uid": "doc_uid_auto_ffe992f0-0871-11ef-b95c-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695909485_745.docx", + "temporary_file_name": "1714647118_591.docx", "size": 21270, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -690,33 +682,21 @@ "last_modified_by": "", "created_date": 1568725611, "modified_date": 1686912636, - "last_printed_date": null, - "other_fields": { - "document_subject": "", - "keywords": "", - "category": "", - "comments": "", - "author": "", - "last_modified_by": "", - "created_date": 1568725611, - "modified_date": 1686912636, - "last_printed_date": null - } + "last_printed_date": null }, + "version": "2.2", + "warnings": [], "attachments": [ { - "version": "0.11.2", - "warnings": [], "content": { "structure": { "node_id": "0", "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "root", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [ { @@ -735,18 +715,24 @@ "name": "confidence", "value": "0.96" }, - { - "start": 0, - "end": 3, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.27358490566037735, \"width\": 0.0527086383601757, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" - }, { "start": 4, "end": 9, "name": "confidence", "value": "0.95" }, + { + "start": 18, + "end": 26, + "name": "confidence", + "value": "0.77" + }, + { + "start": 0, + "end": 3, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.27358490566037735, \"width\": 0.0527086383601757, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, { "start": 4, "end": 9, @@ -759,12 +745,6 @@ "name": "bounding box", "value": "{\"x_top_left\": 0.212298682284041, \"y_top_left\": 0.27358490566037735, \"width\": 0.11859443631039532, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" }, - { - "start": 18, - "end": 26, - "name": "confidence", - "value": "0.77" - }, { "start": 18, "end": 26, @@ -773,9 +753,9 @@ }, { "start": 0, - "end": 27, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.27358490566037735, \"width\": 0.445095168374817, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + "end": 9, + "name": "bold", + "value": "True" }, { "start": 0, @@ -797,10 +777,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 0, "paragraph_type": "raw_text", - "other_fields": {} + "page_id": 0, + "line_id": 0 }, "subparagraphs": [] }, @@ -809,10 +788,9 @@ "text": "", "annotations": [], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "list", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [ { @@ -825,24 +803,12 @@ "name": "confidence", "value": "0.93" }, - { - "start": 0, - "end": 2, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05710102489019034, \"y_top_left\": 0.4811320754716981, \"width\": 0.020497803806734993, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" - }, { "start": 3, "end": 9, "name": "confidence", "value": "0.81" }, - { - "start": 3, - "end": 9, - "name": "bounding box", - "value": "{\"x_top_left\": 0.08345534407027819, \"y_top_left\": 0.4811320754716981, \"width\": 0.0629575402635432, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" - }, { "start": 10, "end": 14, @@ -873,6 +839,18 @@ "name": "confidence", "value": "0.96" }, + { + "start": 0, + "end": 2, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05710102489019034, \"y_top_left\": 0.4811320754716981, \"width\": 0.020497803806734993, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 3, + "end": 9, + "name": "bounding box", + "value": "{\"x_top_left\": 0.08345534407027819, \"y_top_left\": 0.4811320754716981, \"width\": 0.0629575402635432, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, { "start": 10, "end": 14, @@ -904,10 +882,10 @@ "value": "{\"x_top_left\": 0.4216691068814056, \"y_top_left\": 0.5, \"width\": 0.0424597364568082, \"height\": 0.10377358490566038, \"page_width\": 683, \"page_height\": 106}" }, { - "start": 0, - "end": 38, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05710102489019034, \"y_top_left\": 0.4811320754716981, \"width\": 0.40702781844802344, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + "start": 25, + "end": 37, + "name": "bold", + "value": "True" }, { "start": 0, @@ -929,10 +907,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 1, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 1 }, "subparagraphs": [] }, @@ -946,12 +923,6 @@ "name": "confidence", "value": "0.94" }, - { - "start": 0, - "end": 2, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.6981132075471698, \"width\": 0.02342606149341142, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" - }, { "start": 3, "end": 9, @@ -1030,6 +1001,18 @@ "name": "confidence", "value": "0.96" }, + { + "start": 49, + "end": 52, + "name": "confidence", + "value": "0.97" + }, + { + "start": 0, + "end": 2, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.6981132075471698, \"width\": 0.02342606149341142, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, { "start": 3, "end": 9, @@ -1078,12 +1061,6 @@ "name": "bounding box", "value": "{\"x_top_left\": 0.5666178623718887, \"y_top_left\": 0.7169811320754716, \"width\": 0.048316251830161056, \"height\": 0.10377358490566038, \"page_width\": 683, \"page_height\": 106}" }, - { - "start": 49, - "end": 52, - "name": "confidence", - "value": "0.97" - }, { "start": 49, "end": 52, @@ -1121,10 +1098,22 @@ "value": "{\"x_top_left\": 0.8843338213762811, \"y_top_left\": 0.6981132075471698, \"width\": 0.055636896046852125, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" }, { - "start": 0, - "end": 76, - "name": "bounding box", - "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.6886792452830188, \"width\": 0.8857979502196194, \"height\": 0.16981132075471697, \"page_width\": 683, \"page_height\": 106}" + "start": 25, + "end": 29, + "name": "bold", + "value": "True" + }, + { + "start": 35, + "end": 48, + "name": "bold", + "value": "True" + }, + { + "start": 56, + "end": 61, + "name": "bold", + "value": "True" }, { "start": 0, @@ -1146,10 +1135,9 @@ } ], "metadata": { - "page_id": 0, - "line_id": 2, "paragraph_type": "list_item", - "other_fields": {} + "page_id": 0, + "line_id": 2 }, "subparagraphs": [] } @@ -1160,23 +1148,20 @@ "tables": [] }, "metadata": { - "uid": "attach_0c051e4e-5e07-11ee-9889-0242ac120002", + "uid": "attach_a6dcd6be-89d4-48aa-85c0-77828e3cfa13", "file_name": "image1.png", - "temporary_file_name": "1695909485_118.png", + "temporary_file_name": "1714647118_126.png", "size": 14874, - "modified_time": 1695909485, - "created_time": 1695909485, - "access_time": 1695909485, + "modified_time": 1714647118, + "created_time": 1714647118, + "access_time": 1714647118, "file_type": "image/png", "rotated_page_angles": [ - 0 - ], - "other_fields": { - "rotated_page_angles": [ - 0 - ] - } + 0.0 + ] }, + "version": "2.2", + "warnings": [], "attachments": [] } ] diff --git a/docs/source/_static/structure_examples/fintoc.pdf b/docs/source/_static/structure_examples/fintoc.pdf new file mode 100644 index 00000000..bceb2735 Binary files /dev/null and b/docs/source/_static/structure_examples/fintoc.pdf differ diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index a49a089d..6a636b40 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,16 @@ Changelog ========= +v2.2.1 (2023-05-03) +------------------- +Release note: `v2.2.1 `_ + +* Added `fintoc` structure type for parsing financial prospects according to the `FinTOC 2022 Shared task `_ (`FintocStructureExtractor`). +* Fixed small bugs in `ArticleReader`: colspan for tables, keywords, sections numbering, etc. +* Added references to nodes and fixed small bugs in the HTML output representation (return_format="html"). +* Removed `other_fields` from `LineMetadata` and `DocumentMetadata`. +* Update `README.md`. + v2.2 (2023-04-17) ----------------- Release note: `v2.2 `_ @@ -15,7 +25,7 @@ v2.1.1 (2024-03-21) ------------------- Release note: `v2.1.1 `_ -* Update README.md. +* Update `README.md`. * Update table and time benchmarks. * Re-label line-classifier datasets (law, tz, diploma, paragraphs datasets). * Update tasker creators (for the labeling system). diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index b06b345d..ee68c29f 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -70,7 +70,7 @@ Api parameters description * - :cspan:`3` **Type of document structure parsing** * - document_type - - other, law, tz, diploma + - other, law, tz, diploma, article, fintoc - other - Type of the document structure according to specific domain. @@ -80,7 +80,8 @@ Api parameters description * **law** -- Russian laws (:ref:`law_structure`); * **tz** -- Russian technical specifications (:ref:`tz_structure`); * **diploma** -- Russian thesis (:ref:`diploma_structure`); - * **article** -- scientific article (:ref:`article_structure`). + * **article** -- scientific article (:ref:`article_structure`); + * **fintoc** -- English, French and Spanish financial prospects (:ref:`fintoc_structure`). This type is used for choosing a specific structure extractor (and, in some cases, a specific reader). @@ -215,13 +216,15 @@ Api parameters description It is highly recommended to use this option value for any PDF document parsing. * - language - - rus, eng, rus+eng + - rus, eng, rus+eng, fra, spa - rus+eng - Language of the parsed PDF document without a textual layer. The following values are available: * **rus** -- Russian; * **eng** -- English; - * **rus+eng** -- both Russian and English. + * **rus+eng** -- both Russian and English; + * **fra** -- French (for fintoc structure type); + * **spa** -- Spanish (for fintoc structure type). * - pages - :, start:, :end, start:end diff --git a/docs/source/dedoc_api_usage/api_schema.rst b/docs/source/dedoc_api_usage/api_schema.rst index e6c1c228..ea8d5b8a 100644 --- a/docs/source/dedoc_api_usage/api_schema.rst +++ b/docs/source/dedoc_api_usage/api_schema.rst @@ -29,7 +29,6 @@ Json schema of the output is also available during dedoc application running on .. autoattribute:: created_time .. autoattribute:: access_time .. autoattribute:: file_type - .. autoattribute:: other_fields .. autoclass:: dedoc.api.schema.TreeNode @@ -49,7 +48,6 @@ Json schema of the output is also available during dedoc application running on .. autoattribute:: paragraph_type .. autoattribute:: page_id .. autoattribute:: line_id - .. autoattribute:: other_fields .. autoclass:: dedoc.api.schema.Table diff --git a/docs/source/dedoc_api_usage/return_format.rst b/docs/source/dedoc_api_usage/return_format.rst index 250b7c8a..e74c0dec 100644 --- a/docs/source/dedoc_api_usage/return_format.rst +++ b/docs/source/dedoc_api_usage/return_format.rst @@ -41,7 +41,7 @@ The beginning of the document's content: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 4-24 + :lines: 2-22 The key "node_id" means the level of the line in a document tree. The amount of numbers separated by dot shows the depth of the line inside the document tree, @@ -52,25 +52,25 @@ so it's is a subparagraph of the line with text "Document example": .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 63-66 + :lines: 59-62 The beginning of the document's tables: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 320-346 + :lines: 342-368 The beginning of the document's metadata: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 676-684 + :lines: 699-707 The document's attachments: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 706 + :lines: 720 As we see, the ``attachments`` field is empty because the option ``with_attachments`` is set to ``"false"`` by default (see :ref:`table_parameters`). @@ -91,14 +91,14 @@ The beginning of the document's content is the same as in the previous example w .. literalinclude:: ../_static/json_format_examples/linear_structure_type.json :language: json - :lines: 4-24 + :lines: 2-22 But the next document line isn't a subparagraph of the document's title (line with text "Document example"), it has the same level in the document's tree hierarchy. .. literalinclude:: ../_static/json_format_examples/linear_structure_type.json :language: json - :lines: 66-68 + :lines: 62-63 All remaining document lines have the same level as well. @@ -118,7 +118,7 @@ Unlike the previous examples, in this case we have ``attachments`` field filled: .. literalinclude:: ../_static/json_format_examples/with_attachments.json :language: json - :lines: 706-738 + :lines: 689-719 Example with base64 attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,7 +136,7 @@ The only difference is in the attachment's metadata: attachment's content is enc .. literalinclude:: ../_static/json_format_examples/with_base64_attachments.json :language: json - :lines: 706-741 + :lines: 689-720 Example with parsed attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -156,4 +156,4 @@ The beginning of the document's attachments: .. literalinclude:: ../_static/json_format_examples/with_parsed_attachments.json :language: json - :lines: 706-731 + :lines: 689-711 diff --git a/docs/source/index.rst b/docs/source/index.rst index 92582bec..779a6adb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -209,6 +209,7 @@ Currently the following domains can be handled: * Russian technical specifications (:ref:`structure description `). * Russian thesis for bachelor or master degree (:ref:`structure description `). * English scientific articles (:ref:`structure description `). + * English, French and Spanish financial prospects (:ref:`structure description `). For a document of unknown or unsupported domain there is an option to use default structure extractor (``document_type=other`` at :ref:`api_parameters`), the default document structure described :ref:`here `. @@ -257,6 +258,7 @@ For a document of unknown or unsupported domain there is an option to use defaul structure_types/tz structure_types/diploma structure_types/article + structure_types/fintoc .. toctree:: diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst index 08655f06..86ded2c3 100644 --- a/docs/source/modules/structure_extractors.rst +++ b/docs/source/modules/structure_extractors.rst @@ -58,3 +58,9 @@ dedoc.structure_extractors :members: .. autoattribute:: document_type + +.. autoclass:: dedoc.structure_extractors.FintocStructureExtractor + :show-inheritance: + :members: + + .. autoattribute:: document_type diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index b3781b2d..60ad0096 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -49,16 +49,19 @@ PDF and images handling It is highly recommended to use this option value for any PDF document parsing. * - language - - rus, eng, rus+eng + - rus, eng, rus+eng, fra, spa - rus+eng - * :meth:`dedoc.DedocManager.parse` * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` + * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract` - Language of the document without a textual layer. The following values are available: * **rus** -- Russian; * **eng** -- English; - * **rus+eng** -- both Russian and English. + * **rus+eng** -- both Russian and English; + * **fra** -- French (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`); + * **spa** -- Spanish (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`). * - pages - :, start:, :end, start:end diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst index 842f6afc..09b592c2 100644 --- a/docs/source/parameters/structure_type.rst +++ b/docs/source/parameters/structure_type.rst @@ -15,23 +15,25 @@ Structure type configuring - Description * - document_type - - other, law, tz, diploma + - other, law, tz, diploma, article, fintoc - other - * :meth:`dedoc.DedocManager.parse` * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract` - Type of the document structure according to specific domain. If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available: - * **other** -- structure for document of any domain (:ref:`other_structure`) + * **other** -- structure for document of any domain (:ref:`other_structure`). In this case, :class:`~dedoc.structure_extractors.DefaultStructureExtractor` is used. - * **law** -- Russian laws (:ref:`law_structure`) + * **law** -- Russian laws (:ref:`law_structure`). In this case, :class:`~dedoc.structure_extractors.ClassifyingLawStructureExtractor` is used. - * **tz** -- Russian technical specifications (:ref:`tz_structure`) + * **tz** -- Russian technical specifications (:ref:`tz_structure`). In this case, :class:`~dedoc.structure_extractors.TzStructureExtractor` is used. - * **diploma** -- Russian thesis (:ref:`diploma_structure`) + * **diploma** -- Russian thesis (:ref:`diploma_structure`). In this case, :class:`~dedoc.structure_extractors.DiplomaStructureExtractor` is used. - * **article** -- scientific article (:ref:`article_structure`) + * **article** -- scientific article (:ref:`article_structure`). In this case, :class:`~dedoc.readers.ArticleReader` and :class:`~dedoc.structure_extractors.ArticleStructureExtractor` are used. + * **fintoc** -- English, French and Spanish financial prospects (:ref:`fintoc_structure`). + In this case, :class:`~dedoc.structure_extractors.FintocStructureExtractor` is used. If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition` diff --git a/docs/source/structure_types/article.rst b/docs/source/structure_types/article.rst index 8de477e9..2add03f1 100644 --- a/docs/source/structure_types/article.rst +++ b/docs/source/structure_types/article.rst @@ -27,6 +27,7 @@ There are the following line types in the article structure type: * ``root``; * ``author`` (includes ``author_first_name``, ``author_surname``, ``email``); + * ``keywords`` (includes ``keyword``); * ``author_affiliation`` (includes ``org_name``, ``address``); * ``abstract``; * ``section``; @@ -106,11 +107,17 @@ Below is a description of nodes in the output tree: .. literalinclude:: ../_static/json_format_examples/article_example.json :language: json - :lines: 125-198 + :lines: 115-182 + + * **keywords** node (if exist) is a child node of the node ``root``. + + ``keywords`` node contains ``keyword`` nodes as children. Each ``keyword`` node contains the text of one key word item. * **abstract** is the article's abstract section ( tag in GROBID's output). - * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. ``section`` nodes are children of a node ``root``. + * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. + + ``section`` nodes are children of a node ``root`` and may me nested (e.g., section "2.1. Datasets" is nested to the section "2. Related work"). * **bibliography** is the article's bibliography list which contains only ``bibliography_item`` nodes. @@ -169,11 +176,11 @@ Below is a description of nodes in the output tree: All ``bibliography_item`` nodes are children of the ``bibliography`` node. The example of the bibliography item parsing of the article in dedoc: - .. example of "node_id": "0.20.5" + .. example of "node_id": "0.12.5" .. literalinclude:: ../_static/json_format_examples/article_example.json :language: json - :lines: 1745-1880 + :lines: 1591-1713 * **bibliography references**: bibliography references in annotations of the article's text. @@ -186,25 +193,25 @@ Below is a description of nodes in the output tree: Example of a bibliography reference in dedoc is given below. There is a textual node with two bibliography references (with two annotations): - .. example of "node_id": "0.15.0" + .. example of "node_id": "0.10.0" .. literalinclude:: ../_static/json_format_examples/article_example.json :language: json - :lines: 1085-1109 + :lines: 1038-1061 In the example, the annotations reference two ``bibliography_item`` nodes: - .. example of "node_id": "0.20.33" + .. example of "node_id": "0.12.33" .. literalinclude:: ../_static/json_format_examples/article_example.json :language: json - :lines: 4581-4593 + :lines: 4144-4153 - .. example of "node_id": "0.20.61" + .. example of "node_id": "0.12.61" .. literalinclude:: ../_static/json_format_examples/article_example.json :language: json - :lines: 7501-7513 + :lines: 6774-6783 * **raw_text**: node referring to a simple document line. diff --git a/docs/source/structure_types/fintoc.rst b/docs/source/structure_types/fintoc.rst new file mode 100644 index 00000000..bb6d2e2d --- /dev/null +++ b/docs/source/structure_types/fintoc.rst @@ -0,0 +1,142 @@ +.. _fintoc_structure: + +FinTOC structure type +===================== + +This structure type is used for analysis of English, French and Spanish financial prospects in PDF format +according to the `FinTOC 2022 Shared task `_. +You can see the :download:`example <../_static/structure_examples/fintoc.pdf>` of the document of this structure type. + +According to the FinTOC 2022 Shared task, there were two subtasks to be solved: + + * **Title detection (TD)** -- selection from all lines of the document only those that should be included in the table of contents. + * **Table of contents (TOC) generation** -- identification nesting depths of selected titles. + +Based on these tasks, we propose the FinTOC structure type with **header** and **raw_text** node types. +The detailed description of each node type: + + * **header** -- title nodes (from the title detection task). Titles can be nested, so their depth is determined according to the + TOC generation task. **header** nodes can have other **header** nodes or **raw_text** nodes as children nodes. + * **raw_text** -- non-title nodes. Unlike forming the result of TOC generation task, + we add non-title lines in the result document tree. **raw_text** node refers to a simple document line. + It has the least importance in the document tree hierarchy, so it is situated in the leaves of the tree. + It is nested to the node corresponding the previous line with more important type. + + +The documents for the FinTOC 2022 Shared task are PDF files with a textual layer, +so it is recommended to use :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` for their parsing +(``pdf_with_text_layer="true"`` or ``pdf_with_text_layer="tabby"`` in the :ref:`API parameters `). + +.. note:: + + During structure extraction step, we use classifiers trained on data extracted by :class:`~dedoc.readers.PdfTxtlayerReader` -- + usage of :class:`~dedoc.readers.PdfTxtlayerReader` or ``pdf_with_text_layer="true"`` is more preferable. + +The training dataset contains English, French, and Spanish documents, so three language options are available ("en", "fr", "sp"). +It is possible to set document's language using ``language`` option in parameters (e.g., ``parameters={"language": "en"}``). + +To obtain FinTOC structure, we use our method described in `our article `_ +(winners of FinTOC 2022 Shared task!). +The results of our method for different languages and readers are given in the :ref:`table below ` +(they slightly changed since the competition finished). +The name of each experiment consists of the reader type ("tabby" -- :class:`~dedoc.readers.PdfTabbyReader`, +"txt_layer" -- :class:`~dedoc.readers.PdfTxtlayerReader`) +and the document's language ("en" -- English, "fr" -- French, "sp" -- Spanish). +As in the FinTOC 2022 Shared task, we use two metrics for results evaluation (metrics from the `article `_): +**TD** -- F1 measure for the title detection task, **TOC** -- harmonic mean of Inex F1 score and Inex level accuracy for the TOC generation task. + +.. _fintoc_results_table: + +.. list-table:: The results from 3-fold cross-validation on the FinTOC 2022 training dataset + :widths: 20 10 10 10 10 10 10 10 10 + :header-rows: 1 + + * - Name + - TD 0 + - TD 1 + - TD 2 + - TD mean + - TOC 0 + - TOC 1 + - TOC 2 + - TOC mean + * - **en_tabby** + - 0.811 + - 0.833 + - 0.864 + - **0.836** + - 56.5 + - 58.0 + - 64.9 + - **59.8** + * - **en_txt_layer** + - 0.821 + - 0.853 + - 0.833 + - **0.836** + - 57.8 + - 62.1 + - 57.8 + - **59.2** + * - **fr_tabby** + - 0.753 + - 0.744 + - 0.782 + - **0.759** + - 51.2 + - 47.9 + - 51.5 + - **50.2** + * - **fr_txt_layer** + - 0.740 + - 0.794 + - 0.766 + - **0.767** + - 45.6 + - 52.2 + - 50.1 + - **49.3** + * - **sp_tabby** + - 0.606 + - 0.622 + - 0.599 + - **0.609** + - 37.1 + - 43.6 + - 43.4 + - **41.3** + * - **sp_txt_layer** + - 0.629 + - 0.667 + - 0.446 + - **0.581** + - 46.4 + - 48.8 + - 30.7 + - **41.9** + +.. seealso:: + + Please see our article `ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model `_ + to get more information about the FinTOC 2022 Shared task and our method of solving it. + We will be grateful, if you cite our work (see citation in BibTeX format below). + +.. code-block:: RST + + @inproceedings{bogatenkova-etal-2022-ispras, + title = "{ISPRAS}@{F}in{TOC}-2022 Shared Task: Two-stage {TOC} Generation Model", + author = "Bogatenkova, Anastasiia and + Belyaeva, Oksana Vladimirovna and + Perminov, Andrew Igorevich and + Kozlov, Ilya Sergeevich", + editor = "El-Haj, Mahmoud and + Rayson, Paul and + Zmandar, Nadhem", + booktitle = "Proceedings of the 4th Financial Narrative Processing Workshop @LREC2022", + month = jun, + year = "2022", + address = "Marseille, France", + publisher = "European Language Resources Association", + url = "https://aclanthology.org/2022.fnp-1.13", + pages = "89--94" + } diff --git a/labeling/tests/data/laws/law_classifier_000000_Bhw.json b/labeling/tests/data/laws/law_classifier_000000_Bhw.json index eff7ffaf..81d3e477 100644 --- a/labeling/tests/data/laws/law_classifier_000000_Bhw.json +++ b/labeling/tests/data/laws/law_classifier_000000_Bhw.json @@ -21,8 +21,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 4, - "_LineMetadata__other_fields": {} + "line_id": 4 }, "_annotations": [ { @@ -69,8 +68,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 6, - "_LineMetadata__other_fields": {} + "line_id": 6 }, "_annotations": [ { @@ -117,8 +115,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 8, - "_LineMetadata__other_fields": {} + "line_id": 8 }, "_annotations": [ { @@ -165,8 +162,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 10, - "_LineMetadata__other_fields": {} + "line_id": 10 }, "_annotations": [ { @@ -213,8 +209,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 12, - "_LineMetadata__other_fields": {} + "line_id": 12 }, "_annotations": [ { @@ -261,8 +256,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 13, - "_LineMetadata__other_fields": {} + "line_id": 13 }, "_annotations": [ { @@ -309,8 +303,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 14, - "_LineMetadata__other_fields": {} + "line_id": 14 }, "_annotations": [ { @@ -357,8 +350,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 15, - "_LineMetadata__other_fields": {} + "line_id": 15 }, "_annotations": [ { @@ -405,8 +397,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 16, - "_LineMetadata__other_fields": {} + "line_id": 16 }, "_annotations": [ { @@ -453,8 +444,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 17, - "_LineMetadata__other_fields": {} + "line_id": 17 }, "_annotations": [ { @@ -501,8 +491,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 18, - "_LineMetadata__other_fields": {} + "line_id": 18 }, "_annotations": [ { @@ -549,8 +538,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 19, - "_LineMetadata__other_fields": {} + "line_id": 19 }, "_annotations": [ { @@ -597,8 +585,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 20, - "_LineMetadata__other_fields": {} + "line_id": 20 }, "_annotations": [ { @@ -645,8 +632,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 21, - "_LineMetadata__other_fields": {} + "line_id": 21 }, "_annotations": [ { @@ -693,8 +679,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 22, - "_LineMetadata__other_fields": {} + "line_id": 22 }, "_annotations": [ { @@ -741,8 +726,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 23, - "_LineMetadata__other_fields": {} + "line_id": 23 }, "_annotations": [ { @@ -789,8 +773,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 24, - "_LineMetadata__other_fields": {} + "line_id": 24 }, "_annotations": [ { @@ -837,8 +820,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 25, - "_LineMetadata__other_fields": {} + "line_id": 25 }, "_annotations": [ { @@ -885,8 +867,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 27, - "_LineMetadata__other_fields": {} + "line_id": 27 }, "_annotations": [ { @@ -933,8 +914,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 28, - "_LineMetadata__other_fields": {} + "line_id": 28 }, "_annotations": [ { @@ -981,8 +961,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 29, - "_LineMetadata__other_fields": {} + "line_id": 29 }, "_annotations": [ { @@ -1029,8 +1008,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 30, - "_LineMetadata__other_fields": {} + "line_id": 30 }, "_annotations": [ { @@ -1077,8 +1055,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 31, - "_LineMetadata__other_fields": {} + "line_id": 31 }, "_annotations": [ { @@ -1125,8 +1102,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 32, - "_LineMetadata__other_fields": {} + "line_id": 32 }, "_annotations": [ { @@ -1173,8 +1149,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 33, - "_LineMetadata__other_fields": {} + "line_id": 33 }, "_annotations": [ { @@ -1221,8 +1196,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 34, - "_LineMetadata__other_fields": {} + "line_id": 34 }, "_annotations": [ { @@ -1269,8 +1243,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 35, - "_LineMetadata__other_fields": {} + "line_id": 35 }, "_annotations": [ { @@ -1317,8 +1290,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 36, - "_LineMetadata__other_fields": {} + "line_id": 36 }, "_annotations": [ { @@ -1365,8 +1337,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 37, - "_LineMetadata__other_fields": {} + "line_id": 37 }, "_annotations": [ { @@ -1413,8 +1384,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 38, - "_LineMetadata__other_fields": {} + "line_id": 38 }, "_annotations": [ { @@ -1461,8 +1431,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 39, - "_LineMetadata__other_fields": {} + "line_id": 39 }, "_annotations": [ { @@ -1509,8 +1478,7 @@ "line_type": "root" }, "page_id": 0, - "line_id": 40, - "_LineMetadata__other_fields": {} + "line_id": 40 }, "_annotations": [ { @@ -1557,8 +1525,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 45, - "_LineMetadata__other_fields": {} + "line_id": 45 }, "_annotations": [ { @@ -1605,8 +1572,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 46, - "_LineMetadata__other_fields": {} + "line_id": 46 }, "_annotations": [ { @@ -1653,8 +1619,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 48, - "_LineMetadata__other_fields": {} + "line_id": 48 }, "_annotations": [ { @@ -1701,8 +1666,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 49, - "_LineMetadata__other_fields": {} + "line_id": 49 }, "_annotations": [ { @@ -1749,8 +1713,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 51, - "_LineMetadata__other_fields": {} + "line_id": 51 }, "_annotations": [ { @@ -1797,8 +1760,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 52, - "_LineMetadata__other_fields": {} + "line_id": 52 }, "_annotations": [ { @@ -1845,8 +1807,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 53, - "_LineMetadata__other_fields": {} + "line_id": 53 }, "_annotations": [ { @@ -1893,8 +1854,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 54, - "_LineMetadata__other_fields": {} + "line_id": 54 }, "_annotations": [ { @@ -1941,8 +1901,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 56, - "_LineMetadata__other_fields": {} + "line_id": 56 }, "_annotations": [ { @@ -1989,8 +1948,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 58, - "_LineMetadata__other_fields": {} + "line_id": 58 }, "_annotations": [ { @@ -2037,8 +1995,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 59, - "_LineMetadata__other_fields": {} + "line_id": 59 }, "_annotations": [ { @@ -2085,8 +2042,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 60, - "_LineMetadata__other_fields": {} + "line_id": 60 }, "_annotations": [ { @@ -2133,8 +2089,7 @@ "line_type": "item" }, "page_id": 0, - "line_id": 62, - "_LineMetadata__other_fields": {} + "line_id": 62 }, "_annotations": [ { @@ -2181,8 +2136,7 @@ "line_type": "item" }, "page_id": 0, - "line_id": 64, - "_LineMetadata__other_fields": {} + "line_id": 64 }, "_annotations": [ { @@ -2229,8 +2183,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 66, - "_LineMetadata__other_fields": {} + "line_id": 66 }, "_annotations": [ { @@ -2277,8 +2230,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 67, - "_LineMetadata__other_fields": {} + "line_id": 67 }, "_annotations": [ { @@ -2325,8 +2277,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 68, - "_LineMetadata__other_fields": {} + "line_id": 68 }, "_annotations": [ { @@ -2373,8 +2324,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 69, - "_LineMetadata__other_fields": {} + "line_id": 69 }, "_annotations": [ { @@ -2421,8 +2371,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 70, - "_LineMetadata__other_fields": {} + "line_id": 70 }, "_annotations": [ { @@ -2469,8 +2418,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 71, - "_LineMetadata__other_fields": {} + "line_id": 71 }, "_annotations": [ { @@ -2517,8 +2465,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 72, - "_LineMetadata__other_fields": {} + "line_id": 72 }, "_annotations": [ { @@ -2565,8 +2512,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 74, - "_LineMetadata__other_fields": {} + "line_id": 74 }, "_annotations": [ { @@ -2613,8 +2559,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 76, - "_LineMetadata__other_fields": {} + "line_id": 76 }, "_annotations": [ { @@ -2661,8 +2606,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 77, - "_LineMetadata__other_fields": {} + "line_id": 77 }, "_annotations": [ { @@ -2709,8 +2653,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 78, - "_LineMetadata__other_fields": {} + "line_id": 78 }, "_annotations": [ { @@ -2757,8 +2700,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 79, - "_LineMetadata__other_fields": {} + "line_id": 79 }, "_annotations": [ { @@ -2805,8 +2747,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 80, - "_LineMetadata__other_fields": {} + "line_id": 80 }, "_annotations": [ { @@ -2853,8 +2794,7 @@ "line_type": "chapter" }, "page_id": 0, - "line_id": 82, - "_LineMetadata__other_fields": {} + "line_id": 82 }, "_annotations": [ { @@ -2901,8 +2841,7 @@ "line_type": "chapter" }, "page_id": 0, - "line_id": 83, - "_LineMetadata__other_fields": {} + "line_id": 83 }, "_annotations": [ { @@ -2949,8 +2888,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 85, - "_LineMetadata__other_fields": {} + "line_id": 85 }, "_annotations": [ { @@ -2997,8 +2935,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 86, - "_LineMetadata__other_fields": {} + "line_id": 86 }, "_annotations": [ { @@ -3045,8 +2982,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 88, - "_LineMetadata__other_fields": {} + "line_id": 88 }, "_annotations": [ { @@ -3093,8 +3029,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 90, - "_LineMetadata__other_fields": {} + "line_id": 90 }, "_annotations": [ { @@ -3141,8 +3076,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 92, - "_LineMetadata__other_fields": {} + "line_id": 92 }, "_annotations": [ { @@ -3189,8 +3123,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 93, - "_LineMetadata__other_fields": {} + "line_id": 93 }, "_annotations": [ { @@ -3237,8 +3170,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 95, - "_LineMetadata__other_fields": {} + "line_id": 95 }, "_annotations": [ { @@ -3285,8 +3217,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 96, - "_LineMetadata__other_fields": {} + "line_id": 96 }, "_annotations": [ { @@ -3333,8 +3264,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 97, - "_LineMetadata__other_fields": {} + "line_id": 97 }, "_annotations": [ { @@ -3381,8 +3311,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 99, - "_LineMetadata__other_fields": {} + "line_id": 99 }, "_annotations": [ { @@ -3429,8 +3358,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 100, - "_LineMetadata__other_fields": {} + "line_id": 100 }, "_annotations": [ { @@ -3477,8 +3405,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 102, - "_LineMetadata__other_fields": {} + "line_id": 102 }, "_annotations": [ { @@ -3525,8 +3452,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 104, - "_LineMetadata__other_fields": {} + "line_id": 104 }, "_annotations": [ { @@ -3573,8 +3499,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 105, - "_LineMetadata__other_fields": {} + "line_id": 105 }, "_annotations": [ { @@ -3621,8 +3546,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 107, - "_LineMetadata__other_fields": {} + "line_id": 107 }, "_annotations": [ { @@ -3669,8 +3593,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 108, - "_LineMetadata__other_fields": {} + "line_id": 108 }, "_annotations": [ { @@ -3717,8 +3640,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 110, - "_LineMetadata__other_fields": {} + "line_id": 110 }, "_annotations": [ { @@ -3765,8 +3687,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 111, - "_LineMetadata__other_fields": {} + "line_id": 111 }, "_annotations": [ { @@ -3813,8 +3734,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 112, - "_LineMetadata__other_fields": {} + "line_id": 112 }, "_annotations": [ { @@ -3861,8 +3781,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 113, - "_LineMetadata__other_fields": {} + "line_id": 113 }, "_annotations": [ { @@ -3909,8 +3828,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 115, - "_LineMetadata__other_fields": {} + "line_id": 115 }, "_annotations": [ { @@ -3957,8 +3875,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 117, - "_LineMetadata__other_fields": {} + "line_id": 117 }, "_annotations": [ { @@ -4005,8 +3922,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 118, - "_LineMetadata__other_fields": {} + "line_id": 118 }, "_annotations": [ { @@ -4053,8 +3969,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 120, - "_LineMetadata__other_fields": {} + "line_id": 120 }, "_annotations": [ { @@ -4101,8 +4016,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 121, - "_LineMetadata__other_fields": {} + "line_id": 121 }, "_annotations": [ { @@ -4149,8 +4063,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 122, - "_LineMetadata__other_fields": {} + "line_id": 122 }, "_annotations": [ { @@ -4197,8 +4110,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 124, - "_LineMetadata__other_fields": {} + "line_id": 124 }, "_annotations": [ { @@ -4245,8 +4157,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 125, - "_LineMetadata__other_fields": {} + "line_id": 125 }, "_annotations": [ { @@ -4293,8 +4204,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 126, - "_LineMetadata__other_fields": {} + "line_id": 126 }, "_annotations": [ { @@ -4341,8 +4251,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 128, - "_LineMetadata__other_fields": {} + "line_id": 128 }, "_annotations": [ { @@ -4389,8 +4298,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 129, - "_LineMetadata__other_fields": {} + "line_id": 129 }, "_annotations": [ { @@ -4437,8 +4345,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 131, - "_LineMetadata__other_fields": {} + "line_id": 131 }, "_annotations": [ { @@ -4485,8 +4392,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 133, - "_LineMetadata__other_fields": {} + "line_id": 133 }, "_annotations": [ { @@ -4533,8 +4439,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 135, - "_LineMetadata__other_fields": {} + "line_id": 135 }, "_annotations": [ { @@ -4581,8 +4486,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 137, - "_LineMetadata__other_fields": {} + "line_id": 137 }, "_annotations": [ { @@ -4629,8 +4533,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 138, - "_LineMetadata__other_fields": {} + "line_id": 138 }, "_annotations": [ { @@ -4677,8 +4580,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 140, - "_LineMetadata__other_fields": {} + "line_id": 140 }, "_annotations": [ { @@ -4725,8 +4627,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 141, - "_LineMetadata__other_fields": {} + "line_id": 141 }, "_annotations": [ { @@ -4773,8 +4674,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 142, - "_LineMetadata__other_fields": {} + "line_id": 142 }, "_annotations": [ { @@ -4821,8 +4721,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 143, - "_LineMetadata__other_fields": {} + "line_id": 143 }, "_annotations": [ { @@ -4869,8 +4768,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 145, - "_LineMetadata__other_fields": {} + "line_id": 145 }, "_annotations": [ { @@ -4917,8 +4815,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 146, - "_LineMetadata__other_fields": {} + "line_id": 146 }, "_annotations": [ { @@ -4965,8 +4862,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 147, - "_LineMetadata__other_fields": {} + "line_id": 147 }, "_annotations": [ { @@ -5013,8 +4909,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 149, - "_LineMetadata__other_fields": {} + "line_id": 149 }, "_annotations": [ { @@ -5061,8 +4956,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 150, - "_LineMetadata__other_fields": {} + "line_id": 150 }, "_annotations": [ { @@ -5109,8 +5003,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 152, - "_LineMetadata__other_fields": {} + "line_id": 152 }, "_annotations": [ { @@ -5157,8 +5050,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 153, - "_LineMetadata__other_fields": {} + "line_id": 153 }, "_annotations": [ { @@ -5205,8 +5097,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 154, - "_LineMetadata__other_fields": {} + "line_id": 154 }, "_annotations": [ { @@ -5253,8 +5144,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 155, - "_LineMetadata__other_fields": {} + "line_id": 155 }, "_annotations": [ { @@ -5301,8 +5191,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 156, - "_LineMetadata__other_fields": {} + "line_id": 156 }, "_annotations": [ { @@ -5349,8 +5238,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 157, - "_LineMetadata__other_fields": {} + "line_id": 157 }, "_annotations": [ { @@ -5397,8 +5285,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 159, - "_LineMetadata__other_fields": {} + "line_id": 159 }, "_annotations": [ { @@ -5445,8 +5332,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 160, - "_LineMetadata__other_fields": {} + "line_id": 160 }, "_annotations": [ { @@ -5493,8 +5379,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 161, - "_LineMetadata__other_fields": {} + "line_id": 161 }, "_annotations": [ { @@ -5541,8 +5426,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 163, - "_LineMetadata__other_fields": {} + "line_id": 163 }, "_annotations": [ { @@ -5589,8 +5473,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 164, - "_LineMetadata__other_fields": {} + "line_id": 164 }, "_annotations": [ { @@ -5637,8 +5520,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 166, - "_LineMetadata__other_fields": {} + "line_id": 166 }, "_annotations": [ { @@ -5685,8 +5567,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 167, - "_LineMetadata__other_fields": {} + "line_id": 167 }, "_annotations": [ { @@ -5733,8 +5614,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 168, - "_LineMetadata__other_fields": {} + "line_id": 168 }, "_annotations": [ { @@ -5781,8 +5661,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 169, - "_LineMetadata__other_fields": {} + "line_id": 169 }, "_annotations": [ { @@ -5829,8 +5708,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 171, - "_LineMetadata__other_fields": {} + "line_id": 171 }, "_annotations": [ { @@ -5877,8 +5755,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 172, - "_LineMetadata__other_fields": {} + "line_id": 172 }, "_annotations": [ { @@ -5925,8 +5802,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 173, - "_LineMetadata__other_fields": {} + "line_id": 173 }, "_annotations": [ { @@ -5973,8 +5849,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 175, - "_LineMetadata__other_fields": {} + "line_id": 175 }, "_annotations": [ { @@ -6021,8 +5896,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 176, - "_LineMetadata__other_fields": {} + "line_id": 176 }, "_annotations": [ { @@ -6069,8 +5943,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 178, - "_LineMetadata__other_fields": {} + "line_id": 178 }, "_annotations": [ { @@ -6117,8 +5990,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 179, - "_LineMetadata__other_fields": {} + "line_id": 179 }, "_annotations": [ { @@ -6165,8 +6037,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 181, - "_LineMetadata__other_fields": {} + "line_id": 181 }, "_annotations": [ { @@ -6213,8 +6084,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 182, - "_LineMetadata__other_fields": {} + "line_id": 182 }, "_annotations": [ { @@ -6261,8 +6131,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 183, - "_LineMetadata__other_fields": {} + "line_id": 183 }, "_annotations": [ { @@ -6309,8 +6178,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 185, - "_LineMetadata__other_fields": {} + "line_id": 185 }, "_annotations": [ { @@ -6357,8 +6225,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 186, - "_LineMetadata__other_fields": {} + "line_id": 186 }, "_annotations": [ { @@ -6405,8 +6272,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 187, - "_LineMetadata__other_fields": {} + "line_id": 187 }, "_annotations": [ { @@ -6453,8 +6319,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 189, - "_LineMetadata__other_fields": {} + "line_id": 189 }, "_annotations": [ { @@ -6501,8 +6366,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 190, - "_LineMetadata__other_fields": {} + "line_id": 190 }, "_annotations": [ { @@ -6549,8 +6413,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 192, - "_LineMetadata__other_fields": {} + "line_id": 192 }, "_annotations": [ { @@ -6597,8 +6460,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 193, - "_LineMetadata__other_fields": {} + "line_id": 193 }, "_annotations": [ { @@ -6645,8 +6507,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 194, - "_LineMetadata__other_fields": {} + "line_id": 194 }, "_annotations": [ { @@ -6693,8 +6554,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 195, - "_LineMetadata__other_fields": {} + "line_id": 195 }, "_annotations": [ { @@ -6741,8 +6601,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 196, - "_LineMetadata__other_fields": {} + "line_id": 196 }, "_annotations": [ { @@ -6789,8 +6648,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 198, - "_LineMetadata__other_fields": {} + "line_id": 198 }, "_annotations": [ { @@ -6837,8 +6695,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 200, - "_LineMetadata__other_fields": {} + "line_id": 200 }, "_annotations": [ { @@ -6885,8 +6742,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 201, - "_LineMetadata__other_fields": {} + "line_id": 201 }, "_annotations": [ { @@ -6933,8 +6789,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 203, - "_LineMetadata__other_fields": {} + "line_id": 203 }, "_annotations": [ { @@ -6981,8 +6836,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 204, - "_LineMetadata__other_fields": {} + "line_id": 204 }, "_annotations": [ { @@ -7029,8 +6883,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 206, - "_LineMetadata__other_fields": {} + "line_id": 206 }, "_annotations": [ { @@ -7077,8 +6930,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 207, - "_LineMetadata__other_fields": {} + "line_id": 207 }, "_annotations": [ { @@ -7125,8 +6977,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 208, - "_LineMetadata__other_fields": {} + "line_id": 208 }, "_annotations": [ { @@ -7173,8 +7024,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 210, - "_LineMetadata__other_fields": {} + "line_id": 210 }, "_annotations": [ { @@ -7221,8 +7071,7 @@ "line_type": "article" }, "page_id": 0, - "line_id": 211, - "_LineMetadata__other_fields": {} + "line_id": 211 }, "_annotations": [ { @@ -7269,8 +7118,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 213, - "_LineMetadata__other_fields": {} + "line_id": 213 }, "_annotations": [ { @@ -7317,8 +7165,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 214, - "_LineMetadata__other_fields": {} + "line_id": 214 }, "_annotations": [ { @@ -7365,8 +7212,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 215, - "_LineMetadata__other_fields": {} + "line_id": 215 }, "_annotations": [ { @@ -7413,8 +7259,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 216, - "_LineMetadata__other_fields": {} + "line_id": 216 }, "_annotations": [ { @@ -7461,8 +7306,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 217, - "_LineMetadata__other_fields": {} + "line_id": 217 }, "_annotations": [ { @@ -7509,8 +7353,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 218, - "_LineMetadata__other_fields": {} + "line_id": 218 }, "_annotations": [ { @@ -7557,8 +7400,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 219, - "_LineMetadata__other_fields": {} + "line_id": 219 }, "_annotations": [ { @@ -7605,8 +7447,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 220, - "_LineMetadata__other_fields": {} + "line_id": 220 }, "_annotations": [ { @@ -7653,8 +7494,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 221, - "_LineMetadata__other_fields": {} + "line_id": 221 }, "_annotations": [ { @@ -7701,8 +7541,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 222, - "_LineMetadata__other_fields": {} + "line_id": 222 }, "_annotations": [ { @@ -7749,8 +7588,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 224, - "_LineMetadata__other_fields": {} + "line_id": 224 }, "_annotations": [ { @@ -7797,8 +7635,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 225, - "_LineMetadata__other_fields": {} + "line_id": 225 }, "_annotations": [ { @@ -7845,8 +7682,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 226, - "_LineMetadata__other_fields": {} + "line_id": 226 }, "_annotations": [ { @@ -7893,8 +7729,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 227, - "_LineMetadata__other_fields": {} + "line_id": 227 }, "_annotations": [ { @@ -7941,8 +7776,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 228, - "_LineMetadata__other_fields": {} + "line_id": 228 }, "_annotations": [ { @@ -7989,8 +7823,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 230, - "_LineMetadata__other_fields": {} + "line_id": 230 }, "_annotations": [ { @@ -8037,8 +7870,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 231, - "_LineMetadata__other_fields": {} + "line_id": 231 }, "_annotations": [ { @@ -8085,8 +7917,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 232, - "_LineMetadata__other_fields": {} + "line_id": 232 }, "_annotations": [ { @@ -8133,8 +7964,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 233, - "_LineMetadata__other_fields": {} + "line_id": 233 }, "_annotations": [ { @@ -8181,8 +8011,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 235, - "_LineMetadata__other_fields": {} + "line_id": 235 }, "_annotations": [ { @@ -8229,8 +8058,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 236, - "_LineMetadata__other_fields": {} + "line_id": 236 }, "_annotations": [ { @@ -8277,8 +8105,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 237, - "_LineMetadata__other_fields": {} + "line_id": 237 }, "_annotations": [ { @@ -8325,8 +8152,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 238, - "_LineMetadata__other_fields": {} + "line_id": 238 }, "_annotations": [ { @@ -8373,8 +8199,7 @@ "line_type": "articlePart" }, "page_id": 0, - "line_id": 240, - "_LineMetadata__other_fields": {} + "line_id": 240 }, "_annotations": [ { @@ -8421,8 +8246,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 241, - "_LineMetadata__other_fields": {} + "line_id": 241 }, "_annotations": [ { @@ -8469,8 +8293,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 242, - "_LineMetadata__other_fields": {} + "line_id": 242 }, "_annotations": [ { @@ -8517,8 +8340,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 243, - "_LineMetadata__other_fields": {} + "line_id": 243 }, "_annotations": [ { @@ -8565,8 +8387,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 244, - "_LineMetadata__other_fields": {} + "line_id": 244 }, "_annotations": [ { @@ -8613,8 +8434,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 245, - "_LineMetadata__other_fields": {} + "line_id": 245 }, "_annotations": [ { @@ -8661,8 +8481,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 247, - "_LineMetadata__other_fields": {} + "line_id": 247 }, "_annotations": [ { @@ -8709,8 +8528,7 @@ "line_type": "raw_text" }, "page_id": 0, - "line_id": 248, - "_LineMetadata__other_fields": {} + "line_id": 248 }, "_annotations": [ { diff --git a/requirements.txt b/requirements.txt index 30469034..10af796e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ olefile~=0.46 opencv-python>=4.5.5.64,<4.6.0 orjson>=3.8.11,<=3.9.5 pandas>=1.4.1,<=1.9.0 +pdf.tocgen>=1.3.0,<=1.3.4 pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six==20211012 piexif==1.1.3 diff --git a/resources/benchmarks/fintoc_scores.html b/resources/benchmarks/fintoc_scores.html new file mode 100644 index 00000000..50624b7d --- /dev/null +++ b/resources/benchmarks/fintoc_scores.html @@ -0,0 +1,83 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TD 0TD 1TD 2TD meanTOC 0TOC 1TOC 2TOC mean
en_tabby0.8115220.8337980.8642390.83652056.558.064.959.800000
en_txt_layer0.8213600.8532580.8336230.83608157.862.157.859.233333
fr_tabby0.7534090.7442320.7821690.75993751.247.951.550.200000
fr_txt_layer0.7405300.7944600.7660590.76701645.652.250.149.300000
sp_tabby0.6067180.6228390.5990940.60955037.143.643.441.366667
sp_txt_layer0.6290520.6679760.4468270.58128546.448.830.741.966667
\ No newline at end of file diff --git a/scripts/benchmark_pdf_performance.py b/scripts/benchmark_pdf_performance.py index c3fa48af..c5701034 100644 --- a/scripts/benchmark_pdf_performance.py +++ b/scripts/benchmark_pdf_performance.py @@ -95,7 +95,7 @@ def main() -> None: assert args.loops > 0, "The number of repetitions of testing one file must be positive" - print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') + print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') # noqa configs = [{}] if args.parameters: diff --git a/scripts/fintoc2022/__init__.py b/scripts/fintoc2022/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/fintoc2022/benchmark_fintoc.py b/scripts/fintoc2022/benchmark_fintoc.py new file mode 100644 index 00000000..62040cb5 --- /dev/null +++ b/scripts/fintoc2022/benchmark_fintoc.py @@ -0,0 +1,43 @@ +""" +Merge results for all classifiers after running training script `train_fintoc_classifier.py`. +Results are represented in a table (.html file) in benchmarks directory. + +Results are obtained from cross-validation on the training set from FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/). +Three languages are supported: English, French and Spanish ("en", "fr", "sp"). +Two readers are used: `PdfTabbyReader` and `PdfTxtlayerReader` ("tabby", "txt_layer"). +""" +import json +import os +from collections import defaultdict + +import pandas as pd + +if __name__ == "__main__": + scores_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fintoc2022", "scores")) + benchmarks_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + + assert os.path.isdir(scores_dir), "Directory with scores doesn't exist, run `train_fintoc_classifier.py` beforehand" + + all_scores_dict = defaultdict(list) + names = [] + for scores_file in sorted(os.listdir(scores_dir)): + if not scores_file.endswith(".json"): + continue + + # files are named like: scores_en_tabby.json + with open(os.path.join(scores_dir, scores_file), "r") as f: + scores_dict = json.load(f) + + names.append(scores_file[len("scores_"):-len(".json")]) + + for i, td_score in enumerate(scores_dict["td_scores"]): + all_scores_dict[f"TD {i}"].append(td_score) + all_scores_dict["TD mean"].append(scores_dict["td_mean"]) + + for i, toc_score in enumerate(scores_dict["toc_scores"]): + all_scores_dict[f"TOC {i}"].append(toc_score) + all_scores_dict["TOC mean"].append(scores_dict["toc_mean"]) + + scores_df = pd.DataFrame(all_scores_dict, index=names) + with open(os.path.join(benchmarks_dir, "fintoc_scores.html"), "w") as f: + f.write(scores_df.to_html()) diff --git a/scripts/fintoc2022/dataset_loader.py b/scripts/fintoc2022/dataset_loader.py new file mode 100755 index 00000000..9d9560e2 --- /dev/null +++ b/scripts/fintoc2022/dataset_loader.py @@ -0,0 +1,124 @@ +import gzip +import json +import logging +import os +import pickle +import shutil +import tempfile +import zipfile +from collections import Counter, defaultdict +from typing import Dict, List + +import wget +from Levenshtein._levenshtein import ratio + +from dedoc.config import get_config +from dedoc.readers import PdfTabbyReader, PdfTxtlayerReader +from train_dataset.data_structures.line_with_label import LineWithLabel + + +class FintocLineWithLabelExtractor: + """ + Create LineWithLabel from documents and their annotations + """ + + def __init__(self) -> None: + self.readers = {"tabby": PdfTabbyReader(), "txt_layer": PdfTxtlayerReader()} + + def get_lines(self, file_name: str, file_path: str, gt_path: str, reader_name: str) -> List[LineWithLabel]: + """ + Extract lines from PDF document, create labels for lines from the groundtruth file given by FinTOC. + Labeled lines are matched to the lines extracted by dedoc using Levenshtein distance (threshold=0.8). + + :param file_name: name of the file (PDF, json) + :param file_path: path to the PDF document + :param gt_path: path to the groundtruth JSON file with labels + :param reader_name: ("tabby", "txt_layer") - type of PDF reader used for lines extraction + :return: document in form of list of lines with labels + """ + document = self.readers[reader_name].read(file_path, parameters={"need_header_footer_analysis": "True"}) + + labeled_lines = defaultdict(list) + with open(gt_path) as gt_file: + for labeled_line in json.load(gt_file): + labeled_lines[labeled_line["page"] - 1].append(labeled_line) + + result = [] + for line in document.lines: + page_candidates = [(ratio(line.line, labeled_line["text"]), labeled_line) for labeled_line in labeled_lines[line.metadata.page_id]] + best_line = max(page_candidates, key=lambda t: t[0], default=(0, {})) + depth = best_line[1]["depth"] if len(page_candidates) > 0 and best_line[0] > 0.8 else "-1" + result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=str(depth), group=file_name, uid=line.uid)) + + return sorted(result, key=lambda x: (x.metadata.page_id, x.metadata.line_id)) + + +class FintocDatasetLoader: + """ + Class for downloading data from the cloud, distributing lines into document groups and sorting them. + Returns data in form of document lines with their labels. + """ + def __init__(self, dataset_dir: str, logger: logging.Logger) -> None: + """ + :param dataset_dir: path to the directory where to store downloaded dataset + :param logger: logger for logging details of dataset loading + """ + self.dataset_dir = dataset_dir + self.logger = logger + self.config = get_config() + self.data_url = "https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download" + self.line_extractor = FintocLineWithLabelExtractor() + + def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> Dict[str, List[LineWithLabel]]: + """ + Download data from a cloud at `self.data_url` and sort document lines. + + :param language: ("en", "fr", "sp") - language group + :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF + :param use_cache: whether to use cached data (if dataset is already downloaded) or download it anyway + :return: dict of documents {document path: document}, where document is a list of lines with labels of the training dataset + """ + archive_path = os.path.join(self.dataset_dir, "dataset.zip") + if not os.path.isfile(archive_path): + os.makedirs(self.dataset_dir, exist_ok=True) + self.logger.info("Start download dataset") + wget.download(self.data_url, archive_path) + self.logger.info(f"Finish download dataset to {archive_path}") + + pkl_path = os.path.join(self.dataset_dir, f"lines_{language}_{reader_name}.pkl.gz") + + if os.path.isfile(pkl_path) and use_cache: + with gzip.open(pkl_path) as input_file: + parsed_files = pickle.load(input_file) + self.logger.info(f"Data were loaded from the local disk: {len(parsed_files)} files") + return parsed_files + + result = self.__read_pdf_lines(archive_path, language, reader_name) + + with gzip.open(pkl_path, "wb") as out: + pickle.dump(obj=result, file=out) + self.logger.info(Counter([line.label for document in result.values() for line in document])) + return result + + def __read_pdf_lines(self, archive_path: str, language: str, reader_name: str) -> Dict[str, List[LineWithLabel]]: + with zipfile.ZipFile(archive_path, "r") as zip_ref: + zip_ref.extractall(self.dataset_dir) + + data_dir = os.path.join(self.dataset_dir, "data", language) + pdf_dir = os.path.join(data_dir, "pdf") + gt_dir = os.path.join(data_dir, "annots") + pdf_files = {pdf_file[:-len(".pdf")]: os.path.join(pdf_dir, pdf_file) for pdf_file in os.listdir(pdf_dir) if pdf_file.endswith(".pdf")} + gt_files = {gt_file[:-len(".pdf.fintoc4.json")]: os.path.join(gt_dir, gt_file) for gt_file in os.listdir(gt_dir) if gt_file.endswith(".json")} + assert set(pdf_files) == set(gt_files) + + result = {} + with tempfile.TemporaryDirectory() as tmp_dir: + for file_name in pdf_files: + pdf_tmp_path = os.path.join(tmp_dir, file_name) + ".pdf" + shutil.copy(pdf_files[file_name], pdf_tmp_path) + try: + document = self.line_extractor.get_lines(file_name=file_name, file_path=pdf_tmp_path, gt_path=gt_files[file_name], reader_name=reader_name) + result[pdf_files[file_name]] = document + except Exception as e: + self.logger.warning(f"Failed to read {file_name} by {reader_name}, error: {e}") + return result diff --git a/scripts/fintoc2022/metric.py b/scripts/fintoc2022/metric.py new file mode 100755 index 00000000..ad6695b7 --- /dev/null +++ b/scripts/fintoc2022/metric.py @@ -0,0 +1,624 @@ +""" +This is a python3 script that rewrites the score function used in Book Structure +Extraction Competition @ ICDAR2013 +(https://www.cs.helsinki.fi/u/doucet/papers/ICDAR2013.pdf). +It uses a classic levenshtein distance implemented by +https://pypi.org/project/python-Levenshtein/ instead of a customized levenshtein +distance. +It is used to score participants in FinTOC2020 shared task. + +------ +INSTALL +------ +pip install python-Levenshtein + +------ +USAGE +------ +python metric.fintoc2.py--gt_folder --submission_folder + + and are paths to folders containing JSON files: +[ + { + "text": String, # text of the TOC item/entry + "id": Int # identifer of the item/entry corresponding to its order in the TOC + "depth": Int # hierarchical level of the item + "page": Int # the (physical) page number where the item appears + } + +] +""" + +import argparse +import csv +import json +import logging +import os +from abc import ABC, abstractmethod +from glob import glob +from operator import itemgetter + +import Levenshtein +import numpy as np + +JSON_EXTENSION = ".fintoc4.json" +VERBOSE = True +STRING_THRESHOLD = 0.85 + + +class TOCJson: + def __init__(self, json_file): + self.parse(json_file) + + def parse(self, json_file): + with open(json_file, "r", encoding="utf-8") as infile: + content = json.load(infile) + self.entries = [] + for dict_entry in content: + self.entries.append(Title(dict_entry["text"], dict_entry["page"], dict_entry["id"], dict_entry["depth"])) + + +class Title: + def __init__(self, text, page_nb, id_, depth): + self.text = text + self.page_nb = page_nb + self.id_ = id_ + self.depth = depth + self.matched = False + + def __repr__(self): + return f"page={self.page_nb} title={repr(self.text)}" + + def compare_page_nb(self, entry): + if isinstance(entry.page_nb, str): + entry.page_nb = int(entry.page_nb) + if self.page_nb == entry.page_nb: + return 0 + if self.page_nb > entry.page_nb: + return 1 + return -1 + + def compare_depth(self, entry): + if str(self.depth) == entry.depth: + return 0 + if str(self.depth) > entry.depth: + return 1 + return -1 + + +class ICDARMetric(ABC): + + def __init__(self): + self.correct = 0 + self.added = 0 + self.missed = 0 + self.mismatch = 0 + self.p_per_doc = {} + self.r_per_doc = {} + self.f_per_doc = {} + self.title_acc_per_doc = {} + + def compute_prf(self): + self.compute_p() + self.compute_r() + try: + self.f_score = 2 * self.prec * self.reca / (self.prec + self.reca) + except ZeroDivisionError: + self.f_score = 0 + return self.prec, self.reca, self.f_score + + def compute_p(self): + try: + self.prec = self.correct / (self.correct + self.added + self.mismatch) + except ZeroDivisionError: + self.prec = 0 + + def compute_r(self): + try: + self.reca = self.correct / (self.correct + self.missed + self.mismatch) + except ZeroDivisionError: + self.reca = 0 + + @abstractmethod + def initialize_stats(self): + self.correct = 0 + self.added = 0 + self.missed = 0 + self.mismatch = 0 + self.prec = 0.0 + self.reca = 0.0 + self.f_score = 0.0 + self.title_acc = 0.0 + + @abstractmethod + def get_title_acc(self, *args): + pass + + def format_float_percent(self, float_nb): + return "%.1f" % (100 * float_nb) + + def format_res(self): + out = ["%6s" % self.format_float_percent(self.prec)] + out.append("%6s" % self.format_float_percent(self.reca)) + out.append("%6s" % self.format_float_percent(self.f_score)) + out.append("%6s" % self.format_float_percent(self.title_acc)) + return out + + def compute_avg_p(self): + return np.mean(list(self.p_per_doc.values())) + + def compute_std_p(self): + return np.std(list(self.p_per_doc.values())) + + def compute_avg_r(self): + return np.mean(list(self.r_per_doc.values())) + + def compute_std_r(self): + return np.std(list(self.r_per_doc.values())) + + def compute_avg_f(self): + return np.mean(list(self.f_per_doc.values())) + + def compute_std_f(self): + return np.std(list(self.f_per_doc.values())) + + def compute_avg_title_acc(self): + return np.mean(list(self.title_acc_per_doc.values())) + + def compute_std_title_acc(self): + return np.std(list(self.title_acc_per_doc.values())) + + +class InexMetric(ICDARMetric): + + def __init__(self): + super().__init__() + self.level_correct = 0 + self.level_acc = 0.0 + self.level_acc_per_doc = {} + + def initialize_stats(self): + super().initialize_stats() + self.level_correct = 0 + self.level_acc = 0.0 + + def get_level_acc(self, nb_valid_links): + try: + self.level_acc = self.level_correct / nb_valid_links + except ZeroDivisionError: + self.level_acc = 0.0 + return self.level_acc + + def get_title_acc(self, nb_valid_links): + try: + self.title_acc = self.correct / nb_valid_links + except ZeroDivisionError: + self.title_acc = 0.0 + return self.title_acc + + def format_res(self): + out = super().format_res() + out.append("%6s" % self.format_float_percent(self.level_acc)) + return out + + def compute_avg_level_acc(self): + return np.mean(list(self.level_acc_per_doc.values())) + + def compute_std_level_acc(self): + return np.std(list(self.level_acc_per_doc.values())) + + +class XeroxMetric(ICDARMetric): + + def __init__(self): + super().__init__() + self.text_sim = 0 + + def initialize_stats(self): + super().initialize_stats() + self.text_sim = 0 + + def get_title_acc(self): + try: + self.title_acc = self.text_sim / float(self.correct) + except ZeroDivisionError: + self.title_acc = 0.0 + return self.title_acc + + +class Stats: + + def __init__(self): + self.ok_per_doc = {} + self.pbttl_per_doc = {} + self.pblvl_per_doc = {} + self.err_per_doc = {} + self.miss_per_doc = {} + + def compute_sum_ok(self): + return sum(list(self.ok_per_doc.values())) + + def compute_sum_pbttl(self): + return sum(list(self.pbttl_per_doc.values())) + + def compute_sum_pblvl(self): + return sum(list(self.pblvl_per_doc.values())) + + def compute_sum_err(self): + return sum(list(self.err_per_doc.values())) + + def compute_sum_miss(self): + return sum(list(self.miss_per_doc.values())) + + +class Writer: + + def __init__(self): + self.toc_rows = self.format_icdar_heading() + self.td_rows = self.format_td_heading() + + @classmethod + def format_icdar_heading(self): + out = [ + "Doc", "Xrx-P", "Xrx-R", "Xrx-F1", "Xrx-Title acc", "Inex08-P", "Inex08-R", + "Inex08-F1", "Inex08-Title acc", "Inex08-Level acc", "Ok", "PbTtl", + "PbLvl", "Err", "Miss", "book id" + ] + return [out] + + @classmethod + def format_td_heading(self): + out = ["Doc", "Prec", "Rec", "F1", "Book id"] + return [out] + + def dump_all(self): + self.dump_toc() + self.dump_td() + + def dump_toc(self): + with open("toc_report.csv", "w", encoding="utf-8") as outfile: + writer = csv.writer(outfile, dialect=csv.excel, delimiter="\t") + writer.writerows(self.toc_rows) + + def dump_td(self): + with open("td_report.csv", "w", encoding="utf-8") as outfile: + writer = csv.writer(outfile, dialect=csv.excel, delimiter="\t") + writer.writerows(self.td_rows) + + +def score_title_detection(toc1, toc2, log): + correct = 0 + for entry1 in toc1.entries: + res = find_matching_entry(entry1.text, toc2) + if res is not None: + index, match_score = res + matched_text = toc2.entries[index].text + msg = "Gt title %s is matched to %s (score=%.4g)" % (repr(entry1.text), repr(matched_text), match_score) + log.info(msg) + entry1.matched = True + toc2.entries[index].matched = True + correct += 1 + else: + log.info(f"Gt title {repr(entry1.text)} is not matched to any submission title") + for entry in toc2.entries: + if not entry.matched: + log.info(f"{entry} in submission not matched") + added = len([entry for entry in toc2.entries if not entry.matched]) + missed = len([entry for entry in toc1.entries if not entry.matched]) + log.info("nb of added titles: %i", added) + log.info("nb of missed titles: %i", missed) + log.info("nb of correct titles: %i", correct) + # return score + try: + prec = correct / (correct + added) + except ZeroDivisionError: + prec = 0.0 + try: + reca = correct / (correct + missed) + except ZeroDivisionError: + reca = 0.0 + try: + f1_score = 2 * prec * reca / (prec + reca) + except ZeroDivisionError: + f1_score = 0.0 + return prec, reca, f1_score + + +def find_matching_entry(text, toc): + if len(toc.entries) == 0: + return None + similarities = [] + for entry in toc.entries: + if not entry.matched: + similarities.append(Levenshtein.ratio(text, entry.text)) + else: + similarities.append(0) + index, match_score = max(enumerate(similarities), key=itemgetter(1)) + if match_score > STRING_THRESHOLD: + return index, match_score + return None + + +def update_icdar_stats(toc1, toc2, inex_metric, xerox_metric, log): + i1, i2 = 0, 0 + if len(toc1.entries) > 0 and len(toc2.entries) > 0: + entry1 = toc1.entries[i1] + entry2 = toc2.entries[i2] + while True: + link_result = entry1.compare_page_nb(entry2) + try: + if link_result == 0: + xerox_metric.correct += 1 + text_similarity = Levenshtein.ratio(entry1.text, entry2.text) + xerox_metric.text_sim += text_similarity + if text_similarity > STRING_THRESHOLD: + inex_metric.correct += 1 + else: + if VERBOSE: + log.info(f"TITLE ERROR: {entry1} <--> {repr(entry2.text)}") + inex_metric.mismatch += 1 + depth_result = entry1.compare_depth(entry2) + if depth_result == 0: + inex_metric.level_correct += 1 + i1 += 1 + i2 += 1 + entry1 = toc1.entries[i1] + entry2 = toc2.entries[i2] + elif link_result < 0: + inex_metric.missed += 1 + xerox_metric.missed += 1 + if VERBOSE: + log.info(f"MISS: {entry1}") + i1 += 1 + entry1 = toc1.entries[i1] + else: + inex_metric.added += 1 + xerox_metric.added += 1 + if VERBOSE: + log.info(f"ADDED: {entry2}") + i2 += 1 + entry2 = toc2.entries[i2] + except IndexError: + break + # take into account remaining entries in gt + while i1 < len(toc1.entries): + if VERBOSE: + entry1 = toc1.entries[i1] + log.info(f"MISS: {entry1}") + i1 += 1 + inex_metric.missed += 1 + xerox_metric.missed += 1 + # take into account remaining entries in submission + while i2 < len(toc2.entries): + if VERBOSE: + entry2 = toc2.entries[i2] + log.info(f"ADDED: {entry2}") + i2 += 1 + inex_metric.added += 1 + xerox_metric.added += 1 + + +def score(folder1, folder2): + def get_docnames(folder, ext): + out = [] + for ele in ls(folder, ext): + out.append(basename(ele, ext)) + return out + + docnames1 = get_docnames(folder1, JSON_EXTENSION) + docnames2 = get_docnames(folder2, JSON_EXTENSION) + docnames = list(set(docnames1) & set(docnames2)) + n_missing_docs = len([ele for ele in docnames1 if ele not in docnames2]) + n_added_docs = len([ele for ele in docnames2 if ele not in docnames1]) + writer = Writer() + doc_id = 0 + # TOC generation metrics + inex = InexMetric() + xerox = XeroxMetric() + count = Stats() + # Title detection metrics + td_prec = dict(zip(docnames, [None] * len(docnames))) + td_reca = dict(zip(docnames, [None] * len(docnames))) + td_f1 = dict(zip(docnames, [None] * len(docnames))) + # loggers + toc_logger = get_logger("toc", "toc.log") + td_logger = get_logger("td", "td.log") + for json1 in ls(folder1, JSON_EXTENSION): + xerox.initialize_stats() + inex.initialize_stats() + toc1 = TOCJson(json1) + docname = basename(json1, JSON_EXTENSION) + if VERBOSE: + toc_logger.info(f"\n\nCOMPARING {docname}") + td_logger.info(f"\n\nCOMPARING {docname}") + json2 = os.path.join(folder2, docname + JSON_EXTENSION) + if not os.path.isfile(json2): + toc_logger.info(f"{docname} missing from submission") + td_logger.info(f"{docname} missing from submission") + else: + # Title detection + toc2 = TOCJson(json2) + td_prec[docname], td_reca[docname], td_f1[docname] = score_title_detection(toc1, toc2, td_logger) + writer.td_rows.append([doc_id, td_prec[docname], td_reca[docname], td_f1[docname], docname]) + # TOC generation + update_icdar_stats(toc1, toc2, inex, xerox, toc_logger) + # compute stats + count.ok_per_doc[docname] = xerox.correct + count.pbttl_per_doc[docname] = xerox.correct - inex.correct + count.pblvl_per_doc[docname] = xerox.correct - inex.level_correct + count.err_per_doc[docname] = xerox.added + count.miss_per_doc[docname] = xerox.missed + # compute Xerox score + xerox.compute_prf() + xerox.p_per_doc[docname] = xerox.prec + xerox.r_per_doc[docname] = xerox.reca + xerox.f_per_doc[docname] = xerox.f_score + xerox.title_acc_per_doc[docname] = xerox.get_title_acc() + # compute Inex score + inex.compute_prf() + inex.p_per_doc[docname] = inex.prec + inex.r_per_doc[docname] = inex.reca + inex.f_per_doc[docname] = inex.f_score + inex.title_acc_per_doc[docname] = inex.get_title_acc(xerox.correct) + inex.level_acc_per_doc[docname] = inex.get_level_acc(xerox.correct) + # result row + writer.toc_rows.append(get_row_result(doc_id, docname, xerox, inex)) + doc_id += 1 + # get avg and std scores + writer.toc_rows.append(get_avg_row(xerox, inex, count)) + writer.toc_rows.append(get_std_row(xerox, inex)) + writer.td_rows.append(get_avg_row(td_prec, td_reca, td_f1)) + writer.td_rows.append(get_std_row(td_prec, td_reca, td_f1)) + # get stats about missing and added docs + writer.toc_rows.append( + [f"Done: {len(docnames)} comparisons for {len(docnames1)} in groundtruth and {len(docnames2)} in submission"]) + if n_missing_docs: + writer.toc_rows.append([f"{n_missing_docs} docs missing from submission"]) + if n_added_docs: + writer.toc_rows.append([f"{n_added_docs} additional docs in submission (ignored)"]) + # dump + writer.dump_all() + + +def get_row_result(doc_id, doc, xerox, inex): + out = ["%4s " % doc_id] + out.extend(xerox.format_res()) + out.extend(inex.format_res()) + out.append("%7s" % xerox.correct) + out.append("%7s" % (xerox.correct - inex.correct)) + out.append("%7s" % (xerox.correct - inex.level_correct)) + out.append("%7s" % xerox.added) + out.append("%7s" % xerox.missed) + out.append("%s" % doc) + return out + + +""" +https://medium.com/practo-engineering/function-overloading-in-python-94a8b10d1e08 +""" +registry = {} + + +class MultiMethod(object): + def __init__(self, name): + self.name = name + self.typemap = {} + + def __call__(self, *args): + types = tuple(arg.__class__ for arg in args) + function = self.typemap.get(types) + if function is None: + raise TypeError("no match") + return function(*args) + + def register(self, types, function): + self.typemap[types] = function + + +def overload(*types): + def register(function): + name = function.__name__ + mm = registry.get(name) + if mm is None: + mm = registry[name] = MultiMethod(name) + mm.register(types, function) + return mm + + return register + + +""" +https://medium.com/practo-engineering/function-overloading-in-python-94a8b10d1e08 +""" + + +@overload(XeroxMetric, InexMetric, Stats) +def get_avg_row(xerox, inex, count): + out = [] + out.append("%4s " % "AVG") + # xerox + out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_p())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_r())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_f())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_title_acc())) + # inex + out.append("%6s" % inex.format_float_percent(inex.compute_avg_p())) + out.append("%6s" % inex.format_float_percent(inex.compute_avg_r())) + out.append("%6s" % inex.format_float_percent(inex.compute_avg_f())) + out.append("%6s" % inex.format_float_percent(inex.compute_avg_title_acc())) + out.append("%6s" % inex.format_float_percent(inex.compute_avg_level_acc())) + # count stats + out.append("%7s" % (count.compute_sum_ok())) + out.append("%7s" % (count.compute_sum_pbttl())) + out.append("%7s" % (count.compute_sum_pblvl())) + out.append("%7s" % (count.compute_sum_err())) + out.append("%7s" % (count.compute_sum_miss())) + return out + + +@overload(XeroxMetric, InexMetric) +def get_std_row(xerox, inex): + out = ["%4s " % "sdev"] + # xerox + out.append("%6s" % xerox.format_float_percent(xerox.compute_std_p())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_std_r())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_std_f())) + out.append("%6s" % xerox.format_float_percent(xerox.compute_std_title_acc())) + # inex + out.append("%6s" % inex.format_float_percent(inex.compute_std_p())) + out.append("%6s" % inex.format_float_percent(inex.compute_std_r())) + out.append("%6s" % inex.format_float_percent(inex.compute_std_f())) + out.append("%6s" % inex.format_float_percent(inex.compute_std_title_acc())) + out.append("%6s" % inex.format_float_percent(inex.compute_std_level_acc())) + return out + + +@overload(dict, dict, dict) +def get_avg_row(td_prec, td_reca, td_f1): + return [ + "AVG", + np.mean(list(td_prec.values())), + np.mean(list(td_reca.values())), + np.mean(list(td_f1.values())) + ] + + +@overload(dict, dict, dict) +def get_std_row(td_prec, td_reca, td_f1): + return [ + "stdev", + np.std(list(td_prec.values())), + np.std(list(td_reca.values())), + np.std(list(td_f1.values())) + ] + + +def get_logger(name, path_to_log, level=logging.ERROR): + handler = logging.FileHandler(path_to_log, mode="w") + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(handler) + return logger + + +def basename(path, ext): + return os.path.basename(path).split(ext)[0] + + +def ls(folder, ext): + pattern = os.path.join(folder, "*" + ext) + return glob(pattern) + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser(description="This is the scoring script used for FincTOC2021. It outputs two csv " + "reports, one for title detection, and another for toc generation. It also logs " + "information in two separate log files.") + PARSER.add_argument("--gt_folder", required=True, type=str, + help="path to folder containing groundtruth files (one groundtruth file in json format per document") + PARSER.add_argument("--submission_folder", required=True, type=str, + help="path to folder containing submission files (one submission file in json format per document") + ARGS = PARSER.parse_args() + score(ARGS.gt_folder, ARGS.submission_folder) diff --git a/scripts/fintoc2022/train_fintoc_classifier.py b/scripts/fintoc2022/train_fintoc_classifier.py new file mode 100644 index 00000000..ddee19da --- /dev/null +++ b/scripts/fintoc2022/train_fintoc_classifier.py @@ -0,0 +1,46 @@ +""" +Training script for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/). +The code is a modification of the winner's solution (ISP RAS team). +""" +import argparse +import logging +import os + +from scripts.fintoc2022.trainer import FintocTrainer + +clf_params = { + "en_binary": dict(random_state=42, learning_rate=0.25, max_depth=5, n_estimators=400, colsample_bynode=0.8, colsample_bytree=0.5, tree_method="hist"), + "fr_binary": dict(random_state=42, learning_rate=0.1, max_depth=5, n_estimators=800, colsample_bynode=0.5, colsample_bytree=0.8, tree_method="approx"), + "sp_binary": dict(random_state=42, learning_rate=0.25, max_depth=4, n_estimators=600, colsample_bynode=0.5, colsample_bytree=0.5, tree_method="approx"), + "en_target": dict(random_state=42, learning_rate=0.07, max_depth=4, n_estimators=800, colsample_bynode=1, colsample_bytree=1, tree_method="hist"), + "fr_target": dict(random_state=42, learning_rate=0.4, max_depth=5, n_estimators=800, colsample_bynode=1, colsample_bytree=0.5, tree_method="exact"), + "sp_target": dict(random_state=42, learning_rate=0.25, max_depth=3, n_estimators=600, colsample_bynode=0.5, colsample_bytree=1, tree_method="hist") +} + + +if __name__ == "__main__": + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fintoc2022")) + os.makedirs(base_dir, exist_ok=True) + + parser = argparse.ArgumentParser() + parser.add_argument("--language", choices=["en", "fr", "sp"], help="Language of training data", default="en") + parser.add_argument("--reader", choices=["tabby", "txt_layer"], help="Type of PDF reader used for lines extraction", default="tabby") + parser.add_argument("--cross_val", type=bool, help="Whether to do a cross-validation", default=True) + parser.add_argument("--n_splits", type=int, help="Number of splits for cross-validation", default=3) + args = parser.parse_args() + + trainer = FintocTrainer( + data_url="https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download", + logger=logging.getLogger(), + language=args.language, + reader_name=args.reader, + n_splits=args.n_splits, + classifiers_dir_path=os.path.join(base_dir, "classifiers"), + scores_dir_path=os.path.join(base_dir, "scores"), + features_importances_dir_path=os.path.join(base_dir, "features_importances"), + tmp_dir="/tmp/fintoc/", # path where dataset and predicted jsons will be saved + binary_classifier_parameters=clf_params[f"{args.language}_binary"], + target_classifier_parameters=clf_params[f"{args.language}_target"] + ) + + trainer.fit(cross_val=args.cross_val) diff --git a/scripts/fintoc2022/trainer.py b/scripts/fintoc2022/trainer.py new file mode 100644 index 00000000..aab3fb20 --- /dev/null +++ b/scripts/fintoc2022/trainer.py @@ -0,0 +1,176 @@ +import hashlib +import json +import logging +import os +import shutil +from statistics import mean +from typing import Dict, List, Optional, Union + +import pandas as pd +from sklearn.model_selection import GroupKFold +from tqdm import tqdm + +from dedoc.structure_extractors.concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor +from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor +from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier +from dedoc.utils.utils import flatten +from scripts.fintoc2022.dataset_loader import FintocDatasetLoader +from scripts.fintoc2022.metric import score +from scripts.fintoc2022.utils import create_json_result, get_values_from_csv + + +class FintocTrainer: + """ + Class to train and evaluate classifiers for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/). + The code is a modification of the winner's solution (ISP RAS team). + """ + def __init__(self, + data_url: str, + logger: logging.Logger, + language: str, + reader_name: str, + classifiers_dir_path: str, + scores_dir_path: str, + features_importances_dir_path: str, + tmp_dir: str, + binary_classifier_parameters: dict = None, + target_classifier_parameters: dict = None, + n_splits: int = 3) -> None: + """ + :param data_url: url to download training data for FintocDatasetLoader + :param logger: logger for logging details of classifier training + :param language: ("en", "fr", "sp") - language of data + :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF + :param classifiers_dir_path: path to the directory where to save trained classifiers + :param scores_dir_path: path to the directory where to save final scores during cross-validation + :param features_importances_dir_path: path to the directory where to save XLSX files with information about most important features for classifiers + :param tmp_dir: path to temporary directory for saving the dataset and output json files with predictions + :param binary_classifier_parameters: parameters to pass to xgboost.XGBClassifier for classification header/non-header + :param target_classifier_parameters: parameters to pass to xgboost.XGBClassifier for lines depth classification + :param n_splits: number of splits for cross-validation + """ + self.logger = logger + self.language = language + self.reader_name = reader_name + self.feature_extractor = FintocFeatureExtractor() + self.structure_extractor = FintocStructureExtractor() + + self.binary_classifier_parameters = {} if binary_classifier_parameters is None else binary_classifier_parameters + self.target_classifier_parameters = {} if target_classifier_parameters is None else target_classifier_parameters + self.classifier = FintocClassifier(language=self.language) + + self.tmp_dir = tmp_dir + os.makedirs(self.tmp_dir, exist_ok=True) + self.scores_dir_path = scores_dir_path + self.features_importances_dir_path = features_importances_dir_path + self.classifiers_dir_path = classifiers_dir_path + + self.data_url = data_url + url_hash = hashlib.md5(self.data_url.encode()).hexdigest() + self.dataset_dir = os.path.join(self.tmp_dir, f"dataset_{url_hash}") + self.data_loader = FintocDatasetLoader(dataset_dir=self.dataset_dir, logger=logger) + + self.n_splits = n_splits + self.additional_features_fields = ("line", "label", "group", "uid") + + def fit(self, cross_val: bool = True, use_cache: bool = True) -> None: + """ + 1 - Load data by `self.data_url` if needed, extract lines from PDF by chosen reader by `reader_name` if needed (FintocDatasetLoader). + 2 - Extract a feature matrix for extracted document lines (FintocFeatureExtractor). + 3 - Do a cross-validation if needed. + 4 - Train resulting classifiers (binary, target) and save them to `self.classifiers_dir_path` (FintocClassifier). + + :param cross_val: whether to do cross-validation or not + :param use_cache: whether to use cached extracted lines as training data + """ + # obtain training data + self.logger.info("Get data for training and evaluation") + data = self.data_loader.get_data(language=self.language, reader_name=self.reader_name, use_cache=use_cache) + + # create feature matrix + self.logger.info("Create a feature matrix") + features, documents = self.structure_extractor.get_features(documents_dict=data) + self.logger.info(f"Features shape: {features.shape}") + for feature_field in self.additional_features_fields: + features[feature_field] = [getattr(line, feature_field) for line in flatten(documents)] + features["label"] = features["label"].astype(int) + features_names = self.__get_features_names(features) + + # cross-validation using fintoc metric + gt_dir = os.path.join(self.dataset_dir, "data", self.language, "annots") + scores = self.__cross_validate(features=features, gt_dir=gt_dir) if cross_val else None + + # train resulting classifiers on all data + self.logger.info("Start training resulting classifiers on all data") + self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=features, features_names=features_names) + self.__save(features_names=features_names, scores=scores) + + def __get_features_names(self, features_df: pd.DataFrame) -> List[str]: + features_names = [col for col in features_df.columns if col not in self.additional_features_fields] + return features_names + + def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> Dict[str, Union[List[float], float]]: + self.logger.info("Start cross-validation") + features_names = self.__get_features_names(features) + results_path = os.path.join(self.scores_dir_path, f"cross_val_results_{self.language}_{self.reader_name}") + os.makedirs(results_path, exist_ok=True) + + kf = GroupKFold(n_splits=self.n_splits) + json_results_dir = os.path.join(self.tmp_dir, f"json_results_{self.language}_{self.reader_name}") + + result_scores = {"td_scores": [], "toc_scores": []} + for i, (train_index, val_index) in tqdm(enumerate(kf.split(features, groups=features.group)), total=self.n_splits): + df_train = features.loc[train_index] + df_val = features.loc[val_index] + self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=df_train, features_names=features_names) + predicted_classes = self.classifier.predict(df_val[features_names]) + result_dict = create_json_result(df_val, predicted_classes) + + if os.path.isdir(json_results_dir): + shutil.rmtree(json_results_dir) + os.makedirs(json_results_dir) + + tmp_gt_dir, predictions_dir = os.path.join(json_results_dir, "groundtruth"), os.path.join(json_results_dir, "predictions") + os.makedirs(tmp_gt_dir) + os.makedirs(predictions_dir) + + for doc_name, result in result_dict.items(): + gt_doc_name = doc_name + ".pdf.fintoc4.json" + if gt_doc_name not in os.listdir(gt_dir): + self.logger.warning(f"{gt_doc_name} is not found in groundtruth") + continue + with open(os.path.join(predictions_dir, gt_doc_name), "w") as json_file: + json.dump(result, json_file, indent=2) + shutil.copy(os.path.join(gt_dir, gt_doc_name), os.path.join(tmp_gt_dir, gt_doc_name)) + score(tmp_gt_dir, predictions_dir) + + path_scores = os.path.join(results_path, str(i)) + os.makedirs(path_scores, exist_ok=True) + for file_name in ["td.log", "toc.log", "td_report.csv", "toc_report.csv"]: + shutil.move(file_name, os.path.join(path_scores, file_name)) + + f1, inex_f1 = get_values_from_csv(path_scores) + result_scores["td_scores"].append(f1) + result_scores["toc_scores"].append(inex_f1) + self.logger.info(f'Iteration {i}:\ntd={result_scores["td_scores"][-1]}\ntoc={result_scores["toc_scores"][-1]}') + + result_scores["td_mean"] = mean(result_scores["td_scores"]) + result_scores["toc_mean"] = mean(result_scores["toc_scores"]) + return result_scores + + def __save(self, features_names: List[str], scores: Optional[Dict[str, Union[List[float], float]]]) -> None: + + if scores is not None: + os.makedirs(self.scores_dir_path, exist_ok=True) + scores_path = os.path.join(self.scores_dir_path, f"scores_{self.language}_{self.reader_name}.json") + with open(scores_path, "w") as f: + json.dump(scores, f, indent=2) + self.logger.info(f"Scores were saved in {scores_path}") + + self.classifier.save( + classifiers_dir_path=self.classifiers_dir_path, + features_importances_dir_path=self.features_importances_dir_path, + features_names=features_names, + logger=self.logger, + reader=self.reader_name + ) diff --git a/scripts/fintoc2022/utils.py b/scripts/fintoc2022/utils.py new file mode 100755 index 00000000..b9354a7b --- /dev/null +++ b/scripts/fintoc2022/utils.py @@ -0,0 +1,46 @@ +import os +from collections import defaultdict +from typing import Dict, List, Tuple, Union + +import pandas as pd + + +def create_json_result(data: pd.DataFrame, predictions: List[int]) -> Dict[str, List[Dict[str, Union[str, int]]]]: + """ + Creates dictionary with TOCs for each document: {"doc_name": TOC}. + TOC is a following list of dictionaries: + [ + { + "text": String, # text of the TOC item/entry + "id": Int # identifier of the item/entry corresponding to its order in the TOC + "depth": Int # hierarchical level of the item + "page": Int # the (physical) page number where the item appears + } + ] + """ + uid2line = { + item[1].uid: { + "text": item[1].line.strip() if isinstance(item[1].line, str) else "", + "page": int(item[1].page_id + 1), + "group": item[1].group + } for item in data.iterrows() + } + result = defaultdict(list) + assert data.shape[0] == len(predictions) + for i, (line_uid, prediction) in enumerate(zip(data.uid, predictions)): + line = uid2line[line_uid] + if line["text"] == "" or prediction == -1: + continue + # TODO crop text lines containing colon + result[line["group"]].append({"id": i, "depth": str(prediction), "text": line["text"], "page": line["page"]}) + return result + + +def get_values_from_csv(dir_path: str) -> Tuple[float, float]: + td_name = "td_report.csv" + toc_name = "toc_report.csv" + td_df = pd.read_csv(os.path.join(dir_path, td_name), delimiter="\t") + toc_df = pd.read_csv(os.path.join(dir_path, toc_name), delimiter="\t") + f1 = td_df[td_df["Doc"] == "AVG"]["F1"].item() + inex_f1 = toc_df[toc_df["Doc"] == " AVG "]["Inex08-F1"].item() + return f1, inex_f1 diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py index 9dde8702..37b4fc36 100644 --- a/scripts/test_words_bbox_extraction.py +++ b/scripts/test_words_bbox_extraction.py @@ -181,7 +181,7 @@ def test_table_word_extraction(self) -> None: for file_name in file_names: result = self._send_request(file_name, data=dict()) - page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0] + page_angle = result["metadata"]["rotated_page_angles"][0] image = cv2.imread(self._get_abs_path(file_name)) image = rotate_image(image, page_angle) @@ -215,7 +215,7 @@ def test_document_table_split_last_column(self) -> None: structure = result["content"]["structure"] word_annotations = self.__get_words_annotation(structure) image = cv2.imread(self._get_abs_path(filename)) - image = rotate_image(image, result["metadata"]["other_fields"].get("rotated_page_angles", [0.])[0]) + image = rotate_image(image, result["metadata"].get("rotated_page_angles", [0.])[0]) image = self.__draw_word_annotations(image, word_annotations) tables = result["content"]["tables"] if len(tables) > 0: diff --git a/tests/api_tests/content_checker.py b/tests/api_tests/content_checker.py index 9de0edf4..ab5dd6b9 100644 --- a/tests/api_tests/content_checker.py +++ b/tests/api_tests/content_checker.py @@ -80,8 +80,6 @@ def __check_metadata(self, metadata: dict) -> None: self.assertIsInstance(metadata["access_time"], int) if "file_type" in metadata: self.assertIsInstance(metadata["file_type"], str) - if "other_fields" in metadata: - self.assertIsInstance(metadata["other_fields"], dict) def _check_english_doc(self, result: dict) -> None: content = result["content"] diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py index 508e2574..bef10773 100644 --- a/tests/api_tests/test_api_doctype_article.py +++ b/tests/api_tests/test_api_doctype_article.py @@ -34,24 +34,28 @@ def test_article(self) -> None: self.assertEqual("org_name", self._get_by_tree_path(tree, "0.2.2.0")["metadata"]["paragraph_type"]) self.assertEqual("ICTEAM/ELEN/Crypto Group", self._get_by_tree_path(tree, "0.2.2.0")["text"]) + # check section + self.assertEqual("section", self._get_by_tree_path(tree, "0.4")["metadata"]["paragraph_type"]) + self.assertEqual("1 Introduction", self._get_by_tree_path(tree, "0.4")["text"]) + # check bibliography list - self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.20")["metadata"]["paragraph_type"]) - self.assertEqual(65, len(self._get_by_tree_path(tree, "0.20")["subparagraphs"])) + self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"]) + self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"])) # check bib_item 1 recognizing - self.assertEqual("title", self._get_by_tree_path(tree, "0.20.0.0")["metadata"]["paragraph_type"]) - self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.20.0.0")["text"]) - self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.20.0.1")["metadata"]["paragraph_type"]) - self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.20.0.1")["text"]) - self.assertEqual("author", self._get_by_tree_path(tree, "0.20.0.2")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("\nMichelAbdalla\n", self._get_by_tree_path(tree, "0.20.0.2")["text"]) - self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.20.0.5")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("4", self._get_by_tree_path(tree, "0.20.0.5")["text"]) - self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.20.0.6")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("471-488", self._get_by_tree_path(tree, "0.20.0.6")["text"]) + self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"]) + self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"]) + self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"]) + self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"]) + self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"]) + self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.12.0.5")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("4", self._get_by_tree_path(tree, "0.12.0.5")["text"]) + self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.12.0.6")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"]) # check cite on bib_item - bibliography_item_uuid = self._get_by_tree_path(tree, "0.20.57")["metadata"]["uid"] # checking on [58] references + bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] # checking on [58] references section = self._get_by_tree_path(tree, "0.4.0") bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid] # We must found two refs [58] in Introduction section diff --git a/tests/api_tests/test_api_doctype_fintoc.py b/tests/api_tests/test_api_doctype_fintoc.py new file mode 100644 index 00000000..7a70ca56 --- /dev/null +++ b/tests/api_tests/test_api_doctype_fintoc.py @@ -0,0 +1,64 @@ +from api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiFintoc(AbstractTestApiDocReader): + + def test_article_en(self) -> None: + file_name = "fintoc/prospectus_en.pdf" + result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true")) + + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + + # headers + node = self._get_by_tree_path(tree, "0.0") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("Key Information Document (KID)", node["text"].strip()) + node = self._get_by_tree_path(tree, "0.0.0") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("PURPOSE", node["text"].strip()) + + # raw text + node = self._get_by_tree_path(tree, "0.0.0.0") + self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) + self.assertTrue(node["text"].startswith("This document provides")) + + def test_article_fr(self) -> None: + file_name = "fintoc/prospectus_fr.pdf" + result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="fr")) + + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + + # headers + node = self._get_by_tree_path(tree, "0.0") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("INFORMATIONS CLES POUR L’INVESTISSEUR", node["text"].strip()) + node = self._get_by_tree_path(tree, "0.1") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("Prospectus", node["text"].strip()) + + # raw text + node = self._get_by_tree_path(tree, "0.1.0") + self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) + self.assertEqual("OPCVM relevant de la directive européenne 2009/65/CE", node["text"].strip()) + + def test_article_sp(self) -> None: + file_name = "fintoc/prospectus_sp.pdf" + result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="sp")) + + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + + # headers + node = self._get_by_tree_path(tree, "0.0") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("INFORME ANUAL", node["text"].strip()) + node = self._get_by_tree_path(tree, "0.0.1") + self.assertEqual("header", node["metadata"]["paragraph_type"]) + self.assertEqual("ÍNDICE.", node["text"].strip()) + + # raw text + node = self._get_by_tree_path(tree, "0.0.0") + self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) + self.assertTrue(node["text"].startswith("2015")) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index 779100cc..a5825653 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -13,13 +13,12 @@ def test_docx_metadata(self) -> None: file_name = "english_doc.docx" result = self._send_request(file_name) metadata = result["metadata"] - docx_metadata = metadata["other_fields"] - self.assertEqual("Тема", docx_metadata["document_subject"]) - self.assertEqual("анализ естественных языков", docx_metadata["keywords"]) - self.assertEqual("курсовая работа", docx_metadata["category"]) - self.assertEqual("на 3 потянет", docx_metadata["comments"]) - self.assertEqual("Андрей Пышкин", docx_metadata["author"]) - self.assertEqual("Андреус Пышкинус", docx_metadata["last_modified_by"]) + self.assertEqual("Тема", metadata["document_subject"]) + self.assertEqual("анализ естественных языков", metadata["keywords"]) + self.assertEqual("курсовая работа", metadata["category"]) + self.assertEqual("на 3 потянет", metadata["comments"]) + self.assertEqual("Андрей Пышкин", metadata["author"]) + self.assertEqual("Андреус Пышкинус", metadata["last_modified_by"]) def test_docx(self) -> None: file_name = "example.docx" @@ -154,4 +153,4 @@ def __check_doc_like(self, result: dict) -> None: self.assertTrue(metadata["modified_time"] is not None) self.assertTrue(metadata["created_time"] is not None) self.assertTrue(metadata["access_time"] is not None) - self.assertIn("modified_date", metadata["other_fields"]) + self.assertIn("modified_date", metadata) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index a78026bd..ff42a1e8 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -94,9 +94,9 @@ def test_images(self) -> None: def test_image_metadata(self) -> None: file_name = "orient_3.png" result = self._send_request(file_name) - exif = result["metadata"]["other_fields"] - self.assertEqual(exif["exif_image_width"], 1654) - self.assertEqual(exif["exif_image_height"], 2338) + self.assertEqual(result["metadata"]["exif_image_width"], 1654) + self.assertEqual(result["metadata"]["exif_image_height"], 2338) + self.assertIn("rotated_page_angles", result["metadata"]) def test_image_binarization(self) -> None: result = self._send_request("01_МФО_Наклон.jpg", data=dict(need_binarization="true")) diff --git a/tests/data/fintoc/prospectus_en.pdf b/tests/data/fintoc/prospectus_en.pdf new file mode 100644 index 00000000..cf54b365 Binary files /dev/null and b/tests/data/fintoc/prospectus_en.pdf differ diff --git a/tests/data/fintoc/prospectus_fr.pdf b/tests/data/fintoc/prospectus_fr.pdf new file mode 100644 index 00000000..a05c517c Binary files /dev/null and b/tests/data/fintoc/prospectus_fr.pdf differ diff --git a/tests/data/fintoc/prospectus_sp.pdf b/tests/data/fintoc/prospectus_sp.pdf new file mode 100644 index 00000000..e89f6890 Binary files /dev/null and b/tests/data/fintoc/prospectus_sp.pdf differ