diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py
index 779af066..53299a16 100644
--- a/dedoc/api/schema/table_metadata.py
+++ b/dedoc/api/schema/table_metadata.py
@@ -10,3 +10,4 @@ class TableMetadata(BaseModel):
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
+ title: str = Field(description="Table's title")
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index 6a947230..5ca05cec 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -37,6 +37,7 @@
Type of document structure parsing
+
document_type
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
index 1c307409..8919c890 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
@@ -29,7 +29,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .docx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
index cf5cfefa..96834db6 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .xlsx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
index 39e11c69..83fd572a 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
index 34acdef4..4b9ecb54 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pptx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/config.py b/dedoc/config.py
index f3c374eb..06d98894 100644
--- a/dedoc/config.py
+++ b/dedoc/config.py
@@ -43,7 +43,10 @@
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
ocr_conf_threshold=40.0,
# max depth of document structure tree
- recursion_deep_subparagraphs=30
+ recursion_deep_subparagraphs=30,
+
+ # -------------------------------------------EXTERNAL SERVICES SETTINGS---------------------------------------------
+ grobid_max_connection_attempts=3
)
diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py
index 46142cff..ba7741cf 100644
--- a/dedoc/converters/concrete_converters/binary_converter.py
+++ b/dedoc/converters/concrete_converters/binary_converter.py
@@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime == "application/octet-stream" and extension in supported_image_types
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py
index 3b50416a..ad8855ec 100644
--- a/dedoc/converters/concrete_converters/docx_converter.py
+++ b/dedoc/converters/concrete_converters/docx_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py
index 1396a12b..8aaa8809 100644
--- a/dedoc/converters/concrete_converters/excel_converter.py
+++ b/dedoc/converters/concrete_converters/excel_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is xlsx-like, e.g. it has .xls or .ods extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py
index f0b929e8..01483d4c 100644
--- a/dedoc/converters/concrete_converters/pdf_converter.py
+++ b/dedoc/converters/concrete_converters/pdf_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pdf-like, e.g. it has .djvu extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py
index cb50245d..6044c970 100644
--- a/dedoc/converters/concrete_converters/png_converter.py
+++ b/dedoc/converters/concrete_converters/png_converter.py
@@ -25,7 +25,7 @@ def can_convert(self,
"""
Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py
index d1e7aec3..afce2b94 100644
--- a/dedoc/converters/concrete_converters/pptx_converter.py
+++ b/dedoc/converters/concrete_converters/pptx_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pptx-like, e.g. it has .ppt or .odp extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py
index b1543fa0..1f384d59 100644
--- a/dedoc/converters/concrete_converters/txt_converter.py
+++ b/dedoc/converters/concrete_converters/txt_converter.py
@@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is txt-like, e.g. it has .xml extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/converter_composition.py b/dedoc/converters/converter_composition.py
index cf12c2ed..63543d4e 100644
--- a/dedoc/converters/converter_composition.py
+++ b/dedoc/converters/converter_composition.py
@@ -29,7 +29,7 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
:param parameters: parameters of converting, see :ref:`parameters_description` for more details
:return: path of converted file if conversion was executed else path of the original file
"""
- extension, mime = get_mime_extension(file_path=file_path)
+ mime, extension = get_mime_extension(file_path=file_path)
converted_file_path = file_path
for converter in self.converters:
diff --git a/dedoc/data_structures/concrete_annotations/__init__.py b/dedoc/data_structures/concrete_annotations/__init__.py
index 529acaa0..264abda0 100644
--- a/dedoc/data_structures/concrete_annotations/__init__.py
+++ b/dedoc/data_structures/concrete_annotations/__init__.py
@@ -15,7 +15,8 @@
from .superscript_annotation import SuperscriptAnnotation
from .table_annotation import TableAnnotation
from .underlined_annotation import UnderlinedAnnotation
+from .reference_annotation import ReferenceAnnotation
__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation',
'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation',
- 'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
+ 'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation', 'ReferenceAnnotation']
diff --git a/dedoc/data_structures/concrete_annotations/reference_annotation.py b/dedoc/data_structures/concrete_annotations/reference_annotation.py
new file mode 100644
index 00000000..e629ba8b
--- /dev/null
+++ b/dedoc/data_structures/concrete_annotations/reference_annotation.py
@@ -0,0 +1,43 @@
+from dedoc.data_structures.annotation import Annotation
+
+
+class ReferenceAnnotation(Annotation):
+ """
+ This annotation points to a place in the document text that is a link to another line in the document (for example, another textual line).
+
+ Example of usage for document_type="article" with the example of link on the bibliography_item :class:`~dedoc.data_structures.LineWithMeta`.
+
+ LineWithMeta:
+
+ .. code-block:: python
+
+ LineWithMeta( # the line with the reference annotation
+ line="As for the PRF, we use the tree-based construction from Goldreich, Goldwasser and Micali [18]",
+ metadata=LineMetadata(page_id=0, line_id=32),
+ annotations=[ReferenceAnnotation(start=90, end=92, value="97cfac39-f0e3-11ee-b81c-b88584b4e4a1"), ...]
+ )
+
+ other LineWithMeta:
+
+ .. code-block:: python
+
+ LineWithMeta( # The line referenced by the previous one
+ line="some your text (can be empty)",
+ metadata=LineMetadata(
+ page_id=10,
+ line_id=189,
+ tag_hierarchy_level=HierarchyLevel(level1=2, level2=0, paragraph_type="bibliography_item")),
+ other_fields={"uid": "97cfac39-f0e3-11ee-b81c-b88584b4e4a1"}
+ ),
+ annotations=[]
+ )
+ """
+ name = "reference"
+
+ def __init__(self, value: str, start: int, end: int) -> None:
+ """
+ :param value: unique identifier of the line to which this annotation refers
+ :param start: start of the annotated text with a link
+ :param end: end of the annotated text with a link
+ """
+ super().__init__(start=start, end=end, name=ReferenceAnnotation.name, value=value, is_mergeable=False)
diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py
index 504c5110..19b6730a 100644
--- a/dedoc/data_structures/line_metadata.py
+++ b/dedoc/data_structures/line_metadata.py
@@ -30,9 +30,9 @@ def __init__(self,
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
+ self.__other_fields = {}
if other_fields is not None and len(other_fields) > 0:
self.extend_other_fields(other_fields)
- self.__other_fields = {}
def extend_other_fields(self, new_fields: dict) -> None:
"""
diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
index ca954573..798a1712 100644
--- a/dedoc/data_structures/line_with_meta.py
+++ b/dedoc/data_structures/line_with_meta.py
@@ -136,7 +136,8 @@ def set_line(self, line: str) -> None:
self._line = line
def __repr__(self) -> str:
- return f"LineWithMeta({self.line[:65]})"
+ return (f"LineWithMeta({self.line[:65]}, "
+ f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
assert isinstance(other, (LineWithMeta, str))
diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py
index a70ab2b4..fc934d9a 100644
--- a/dedoc/data_structures/table_metadata.py
+++ b/dedoc/data_structures/table_metadata.py
@@ -9,15 +9,17 @@ class TableMetadata(Serializable):
"""
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
- def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None:
+ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
+ :param title: table's title
"""
self.page_id = page_id
self.uid = str(uuid.uuid4()) if not uid else uid
self.rotated_angle = rotated_angle
+ self.title = title
def to_api_schema(self) -> ApiTableMetadata:
- return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle)
+ return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title)
diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py
index 6854c6f4..679db954 100644
--- a/dedoc/manager_config.py
+++ b/dedoc/manager_config.py
@@ -1,5 +1,7 @@
from typing import Optional
+from dedoc.readers.article_reader.article_reader import ArticleReader
+
def _get_manager_config(config: dict) -> dict:
"""
@@ -57,6 +59,7 @@ def _get_manager_config(config: dict) -> dict:
BinaryConverter(config=config)
]
readers = [
+ ArticleReader(config=config),
DocxReader(config=config),
ExcelReader(config=config),
PptxReader(config=config),
diff --git a/dedoc/readers/__init__.py b/dedoc/readers/__init__.py
index 7c6cce29..2d96fdae 100644
--- a/dedoc/readers/__init__.py
+++ b/dedoc/readers/__init__.py
@@ -1,4 +1,5 @@
from .archive_reader.archive_reader import ArchiveReader
+from .article_reader.article_reader import ArticleReader
from .base_reader import BaseReader
from .csv_reader.csv_reader import CSVReader
from .docx_reader.docx_reader import DocxReader
@@ -17,6 +18,6 @@
from .reader_composition import ReaderComposition
from .txt_reader.raw_text_reader import RawTextReader
-__all__ = ['ArchiveReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
+__all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader',
'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader']
diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py
index d8831b58..589014ac 100644
--- a/dedoc/readers/archive_reader/archive_reader.py
+++ b/dedoc/readers/archive_reader/archive_reader.py
@@ -29,7 +29,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/article_reader/__init__.py b/dedoc/readers/article_reader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py
new file mode 100644
index 00000000..f2169452
--- /dev/null
+++ b/dedoc/readers/article_reader/article_reader.py
@@ -0,0 +1,365 @@
+import os
+import time
+from typing import Dict, List, Optional, Tuple
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from dedoc.data_structures import Annotation, CellWithMeta, HierarchyLevel, LineMetadata, Table, TableAnnotation, TableMetadata
+from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.data_structures.unstructured_document import UnstructuredDocument
+from dedoc.extensions import recognized_mimes
+from dedoc.readers.base_reader import BaseReader
+from dedoc.utils.parameter_utils import get_param_document_type
+from dedoc.utils.utils import get_mime_extension
+
+
+class ArticleReader(BaseReader):
+ """
+ This class is used for parsing scientific articles with .pdf extension using `GROBID `_ system.
+ """
+
+ def __init__(self, config: Optional[dict] = None) -> None:
+ super().__init__(config=config)
+ self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
+ self.url = f"{self.grobid_url}/api/processFulltextDocument"
+ self.grobid_is_alive = False
+ self.__update_grobid_alive(self.grobid_url, max_attempts=self.config.get("grobid_max_connection_attempts", 3))
+
+ def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
+ """
+ The method calls the service GROBID method ``/api/processFulltextDocument`` and analyzes the result (format XML/TEI) of the recognized article
+ using beautifulsoup library.
+ As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`.
+ Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
+ The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``.
+ You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`.
+
+ Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
+ """
+ with open(file_path, "rb") as file:
+ files = {"input": file}
+ try:
+ response = requests.post(self.url, files=files)
+ if response.status_code != 200:
+ warning = f"GROBID returns code {response.status_code}."
+ self.logger.warning(warning)
+ return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
+ except requests.exceptions.ConnectionError as ex:
+ warning = f"GROBID doesn't response. Check GROBID service on {self.url}. Exception' msg: {ex}"
+ self.logger.warning(warning)
+ return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
+
+ soup = BeautifulSoup(response.text, features="lxml")
+ lines = self.__parse_title(soup)
+
+ if soup.biblstruct is not None:
+ authors = soup.biblstruct.find_all("author")
+ lines += [line for author in authors for line in self.__parse_author(author)]
+
+ bib_lines, bib2uid = self.__parse_bibliography(soup)
+ tables, table2uid = self.__parse_tables(soup)
+
+ lines += self.__parse_text(soup, bib2uid, table2uid)
+ lines.extend(bib_lines)
+
+ return UnstructuredDocument(tables=tables, lines=lines, attachments=[], warnings=["use GROBID (version: 0.8.0)"])
+
+ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
+ """
+ Check if:
+
+ * the document extension is suitable for this reader (.pdf);
+ * parameter "document_type" is "article";
+ * GROBID service is running on port 8070.
+
+ Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
+ """
+ if get_param_document_type(parameters) != "article":
+ return False
+
+ self.__update_grobid_alive(self.grobid_url, max_attempts=1)
+ if not self.grobid_is_alive:
+ return False
+
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ return mime in recognized_mimes.pdf_like_format and extension.lower() == ".pdf"
+
+ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
+ if self.grobid_is_alive:
+ return
+
+ attempt = max_attempts
+ while attempt > 0:
+ try:
+ response = requests.get(f"{grobid_url}/api/isalive")
+ if response.status_code == 200:
+ self.logger.info(f"GROBID up on {grobid_url}.")
+ self.grobid_is_alive = True
+ return
+ except requests.exceptions.ConnectionError as ex:
+ self.logger.warning(f"GROBID doesn't response. Check GROBID service on {self.url}. Exception's msg: {ex}")
+ time.sleep(5)
+ attempt -= 1
+
+ self.grobid_is_alive = False
+
+ def __get_tag_by_hierarchy_path(self, source: Tag, hierarchy_path: List[str]) -> Optional[str]:
+ cur_tag = source
+ for path_item in hierarchy_path:
+ cur_tag = cur_tag.find(path_item)
+ if cur_tag is None:
+ # tag not found
+ return ""
+
+ return ArticleReader.__tag2text(cur_tag)
+
+ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, paragraph_type: Optional[str] = None,
+ annotations: Optional[List[Annotation]] = None, other_fields: Optional[Dict] = None) -> LineWithMeta:
+ # TODO check on improve
+ if other_fields is None:
+ other_fields = {}
+ assert text is not None
+ assert isinstance(text, str)
+
+ if hierarchy_level_id is None or paragraph_type is None:
+ hierarchy_level = HierarchyLevel.create_raw_text()
+ else:
+ hierarchy_level = HierarchyLevel(level_1=hierarchy_level_id, level_2=0, can_be_multiline=False, line_type=paragraph_type)
+
+ return LineWithMeta(line=text,
+ metadata=LineMetadata(page_id=0, line_id=0, tag_hierarchy_level=hierarchy_level, other_fields=other_fields),
+ annotations=annotations)
+
+ def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]:
+ lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")]
+
+ if affiliation_tag.orgname:
+ lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name"))
+
+ if affiliation_tag.address:
+ lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address"))
+
+ return lines
+
+ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
+ """
+
+ Example:
+
+ SoniaBelaïd
+
+ École Normale Supérieure
+
+ 45 rue dUlm
+ 75005
+ Paris
+
+
+
+ Thales Communications & Security
+
+ 4 Avenue des Louvresses
+ 92230
+ Gennevilliers
+
+
+
+ """
+ lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")]
+
+ first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"])
+ if first_name:
+ lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name"))
+
+ surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"])
+ if surname:
+ lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname"))
+
+ lines += [
+ self.__create_line(text=email.get_text(), hierarchy_level_id=3, paragraph_type="email")
+ for email in author_tag.find_all("email") if email
+ ]
+
+ affiliations = author_tag.find_all("affiliation")
+ lines += [line for affiliation in affiliations for line in self.__parse_affiliation(affiliation)]
+
+ return lines
+
+ def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta:
+ text = ""
+ start = 0
+ annotations = []
+
+ for subpart in content:
+ if isinstance(subpart, Tag) and subpart.name == "ref":
+ target = subpart.get("target")
+ sub_text = subpart.string
+ if subpart.get("type") == "bibr" and target in bib2uid:
+ annotations.append(ReferenceAnnotation(value=bib2uid[target], start=start, end=start + len(sub_text)))
+ if subpart.get("type") == "table" and target in table2uid:
+ annotations.append(TableAnnotation(name=table2uid[target], start=start, end=start + len(sub_text)))
+ else:
+ sub_text = subpart if isinstance(subpart, str) else ""
+
+ text += sub_text
+ start += len(sub_text)
+
+ return self.__create_line(text=text, hierarchy_level_id=None, paragraph_type=None, annotations=annotations)
+
+ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]:
+ """
+ Example of section XML tag:
+
Preprocessing
...
...
+ """
+ lines = []
+
+ abstract = soup.find("abstract").p
+ lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract"))
+ lines.append(self.__create_line(text=self.__tag2text(abstract)))
+
+ for text in soup.find_all("text"):
+ for part in text.find_all("div"):
+ # TODO: Beautifulsoup doesn't read tags from input XML file. WTF!
+ # As a result we lose section number in text (see example above)
+ # Need to fix this in the future.
+ number = part.head.get("n") + " " if part.head else ""
+ line_text = str(part.contents[0]) if len(part.contents) > 0 else None
+ if line_text is not None and len(line_text) > 0:
+ lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section"))
+ for subpart in part.find_all("p"):
+ if subpart.string is not None:
+ lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid))
+ elif subpart.contents and len(subpart.contents) > 0:
+ lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
+
+ return lines
+
+ @staticmethod
+ def __tag2text(tag: Tag) -> str:
+ return "" if not tag or not tag.string else tag.string
+
+ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]:
+ """
+ Example Table with table's ref:
+ -----------------------------------------------
+ Table Reference Example:
+ 1
+ ...
+ Table Example:
+
+ Table 1 .
+
+ Performance of some illustrative AES implementations.
+
+ """
+ tables = []
+ table2uid = {}
+
+ tag_tables = soup.find_all("figure", {"type": "table"})
+ for table in tag_tables:
+ row_cells = []
+ head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head)
+ title = head + self.__tag2text(table.figdesc)
+ for row in table.table.find_all("row"):
+ row_cells.append([CellWithMeta(lines=[self.__create_line(self.__tag2text(cell))]) for cell in row.find_all("cell")])
+ tables.append(Table(cells=row_cells, metadata=TableMetadata(page_id=0, title=title)))
+ table2uid["#" + table.get("xml:id")] = tables[-1].metadata.uid
+
+ return tables, table2uid
+
+ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
+ """
+ Reference Example:
+ [6]
+ ...
+
+
+
+ Leakage-resilient symmetric encryption via re-keying
+
+ MichelAbdalla
+
+
+ SoniaBelaïd
+
+
+ Pierre-AlainFouque
+
+
+
+ Bertoni and Coron
+
+ 4
+
+
+
+
+
+ """
+ lines = []
+ cites = {} # bib_item_grobid_uid: line_uid
+
+ # according GROBID description
+ level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"}
+
+ bibliography = soup.find("listbibl", recursive=True)
+ lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography"))
+ if not bibliography:
+ return lines, cites
+
+ bib_items = bibliography.find_all("biblstruct")
+ if not bib_items:
+ return lines, cites
+
+ # parse bibliography items
+ for bib_item in bib_items:
+ cites["#" + bib_item.get("xml:id")] = lines[-1].uid
+ lines.append(self.__create_line(text="", hierarchy_level_id=2, paragraph_type="bibliography_item", other_fields={"uid": lines[-1].uid}))
+
+ # parse bib title
+ for title in bib_item.find_all("title", recursive=True):
+ if title.get("level"):
+ paragraph_type = level_2_paragraph_type[title.get("level")]
+ lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type))
+
+ lines += [ # parse bib authors
+ self.__create_line(text=author.get_text(), hierarchy_level_id=3, paragraph_type="author")
+ for author in bib_item.find_all("author", recursive=True) if author
+ ]
+
+ lines += [ # parse biblScope
+ self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume")
+ for bibl_scope in bib_item.find_all("biblscope", {"unit": "volume"}, recursive=True) if bibl_scope
+ ]
+
+ try:
+ lines += [ # parse values
+ self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page")
+ for bibl_scope in bib_item.find_all("biblscope", {"unit": "page"}, recursive=True) if bibl_scope
+ ]
+ finally:
+ self.logger.warning("Grobid parsing warning: was non-standard format")
+
+ lines += [ # parse DOI (maybe more one)
+ self.__create_line(text=self.__tag2text(idno), hierarchy_level_id=3, paragraph_type="DOI")
+ for idno in bib_item.find_all("idno", recursive=True) if idno
+ ]
+
+ if bib_item.publisher:
+ lines.append(self.__create_line(text=self.__tag2text(bib_item.publisher), hierarchy_level_id=3, paragraph_type="publisher"))
+
+ if bib_item.date:
+ lines.append(self.__create_line(text=self.__tag2text(bib_item.date), hierarchy_level_id=3, paragraph_type="date"))
+
+ return lines, cites
+
+ def __parse_title(self, soup: Tag) -> List[LineWithMeta]:
+ return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")]
diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py
index d1de64ed..e2e09453 100644
--- a/dedoc/readers/csv_reader/csv_reader.py
+++ b/dedoc/readers/csv_reader/csv_reader.py
@@ -25,7 +25,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.csv_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py
index 6e0eabde..3d4c9028 100644
--- a/dedoc/readers/docx_reader/docx_reader.py
+++ b/dedoc/readers/docx_reader/docx_reader.py
@@ -26,7 +26,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py
index 7a239e31..448d13e0 100644
--- a/dedoc/readers/email_reader/email_reader.py
+++ b/dedoc/readers/email_reader/email_reader.py
@@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension or mime is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return file_path.lower().endswith(".eml") or mime == "message/rfc822"
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py
index 4fc8e12c..6e882a50 100644
--- a/dedoc/readers/excel_reader/excel_reader.py
+++ b/dedoc/readers/excel_reader/excel_reader.py
@@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py
index 4a2668bf..83fb2085 100644
--- a/dedoc/readers/html_reader/html_reader.py
+++ b/dedoc/readers/html_reader/html_reader.py
@@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in [".html", ".shtml"] or mime in ["text/html"]
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py
index f408674f..767542cf 100644
--- a/dedoc/readers/json_reader/json_reader.py
+++ b/dedoc/readers/json_reader/json_reader.py
@@ -28,7 +28,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader (it has .json extension).
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py
index ea980dec..f08e84e4 100644
--- a/dedoc/readers/mhtml_reader/mhtml_reader.py
+++ b/dedoc/readers/mhtml_reader/mhtml_reader.py
@@ -32,7 +32,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(tuple(self.mhtml_extensions))
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py
index 836a98bb..e1e15b90 100644
--- a/dedoc/readers/note_reader/note_reader.py
+++ b/dedoc/readers/note_reader/note_reader.py
@@ -22,7 +22,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".note.pickle")
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
index e2c3ad37..c47e7e7d 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -47,7 +47,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
It is recommended to use `pdf_with_text_layer=auto_tabby` because it's faster and allows to get better results.
You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
if not (mime in recognized_mimes.pdf_like_format or extension.lower() == ".pdf"):
return False
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
index 41e9a310..8372fb92 100644
--- a/dedoc/readers/pdf_reader/pdf_base_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -131,7 +131,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
prev_line = None
for line in all_lines_with_links:
- line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_list_hl_with_regexp(line, prev_line)
+ line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
prev_line = line
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
index fd2cf6ff..53edd2e1 100644
--- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -55,7 +55,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime in recognized_mimes.pdf_like_format or mime in recognized_mimes.image_like_format or \
file_path.lower().endswith(tuple(recognized_extensions.image_like_format)) or extension.lower().replace(".", "") in supported_image_types
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
index 42a2b478..00e5e552 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -66,8 +66,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- parameters = {} if parameters is None else parameters
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "tabby"
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
@@ -265,7 +264,7 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_
return HierarchyLevel(1, header_level, False, line_type)
if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
- return DefaultStructureExtractor.get_list_hl_with_regexp(line, prev_line)
+ return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
return HierarchyLevel(None, None, True, line_type)
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index 86277bf0..97e55a03 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -34,8 +34,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- parameters = {} if parameters is None else parameters
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "true"
def _process_one_page(self,
diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py
index a3040b62..6dc77551 100644
--- a/dedoc/readers/pptx_reader/pptx_reader.py
+++ b/dedoc/readers/pptx_reader/pptx_reader.py
@@ -34,7 +34,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py
index 9cf0aec3..2bac6917 100644
--- a/dedoc/readers/reader_composition.py
+++ b/dedoc/readers/reader_composition.py
@@ -30,7 +30,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
:return: intermediate representation of the document with lines, tables and attachments
"""
file_name = os.path.basename(file_path)
- extension, mime = get_mime_extension(file_path=file_path)
+ mime, extension = get_mime_extension(file_path=file_path)
for reader in self.readers:
if reader.can_read(file_path=file_path, mime=mime, extension=extension, parameters=parameters):
diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py
index 33ffe656..58bb1164 100644
--- a/dedoc/readers/txt_reader/raw_text_reader.py
+++ b/dedoc/readers/txt_reader/raw_text_reader.py
@@ -30,7 +30,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith((".txt", "txt.gz"))
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
@@ -70,7 +70,7 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
indent_annotation = self.__get_indent_annotation(line)
line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid)
- line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_list_hl_with_regexp(line_with_meta, prev_line)
+ line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line)
prev_line = line_with_meta
lines.append(line_with_meta)
diff --git a/dedoc/structure_extractors/__init__.py b/dedoc/structure_extractors/__init__.py
index 2e3e9132..404d915c 100644
--- a/dedoc/structure_extractors/__init__.py
+++ b/dedoc/structure_extractors/__init__.py
@@ -1,12 +1,14 @@
from .abstract_structure_extractor import AbstractStructureExtractor
+from .concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from .concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor
+from .concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor
from .concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor
-from .concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from .concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor
from .concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor
from .concrete_structure_extractors.law_structure_excractor import LawStructureExtractor
from .concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor
from .structure_extractor_composition import StructureExtractorComposition
-__all__ = ['AbstractStructureExtractor', 'AbstractLawStructureExtractor', 'ClassifyingLawStructureExtractor', 'DefaultStructureExtractor',
- 'DiplomaStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', 'TzStructureExtractor', 'StructureExtractorComposition']
+__all__ = ['AbstractStructureExtractor', 'AbstractLawStructureExtractor', 'ArticleStructureExtractor', 'ClassifyingLawStructureExtractor',
+ 'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', 'TzStructureExtractor',
+ 'StructureExtractorComposition']
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py
new file mode 100644
index 00000000..4ef6d4e8
--- /dev/null
+++ b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py
@@ -0,0 +1,35 @@
+from typing import List, Optional
+
+from dedoc.data_structures import HierarchyLevel, UnstructuredDocument
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.structure_extractors import AbstractStructureExtractor
+
+
+class ArticleStructureExtractor(AbstractStructureExtractor):
+ """
+ This class corresponds to the `GROBID `_ article structure extraction.
+
+ This class saves all tag_hierarchy_levels received from the :class:`~dedoc.readers.ArticleReader` \
+ without using the postprocessing step (without using regular expressions).
+
+ You can find the description of this type of structure in the section :ref:`article_structure`.
+ """
+ document_type = "article"
+
+ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
+ """
+ Extract article structure from the given document and add additional information to the lines' metadata.
+ To get the information about the method's parameters look at the documentation of the class \
+ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
+ """
+ for line in document.lines:
+ if line.metadata.tag_hierarchy_level is None:
+ line.metadata.tag_hierarchy_level = HierarchyLevel.create_raw_text()
+ else:
+ line.metadata.hierarchy_level = line.metadata.tag_hierarchy_level
+ assert line.metadata.hierarchy_level is not None
+
+ return document
+
+ def _postprocess(self, lines: List[LineWithMeta], paragraph_type: List[str], regexps: List, excluding_regexps: List) -> List[LineWithMeta]:
+ return lines
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py
index 9ce18e0e..da6e40cf 100644
--- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py
+++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py
@@ -35,7 +35,7 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.unknown:
- line.metadata.hierarchy_level = self.get_list_hl_with_regexp(line, previous_line)
+ line.metadata.hierarchy_level = self.get_hl_list_using_regexp(line, previous_line)
else:
line.metadata.hierarchy_level = self.__get_hl_with_tag(line)
@@ -61,7 +61,7 @@ def __get_hl_with_tag(self, line: LineWithMeta) -> HierarchyLevel:
return line.metadata.tag_hierarchy_level
@staticmethod
- def get_list_hl_with_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel:
+ def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel:
prefix = get_prefix(DefaultStructureExtractor.prefix_list, line)
# TODO dotted list without space after numbering, like "1.Some text"
diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py
index 85f3006d..7f26fad1 100644
--- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py
+++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py
@@ -44,7 +44,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s
elif prediction == "raw_text":
line = self.__postprocess_raw_text(line, init_hl_depth)
if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"):
- line.metadata.hierarchy_level = DefaultStructureExtractor.get_list_hl_with_regexp(line, previous_raw_text_line)
+ line.metadata.hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, previous_raw_text_line)
previous_raw_text_line = line
else:
line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py
index da72704a..ece1cf56 100644
--- a/dedoc/utils/parameter_utils.py
+++ b/dedoc/utils/parameter_utils.py
@@ -19,6 +19,13 @@ def get_param_language(parameters: Optional[dict]) -> str:
return language
+def get_param_document_type(parameters: Optional[dict]) -> str:
+ if parameters is None:
+ return "other"
+ document_type = str(parameters.get("document_type", "other")).lower()
+ return document_type
+
+
def get_param_orient_analysis_cells(parameters: Optional[dict]) -> bool:
if parameters is None:
return False
diff --git a/docker-compose.yml b/docker-compose.yml
index 904c36d8..85378db9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,10 +12,13 @@ services:
- 1231:1231
environment:
DOCREADER_PORT: 1231
+ GROBID_HOST: "grobid"
+ GROBID_PORT: 8070
test:
depends_on:
+ - grobid
- dedoc
build:
context: .
@@ -24,7 +27,14 @@ services:
environment:
DOC_READER_HOST: "dedoc"
DOCREADER_PORT: 1231
+ GROBID_HOST: "grobid"
+ GROBID_PORT: 8070
is_test: $test
PYTHONPATH: $PYTHONPATH:/dedoc_root/tests:/dedoc_root
command:
bash dedoc_root/tests/run_tests_in_docker.sh
+
+ grobid:
+ image: "lfoppiano/grobid:0.8.0"
+ ports:
+ - 8070:8070
diff --git a/docs/source/_static/code_examples/djvu_converter.py b/docs/source/_static/code_examples/djvu_converter.py
index 192f889f..79415696 100644
--- a/docs/source/_static/code_examples/djvu_converter.py
+++ b/docs/source/_static/code_examples/djvu_converter.py
@@ -15,7 +15,7 @@ def can_convert(self,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ _, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension == ".djvu"
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/docs/source/_static/code_examples/pdf_attachment_extractor.py b/docs/source/_static/code_examples/pdf_attachment_extractor.py
index e28a7a2e..57d1739e 100644
--- a/docs/source/_static/code_examples/pdf_attachment_extractor.py
+++ b/docs/source/_static/code_examples/pdf_attachment_extractor.py
@@ -15,7 +15,7 @@ def can_extract(self,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/docs/source/_static/code_examples/pdf_reader.py b/docs/source/_static/code_examples/pdf_reader.py
index b588ae65..e6b01ef1 100644
--- a/docs/source/_static/code_examples/pdf_reader.py
+++ b/docs/source/_static/code_examples/pdf_reader.py
@@ -21,7 +21,7 @@ def __init__(self, config: Optional[dict] = None) -> None:
self.attachment_extractor = PdfAttachmentsExtractor(config=self.config)
def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/docs/source/_static/json_format_examples/article_example.json b/docs/source/_static/json_format_examples/article_example.json
new file mode 100644
index 00000000..712c5841
--- /dev/null
+++ b/docs/source/_static/json_format_examples/article_example.json
@@ -0,0 +1,8632 @@
+{
+ "content": {
+ "structure": {
+ "node_id": "0",
+ "text": "Masking and Leakage-Resilient Primitives: One, the Other(s) or Both?",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "root",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.0",
+ "text": "",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.0.0",
+ "text": "Sonia",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_first_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.0.1",
+ "text": "Belaïd",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_surname",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.0.2",
+ "text": "aff0",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_affiliation",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.0.2.0",
+ "text": "École Normale Supérieure",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "org_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.0.2.1",
+ "text": "\n45 rue dUlm\n75005\nParis\n",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "address",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.0.3",
+ "text": "aff1",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_affiliation",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.0.3.0",
+ "text": "Thales Communications & Security",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "org_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.0.3.1",
+ "text": "\n4 Avenue des Louvresses\n92230\nGennevilliers\n",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "address",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "node_id": "0.1",
+ "text": "",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.1.0",
+ "text": "Vincent",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_first_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.1.1",
+ "text": "Grosso",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_surname",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.1.2",
+ "text": "aff2",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_affiliation",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.1.2.0",
+ "text": "ICTEAM/ELEN/Crypto Group",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "org_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.1.2.1",
+ "text": "\nBelgium\n",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "address",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "node_id": "0.2",
+ "text": "",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.2.0",
+ "text": "François",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_first_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.2.1",
+ "text": "Xavier-Standaert",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_surname",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.2.2",
+ "text": "aff2",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "author_affiliation",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.2.2.0",
+ "text": "ICTEAM/ELEN/Crypto Group",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "org_name",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.2.2.1",
+ "text": "\nBelgium\n",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "address",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "node_id": "0.3",
+ "text": "Abstract",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "abstract",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.3.0",
+ "text": "Securing cryptographic implementations against side-channel attacks is one of the most important challenges in modern cryptography. Many countermeasures have been introduced for this purpose, and analyzed in specialized security models. Formal solutions have also been proposed to extend the guarantees of provable security to physically observable devices. Masking and leakage-resilient cryptography are probably the most investigated and best understood representatives of these two approaches. Unfortunately, claims whether one, the other or their combination provides better security at lower cost remained vague so far. In this paper, we provide the first comprehensive treatment of this important problem. For this purpose, we analyze whether cryptographic implementations can be security-bounded, in the sense that the time complexity of the best side-channel attack is lower-bounded, independent of the number of measurements performed. Doing so, we first put forward a significant difference between stateful primitives such as leakage-resilient PRGs (that easily ensure bounded security), and stateless ones such as leakage-resilient PRFs (that hardly do). We then show that in practice, leakage-resilience alone provides the best security vs. performance tradeoff when bounded security is achievable, while masking alone is the solution of choice otherwise. That is, we highlight that one (x)or the other approach should be privileged, which contradicts the usual intuition that physical security is best obtained by combining countermeasures.",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.4",
+ "text": "Introduction",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.4.0",
+ "text": "Masking is a frequently considered solution to improve security against sidechannel attacks [5,19]. A large number of papers investigated its application to smart card implementations of the AES (e.g. [17,37,53,55]). It essentially randomizes all the sensitive variables in a cryptographic device, by splitting them into d shares, and performs all computations on these shares afterwards. The resulting process is expected to improve physical security since if the masking scheme is carefully implemented (i.e. if the leakages of all the shares are independent), higher-order moments of the leakage distribution have to be estimated to reveal key-dependent information. It has been shown that the number of measurements needed to perform a successful DPA (Differential Power Analysis) increases exponentially with the number of shares (see, e.g. [44,59]).One limitation of masking is that (as most countermeasures against sidechannel attacks [30]) it \"only\" reduces the amount of information leakage, at the cost of sometimes strong performance overheads [20]. Another line of work, next denoted as leakage-resilient cryptography, followed a complementary approach and tried to make the exploitation of this information more difficult (e.g. computationally). For this purpose, the main assumption is that the information leakage per iteration is limited in some sense. When applied in the context of symmetric cryptography, most instances of leakage-resilient constructions rely on re-keying strategies for this purpose, as first suggested by Kocher [27]. Examples of primitives include Pseudo-Random Generators (PRGs) [12,15,41,57,58,64,65] and Pseudo-Random Functions (PRFs) [1,10,15,34,58,64].The topic of leakage resilience has given rise to quite animated debates in the cryptographic community. Several assumptions have been proposed, and the quest for models that adequately capture physical reality is still ongoing (see [57] for a recent discussion). Yet, and independent of the relevance of the proofs obtained within these models, a more pragmatic problem is to find out the security levels of leakage-resilient constructions in front of standard side-channel adversaries (i.e. the same as the ones considered in security evaluations for masking). That is, are these primitives useful to help cryptographic designers to pass current certification procedures (e.g. EMVco [14] or Common Criteria [7])?Unfortunately, claims in one or the other direction remained vague so far. The main reason is that, as hinted by Bernstein in a CHES 2012 rump session talk, substantiated answers require to consider both security and performances [3], i.e. two qualities that are generally hard to quantify. In this paper, we aim to contribute to this issue and provide tools allowing to determine the best way to reach a given security level in different (software and hardware) scenarios, within the limits of what empirical evaluations can provide. For this purpose, we will consider the AES-based PRG and PRF illustrated in Figures 1 and2, respectively. For every key k i , the PRG produces a key k i+1 and N -1 strings y i 1 , y i 2 , . . . , y i N -1 , both obtained by encrypting N public plaintexts p i j with k i . As for the PRF, we use the tree-based construction from Goldreich, Goldwasser and Micali [18], where each step incorporates log 2 [N ] input bits and generates k i+1 = AES ki (p i j ). Following [34], the last stage is optionally completed by a whitening step, in order to limit the data complexity of attacks targeting the PRF output to one (e.g. when using large N values, typically). Quite naturally, there is a simple security versus efficiency tradeoff for both types of constructions. In the first (PRG) case, we produce a 128-bit output stream every N N -1 AES encryptions. In the second (PRF) case, we produce a 128-bit output every 128 log(N ) AES encryptions (+1 if output whitening is used). The details of these primitives are not necessary for the understanding of this work. The only important feature in our discussions is that the PRG construction is stateful while the PRF one is stateless. As a result, the PRG limits the number of measurements that a side-channel adversary can perform with the same key, while the PRF limits his data complexity (i.e. the number of plaintexts that can be observed). In practice, it means that in this latter case, the same measurement can be repeated multiple times, e.g. in order to get rid of the physical noise through averaging. As already discussed by Medwed et al. in [34], Section 3, this may lead to significant difference in terms of security against DPA.In order to compare and combine these two primitives with masking, we investigate whether they can lead to security-bounded implementations, i.e. implementations such that the time complexity of the best side-channel attack remains bounded independent of the number of measurements performed by the adversary. Doing so, we first show that the stateful leakage-resilient PRG in Figure 1 naturally leads to such implementations. By contrast, this guarantee is harder to reach with (stateless) leakage-resilient PRFs such as in Figure 2. The tweaked construction proposed in [34] (that takes advantage of hardware parallelism) is in fact the only security-bounded PRF we found in our experiments. Next, we put forward that better security at lower cost is obtained by using the leakage-resilient PRG alone (i.e. without masking), while masking alone is the most efficient solution for improving the security of stateless primitives whenever the implementations cannot be security-bounded. Therefore, our results underline that both masking and leakage-resilient primitives can be useful ingredients in the design of physically secure designs. But they also lead to the counterintuitive observation that sometimes (in fact, frequently), these solutions are better used separately, hence contradicting the usual intuition that security against side-channel attacks is best obtained via a combination of countermeasures.Admittedly, these results are only obtained for a set of side-channel attacks that are representative of the state-of-the-art. Hence, positive observations such as made for the tweaked construction in [34] are not proven: they only indicate that the cryptanalysis of such schemes may be hard with current knowledge. In the same lines, the differences between leakage-resilient PRGs and PRFs do not contradict their proofs: they only indicate that the (crucial) assumption of bounded leakage can imply different challenges for hardware designers. Hence, instantiating these primitives with the same AES implementation can lead to different security levels (even if the same N value is used in both cases).",
+ "annotations": [
+ {
+ "start": 92,
+ "end": 95,
+ "name": "bibliography_ref",
+ "value": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 95,
+ "end": 98,
+ "name": "bibliography_ref",
+ "value": "bac4e4bb-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 201,
+ "end": 205,
+ "name": "bibliography_ref",
+ "value": "bac4e4ab-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 205,
+ "end": 208,
+ "name": "bibliography_ref",
+ "value": "bac4e551-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 208,
+ "end": 211,
+ "name": "bibliography_ref",
+ "value": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 211,
+ "end": 214,
+ "name": "bibliography_ref",
+ "value": "bac4e5dd-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 846,
+ "end": 850,
+ "name": "bibliography_ref",
+ "value": "bac4e584-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 850,
+ "end": 853,
+ "name": "bibliography_ref",
+ "value": "bac4e602-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 942,
+ "end": 946,
+ "name": "bibliography_ref",
+ "value": "bac4e516-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1055,
+ "end": 1059,
+ "name": "bibliography_ref",
+ "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1550,
+ "end": 1554,
+ "name": "bibliography_ref",
+ "value": "bac4e501-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1619,
+ "end": 1623,
+ "name": "bibliography_ref",
+ "value": "bac4e480-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1623,
+ "end": 1626,
+ "name": "bibliography_ref",
+ "value": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1683,
+ "end": 1686,
+ "name": "bibliography_ref",
+ "value": "bac4e49b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1626,
+ "end": 1629,
+ "name": "bibliography_ref",
+ "value": "bac4e571-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1629,
+ "end": 1632,
+ "name": "bibliography_ref",
+ "value": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1929,
+ "end": 1933,
+ "name": "bibliography_ref",
+ "value": "bac4e5ec-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1632,
+ "end": 1635,
+ "name": "bibliography_ref",
+ "value": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1689,
+ "end": 1692,
+ "name": "bibliography_ref",
+ "value": "bac4e5f6-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1635,
+ "end": 1638,
+ "name": "bibliography_ref",
+ "value": "bac4e634-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1692,
+ "end": 1695,
+ "name": "bibliography_ref",
+ "value": "bac4e634-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1638,
+ "end": 1641,
+ "name": "bibliography_ref",
+ "value": "bac4e63d-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1677,
+ "end": 1680,
+ "name": "bibliography_ref",
+ "value": "bac4e42a-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1680,
+ "end": 1683,
+ "name": "bibliography_ref",
+ "value": "bac4e46d-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1686,
+ "end": 1689,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3412,
+ "end": 3416,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 4544,
+ "end": 4548,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 5206,
+ "end": 5210,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 6249,
+ "end": 6253,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2381,
+ "end": 2385,
+ "name": "bibliography_ref",
+ "value": "bac4e499-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2405,
+ "end": 2408,
+ "name": "bibliography_ref",
+ "value": "bac4e461-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2640,
+ "end": 2643,
+ "name": "bibliography_ref",
+ "value": "bac4e43e-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3306,
+ "end": 3310,
+ "name": "bibliography_ref",
+ "value": "bac4e4b2-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.5",
+ "text": "Methodology & limitations",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.5.0",
+ "text": "The main goal of this paper is to provide sound techniques to evaluate how leakage-resilient PRGs/PRFs and masking combine. In this section, we provide a brief description of the methodology we will use for this purpose, and underline its limitations. The two main components, namely performance and security evaluations, are detailed in Sections 3 and 4, and then combined in Section 5. Our proposal essentially holds in five steps that we detail below.1. Fix the target security level. In the following, we will take the AES Rijndael with 128-bit key as case study. Since a small security degradation due to side-channel attacks is unavoidable, we will consider 120-bit, 100-bit and 80-bit target security levels for illustration. We do not go below 80-bit keys since it typically corresponds to current short-term security levels [9].2. Choose an implementation. Given a cryptographic algorithm, this essentially corresponds to the selection of a technology and possibly a set of countermeasures to incorporate in the designs to evaluate. In the following, we will consider both software and hardware implementations for illustration, since they lead to significantly different performance and security levels. As for countermeasures, different types of masking schemes will be considered.3. Evaluate performances / extract a cost function. Given an implementation, different metrics can be selected for this purpose (such as code size, RAM, or cycle count in software and area, frequency, throughput or power consumption in hardware). Both for software and hardware implementations, we will use combined functions, namely the \"code size × cycle count\" product and the \"area / throughput\" ratio. While our methodology would be perfectly applicable to other choices of metrics, we believe they are an interesting starting point to capture the efficiency of our different implementations. In particular for the hardware cases, such metrics are less dependent on the serial vs. parallel nature of the target architectures (see [26], Section 2).4. Evaluate security / extract the maximum number of measurements. This central part of our analysis first requires to select the attacks from which we will evaluate security. In the following, we will consider the \"standard DPA attacks\" described in [31] for this purpose. Furthermore, we will investigate them in the profiled setting of template attacks (i.e. assuming that the adversary can build a precise model for the leakage function) [6]. This choice is motivated by the goal of approaching worst-case evaluations [56]. Based on these attacks, we will estimate the security graphs introduced in [61], i.e. compute the adversaries' success rates in function of their time complexity and number of measurements. From a given security level (e.g. 120-bit time complexity), we will finally extract the maximum number of measurements per key tolerated, as can be bounded by the PRG construction1 .",
+ "annotations": [
+ {
+ "start": 833,
+ "end": 836,
+ "name": "bibliography_ref",
+ "value": "bac4e46b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2027,
+ "end": 2031,
+ "name": "bibliography_ref",
+ "value": "bac4e4f7-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2295,
+ "end": 2299,
+ "name": "bibliography_ref",
+ "value": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2486,
+ "end": 2489,
+ "name": "bibliography_ref",
+ "value": "bac4e456-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2566,
+ "end": 2570,
+ "name": "bibliography_ref",
+ "value": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2647,
+ "end": 2651,
+ "name": "bibliography_ref",
+ "value": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.6",
+ "text": "5.",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.6.0",
+ "text": "Compute a global cost metric (possibly with an application constraint). In case of security-bounded implementations, the previous security evaluation can be used to estimate how frequently one has to \"re-key\" within a leakageresilient construction. From this estimate, we derive the average number of AES encryptions to execute per 128-bit output. By multiplying this number with the cost function of our performance evaluations, we obtain a global metric for the implementation of an AES-based design ensuring a given security level. In case of security-unbounded implementations, re-keying is not sufficient to maintain the target security level independent of the number of measurements performed by the adversary. So the cost functions have to be combined with an application constraint, stating the maximum number of measurements that can be tolerated to maintain this security level.Quite naturally, such a methodology is limited in the same way as any performance and security evaluation. From the performance point-of-view, our investigations only apply to a representative subset of the (large) set of AES designs published in the literature. Because of place constraints, we first paid attention to state-of-the-art implementations and countermeasures, but applying our methodology to more examples is naturally feasible (and desirable). A very similar statement holds for security evaluations. Namely, we considered standard DPA attacks as a starting point, and because they typically correspond to the state-of-the-art in research and evaluation laboratories. Yet, cryptanalytic progresses can always appear2 . Besides, countermeasures such as masking may rely on physical assumptions that are difficult to compare rigorously (since highly technology-dependent), as will be detailed next with the case of \"glitches\".Note that these limitations are to a large extent inherent to the problem we tackle, and our results also correspond to the best we can hope in this respect. Hence, more than the practical conclusions that we draw in the following sections (that are of course important for current engineers willing to implement physically secure designs), it is the fact that we are able to compare the performance vs. security tradeoffs corresponding to the combination of leakage-resilient constructions with masking that is the most important contribution of this work. Indeed, these comparisons are dependent on the state-of-the-art implementations and attacks that are considered to be relevant for the selected algorithm.",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.7",
+ "text": "Performance evaluations",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.7.0",
+ "text": "In this section, we provide our performance evaluations for unprotected and masked AES designs. As previously mentioned, we will consider both software and hardware examples for this purpose. In this context, the main challenge is to find implementations that are (reasonably) comparable. This turned out to be relatively easy in the software case, for which we selected a couple of implementations in 8-bit microcontrollers, i.e. typical targets for side-channel analysis. By contrast, finding implementations in the same technology turns out to be more challenging in hardware: transistor sizes have evolved from (more than) 130µm to (less than) 65ηm over the last 15 years (i.e. the period over which most countermeasures against side-channel attacks have been proposed). Hence, published performance evaluations for side-channel protected designs are rarely comparable. Yet, we could find several designs in a recent FPGA technology, namely the Xilinx Virtex-5 devices (that are based on a 65ηm process).The performances of the implementations we will analyze are summarized in Table 1. As previously mentioned, our software cost function is the frequently considered \"code size × cycle count\" metric, while we use the \"area / throughput\" ratio in the hardware (FPGA) case. As for the countermeasures evaluated, we first focused on the higher-order masking scheme proposed by Rivain and Prouff at CHES 2010, which can be considered as the state-of-the-art in software [53]. We then added the CHES 2011 polynomial masking scheme of Prouff and Roche [45] (and its implementation in [20]), as a typical example of \"glitchresistant\" solution relying on secret sharing and multiparty computation (see the discussion in the next paragraph). A similar variety of countermeasures is proposed in hardware, where we also consider an efficient but glitch-sensitive implementation proposed in [48], and a threshold AES implementation that is one of the most promising solutions to deal with glitches in this case [36]. Note that this latter implementation is based on an 8-bit architecture (rather than a 128-bit one for the others). So although our cost function is aimed at making comparisons between different architectures more reflective of the algorithms' and countermeasures' performances, more serial implementations as this one generally pay a small overhead due to their more complex control logic.Physical assumptions and glitches. As explicit in Table 1, countermeasures against side-channel attacks always rely on a number of physical assumptions.In the case of masking, a central one is that the leakage of the shares manipulated by the target implementation should be independent of each other [22]. Glitches, that are transient signals appearing during the computations in certain (e.g. CMOS) implementations, are a typical physical default that can cause this assumption to fail, as first put forward by Mangard et al. in [32]. There are two possible solutions to deal with such physical defaults: either by making explicit to cryptographic engineers that they have to prevent glitches at the physical level, or by designing countermeasures that can cope with glitches.Interestingly, the first solution is one aspect where hardware and software implementations significantly differ. Namely, while it is usually possible to ensure independent leakages in masked software, by ensuring a sufficient time separation between the manipulation of the shares, it is extremely difficult to avoid glitches in hardware [33]. Yet, even in hardware it is generally expected that the \"glitch signal\" will be more difficult to exploit by adversaries, especially if designers pay attention to this issue [35]. In this context, the main question is to determine the amplitude of this signal: if sufficiently reduced in front of the measurement noise, it may turn out that a glitch-sensitive masked implementation leads to improved security levels (compared to an unprotected one). Since this amplitude is highly technology-dependent, we will use it as a parameter to analyze the security of our hardware implementations in the next sections. Yet, we recall that it is a safe practice to focus on glitch-resistant implementations when it comes to hardware. Besides, we note that glitches are not the only physical default that may cause the independent leakage assumption to be contradicted in practice [42,51].",
+ "annotations": [
+ {
+ "start": 1088,
+ "end": 1089,
+ "name": "table",
+ "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1"
+ },
+ {
+ "start": 2456,
+ "end": 2457,
+ "name": "table",
+ "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1"
+ },
+ {
+ "start": 1472,
+ "end": 1476,
+ "name": "bibliography_ref",
+ "value": "bac4e5cd-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1552,
+ "end": 1556,
+ "name": "bibliography_ref",
+ "value": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1584,
+ "end": 1588,
+ "name": "bibliography_ref",
+ "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1885,
+ "end": 1889,
+ "name": "bibliography_ref",
+ "value": "bac4e5a0-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2005,
+ "end": 2009,
+ "name": "bibliography_ref",
+ "value": "bac4e549-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2701,
+ "end": 2705,
+ "name": "bibliography_ref",
+ "value": "bac4e4d6-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2931,
+ "end": 2935,
+ "name": "bibliography_ref",
+ "value": "bac4e526-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3517,
+ "end": 3521,
+ "name": "bibliography_ref",
+ "value": "bac4e531-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3697,
+ "end": 3701,
+ "name": "bibliography_ref",
+ "value": "bac4e541-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 4394,
+ "end": 4398,
+ "name": "bibliography_ref",
+ "value": "bac4e575-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 4398,
+ "end": 4401,
+ "name": "bibliography_ref",
+ "value": "bac4e5bc-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.8",
+ "text": "Security evaluations",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.8.0",
+ "text": "We now move to the core of our analysis, namely the security evaluation of different implementations. For this purpose, we first need to discuss the type of security evaluation we will conduct, which can be viewed as a tradeoff between generality and informativeness. That is, one ideally wants to reach general conclusions in the sense that they are independent of the underlying device technology. A typical solution for this purpose is to evaluate the \"security order\" of a countermeasure, as defined by Coron et al. [8]. Informally, the security order corresponds to the largest moment in the leakage probability distributions that is key-independent (hence from which no information can be extracted). For example, an unprotected implementation can be attacked by computing mean values (i.e. first-order moments) [28]. By contrast, the hope of masking is to ensure that adversaries will have to estimate higher-order moments, which is expected to increase the data complexity required to extract information, as first shown by Chari et al. [5]. Evaluating the order is interesting because under the independent leakage assumption mentioned in the last section, it can be done based on the mathematical description of a countermeasure only. Of course, the informativeness of such an abstract evaluation is limited since (1) it indeed does not allow testing whether the independent leakage assumption is fulfilled, and (2) even if this assumption is fulfilled, there is no strict correspondance between the security order and the security level of an implementation (e.g. measured with a probability of success corresponding to some bounded complexities). This is because already for masking (i.e. the countermeasure that aims at increasing the security order), and even if independent leakages are observed in practice, the actual complexity of a side-channel attack highly depends on the amount of noise in the measurements. And of course, there are also countermeasures that simply do not aim at increasing the security order, e.g. shuffling [21].One appealing way to mitigate the second issue is to perform so-called \"simulated attacks\". This essentially requires to model the leakage corresponding to different sensitive operations in an idealized implementation. For example, a usual approximation is to consider that all the intermediate values during a cryptographic computation (such as the S-boxes inputs and outputs for a block cipher) leak the sum of their Hamming weight and a Gaussian distributed noise [30]. It is then possible to accurately estimate the evaluation metrics proposed in [56] (i.e. mutual information, success rate, guessing entropy) from these mathematically generated leakages. Furthermore, one can use the noise variance as a security parameter and analyze its impact on the time and data complexity of successful attacks. Quite naturally, such an alternative still does not solve the first issue (i.e. the independent leakage assumption), for which the only possibility is to evaluate the real measurements of an actual implementation, in a given technology. This latter solution is admittedly the most informative, but also the least general, and is quite intensive for comparison purposes (since it requires to have access to source codes, target devices and measurement setups for all the designs to evaluate). Interestingly, it has been shown that simulated attacks can be quite close to real ones in the context of standard DPA and masking [59]. So since our goal is to show that there exist realistic scenarios where leakage-resilient PRGs/PRFs and masking are useful ingredients to reach a given security level at the lowest cost, we will use this type of evaluations in the following.Note finally that performing simulated attacks could not be replaced by computing explicit formulae for the success rate such as, e.g. [16,52]. Indeed, these formulae only predict subkey (typically key bytes) recoveries while we consider security graphs for full 128-bit master keys. Beside, they are only applicable to unprotected devices so far, and hardly capture masked implementations and the effect of key-dependent algorithmic noise as we will consider next.",
+ "annotations": [
+ {
+ "start": 520,
+ "end": 523,
+ "name": "bibliography_ref",
+ "value": "bac4e463-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 818,
+ "end": 822,
+ "name": "bibliography_ref",
+ "value": "bac4e505-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1045,
+ "end": 1048,
+ "name": "bibliography_ref",
+ "value": "bac4e44c-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2048,
+ "end": 2052,
+ "name": "bibliography_ref",
+ "value": "bac4e4cd-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2520,
+ "end": 2524,
+ "name": "bibliography_ref",
+ "value": "bac4e516-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2604,
+ "end": 2608,
+ "name": "bibliography_ref",
+ "value": "bac4e5e6-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3482,
+ "end": 3486,
+ "name": "bibliography_ref",
+ "value": "bac4e602-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3864,
+ "end": 3868,
+ "name": "bibliography_ref",
+ "value": "bac4e4a3-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3868,
+ "end": 3871,
+ "name": "bibliography_ref",
+ "value": "bac4e5c4-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.9",
+ "text": "Evaluation setups",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.9.0",
+ "text": "We will consider two types of setups in our evaluations: one for software, one for hardware. As illustrated in Figure 3 in the case of a Boolean-masked S-box implementation with two shares, the main difference is that the software performs all the operations sequentially, while the hardware performs them in parallel. We will further assume that the leakage of parallel operations is summed [40]. As previously mentioned, we will illustrate our analyses with a Hamming weight leakage function. Additionally, we will consider a noise variance of 10, corresponding to a Signal-to-Noise Ratio of 0.2 (as defined in [29]) 3 . This is a typical value, both for software implementations [11] and FPGA measurement boards [25].Let us denote the AES S-box as S, a byte of plaintext and key as x i and k i (respectively), the random shares used in masking as r j i (before the S-box) and m j i (after the S-box), the Hamming weight function as HW, the bitwise XOR as ⊕, the field multiplication used in polynomial masking as ⊗, and Gaussiandistributed noise random variables N j i . From these notations, we can specify the list of all our target implementations as summarized in Table 2.A couple of observations are worth being underlined as we now discuss.First, and as already mentioned, the main difference between software and hardware implementations is the number of exploitable leakage samples: there is a single such sample per plaintext in hardware while there are 16×(N m +1) ones in software (with N m the number of masks). Next, we only considered glitches in hardware (since it is generally possible to ensure independent leakage in software, by ensuring a sufficient time separation between the manipulation of the shares). We assumed that \"first-order glitches\" can appear in our Boolean-masked FPGA implementation, and modeled the impact of the mask as an additive binomial noise in this case. We further assumed that the amplitude of this first-order signal was reduced according to a factor f . This factor corresponds to the parameter used to quantify the amplitude of the glitches mentioned in the previous section. Note that this modeling is sound because the complexity of a first-order DPA only depends on the value of its SNR (which is equivalent to correlation and information theoretic metrics in this case, as proven in [31]). So even leakage functions deviating from the Hamming weight abstraction would lead to similar trends. Since the threshold implementation in [36] guarantees the absence of firstorder glitches, we only analyzed the possibility of second-order glitches for this one, and modeled them in the same way as just described (i.e. by considering the second mask M 2 i as an additive binomial noise, and reducing the amplitude of the second-order signal by a factor f ). Third, the chosen-plaintext construction of [34] is only applicable in hardware. Furthermore, we only evaluated its impact for the unprotected implementation, and the 1-mask Boolean one with glitches. As will become clear in the next section, this is because the data complexity bound to 256 (that is the maximum tolerated by design in this case) is only relevant when successful side-channel attacks occur for such small complexities (which was only observed for implementations with first-order signal).For convenience, we denoted each implementation in our experiments with three letters. The first one corresponds to the type of scenario considered, i.e. with Known (K) or carefully Chosen (C) plaintexts. The second one indicates [20,45]2nd-order KP whether we are in a Software (S) or Hardware (H) case study. The third one corresponds to the type of countermeasure selected, i.e. Unprotected (U), 1-or 2-mask Boolean (B 1 , B 2 ), 1-mask Polynomial (P 1 ) and 2-mask threshold (T 2 ). The additional star signals finally reflect the presence of (first-order or secondorder) glitches. For example, KHB * 1 is an AES design protected with a 1-mask Boolean scheme, implemented in an imperfect hardware leading to first-order glitches, and analyzed in the context of known (uniform) plaintexts.",
+ "annotations": [
+ {
+ "start": 392,
+ "end": 396,
+ "name": "bibliography_ref",
+ "value": "bac4e568-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 613,
+ "end": 617,
+ "name": "bibliography_ref",
+ "value": "bac4e50e-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 682,
+ "end": 686,
+ "name": "bibliography_ref",
+ "value": "bac4e476-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 715,
+ "end": 719,
+ "name": "bibliography_ref",
+ "value": "bac4e4ee-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2339,
+ "end": 2343,
+ "name": "bibliography_ref",
+ "value": "bac4e51d-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2486,
+ "end": 2490,
+ "name": "bibliography_ref",
+ "value": "bac4e549-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2850,
+ "end": 2854,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3542,
+ "end": 3546,
+ "name": "bibliography_ref",
+ "value": "bac4e4c5-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 3546,
+ "end": 3548,
+ "name": "bibliography_ref",
+ "value": "bac4e58b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1177,
+ "end": 1178,
+ "name": "table",
+ "value": "6e093372-d147-4245-8aab-08ed5fe5c072"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.10",
+ "text": "Template attacks and security graphs",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.10.0",
+ "text": "Given the leakage functions defined in Table 2, a template attack first requires to build a leakage model. In the following, and for each byte of the AES master key, we will consider Gaussian templates for unprotected implementations, and Gaussian for masked implementations. Let us denote the probability density function of a Gaussian distribution taken on input z, with mean µ (resp. mean vector µ) and variance σ 2 (resp. covariance matrix Σ) as N (z|µ, σ 2 ) (resp. N (z|µ, Σ)). This notation directly leads to models of the form:Prfor (software and hardware) unprotected implementations and:Prfor (software and hardware) masked implementations with two shares. The formula naturally extends to more shares, by just adding more sums over the masks. Note that in these models, all the noise (including the algorithmic one in hardware implementations) is captured by the Gaussian distribution 4 . Given these models, the template adversary will accumulate information on the key bytes k i , by computing products of probabilities corresponding to multiple plaintexts. Doing so and for each key byte, he will produce lists of 256 probabilities corresponding each possible candidate ki , defined as follows:i ],with the leakage vector L (j) respectively corresponding to l (j) i (resp. l (j) ) in the context of Equ. 1 (resp. Equ. 2) and l 1,(j) i , l 2,(j) i (resp. l (j) ) in the context of Equ. 3 (resp. Equ. 4) The number of measurements is given by q in Equ. 5. Next and for each target implementation, we will repeat 100 experiments. And for each value of q in these experiments, use a rank estimation algorithm to evaluate the time complexity needed to recover the full AES master key [61]. Eventually, we will build \"security graphs\" where the attack probability of success is provided in function of a time complexity and a number of measurements.Iterative DPA against constructions with carefully chosen plaintexts. Note that while standard DPA attacks are adequate to analyze the security of unprotected and masked implementations in a known-plaintext scenario, their divide-and-conquer strategy hardly applies to the PRF in [34], with carefullychosen plaintexts leading to key-dependent algorithmic noise. This is because the (maximum 256) constants c j used in this proposal are such that all 16 bytes are always identical. Hence, a standard DPA will provide a single list of probabilities, containing information about the 16 AES key bytes at once. In this case, we additionally considered the iterative DPA described in this previous reference, which essentially works by successively removing the algorithmic noise generated by the best-rated key bytes. While such an attack can only work under the assumption that the adversary has an very precise leakage model in hand, we use it as a representative of worst-case attack against such a construction.",
+ "annotations": [
+ {
+ "start": 45,
+ "end": 46,
+ "name": "table",
+ "value": "6e093372-d147-4245-8aab-08ed5fe5c072"
+ },
+ {
+ "start": 1693,
+ "end": 1697,
+ "name": "bibliography_ref",
+ "value": "bac4e61b-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2137,
+ "end": 2141,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.11",
+ "text": "Experimental results",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.11.0",
+ "text": "For illustration, the security graph of the AES implementation KHB 1 is given in Figure 4, where we additionally provide the maximum number of measurements tolerated to maintain security levels corresponding to 2 120 , 2 100 and 2 80 time complexity. All the implementations in Table 2 have been similarly evaluated and the result of these experiments are in Appendix A, Figures 8 to 13. Note that in the aforementioned case of iterative DPA (Appendix A, Figure 14), the adversary recovers the AES key bytes but still has to find their position within the AES state, which (roughly) corresponds to 16! ≈ 2 44 possibilities [2].",
+ "annotations": [
+ {
+ "start": 284,
+ "end": 285,
+ "name": "table",
+ "value": "6e093372-d147-4245-8aab-08ed5fe5c072"
+ },
+ {
+ "start": 623,
+ "end": 626,
+ "name": "bibliography_ref",
+ "value": "bac4e432-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.12",
+ "text": "Security vs. performance tradeoffs",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.12.0",
+ "text": "We now combine the results in the previous sections to answer our main question. Namely, what is the best way to exploit masking and/or leakage-resilient primitives to resist standard DPA in hardware and software implementations?",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.13",
+ "text": "Leakage-resilient PRGs",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.13.0",
+ "text": "Let M be the maximum number of measurements tolerated to maintain a given security level for one of the implementations in section 4. The re-keying in leakage-resilient PRGs is such that it is exactly this number M that is limited by design (i.e. the value N in Figure 1 bounds M for the adversary), hence directly leading to security-bounded implementations. The global cost metric we use in this case can be written as M M -1 × cost f unction, where the first factor corresponds to the average number of AES encryptions that are used to produce each 128-bit output string, and the second one is the cost function of Table 1.A comparison of different leakage-resilient PRG implementations in software (i.e. based on different unprotected and protected AES implementations) is given in Figure 5 for 80-bit and 120-bit security levels (the results for 100-bit security are in Appendix A, Figure 15, left). The main observation in this context is that the straightforward implementation of the PRG with an unprotected AES design is the most efficient solution. This is mainly because moving from the smallest M value (i.e. M = 2, as imposed by the 120-bit security level in the unprotected case -see Figure 8-left) to large ones (e.g. M > 1000 for masked implementations) can only lead to a gain factor of 2 for the global cost metric, which is not justified in view of the performance overheads due to the masking. For a similar reason (i.e. the limited interest of increasing M ), the global cost metric is essentially independent of the target security level in the figure. In other words, there is little interest in decreasing this security level since it leads to poor performance improvements. The hardware implementations in Appendix A, Figures 15-right and 16 lead to essentially similar intuitions, as also witnessed by the limited impact of decreasing the amplitude of the glitch signal with the f factor (see the KHB * 1 and KHT * 2 implementations for which f = 10 in the latter figures).",
+ "annotations": [
+ {
+ "start": 624,
+ "end": 625,
+ "name": "table",
+ "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.14",
+ "text": "Leakage-resilient PRFs",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.14.0",
+ "text": "Security-unbounded implementations. Let us now consider (stateless) leakage-resilient PRFs. As already mentioned, those constructions only bound the adversary's data complexity. The main observation in this case is that if random plaintexts are considered, such implementations can only be security-unbounded (with the slight cautionary note that we give below). This fact can be easily explained when the PRF is instantiated with an unprotected software implementation of the AES. What happens then is that the adversary can repeat his measurements to get rid of the physical noise, and consequently move from the security graph of Appendix A, Figure 8-left to the one of Appendix A, Figure 13-right. Such a \"repeating\" attack is exactly the one already mentioned in [34] to argue that bounded data complexity is not enough to bound (computational) security. In fact, it similarly applies to masked implementations. The only difference is that the adversary will not average his measurements, but rather combine them as in Equation 5. This is because given a leakage function, e.g. the Hamming weight one that leads to 9 distinguishable events, the distribution of the measurements in a masked implementation will lead to the same number of distinguishable events: the only difference is that more sampling will be necessary to distinguish them (see the appendices in [60] for a plot of these distributions). So if the number of measurements is not bounded, attacks with low time complexities as in Appendix A, Figure 13 right will always exist.One important consequence is that using the PRF construction in this context is essentially useless for all the AES implementations we consider in this paper. The only way to maintain a target security level for such stateless primitives is to limit the number of measurements by putting a constraint on the lifetime of the system. And this lifetime will be selected according to the maximum number of measurements tolerated that can be extracted from our security graphs, which now highly depends on the countermeasure selected. In other words, we can only evaluate the cost function and the security level attained independently in this case, as illustrated in Figure 6 for our software instances (the 100-bit security level is again given in Appendix A, Figure 17-left). Here, we naturally come back to the standard result that Boolean (resp. polynomial) masking increases security at the cost of performance overheads that are roughly quadratic (resp. cubic) in the number of shares. Note that the security level of the 1-mask polynomial scheme is higher than the 2-mask Boolean one for the noise variance we consider, which is consistent with the previous work of Roche and Prouff [54]. Similar conclusions are obtained with hardware implementations (Appendix A, Figure 17-right and Appendix A, Figure 18), for which the impact of glitches is now clearly visible. For example, a factor f = 10 essentially multiplies the number of measurements by f for the Boolean masking with first-order glitches, and f 2 for the threshold implementation with second-order glitches. Cautionary note. The statement that stateless leakage-resilient PRFs can only be security unbounded if known plaintexts are considered essentially relates to the fact that repeated measurements allow removing the effect of the noise and the masks in a leaking implementation. Yet, this claim should be slightly mitigated in the case of algorithmic noise in hardware implementations. Indeed, this part of the noise can only be averaged up to the data complexity bound that is imposed by the PRF design. Taking the example of our hardware implementations where all 16 S-boxes are manipulated in parallel, the SNR corresponding to algorithmic noise can be computed as the ratio between the variance of a uniformly distributed 8-bit values's Hamming weight (i.e. 2) and the variance of 15 such values (i.e. 30). Averaging this noise over M plaintexts will lead to SNRs of 1 15/M , which is already larger than 17 if M = 256 (i.e. a noise level for which the security graph will be extremely close to the worst case one of Appendix A, Figure 13-right). So although there is a \"gray area\" where a leakage-resilient PRF implemented in hardware can be (weakly) security-bounded, these contexts are of quite limited interest because the will imply bounds on the data complexity that are below 256, i.e. they anyway lead to less efficient solutions than the tweaked construction that we investigate in the next subsection.Security-bounded implementations. As just discussed, stateless primitives hardly lead to security bounded implementations if physical and algorithmic noise can be averaged -which is straightforwardly feasible in a known plaintext scenario. The tweaked construction in [34] aims at avoiding such a weakness by preventing the averaging of the algorithmic noise, thanks to the combined effect of hardware parallelism and carefully chosen plaintexts leading to keydependencies in this noise. Since only the physical noise can be averaged in this case, the bounded data complexity that the leakage-resilient PRF guarantees consequently leads to security-bounded implementations again. This is illustrated both by the standard DPAs (such as in Appendix A, Figures 10-right and 12-left) and the iterative attacks (such as in Appendix A, Figure 13) that can be performed against this PRF 5 . As in Section 5.1, we extracted the maximum data complexity D from these graphs, and produced as global cost metric:where the first factor corresponds to the (rounded) average number of AES encryptions needed to produce a 128-bit output, and the second one is the cost function of Table 1. A comparison of our different leakage-resilient PRFs instantiated with a hardware implementation of the AES and chosen plaintexts is given in Figure 7. Here again, we observe that the most efficient solution is to consider an unprotected design. Interestingly, we also observe that for the unprotected AES, the iterative attack is the worst case for the 80-bit security level (where it forces the re-keying after 97 plaintexts vs. 256 for the standard DPA), while the standard DPA is the worst-case for the 120-bit security level (where it forces the re-keying after 10 plaintexts vs. 37 for the iterative attack). This nicely fits the intuition that iterative attacks become more powerful as the data complexity increases, i.e. when the additional time complexity corresponding to the enumeration of a permutation over 16 bytes becomes small compared to the time complexity required to recover the 16 AES key bytes (unordered). ",
+ "annotations": [
+ {
+ "start": 768,
+ "end": 772,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 4800,
+ "end": 4804,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1369,
+ "end": 1373,
+ "name": "bibliography_ref",
+ "value": "bac4e610-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 2732,
+ "end": 2736,
+ "name": "bibliography_ref",
+ "value": "bac4e5d7-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 5703,
+ "end": 5704,
+ "name": "table",
+ "value": "d2ce350a-25be-4d05-9061-6f1d4cf8bdd1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.15",
+ "text": "Conclusion",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.15.0",
+ "text": "The results in this work essentially show that masking and leakage-resilient constructions hardly combine constructively. For (stateful) PRGs, our experiments indicate that both for software and hardware implementations, a leakageresilient design instantiated with an unprotected AES is the most efficient solution to reach any given security level. For stateless PRFs, they rather show that a bounded data complexity guarantee is (mostly) ineffective in bounding the (computational) complexity of the best attacks. So implementing masking and limiting the lifetime of the cryptographic implementation is the best solution in this case. Nevertheless, the chosen-plaintext tweak proposed in [34] is an interesting exception to this conclusion, as it leads to security-bounded hardware implementations for stateless primitives that are particularly interesting from an application point-of-view, e.g. for re-synchronization, challenge-response protocols, . . . Beyond the further analysis of such constructions, their extension to software implementations is an interesting scope for further research. In this respect, the combination of a chosen-plaintext leakage-resilient PRF with the shuffling countermeasure in [62] seems promising, as it could \"emulate\" the keydependent algorithmic noise ensuring security bounds in hardware. ",
+ "annotations": [
+ {
+ "start": 690,
+ "end": 694,
+ "name": "bibliography_ref",
+ "value": "bac4e539-f290-11ee-a6ed-b88584b4e4a1"
+ },
+ {
+ "start": 1214,
+ "end": 1218,
+ "name": "bibliography_ref",
+ "value": "bac4e623-f290-11ee-a6ed-b88584b4e4a1"
+ }
+ ],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.16",
+ "text": "A Additional figures",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ },
+ {
+ "node_id": "0.17",
+ "text": "\n",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "section",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": [
+ {
+ "node_id": "0.17.0",
+ "text": "Acknowledgements. F.-X. Standaert is an associate researcher of the . Work funded in parts by the through the project (CRASH) and the grant B- project.",
+ "annotations": [],
+ "metadata": {
+ "paragraph_type": "raw_text",
+ "page_id": 0,
+ "line_id": 0,
+ "other_fields": {}
+ },
+ "subparagraphs": []
+ }
+ ]
+ },
+ {
+ "node_id": "0.18",
+ "text": "
Acknowledgements. F.-X. Standaert is an associate researcher of the Belgian Fund for Scientific Research (FNRS-F.R.S.). Work funded in parts by the European Commission through the ERC project 280141 (CRASH) and the European ISEC action grant HOME/2010/ISEC/AG/INT-011 B-CCENTRE project.