diff --git a/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py b/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py index 798b1efc0..887272b3a 100644 --- a/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py +++ b/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py @@ -1,14 +1,20 @@ import re from html.parser import HTMLParser from itertools import accumulate -from typing import Dict, List, Union +from typing import List, Optional, TypedDict from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc +class Text(TypedDict): + text: str + start: int + end: int + + class CustomLineHTMLParser(HTMLParser): - result: List[Dict[str, Union[str, int]]] + result: List[Text] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -23,7 +29,7 @@ def current_index(self): line, char = self.getpos() return self.line_lengths[line - 1] + char - def __call__(self, data: str) -> List[Dict[str, Union[str, int]]]: + def __call__(self, data: str) -> List[Text]: self.reset() self.line_lengths = [0] + list( accumulate(len(line) for line in data.splitlines(keepends=True)) @@ -37,21 +43,13 @@ class HTMLTextMapper(CustomLineHTMLParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.result = [] - self.text = { - "text": "", - "start": 0, - "end": 0, - } + self.text: Optional[Text] = None self.end_spaces = 0 def reset(self): super().reset() self.result = [] - self.text = { - "text": "", - "start": 0, - "end": 0, - } + self.text = None def handle_data(self, data: str): # only add text if it is not only whitespaces! @@ -68,6 +66,7 @@ def handle_data(self, data: str): self.text = { "text": data.strip(), "start": self.current_index + start_spaces, + "end": -1, } def handle_starttag(self, tag, attrs): @@ -80,14 +79,11 @@ def handle_comment(self, data): self.text_end() def text_end(self): - self.text["end"] = self.current_index - self.end_spaces - self.result.append(self.text) - self.text = { - "text": "", - "start": 0, - "end": 0, - } - self.end_spaces = 0 + if self.text: + self.text["end"] = self.current_index - self.end_spaces + self.result.append(self.text) + self.text = None + self.end_spaces = 0 def close(self): super().close()