diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1584267192..a4018c1b25 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.16.6-dev0
+
+### Enhancements
+- **Every
tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
+
+### Features
+
+### Fixes
+
## 0.16.5
### Enhancements
diff --git a/test_unstructured/documents/test_mappings.py b/test_unstructured/documents/test_mappings.py
new file mode 100644
index 0000000000..029bb544fb
--- /dev/null
+++ b/test_unstructured/documents/test_mappings.py
@@ -0,0 +1,53 @@
+from collections import defaultdict
+from typing import Dict, Type
+
+from unstructured.documents import elements, ontology
+from unstructured.documents.mappings import (
+ ALL_ONTOLOGY_ELEMENT_TYPES,
+ HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
+ ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
+ get_all_subclasses,
+)
+from unstructured.documents.ontology import OntologyElement
+
+
+def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
+ """
+ Get a mapping of HTML tags to their exclusive OntologyElement types.
+ """
+ html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
+ for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
+ for tag in element_type().allowed_tags:
+ html_tag_to_element_type_mappings[tag].append(element_type)
+
+ return {
+ tag: element_types[0]
+ for tag, element_types in html_tag_to_element_type_mappings.items()
+ if len(element_types) == 1
+ }
+
+
+def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
+ exclusive_html_tags = _get_exclusive_html_tags()
+ for expected_tag, expected_element_type in exclusive_html_tags.items():
+ assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
+ assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
+
+
+def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
+ for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
+ assert issubclass(element_type, OntologyElement)
+
+
+def test_ontology_to_unstructured_mapping_has_valid_types():
+ for (
+ ontology_element,
+ unstructured_element,
+ ) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
+ assert issubclass(unstructured_element, elements.Element)
+ assert issubclass(ontology_element, ontology.OntologyElement)
+
+
+def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
+ for ontology_element in get_all_subclasses(ontology.OntologyElement):
+ assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index f4096753b3..a03340d1af 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.5" # pragma: no cover
+__version__ = "0.16.6-dev0" # pragma: no cover
diff --git a/unstructured/documents/mappings.py b/unstructured/documents/mappings.py
index 98b9f6b4f8..d6b45114ac 100644
--- a/unstructured/documents/mappings.py
+++ b/unstructured/documents/mappings.py
@@ -5,10 +5,10 @@
of parsed documents
"""
-from collections import defaultdict
from typing import Any, Dict, Type
-from unstructured.documents.ontology import OntologyElement
+from unstructured.documents import elements, ontology
+from unstructured.documents.elements import Element
def get_all_subclasses(cls) -> list[Any]:
@@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
return all_subclasses
-def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
+def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
"""
- Get a mapping of HTML tags to their exclusive OntologyElement types.
- """
- html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
- for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
- for tag in element_type().allowed_tags:
- html_tag_to_element_type_mappings[tag].append(element_type)
-
- return {
- tag: element_types[0]
- for tag, element_types in html_tag_to_element_type_mappings.items()
- if len(element_types) == 1
- }
-
-
-def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
- """
- Get a mapping of ontology element names to unstructured type names.
+ Get a mapping of ontology element to unstructured type.
The dictionary here was created base on ontology mapping json
Can be generated via the following code:
@@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
```
Returns:
- dict: A dictionary where keys are ontology element class names
- and values are unstructured type names.
+ dict: A dictionary where keys are ontology element classes
+ and values are unstructured types.
"""
ontology_to_unstructured_class_mapping = {
- "Document": "UncategorizedText",
- "Section": "UncategorizedText",
- "Page": "UncategorizedText",
- "Column": "UncategorizedText",
- "Paragraph": "NarrativeText",
- "Header": "Header",
- "Footer": "Footer",
- "Sidebar": "UncategorizedText",
- "PageBreak": "PageBreak",
- "Title": "Title",
- "Subtitle": "Title",
- "Heading": "Title",
- "NarrativeText": "NarrativeText",
- "Quote": "NarrativeText",
- "Footnote": "UncategorizedText",
- "Caption": "FigureCaption",
- "PageNumber": "PageNumber",
- "UncategorizedText": "UncategorizedText",
- "OrderedList": "UncategorizedText",
- "UnorderedList": "UncategorizedText",
- "DefinitionList": "UncategorizedText",
- "ListItem": "ListItem",
- "Table": "Table",
- "TableRow": "Table",
- "TableCell": "Table",
- "TableCellHeader": "Table",
- "TableBody": "Table",
- "TableHeader": "Table",
- "Image": "Image",
- "Figure": "Image",
- "Video": "UncategorizedText",
- "Audio": "UncategorizedText",
- "Barcode": "Image",
- "QRCode": "Image",
- "Logo": "Image",
- "CodeBlock": "CodeSnippet",
- "InlineCode": "CodeSnippet",
- "Formula": "Formula",
- "Equation": "Formula",
- "FootnoteReference": "UncategorizedText",
- "Citation": "UncategorizedText",
- "Bibliography": "UncategorizedText",
- "Glossary": "UncategorizedText",
- "Author": "UncategorizedText",
- "MetaDate": "UncategorizedText",
- "Keywords": "UncategorizedText",
- "Abstract": "NarrativeText",
- "Hyperlink": "UncategorizedText",
- "TableOfContents": "UncategorizedText",
- "Index": "UncategorizedText",
- "Form": "UncategorizedText",
- "FormField": "UncategorizedText",
- "FormFieldValue": "UncategorizedText",
- "Checkbox": "UncategorizedText",
- "RadioButton": "UncategorizedText",
- "Button": "UncategorizedText",
- "Comment": "UncategorizedText",
- "Highlight": "UncategorizedText",
- "RevisionInsertion": "UncategorizedText",
- "RevisionDeletion": "UncategorizedText",
- "Address": "Address",
- "EmailAddress": "EmailAddress",
- "PhoneNumber": "UncategorizedText",
- "CalendarDate": "UncategorizedText",
- "Time": "UncategorizedText",
- "Currency": "UncategorizedText",
- "Measurement": "UncategorizedText",
- "Letterhead": "Header",
- "Signature": "UncategorizedText",
- "Watermark": "UncategorizedText",
- "Stamp": "UncategorizedText",
+ ontology.Document: elements.Text,
+ ontology.Section: elements.Text,
+ ontology.Page: elements.Text,
+ ontology.Column: elements.Text,
+ ontology.Paragraph: elements.NarrativeText,
+ ontology.Header: elements.Header,
+ ontology.Footer: elements.Footer,
+ ontology.Sidebar: elements.Text,
+ ontology.PageBreak: elements.PageBreak,
+ ontology.Title: elements.Title,
+ ontology.Subtitle: elements.Title,
+ ontology.Heading: elements.Title,
+ ontology.NarrativeText: elements.NarrativeText,
+ ontology.Quote: elements.NarrativeText,
+ ontology.Footnote: elements.Text,
+ ontology.Caption: elements.FigureCaption,
+ ontology.PageNumber: elements.PageNumber,
+ ontology.UncategorizedText: elements.Text,
+ ontology.OrderedList: elements.Text,
+ ontology.UnorderedList: elements.Text,
+ ontology.DefinitionList: elements.Text,
+ ontology.ListItem: elements.ListItem,
+ ontology.Table: elements.Table,
+ ontology.TableRow: elements.Table,
+ ontology.TableCell: elements.Table,
+ ontology.TableCellHeader: elements.Table,
+ ontology.TableBody: elements.Table,
+ ontology.TableHeader: elements.Table,
+ ontology.Image: elements.Image,
+ ontology.Figure: elements.Image,
+ ontology.Video: elements.Text,
+ ontology.Audio: elements.Text,
+ ontology.Barcode: elements.Image,
+ ontology.QRCode: elements.Image,
+ ontology.Logo: elements.Image,
+ ontology.CodeBlock: elements.CodeSnippet,
+ ontology.InlineCode: elements.CodeSnippet,
+ ontology.Formula: elements.Formula,
+ ontology.Equation: elements.Formula,
+ ontology.FootnoteReference: elements.Text,
+ ontology.Citation: elements.Text,
+ ontology.Bibliography: elements.Text,
+ ontology.Glossary: elements.Text,
+ ontology.Author: elements.Text,
+ ontology.MetaDate: elements.Text,
+ ontology.Keywords: elements.Text,
+ ontology.Abstract: elements.NarrativeText,
+ ontology.Hyperlink: elements.Text,
+ ontology.TableOfContents: elements.Text,
+ ontology.Index: elements.Text,
+ ontology.Form: elements.Text,
+ ontology.FormField: elements.Text,
+ ontology.FormFieldValue: elements.Text,
+ ontology.Checkbox: elements.Text,
+ ontology.RadioButton: elements.Text,
+ ontology.Button: elements.Text,
+ ontology.Comment: elements.Text,
+ ontology.Highlight: elements.Text,
+ ontology.RevisionInsertion: elements.Text,
+ ontology.RevisionDeletion: elements.Text,
+ ontology.Address: elements.Address,
+ ontology.EmailAddress: elements.EmailAddress,
+ ontology.PhoneNumber: elements.Text,
+ ontology.CalendarDate: elements.Text,
+ ontology.Time: elements.Text,
+ ontology.Currency: elements.Text,
+ ontology.Measurement: elements.Text,
+ ontology.Letterhead: elements.Header,
+ ontology.Signature: elements.Text,
+ ontology.Watermark: elements.Text,
+ ontology.Stamp: elements.Text,
}
return ontology_to_unstructured_class_mapping
-ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement)
-HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = {
+ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement)
+HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = {
(tag, element_type().css_class_name): element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}
-CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = {
+CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
element_type().css_class_name: element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}
-EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags()
-ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping()
+HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
+ "body": ontology.Document,
+ "footer": ontology.Footer,
+ "aside": ontology.Sidebar,
+ "hr": ontology.PageBreak,
+ "h3": ontology.Heading,
+ "h4": ontology.Heading,
+ "h5": ontology.Heading,
+ "h6": ontology.Heading,
+ "blockquote": ontology.Quote,
+ "figcaption": ontology.Caption,
+ "ol": ontology.OrderedList,
+ "li": ontology.ListItem,
+ "tbody": ontology.TableBody,
+ "thead": ontology.TableHeader,
+ "tr": ontology.TableRow,
+ "td": ontology.TableCell,
+ "th": ontology.TableCellHeader,
+ "figure": ontology.Figure,
+ "video": ontology.Video,
+ "audio": ontology.Audio,
+ "pre": ontology.CodeBlock,
+ "sub": ontology.FootnoteReference,
+ "cite": ontology.Citation,
+ "nav": ontology.Index,
+ "form": ontology.Form,
+ "label": ontology.FormField,
+ "button": ontology.Button,
+ "mark": ontology.Highlight,
+ "ins": ontology.RevisionInsertion,
+ "del": ontology.RevisionDeletion,
+ "address": ontology.Address,
+ "table": ontology.Table,
+}
+
+ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
index 6054eba670..0f12c967bb 100644
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@@ -7,45 +7,24 @@
from bs4 import BeautifulSoup, Tag
-from unstructured.documents.elements import (
- TYPE_TO_TEXT_ELEMENT_MAP,
- Element,
- ElementMetadata,
- Text,
-)
+from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
CSS_CLASS_TO_ELEMENT_TYPE_MAP,
- EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP,
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP,
- ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME,
-)
-from unstructured.documents.ontology import (
- Bibliography,
- Citation,
- Document,
- ElementTypeEnum,
- Footnote,
- FootnoteReference,
- Glossary,
- Hyperlink,
- NarrativeText,
- OntologyElement,
- Page,
- Paragraph,
- Quote,
- UncategorizedText,
+ HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
+ ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
)
RECURSION_LIMIT = 50
def ontology_to_unstructured_elements(
- ontology_element: OntologyElement,
+ ontology_element: ontology.OntologyElement,
parent_id: str = None,
page_number: int = None,
depth: int = 0,
filename: str | None = None,
-) -> list[Element]:
+) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
@@ -70,18 +49,18 @@ def ontology_to_unstructured_elements(
list[Element]: A list of unstructured Element objects.
"""
elements_to_return = []
- if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
+ if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
- if page_number is None and isinstance(ontology_element, Page):
+ if page_number is None and isinstance(ontology_element, ontology.Page):
page_number = ontology_element.page_number
- if not isinstance(ontology_element, Document):
+ if not isinstance(ontology_element, ontology.Document):
elements_to_return += [
- Text(
+ elements.Text(
text="",
element_id=ontology_element.id,
detection_origin="vlm_partitioner",
- metadata=ElementMetadata(
+ metadata=elements.ElementMetadata(
parent_id=parent_id,
text_as_html=ontology_element.to_html(add_children=False),
page_number=page_number,
@@ -96,7 +75,7 @@ def ontology_to_unstructured_elements(
child,
parent_id=ontology_element.id,
page_number=page_number,
- depth=0 if isinstance(ontology_element, Document) else depth + 1,
+ depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
)
children += child
@@ -104,10 +83,7 @@ def ontology_to_unstructured_elements(
combined_children = combine_inline_elements(children)
elements_to_return += combined_children
else:
- unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
- ontology_element.__class__.__name__
- ]
- element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
+ element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text()
@@ -115,7 +91,7 @@ def ontology_to_unstructured_elements(
text=element_text,
element_id=ontology_element.id,
detection_origin="vlm_partitioner",
- metadata=ElementMetadata(
+ metadata=elements.ElementMetadata(
parent_id=parent_id,
text_as_html=html_code_of_ontology_element,
page_number=page_number,
@@ -128,7 +104,7 @@ def ontology_to_unstructured_elements(
return elements_to_return
-def combine_inline_elements(elements: list[Element]) -> list[Element]:
+def combine_inline_elements(elements: list[elements.Element]) -> list[elements.Element]:
"""
Combines consecutive inline elements into a single element. Inline elements
can be also combined with text elements.
@@ -168,7 +144,9 @@ def combine_inline_elements(elements: list[Element]) -> list[Element]:
return result_elements
-def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool:
+def can_unstructured_elements_be_merged(
+ current_element: elements.Element, next_element: elements.Element
+) -> bool:
"""
Elements can be merged when:
- They are on the same level in the HTML tree
@@ -200,20 +178,20 @@ def can_unstructured_elements_be_merged(current_element: Element, next_element:
return True
-def is_text_element(ontology_element: OntologyElement) -> bool:
+def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
"""Categories or classes that we want to combine with inline text"""
text_classes = [
- NarrativeText,
- Quote,
- Paragraph,
- Footnote,
- FootnoteReference,
- Citation,
- Bibliography,
- Glossary,
+ ontology.NarrativeText,
+ ontology.Quote,
+ ontology.Paragraph,
+ ontology.Footnote,
+ ontology.FootnoteReference,
+ ontology.Citation,
+ ontology.Bibliography,
+ ontology.Glossary,
]
- text_categories = [ElementTypeEnum.metadata]
+ text_categories = [ontology.ElementTypeEnum.metadata]
if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True
@@ -224,11 +202,14 @@ def is_text_element(ontology_element: OntologyElement) -> bool:
return False
-def is_inline_element(ontology_element: OntologyElement) -> bool:
+def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
"""Categories or classes that we want to combine with text elements"""
- inline_classes = [Hyperlink]
- inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation]
+ inline_classes = [ontology.Hyperlink]
+ inline_categories = [
+ ontology.ElementTypeEnum.specialized_text,
+ ontology.ElementTypeEnum.annotation,
+ ]
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True
@@ -239,7 +220,9 @@ def is_inline_element(ontology_element: OntologyElement) -> bool:
return False
-def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement:
+def unstructured_elements_to_ontology(
+ unstructured_elements: Sequence[elements.Element],
+) -> ontology.OntologyElement:
"""
Converts a sequence of unstructured Element objects to an OntologyElement object.
@@ -260,10 +243,10 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
document_element_id = unstructured_elements[0].metadata.parent_id
if document_element_id is None:
- document_element_id = OntologyElement.generate_unique_id()
+ document_element_id = ontology.OntologyElement.generate_unique_id()
unstructured_elements[0].metadata.parent_id = document_element_id
- id_to_element_mapping[document_element_id] = Document(
+ id_to_element_mapping[document_element_id] = ontology.Document(
additional_attributes={"id": document_element_id}
)
@@ -288,7 +271,7 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
return root_element
-def parse_html_to_ontology(html_code: str) -> OntologyElement:
+def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
"""
Parses the given HTML code and converts it into an Element object.
@@ -356,7 +339,9 @@ def remove_empty_tags(soup):
return str(soup)
-def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
+def parse_html_to_ontology_element(
+ soup: Tag, recursion_depth: int = 1
+) -> ontology.OntologyElement | None:
"""
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
@@ -375,7 +360,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
escaped_attrs = get_escaped_attributes(soup)
if soup.name == "br": # Note(Pluto) should it be
?
- return Paragraph(
+ return ontology.Paragraph(
text="",
css_class_name=None,
html_tag_name="br",
@@ -383,9 +368,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
)
has_children = (
- (ontology_class != UncategorizedText)
+ (ontology_class != ontology.UncategorizedText)
and any(isinstance(content, Tag) for content in soup.contents)
- or ontology_class().elementType == ElementTypeEnum.layout
+ or ontology_class().elementType == ontology.ElementTypeEnum.layout
)
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
@@ -395,7 +380,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
(
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
if isinstance(child, Tag)
- else Paragraph(text=str(child).strip())
+ else ontology.Paragraph(text=str(child).strip())
)
for child in soup.children
if str(child).strip()
@@ -414,7 +399,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
return output_element
-def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[OntologyElement]]:
+def extract_tag_and_ontology_class_from_tag(
+ soup: Tag,
+) -> tuple[str, Type[ontology.OntologyElement]]:
"""
Extracts the HTML tag and corresponding ontology class
from a BeautifulSoup Tag object. The CSS class is prioritized over
@@ -445,8 +432,8 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
html_tag = element_class().allowed_tags[0]
# Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology
- if not element_class and soup.name in EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP:
- html_tag, element_class = soup.name, EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP[soup.name]
+ if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP:
+ html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name]
# Scenario 4: CSS class incorrect, HTML tag incorrect
# Fallback to default UncategorizedText
@@ -455,7 +442,7 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
# e.g. parent=FormField soup.name=input -> element=FormFieldInput
html_tag = "span"
- element_class = UncategorizedText
+ element_class = ontology.UncategorizedText
return html_tag, element_class