diff --git a/HISTORY.md b/HISTORY.md
index 13640879..2c1e59e8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,10 @@
## Unreleased
+### Improvements
+
+- The html format now supports importing from HTML content (#243)
+
### Changes
- The html export format does not depend on MarkupPy any longer, therefore the
diff --git a/docs/formats.rst b/docs/formats.rst
index 919394b7..636f7309 100644
--- a/docs/formats.rst
+++ b/docs/formats.rst
@@ -97,9 +97,21 @@ install Tablib with ``pip install "tablib[pandas]"`` to make the format availabl
html
====
-The ``html`` format is currently export-only. The exports produce an HTML page
-with the data in a ``
``. If headers have been set, they will be used as
-table headers.
+The exports produce an HTML page with the data in a ````. If headers have
+been set, they will be used as table headers (``thead``).
+
+When you import HTML, you can specify a specific table to import by providing
+the ``table_id`` argument::
+
+ import tablib
+
+ tablib.import_set(your_html, format='html', table_id='some_table_id')
+
+Otherwise, the first table found will be imported.
+
+.. versionchanged:: 3.6.0
+
+ The ability to import HTML was added. The dependency on MarkupPy was dropped.
jira
====
diff --git a/src/tablib/formats/_html.py b/src/tablib/formats/_html.py
index 373620d2..b8b9e700 100644
--- a/src/tablib/formats/_html.py
+++ b/src/tablib/formats/_html.py
@@ -1,5 +1,6 @@
""" Tablib - HTML export support.
"""
+from html.parser import HTMLParser
from xml.etree import ElementTree as ET
@@ -48,3 +49,66 @@ def export_book(cls, databook):
result += '\n'
return result
+
+ @classmethod
+ def import_set(cls, dset, in_stream, table_id=None, **kwargs):
+ """Returns dataset from HTML content."""
+
+ dset.wipe()
+ parser = TablibHTMLParser(dset, table_id=table_id)
+ parser.feed(in_stream.read())
+ if not parser.table_found:
+ if table_id:
+ raise ValueError(f'No found with id="{table_id}" in input HTML')
+ else:
+ raise ValueError('No found in input HTML')
+
+
+class TablibHTMLParser(HTMLParser):
+ def __init__(self, dataset, *args, table_id=None, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.dset = dataset
+ self.table_id = table_id
+ self.table_found = False
+ self.table_open = False
+ self.thead_open = False
+ self.cell_open = False
+ self.headers = []
+ self.current_row = []
+ self.current_data = ''
+
+ def handle_starttag(self, tag, attrs):
+ if (
+ tag == 'table' and not self.table_found and
+ (not self.table_id or dict(attrs).get('id') == self.table_id)
+ ):
+ self.table_open = True
+ self.table_found = True
+ elif self.table_open:
+ if tag == 'thead':
+ self.thead_open = True
+ elif tag in ['td', 'th']:
+ self.cell_open = True
+
+ def handle_endtag(self, tag):
+ if not self.table_open:
+ return
+ if tag == 'table':
+ self.table_open = False
+ elif tag == 'thead':
+ self.thead_open = False
+ self.dset.headers = self.headers
+ elif tag == 'tr' and self.current_row:
+ self.dset.append(self.current_row)
+ self.current_row = []
+ elif tag in ['td', 'th']:
+ if self.thead_open:
+ self.headers.append(self.current_data)
+ else:
+ self.current_row.append(self.current_data)
+ self.cell_open = False
+ self.current_data = ''
+
+ def handle_data(self, data):
+ if self.cell_open:
+ self.current_data += data
diff --git a/tests/test_tablib.py b/tests/test_tablib.py
index 11a92fa6..408d0449 100755
--- a/tests/test_tablib.py
+++ b/tests/test_tablib.py
@@ -667,6 +667,54 @@ def test_html_databook_export(self):
f"Founders
{self.founders_html}Founders
{self.founders_html}"
)
+ def test_html_import(self):
+ data.html = self.founders_html
+
+ self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers)
+ self.assertEqual([
+ ('John', 'Adams', '90'),
+ ('George', 'Washington', '67'),
+ ('Thomas', 'Jefferson', '50'),
+ ], data[:])
+
+ def test_html_import_no_headers(self):
+ data.html = """
+
+ John | Adams | 90 |
"
+ George | Washington | 67 |
"
+
+ """
+
+ self.assertIsNone(data.headers)
+ self.assertEqual([
+ ('John', 'Adams', '90'),
+ ('George', 'Washington', '67'),
+ ], data[:])
+
+ def test_html_import_no_table(self):
+ html = ""
+
+ with self.assertRaises(ValueError) as exc:
+ data.html = html
+ self.assertEqual('No found in input HTML', str(exc.exception))
+
+ def test_html_import_table_id(self):
+ """A table with a specific id can be targeted for import."""
+ html_input = """
+
+
+
+ first_name | last_name |
+ John | Adams |
"
+
+
+ """
+ dataset = tablib.import_set(html_input, format="html", table_id="import")
+ self.assertEqual(['first_name', 'last_name'], dataset.headers)
+ self.assertEqual([('John', 'Adams')], dataset[:])
+
class RSTTests(BaseTestCase):
def test_rst_force_grid(self):