From 2dec898527ea7eeea6695efc8a4e639c306b0592 Mon Sep 17 00:00:00 2001
From: Claude Paroz <claude@2xlibre.net>
Date: Sun, 2 Jul 2023 12:00:41 +0200
Subject: [PATCH] Fixes #243 - Support import_set for html input

---
 HISTORY.md                  |  4 +++
 docs/formats.rst            | 18 +++++++++--
 src/tablib/formats/_html.py | 64 +++++++++++++++++++++++++++++++++++++
 tests/test_tablib.py        | 48 ++++++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 3 deletions(-)
diff --git a/HISTORY.md b/HISTORY.md
index 13640879..2c1e59e8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+### Improvements
+
+- The html format now supports importing from HTML content (#243)
+
 ### Changes
 
 - The html export format does not depend on MarkupPy any longer, therefore the
diff --git a/docs/formats.rst b/docs/formats.rst
index 919394b7..636f7309 100644
--- a/docs/formats.rst
+++ b/docs/formats.rst
@@ -97,9 +97,21 @@ install Tablib with ``pip install "tablib[pandas]"`` to make the format availabl
 html
 ====
 
-The ``html`` format is currently export-only. The exports produce an HTML page
-with the data in a ``<table>``. If headers have been set, they will be used as
-table headers.
+The exports produce an HTML page with the data in a ``<table>``. If headers have
+been set, they will be used as table headers (``thead``).
+
+When you import HTML, you can specify a specific table to import by providing
+the ``table_id`` argument::
+
+    import tablib
+
+    tablib.import_set(your_html, format='html', table_id='some_table_id')
+
+Otherwise, the first table found will be imported.
+
+.. versionchanged:: 3.6.0
+
+    The ability to import HTML was added. The dependency on MarkupPy was dropped.
 
 jira
 ====
diff --git a/src/tablib/formats/_html.py b/src/tablib/formats/_html.py
index 373620d2..ecc3a3d9 100644
--- a/src/tablib/formats/_html.py
+++ b/src/tablib/formats/_html.py
@@ -1,5 +1,6 @@
 """ Tablib - HTML export support.
 """
+from html.parser import HTMLParser
 from xml.etree import ElementTree as ET
 
 
@@ -48,3 +49,66 @@ def export_book(cls, databook):
             result += '\n'
 
         return result
+
+    @classmethod
+    def import_set(cls, dset, in_stream, table_id=None, **kwargs):
+        """Returns dataset from HTML content."""
+
+        dset.wipe()
+        parser = TablibHTMLParser(dset, table_id=table_id)
+        parser.feed(in_stream.read())
+        if not parser.table_found:
+            if table_id:
+                raise ValueError(f'No <table> found with id="{table_id}" in input HTML')
+            else:
+                raise ValueError('No <table> found in input HTML')
+
+
+class TablibHTMLParser(HTMLParser):
+    def __init__(self, dataset, *args, table_id=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dset = dataset
+        self.table_id = table_id
+        self.table_found = False
+        self.table_open = False
+        self.thead_open = False
+        self.cell_open = False
+        self.headers = []
+        self.current_row = []
+        self.current_data = ''
+
+    def handle_starttag(self, tag, attrs):
+        if (
+            tag == 'table' and not self.table_found and
+            (not self.table_id or dict(attrs).get('id') == self.table_id)
+        ):
+            self.table_open = True
+            self.table_found = True
+        elif self.table_open:
+            if tag == 'thead':
+                self.thead_open = True
+            elif tag in ['td', 'th']:
+                self.cell_open = True
+
+    def handle_endtag(self, tag):
+        if not self.table_open:
+            return
+        if tag == 'table':
+            self.table_open = False
+        elif tag == 'thead':
+            self.thead_open = False
+            self.dset.headers = self.headers
+        elif tag == 'tr' and self.current_row:
+            self.dset.append(self.current_row)
+            self.current_row = []            
+        elif tag in ['td', 'th']:
+            if self.thead_open:
+                self.headers.append(self.current_data)
+            else:
+                self.current_row.append(self.current_data)
+            self.cell_open = False
+            self.current_data = ''
+
+    def handle_data(self, data):
+        if self.cell_open:
+            self.current_data += data
diff --git a/tests/test_tablib.py b/tests/test_tablib.py
index 11a92fa6..408d0449 100755
--- a/tests/test_tablib.py
+++ b/tests/test_tablib.py
@@ -667,6 +667,54 @@ def test_html_databook_export(self):
             f"<h3>Founders</h3>{self.founders_html}<h3>Founders</h3>{self.founders_html}"
         )
 
+    def test_html_import(self):
+        data.html = self.founders_html
+
+        self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers)
+        self.assertEqual([
+            ('John', 'Adams', '90'),
+            ('George', 'Washington', '67'),
+            ('Thomas', 'Jefferson', '50'),
+        ], data[:])
+
+    def test_html_import_no_headers(self):
+        data.html = """
+            <table>
+            <tr><td>John</td><td><i>Adams</i></td><td>90</td></tr>"
+            <tr><td>George</td><td><i>Wash</i>ington</td><td>67</td></tr>"
+            </table>
+        """
+
+        self.assertIsNone(data.headers)
+        self.assertEqual([
+            ('John', 'Adams', '90'),
+            ('George', 'Washington', '67'),
+        ], data[:])
+
+    def test_html_import_no_table(self):
+        html = "<html><body></body></html>"
+
+        with self.assertRaises(ValueError) as exc:
+            data.html = html
+        self.assertEqual('No <table> found in input HTML', str(exc.exception))
+
+    def test_html_import_table_id(self):
+        """A table with a specific id can be targeted for import."""
+        html_input = """
+            <html><body>
+            <table id="ignore">
+              <thead><tr><th>IGNORE</th></tr></thead><tr><td>IGNORE</td></tr>
+            </table>
+            <table id="import">
+              <thead><tr><th>first_name</th><th>last_name</th></tr></thead>
+              <tr><td>John</td><td>Adams</td></tr>"
+            </table>
+            </html></body>
+        """
+        dataset = tablib.import_set(html_input, format="html", table_id="import")
+        self.assertEqual(['first_name', 'last_name'], dataset.headers)
+        self.assertEqual([('John', 'Adams')], dataset[:])
+
 
 class RSTTests(BaseTestCase):
     def test_rst_force_grid(self):