From 2dec898527ea7eeea6695efc8a4e639c306b0592 Mon Sep 17 00:00:00 2001 From: Claude Paroz Date: Sun, 2 Jul 2023 12:00:41 +0200 Subject: [PATCH] Fixes #243 - Support import_set for html input --- HISTORY.md | 4 +++ docs/formats.rst | 18 +++++++++-- src/tablib/formats/_html.py | 64 +++++++++++++++++++++++++++++++++++++ tests/test_tablib.py | 48 ++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 13640879..2c1e59e8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,10 @@ ## Unreleased +### Improvements + +- The html format now supports importing from HTML content (#243) + ### Changes - The html export format does not depend on MarkupPy any longer, therefore the diff --git a/docs/formats.rst b/docs/formats.rst index 919394b7..636f7309 100644 --- a/docs/formats.rst +++ b/docs/formats.rst @@ -97,9 +97,21 @@ install Tablib with ``pip install "tablib[pandas]"`` to make the format availabl html ==== -The ``html`` format is currently export-only. The exports produce an HTML page -with the data in a ````. If headers have been set, they will be used as -table headers. +The exports produce an HTML page with the data in a ``
``. If headers have +been set, they will be used as table headers (``thead``). + +When you import HTML, you can specify a specific table to import by providing +the ``table_id`` argument:: + + import tablib + + tablib.import_set(your_html, format='html', table_id='some_table_id') + +Otherwise, the first table found will be imported. + +.. versionchanged:: 3.6.0 + + The ability to import HTML was added. The dependency on MarkupPy was dropped. jira ==== diff --git a/src/tablib/formats/_html.py b/src/tablib/formats/_html.py index 373620d2..ecc3a3d9 100644 --- a/src/tablib/formats/_html.py +++ b/src/tablib/formats/_html.py @@ -1,5 +1,6 @@ """ Tablib - HTML export support. """ +from html.parser import HTMLParser from xml.etree import ElementTree as ET @@ -48,3 +49,66 @@ def export_book(cls, databook): result += '\n' return result + + @classmethod + def import_set(cls, dset, in_stream, table_id=None, **kwargs): + """Returns dataset from HTML content.""" + + dset.wipe() + parser = TablibHTMLParser(dset, table_id=table_id) + parser.feed(in_stream.read()) + if not parser.table_found: + if table_id: + raise ValueError(f'No
found with id="{table_id}" in input HTML') + else: + raise ValueError('No
found in input HTML') + + +class TablibHTMLParser(HTMLParser): + def __init__(self, dataset, *args, table_id=None, **kwargs): + super().__init__(*args, **kwargs) + self.dset = dataset + self.table_id = table_id + self.table_found = False + self.table_open = False + self.thead_open = False + self.cell_open = False + self.headers = [] + self.current_row = [] + self.current_data = '' + + def handle_starttag(self, tag, attrs): + if ( + tag == 'table' and not self.table_found and + (not self.table_id or dict(attrs).get('id') == self.table_id) + ): + self.table_open = True + self.table_found = True + elif self.table_open: + if tag == 'thead': + self.thead_open = True + elif tag in ['td', 'th']: + self.cell_open = True + + def handle_endtag(self, tag): + if not self.table_open: + return + if tag == 'table': + self.table_open = False + elif tag == 'thead': + self.thead_open = False + self.dset.headers = self.headers + elif tag == 'tr' and self.current_row: + self.dset.append(self.current_row) + self.current_row = [] + elif tag in ['td', 'th']: + if self.thead_open: + self.headers.append(self.current_data) + else: + self.current_row.append(self.current_data) + self.cell_open = False + self.current_data = '' + + def handle_data(self, data): + if self.cell_open: + self.current_data += data diff --git a/tests/test_tablib.py b/tests/test_tablib.py index 11a92fa6..408d0449 100755 --- a/tests/test_tablib.py +++ b/tests/test_tablib.py @@ -667,6 +667,54 @@ def test_html_databook_export(self): f"

Founders

{self.founders_html}

Founders

{self.founders_html}" ) + def test_html_import(self): + data.html = self.founders_html + + self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers) + self.assertEqual([ + ('John', 'Adams', '90'), + ('George', 'Washington', '67'), + ('Thomas', 'Jefferson', '50'), + ], data[:]) + + def test_html_import_no_headers(self): + data.html = """ +
+ " + " +
JohnAdams90
GeorgeWashington67
+ """ + + self.assertIsNone(data.headers) + self.assertEqual([ + ('John', 'Adams', '90'), + ('George', 'Washington', '67'), + ], data[:]) + + def test_html_import_no_table(self): + html = "" + + with self.assertRaises(ValueError) as exc: + data.html = html + self.assertEqual('No found in input HTML', str(exc.exception)) + + def test_html_import_table_id(self): + """A table with a specific id can be targeted for import.""" + html_input = """ + +
+ +
IGNORE
IGNORE
+ + + " +
first_namelast_name
JohnAdams
+ + """ + dataset = tablib.import_set(html_input, format="html", table_id="import") + self.assertEqual(['first_name', 'last_name'], dataset.headers) + self.assertEqual([('John', 'Adams')], dataset[:]) + class RSTTests(BaseTestCase): def test_rst_force_grid(self):