Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #243 - Support import_set for html input #555

Merged
merged 1 commit into from
Jul 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Improvements

- The html format now supports importing from HTML content (#243)

### Changes

- The html export format does not depend on MarkupPy any longer, therefore the
Expand Down
18 changes: 15 additions & 3 deletions docs/formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,21 @@ install Tablib with ``pip install "tablib[pandas]"`` to make the format availabl
html
====

The ``html`` format is currently export-only. The exports produce an HTML page
with the data in a ``<table>``. If headers have been set, they will be used as
table headers.
The exports produce an HTML page with the data in a ``<table>``. If headers have
been set, they will be used as table headers (``thead``).

When you import HTML, you can specify a specific table to import by providing
the ``table_id`` argument::

import tablib

tablib.import_set(your_html, format='html', table_id='some_table_id')

Otherwise, the first table found will be imported.

.. versionchanged:: 3.6.0

The ability to import HTML was added. The dependency on MarkupPy was dropped.

jira
====
Expand Down
64 changes: 64 additions & 0 deletions src/tablib/formats/_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Tablib - HTML export support.
"""
from html.parser import HTMLParser
from xml.etree import ElementTree as ET


Expand Down Expand Up @@ -48,3 +49,66 @@ def export_book(cls, databook):
result += '\n'

return result

@classmethod
def import_set(cls, dset, in_stream, table_id=None, **kwargs):
"""Returns dataset from HTML content."""

dset.wipe()
parser = TablibHTMLParser(dset, table_id=table_id)
parser.feed(in_stream.read())
if not parser.table_found:
if table_id:
raise ValueError(f'No <table> found with id="{table_id}" in input HTML')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we add a simple test to cover this line and check it executes as expected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion, updated.

else:
raise ValueError('No <table> found in input HTML')


class TablibHTMLParser(HTMLParser):
def __init__(self, dataset, *args, table_id=None, **kwargs):
super().__init__(*args, **kwargs)
self.dset = dataset
self.table_id = table_id
self.table_found = False
self.table_open = False
self.thead_open = False
self.cell_open = False
self.headers = []
self.current_row = []
self.current_data = ''

def handle_starttag(self, tag, attrs):
if (
tag == 'table' and not self.table_found and
(not self.table_id or dict(attrs).get('id') == self.table_id)
):
self.table_open = True
self.table_found = True
elif self.table_open:
if tag == 'thead':
self.thead_open = True
elif tag in ['td', 'th']:
self.cell_open = True

def handle_endtag(self, tag):
if not self.table_open:
return
if tag == 'table':
self.table_open = False
elif tag == 'thead':
self.thead_open = False
self.dset.headers = self.headers
elif tag == 'tr' and self.current_row:
self.dset.append(self.current_row)
self.current_row = []
elif tag in ['td', 'th']:
if self.thead_open:
self.headers.append(self.current_data)
else:
self.current_row.append(self.current_data)
self.cell_open = False
self.current_data = ''

def handle_data(self, data):
if self.cell_open:
self.current_data += data
53 changes: 53 additions & 0 deletions tests/test_tablib.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,59 @@ def test_html_databook_export(self):
f"<h3>Founders</h3>{self.founders_html}<h3>Founders</h3>{self.founders_html}"
)

def test_html_import(self):
data.html = self.founders_html

self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers)
self.assertEqual([
('John', 'Adams', '90'),
('George', 'Washington', '67'),
('Thomas', 'Jefferson', '50'),
], data[:])

def test_html_import_no_headers(self):
data.html = """
<table>
<tr><td>John</td><td><i>Adams</i></td><td>90</td></tr>"
<tr><td>George</td><td><i>Wash</i>ington</td><td>67</td></tr>"
</table>
"""

self.assertIsNone(data.headers)
self.assertEqual([
('John', 'Adams', '90'),
('George', 'Washington', '67'),
], data[:])

def test_html_import_no_table(self):
html = "<html><body></body></html>"

with self.assertRaises(ValueError) as exc:
data.html = html
self.assertEqual('No <table> found in input HTML', str(exc.exception))

def test_html_import_table_id(self):
"""A table with a specific id can be targeted for import."""
html_input = """
<html><body>
<table id="ignore">
<thead><tr><th>IGNORE</th></tr></thead><tr><td>IGNORE</td></tr>
</table>
<table id="import">
<thead><tr><th>first_name</th><th>last_name</th></tr></thead>
<tr><td>John</td><td>Adams</td></tr>"
</table>
</html></body>
"""
dataset = tablib.import_set(html_input, format="html", table_id="import")
self.assertEqual(['first_name', 'last_name'], dataset.headers)
self.assertEqual([('John', 'Adams')], dataset[:])

# If the id is not found, an error is raised
with self.assertRaises(ValueError) as exc:
dataset = tablib.import_set(html_input, format="html", table_id="notfound")
self.assertEqual('No <table> found with id="notfound" in input HTML', str(exc.exception))


class RSTTests(BaseTestCase):
def test_rst_force_grid(self):
Expand Down