From fcee87132acce5f81e253b4caf1337e413392476 Mon Sep 17 00:00:00 2001 From: klothe Date: Tue, 5 Jul 2016 13:53:24 -0400 Subject: [PATCH 1/3] add html.import_set and tests --- tablib/formats/_html.py | 25 +++++++++++++++++++ test_tablib.py | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/tablib/formats/_html.py b/tablib/formats/_html.py index 655e74c5..f72e6fe7 100644 --- a/tablib/formats/_html.py +++ b/tablib/formats/_html.py @@ -11,6 +11,7 @@ else: from cStringIO import StringIO from tablib.packages import markup +import bs4 import tablib from tablib.compat import unicode @@ -68,3 +69,27 @@ def export_book(databook): wrapper.write('\n') return stream.getvalue().decode('utf-8') + + +def import_set(dset, in_stream, headers=True, **kwargs): + dset.wipe() + text = in_stream.read() + tables = bs4.BeautifulSoup(markup=text).find_all('table') + if len(tables) != 1: + raise ValueError('Expected 1 table, found %s' % len(tables)) + table = tables[0] + + if table.thead.tr: + dset.headers = [ + x.string for x in table.thead.tr.find_all('th', recursive=False)] + + # this finds rows inside , , also. + for i, row in enumerate(table.find_all('tr')): + # skip first row if it was used for the headers + if i == 0 and dset.headers: + continue + dset.append( + [cell.get_text() for cell in row.find_all('td', recursive=False)]) + + print dset + diff --git a/test_tablib.py b/test_tablib.py index be41ee71..cfaa791c 100755 --- a/test_tablib.py +++ b/test_tablib.py @@ -6,6 +6,9 @@ import unittest import sys import os + +from StringIO import StringIO + import tablib from tablib.compat import markup, unicode, is_py3 from tablib.core import Row @@ -296,6 +299,56 @@ def test_html_export_none_value(self): self.assertEqual(html, d.html) + def test_html_import(self): + html = markup.page() + html.table.open() + html.thead.open() + + html.tr(markup.oneliner.th(self.founders.headers)) + html.thead.close() + + for founder in self.founders: + html.tr(markup.oneliner.td(founder)) + + html.table.close() + html = StringIO(str(html)) + + data.html = html + + self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers) + self.assertEqual([ + ('John', 'Adams', '90'), + ('George', 'Washington', '67'), + ('Thomas', 'Jefferson', '50'), + ], data[:]) + + def test_html_import_no_headers(self): + html = markup.page() + html.table.open() + html.thead.open() + + for founder in self.founders: + html.tr(markup.oneliner.td(founder)) + + html.table.close() + html = StringIO(str(html)) + + data.html = html + + self.assertEqual(None, data.headers) + self.assertEqual([ + ('John', 'Adams', '90'), + ('George', 'Washington', '67'), + ('Thomas', 'Jefferson', '50'), + ], data[:]) + + def test_html_import_no_table(self): + html = StringIO(str(markup.page())) + + with self.assertRaises(ValueError) as e: + data.html = html + self.assertEqual('Expected 1 table, found 0', e.exception.message) + def test_latex_export(self): """LaTeX export""" From 59898105e6b48972a38cff6f9efde3e5d764c6f8 Mon Sep 17 00:00:00 2001 From: klothe Date: Tue, 5 Jul 2016 14:09:55 -0400 Subject: [PATCH 2/3] remove unused argument to html.import_set --- tablib/formats/_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tablib/formats/_html.py b/tablib/formats/_html.py index f72e6fe7..8389b944 100644 --- a/tablib/formats/_html.py +++ b/tablib/formats/_html.py @@ -71,7 +71,7 @@ def export_book(databook): return stream.getvalue().decode('utf-8') -def import_set(dset, in_stream, headers=True, **kwargs): +def import_set(dset, in_stream, **kwargs): dset.wipe() text = in_stream.read() tables = bs4.BeautifulSoup(markup=text).find_all('table') From 2874a2466bcc862e220dc43747db6a9a72f2dbf6 Mon Sep 17 00:00:00 2001 From: klothe Date: Tue, 5 Jul 2016 14:15:27 -0400 Subject: [PATCH 3/3] remove print --- tablib/formats/_html.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tablib/formats/_html.py b/tablib/formats/_html.py index 8389b944..0ce0b5c3 100644 --- a/tablib/formats/_html.py +++ b/tablib/formats/_html.py @@ -91,5 +91,3 @@ def import_set(dset, in_stream, **kwargs): dset.append( [cell.get_text() for cell in row.find_all('td', recursive=False)]) - print dset -