-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmungy.py
111 lines (101 loc) · 3.65 KB
/
mungy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from pyquery import PyQuery as pq
import json
class MalformedHTMLError(Exception):
def __init__(self, msg):
self.msg = msg
class Munger():
"""Designed to be fluent/chainable."""
def to_json(self, data):
return json.dumps(data)
def html_to_lists(self, pqhtml):
"""Takes a chunk of html and generates multiple lists
from all found UL's on the page.
"""
# TODO: Add OL support
list_html = []
for k, ul in enumerate(pq(pqhtml).find('ul')):
list_html.append(self.list_to_list(pq(ul)))
return list_html
def dict_to_list(self, data):
ls = []
for _, v in data.iteritems():
ls.append(v)
return ls
def list_to_list(self, list_html):
"""Takes a single ul node and converts it to a
list of list-items"""
list_items = []
for li in pq(list_html).find('li'):
list_items.append(pq(li).text())
return list_items
def table_to_dict_values(self, table_html):
"""Takes table html and converts into a
dictionary with the following format:
{
"0": {
"thead1": "td1"
"thead2": "td2"
},
"1": {
"thead1": "td1"
"thead2": "td2"
}
}
"""
data = {}
headings = []
# Try proper theader formatting first.
ths = pq(table_html).find('thead th')
if not ths:
ths = pq(table_html).find('th')
if not ths:
ths = pq(table_html).find('thead').children()
if not ths:
raise MalformedHTMLError('Invalid html. No THs could be found.')
for th in ths:
headings.append('_'.join(pq(th).text().strip().lower().split(' ')))
trs = pq(table_html).find('tbody tr')
if not trs:
trs = pq(table_html).find('tr')
if not trs:
raise MalformedHTMLError('Invalid html. No TRs could be found.')
for k, tr in enumerate(trs):
tr_data = {}
for k1, td in enumerate(pq(tr).children()):
try:
tr_data[headings[k1]] = pq(td).text().strip().lower()
except IndexError:
continue
data[k] = tr_data
return data
def table_to_dict_indices(self, table_html):
# Sort by indices
data = {}
"""
Example output
data = {
[0: (th1, [td1, tdn]),
[1: (th2, [td2, tdn]),
}
"""
# TODO: handle images
# Handle plain tables, no headings
if not pq(table_html).find('tbody') and not \
pq(table_html).find('thead'):
for k, row in enumerate(pq(table_html).find('tr')):
data[k] = []
for k1, cell in enumerate(pq(row).find('td')):
data[k].append(pq(cell).text().strip().lower())
else:
# Handle traditionally (properly) formatted tables
for _, heading in enumerate(pq(table_html).find('thead th')):
data[k] = (pq(heading).text().strip().lower(), [])
for _, row in enumerate(pq(table_html).find('tbody tr')):
for k1, cell in enumerate(pq(row).find('td')):
try:
# Nested tables, etc will just
# merged into the single td
data[k1][1].append(pq(cell).text().strip().lower())
except KeyError:
continue
return data