-
Notifications
You must be signed in to change notification settings - Fork 0
/
getschool.py
107 lines (92 loc) · 3.09 KB
/
getschool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
import re
import json
from lxml import etree
import codecs
def x_to_h(a):
"十六进制到十进制"
num = ['0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f']
ans = ""
while a > 15:
ans = num[int(a % 16)] + ans
a /= 16
if int(a) != 0:
ans = num[int(a)] + ans
return ans
def get_td2(html, end):
"得到下面表格的东西"
t = etree.HTML(html)
temp = t.xpath("//div[@class='basic-info cmn-clearfix']/dl")
keys = temp[0].xpath("./dt") + temp[1].xpath("./dt")
values = temp[0].xpath("./dd") + temp[1].xpath("./dd")
for i in range(0, len(keys) - 1):
key = keys[i].xpath("./text()")[0]
key = re.sub(r"\s", "", key)
value = get_str_2(values[i])
# print(key, ":", value)
end[key] = value
# print(type(values[i]))
def get_td(html, end):
"得到上面表格的东西"
t = etree.HTML(html)
temp = t.xpath("//div[@class='baseBox']/div")
temp = temp[0].xpath("./dl") + temp[1].xpath("./dl")
for dl in temp:
key = dl.xpath("./dt/text()")[0]
key = re.sub(r"\s", "", key) # 去空格
value = get_str(dl.xpath("./dd"))
# print(key, ":", value)
end[key] = value
def get_str_2(t_oo):
"得到下面表格里的元素里的文字"
string = etree.tostring(t_oo)
string = string.decode('UTF-8', 'strict')
# print("原文:\n" + string)
string = re.sub(r"<.*?>", "", string)
string = re.sub(r"\[\d*\]", "", string)
string = re.sub(r"\n", "", string)
re_a = re.findall(r"&#.*?;", string)
for i in re_a:
o = x_to_h(int(i[2:-1]))
if len(o) < 4:
o = '0' * (4 - len(o)) + o
string = string.replace(i, "\\u" + o)
string = string.encode()
string = string.decode('unicode-escape')
return string
def get_str(t_o):
"得到上面表格里的元素里的文字"
text = ""
for t_oo in t_o:
string = etree.tostring(t_oo)
string = string.decode('UTF-8', 'strict')
# print("原文:\n" + string)
string = re.sub(r"<.*?>", "", string)
string = re.sub(r"\[\d*\]", "", string)
string = re.sub(r"\n", "", string)
re_a = re.findall(r"&#.*?;", string)
for i in re_a:
o = x_to_h(int(i[2:-1]))
if len(o) < 4:
o = '0' * (4 - len(o)) + o
string = string.replace(i, "\\u" + o)
string = string.encode()
string = string.decode('unicode-escape')
text += string
return text
def parse_all(html):
end = {}
get_td2(html, end)
get_td(html, end)
return end
def get_str_fail(t_o):
"遍历所有标签和文字 得到所有文字"
text = "" # 但是所有标签外文字会在左边,标签内会在右边,字的顺序会乱
for t in t_o:
tt = t.xpath("./text()")
for i in tt:
text += i.strip()
if isinstance(t.xpath("./*"), list):
text += get_str(t.xpath("./*"))
return text