-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml2md.py
142 lines (113 loc) · 3.72 KB
/
html2md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from html.parser import HTMLParser
class Html2Markdown(HTMLParser):
# output markdown text
__output = ''
# take a unique placeholder then replace text
__placeholder = '3f4ec2b893ce4f1ab0f4c0861ef9dae7'
# mark content between start tag and end tag
__content = ''
# mark hidden between start tag and end tag
__hidden = False
# mark new line between start tag and end tag
__newline = True
# mark prefix line between start tag and end tag
__prefix = False
# set no space mode
__no_space = True
# define replacemen rules - 0: starttag, 1: endtag
__rule_replacement = {
'a': ('', ''),
'blockquote': ('\n', '\n'),
'code': (' ``` ', ' ``` '),
'div': ('', '\n'),
# 'em': (' *', '* '),
'h1': ('# ', '\n'),
'h2': ('## ', '\n'),
'h3': ('### ', '\n'),
'h4': ('#### ', '\n'),
'h5': ('##### ', '\n'),
'h6': ('###### ', '\n'),
'hr': ('', ' ----- \n'),
'img': ('', '\n\n'),
'p': ('', '\n'),
'pre': ('', '\n'),
'strong': (' **', '** '),
'ul': ('\n', '\n')
}
@property
def output(self):
r = self.__output
r = r.replace(' ', '')
r = r.replace('\n\n\n', '\n')
r = r.replace('****', '')
r = r.replace('``` ```', '')
return r.strip()
# default parse
def default_parse(self, tag, alone):
if alone is True:
i = 0
else:
i = 1
if tag in self.__rule_replacement:
it = self.__rule_replacement[tag]
self.__output += it[i]
# handle start tag
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (key, val) in attrs:
if key == 'href':
self.__content = f'[{self.__placeholder}]({val})'
if tag == 'blockquote':
self.__prefix = True
if tag == 'code':
self.__content = self.__placeholder
self.__no_space = False
if tag == 'figcaption':
self.__hidden = True
if tag == 'li':
if self.__prefix is True:
self.__output += '>'
else:
self.__output += '* '
self.__newline = False
if tag == 'p':
if self.__newline is True:
self.__output += '\n'
if tag == 'pre':
self.__rule_replacement['code'] = (' ``` \n', ' \n ``` ')
# default parse
self.default_parse(tag, True)
# handle text data
def handle_data(self, data):
data = data.replace('\xa0', '')
if len(self.__content) > 0:
self.__output += self.__content.replace(self.__placeholder, data)
self.__content = ''
elif self.__hidden is False:
self.__output += data
# handle end tag
def handle_endtag(self, tag):
if tag == 'blockquote':
self.__output += '\n'
self.__prefix = False
if tag == 'code':
self.__no_space = True
if tag == 'figcaption':
self.__hidden = False
if tag == 'li':
self.__newline = True
if tag == 'pre':
self.__rule_replacement['code'] = (' ``` ', ' ``` ')
# default parse
self.default_parse(tag, False)
# handle never end tag
def handle_startendtag(self, tag, attrs):
if tag == 'br':
if self.__newline is True:
self.__output += '\n'
if tag == 'img':
for (key, val) in attrs:
if key == 'data-src':
self.__output += f''
# default parse
self.default_parse(tag, False)