-
Notifications
You must be signed in to change notification settings - Fork 0
/
HtmlToMarkdown.py
86 lines (70 loc) · 2.32 KB
/
HtmlToMarkdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from html2text import HTML2Text, html2text # pip install html2text
import os
import requests
class HtmlToMarkdown(object):
""" Doc of HtmlToMarkdown.
Convert HTML(url / .html) to Markdown.
Attribute:
source: url / .html
"""
def __init__(self, source):
# super(HtmlToMarkdown, self).__init__()
self.source = source
self.html_handle = HTML2Text()
@property
def source_type(self):
return 'http' if self.source.strip()[:4] == 'http' else 'file'
@property
def content(self):
if self.source_type == 'http':
headers = {
'Referer': 'https://www.google.com/',
'sec-fetch-mode': 'navigate',
'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
proxies = {
# 'https': 'socks5://user:pass@host:port',
'http': 'socks5://127.0.0.1:1086',
'https': 'socks5://127.0.0.1:1086',
}
try:
response = requests.get(
self.source,
headers=headers,
proxies=proxies,
timeout=(3.05, 27) # (connection, read)
).content.decode('utf-8')
except Exception as e:
print(e)
response = ''
return response
return open(source, 'r', encoding='UTF-8').read()
def markdown(self):
""" Convert to Markdown
return: Markdown of String.
"""
self.html_handle.ignore_links = True
self.html_handle.ignore_images = True
return html2text(self.content)
def text(self):
""" Convert to Text
return: Text of String.
"""
self.html_handle.ignore_links = True
# self.html_handle.bypass_tables = False
return self.html_handle.handle(self.content)
source = 'https://www.baidu.com/'
source = 'https://www.google.com/'
# source = 'https://www.twitter.com/'
# source = 'https://litets.com/article/2019/4/3/103.html'
# source = './baidu.html'
# source = 'https://zh.wikipedia.org/wiki/%E9%9F%93%E5%9C%8B%E9%83%A8%E7%BD%B2%E8%96%A9%E5%BE%B7%E5%8F%8D%E5%B0%8E%E5%BD%88%E7%B3%BB%E7%B5%B1%E4%BA%8B%E4%BB%B6'
# source = 'https://www.runoob.com/html/html-tables.html'
obj = HtmlToMarkdown(source)
# print(obj.source)
# print(obj.content)
print(obj.markdown())
# print(obj.text())
# print(obj.test())
# print(obj.run())