forked from merwin-asm/OpenCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobots_txt.py
84 lines (55 loc) · 1.55 KB
/
robots_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Open Crawler - Robot.txt - Loader
"""
import requests as r
def get_robot_txt(site, proxies):
site = site.replace("https://", "")
site = site.replace("http://", "")
robot_file_url = "https://" + site.split("/")[0] + "/robots.txt"
try:
res = r.get(robot_file_url, timeout = 3 , proxies = proxies)
except:
res = None
if res == None:
return ""
status = int(res.status_code)
if status == 200:
return res.text
else:
return ""
def get_lines(txt):
txt = remove_comments(txt)
while "" in txt:
txt.remove("")
while "\n" in txt:
txt.remove("\n")
while "\t" in txt:
txt.remove("\t")
return txt
def remove_comments(txt):
txt = txt.split("\n")
txt_ = []
for e in txt:
txt_.append(e.split("#")[0])
return txt_
def disallowed(site, proxies):
txt = get_robot_txt(site, proxies)
txt = get_lines(txt)
dis = []
if txt == []:
return txt
record = False
for line in txt:
if line == "User-agent: *":
record = True
elif line.startswith("Disallow:"):
if record:
dis.append(line.split(" ")[-1])
elif "User-agent:" in line:
if record == True:
break
return dis
if __name__ == "__main__":
# print(get_robot_txt("https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt"))
# print(disallowed("https://www.google.com/robots.txt"))
pass