-
Notifications
You must be signed in to change notification settings - Fork 3
/
01.download_1.py
80 lines (69 loc) · 2.36 KB
/
01.download_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python3
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas
import time
# 读取 00.get_metadata.R 获取的相关目录信息
D0 = pandas.read_csv("all_aisixiang_2017-05-24.csv")
# 意外中断时,可以修改 j 的值
j = 0
D = D0[j:]
for i in D['ID']:
Url = "http://www.aisixiang.com/data/" + str(i) + ".html"
print(Url)
try:
html = urlopen(Url)
except:
f1 = open("broken-new.txt", 'a')
Broken = str(i) + '-' + Url + ',' + '\n'
f1.write(Broken)
f1.close
print(Broken)
j += 1
Availability = 3
f2 = open("Av.txt", 'a')
f2.write(str(Availability) + '_' + str(i) + ',' + '\n')
f2.close
else:
Soup = BeautifulSoup(html, "html.parser")
Article = Soup.find(id = "content2")
Article_page = ''
if type(Article) == type(None):
Availability = 0
else:
Availability = 1
Page = Soup.find(class_ = "list_page")
if type(Page) == type(None):
Article_page = Article_page + Article.get_text()
else:
Page_number = Page.find_all("a")
N = int(Page_number[-2].get_text())
for k in range(1, N+1):
Url2 = Url[:(len(Url)-5)] + '-' + str(k) + '.html'
print(Url2)
try:
html2 = urlopen(Url2)
except:
k += 1
ft2 = open("broken2.txt", 'a')
Broken2 = str(i) + '-' + Url2 + ',' + '\n'
ft2.write(Broken2)
ft2.close
print(Broken2)
else:
Soup2 = BeautifulSoup(html2, "html.parser")
Article = Soup2.find(id = "content2")
Article_page = Article_page + Article.get_text()
time.sleep(1)
Name = str(Availability) + '-' + str(i) + '-' + D0.iloc[j,0] + '.txt'
Name = Name.replace('/','')
f = open(Name, 'w')
f.write(Article_page)
f.close()
print(Name + '\n')
j += 1
time.sleep(1)
f2 = open("Av.txt", 'a')
f2.write(str(Availability) + '_' + str(i) + ',' + '\n')
f2.close