1
1
#!/usr/bin/env python
2
2
# encoding=utf-8
3
- import requests ,re
3
+ import requests
4
+ import re
4
5
import codecs
5
6
from bs4 import BeautifulSoup
6
7
from openpyxl import Workbook
7
8
wb = Workbook ()
8
9
dest_filename = '电影.xlsx'
9
- ws1 = wb .active
10
+ ws1 = wb .active
10
11
ws1 .title = "电影top250"
11
12
12
13
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
@@ -24,56 +25,58 @@ def download_page(url):
24
25
def get_li (doc ):
25
26
soup = BeautifulSoup (doc , 'html.parser' )
26
27
ol = soup .find ('ol' , class_ = 'grid_view' )
27
- name = [] # 名字
28
- star_con = [] # 评价人数
29
- score = [] #评分
30
- info_list = [] #短评
28
+ name = [] # 名字
29
+ star_con = [] # 评价人数
30
+ score = [] # 评分
31
+ info_list = [] # 短评
31
32
for i in ol .find_all ('li' ):
32
33
detail = i .find ('div' , attrs = {'class' : 'hd' })
33
- movie_name = detail .find ('span' , attrs = {'class' : 'title' }).get_text () #电影名字
34
- level_star = i .find ('span' ,attrs = {'class' :'rating_num' }).get_text () #评分
35
- star = i .find ('div' ,attrs = {'class' :'star' })
36
- star_num = star .find (text = re .compile ('评价' )) #评价
34
+ movie_name = detail .find (
35
+ 'span' , attrs = {'class' : 'title' }).get_text () # 电影名字
36
+ level_star = i .find (
37
+ 'span' , attrs = {'class' : 'rating_num' }).get_text () # 评分
38
+ star = i .find ('div' , attrs = {'class' : 'star' })
39
+ star_num = star .find (text = re .compile ('评价' )) # 评价
37
40
38
- info = i .find ('span' ,attrs = {'class' :'inq' }) #短评
39
- if info : # 判断是否有短评
41
+ info = i .find ('span' , attrs = {'class' : 'inq' }) # 短评
42
+ if info : # 判断是否有短评
40
43
info_list .append (info .get_text ())
41
44
else :
42
45
info_list .append ('无' )
43
46
score .append (level_star )
44
-
45
47
46
48
name .append (movie_name )
47
49
star_con .append (star_num )
48
- page = soup .find ('span' , attrs = {'class' : 'next' }).find ('a' ) # 获取下一页
50
+ page = soup .find ('span' , attrs = {'class' : 'next' }).find ('a' ) # 获取下一页
49
51
if page :
50
- return name ,star_con ,score ,info_list ,DOWNLOAD_URL + page ['href' ]
51
- return name ,star_con ,score ,info_list ,None
52
+ return name , star_con , score , info_list , DOWNLOAD_URL + page ['href' ]
53
+ return name , star_con , score , info_list , None
52
54
53
55
54
56
def main ():
55
57
url = DOWNLOAD_URL
56
58
name = []
57
- star_con = []
59
+ star_con = []
58
60
score = []
59
61
info = []
60
62
while url :
61
63
doc = download_page (url )
62
- movie ,star ,level_num ,info_list ,url = get_li (doc )
64
+ movie , star , level_num , info_list , url = get_li (doc )
63
65
name = name + movie
64
66
star_con = star_con + star
65
- score = score + level_num
66
- info = info + info_list
67
- for (i ,m , o , p ) in zip (name ,star_con ,score ,info ):
68
- col_A = 'A%s' % (name .index (i )+ 1 )
69
- col_B = 'B%s' % (name .index (i )+ 1 )
70
- col_C = 'C%s' % (name .index (i )+ 1 )
71
- col_D = 'D%s' % (name .index (i )+ 1 )
72
- ws1 [col_A ]= i
67
+ score = score + level_num
68
+ info = info + info_list
69
+ for (i , m , o , p ) in zip (name , star_con , score , info ):
70
+ col_A = 'A%s' % (name .index (i ) + 1 )
71
+ col_B = 'B%s' % (name .index (i ) + 1 )
72
+ col_C = 'C%s' % (name .index (i ) + 1 )
73
+ col_D = 'D%s' % (name .index (i ) + 1 )
74
+ ws1 [col_A ] = i
73
75
ws1 [col_B ] = m
74
76
ws1 [col_C ] = o
75
77
ws1 [col_D ] = p
76
78
wb .save (filename = dest_filename )
77
79
80
+
78
81
if __name__ == '__main__' :
79
82
main ()
0 commit comments