-
Notifications
You must be signed in to change notification settings - Fork 0
/
testparse.py
108 lines (85 loc) · 2.97 KB
/
testparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup
import csv
# pip install beautifulsoup4
# pip install lxml
# План
# 1. Выяснить кол-во страниц
# 2. Сформировать список уролов на страницы выдачи
# 3. Собрать данные
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
r = requests.get(url, headers = headers)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('ul', class_='aRWg5').find_all('a', class_='_2xCrO')[-1].get('href')
total_pages = pages.split('=')[1]
return int(total_pages)
def write_csv(data):
with open('reviews_data_base.csv','a') as f:
writer = csv.writer(f)
writer.writerow( (data['company'], data['city'], data['product'], data['header'], data['full_text'], data['mark'], data['date'], data['link']) )
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
reviews = soup.find('div', class_='_1ZC3Z').find_all('div', class_='_227VT')
for review in reviews:
name = review.find('div', class_='_3bNvn').find_all('a')[0].get('href').split('/')[2]
if 'rosgosstrah' not in name:
#company, product, header, full_text, mark, date, link
try:
company = review.find('div', class_='_3bNvn').find_all('a')[0].get('href').split('/')[2]
except:
company = ''
try:
city = review.find('div', class_='_3bNvn').find_all('div', class_='_2iHTj')[0].text.split(', ')[1]
except:
city = ''
try:
product = review.find('div', class_='Ws2f2').find_all('span', class_='_1RwOX')[0].text
except:
product = ''
try:
header = review.find('a', class_ = 'mrfZC').find_all('div', class_='_3SgnA _2mg0e')[0].text
except:
header = ''
try:
full_text = review.find('div', class_ = '_3p0dD').find_all('p')[0].text
except:
full_text = ''
try:
mark = int(review.find('div', class_='_3bNvn').find_all('span',class_='_1OBr6')[0].text)
except:
mark = ''
try:
date = review.find('div', class_='_3bNvn').find_all('div', class_='_2iHTj')[0].text.split(', ')[2] + ' ' + review.find('div', class_='_3bNvn').find_all('div', class_='_2iHTj')[0].text.split(', ')[3]
except:
date = ''
try:
link = 'https://www.sravni.ru' + review.find('a', class_ = 'mrfZC').get('href')
except:
link = ''
data = {'company' : company,
'city' : city,
'product' : product,
'header' : header,
'full_text' : full_text,
'mark' : mark,
'date' : date,
'link' : link
}
write_csv(data)
else:
continue
def main():
url = "https://www.sravni.ru/strakhovye-kompanii/otzyvy/"
base_url = "https://www.sravni.ru/strakhovye-kompanii/otzyvy/?"
page_part = "page="
total_pages = get_total_pages(get_html(url))
for i in range(1, total_pages):
url_gen = base_url+page_part+str(i)
#print(url_gen)
html = get_html(url_gen)
get_page_data(html)
if __name__ == '__main__':
main()