-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawling.py
296 lines (218 loc) · 8.7 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import selenium
from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import datetime
import requests
import os
import pandas as pd
from tqdm import tqdm
"""
네이버 리뷰에서 음식점을 정하고
해당 음식점에 리류를 작성한 유저의 닉네임과 링크를 크롤링
이후 엑셀에 저장
"""
def function1(url):
cnt = 1
#엑셀 생성
xlsx = Workbook()
#output이라는 sheet에 저장해준다.
list_sheet = xlsx.create_sheet('output')
list_sheet.append(['nickname', 'link'])
now = datetime.datetime.now()
#크롤링중에 창 안뜨고 진행할 수 있도록 한다
options = webdriver.ChromeOptions()
options.add_argument("headless")
try:
driver = webdriver.Chrome(options=options)
#음식점 링크를 받아온다
driver.get(url)
driver.implicitly_wait(2)
count = 0
try:
# 해당 음식점에 리뷰를 작성한 모든 리뷰어의 개인 프로필 링크를 크롤링 한다.
while True:
count+=1
#XPATH를 이용해 프로필을 클릭한다.
driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span').click()
time.sleep(0.04)
if count == 500:
break
except Exception as e:
print(e)
print(count)
time.sleep(1)
html = driver.page_source
bs = BeautifulSoup(html, 'lxml')
reviews = bs.select('li.YeINN')
#해당 리뷰어의 리뷰 목록에 접근하는 과정
for r in reviews:
nickname = r.select_one('div.VYGLG')
link=r.select_one('a.p24Ki')['href']
# exception handling
nickname = nickname.text if nickname else ''
#엑셀 파일에 리뷰어 nickname과 링크를 저장한다.
list_sheet.append([nickname, link])
time.sleep(0.06)
driver.quit()
time.sleep(0.06)
#save file
file_name = 'reviewer_link_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
xlsx.save(file_name)
cnt+=1
except Exception as e:
print("function1 error occurs")
print(e)
file_name = 'reviewer_link_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
xlsx.save(file_name)
"""
리뷰어의 개인 프로필 링크 접근
작성한 모든 리뷰를 볼 수 있는 페이지 이동
"""
def review_2(url_path):
#리뷰어 개인 프로필의 링크
url = url_path
options = webdriver.ChromeOptions()
options.add_argument("headless")
try:
driver = webdriver.Chrome(options = options)
driver.get(url)
#버튼 클릭
reviewer_url_xpath = WebDriverWait(driver, 3).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div[3]/div/ul/li[1]/button")))
reviewer_url_xpath.click()
time.sleep(1.2)
get_url = driver.current_url
driver.quit()
return get_url
except Exception as e:
#print(e)
return 0
"""
리뷰를 바탕으로 데이터 구축
"""
def function3(df_):
now = datetime.datetime.now()
#엑셀에 저장해준다
xlsx = Workbook()
#output이라는 sheet에 저장
list_sheet = xlsx.create_sheet('output')
#크롤링 할 내용
list_sheet.append(['name','restaurant', 'content', 'type', 'address','date','url'])
#크롤링 옵션
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
# 리뷰어 개인 프로필의 링크를 받아서 시작한다.
count=0
for url_ in tqdm(df['link']):
count+=1
#중간에 저장해준다
if count % 100 ==0:
file_name = f'naver_review_{count}_.xlsx'
xlsx.save(file_name)
try:
url = review_2(url_)
if url == 0:
continue
driver = webdriver.Chrome(options= options)
driver.get(url)
time.sleep(1.5)
html = driver.page_source
bs = BeautifulSoup(html, 'lxml')
#원하는 데이터를 크롤링하기 위해 html파일에 간접적으로 접근하는 방식을 선택했다.
name=bs.select_one('header._2nqODz>div.jA_lkM>button.wTaI4v>h1._2LIPHf')
name=name.text if name else ''
#리뷰어가 많은 리뷰를 작성했을 경우를 위해 스크롤해서 페이지 밑으로 내려간다.
for i in range(50):
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.04)
html = driver.page_source
bs = BeautifulSoup(html, 'lxml')
#각 리뷰의 html을 잡아준다.
reviews = bs.select('div._27tH92')
# 리뷰에서 필요한 내용을 크롤링한다.
for r in reviews:
restaurant = r.select_one('div.MF77ib>button.wTaI4v>span._1QGRWW')
content = r.select_one('div._3-ITu7')
type = r.select('div._2vBfgu>span.wzFIfJ')[0]
address = r.select('div._2vBfgu>span.wzFIfJ')[1]
date=r.select('div.pOj49R>div._15xwjO>div.hol3Ic>div>span._3nNYBi>time')[0]
# exception handling
restaurant = restaurant.text if restaurant else ''
content = content.text if content else ''
type = type.text if type else ''
address = address.text if address else ''
date=date.text if date else ''
list_sheet.append([name,restaurant, content, type, address, date,url])
time.sleep(0.06)
except:
continue
file_name = f'naver_review_full.xlsx'
xlsx.save(file_name)
driver.quit()
def read_all_excel_files_in_current_path(sheet_name='output'):
current_path = os.getcwd()
all_files = os.listdir(current_path)
excel_files = [file for file in all_files if file.endswith(('.xlsx', '.xls'))]
if not excel_files:
print("No Excel files found in the current directory.")
return None
all_dataframes = []
for excel_file in excel_files:
file_path = os.path.join(current_path, excel_file)
try:
df = pd.read_excel(file_path, sheet_name)
all_dataframes.append(df)
except Exception as e:
print(f"Error reading sheet '{sheet_name}' from '{excel_file}': {str(e)}")
return all_dataframes
def concat_and_save_to_excel(dataframes, output_file='concatenated_output.xlsx'):
concatenated_df = pd.concat(dataframes, ignore_index=True)
concatenated_df.to_excel(output_file, index=False, engine='openpyxl')
print(f"Concatenated data saved to {output_file}")
if __name__ == "__main__":
now = datetime.datetime.now()
print("시작시간 ",end='')
print(now)
current_path = os.getcwd()
all_files = os.listdir(current_path)
excel_files = [file for file in all_files if file.endswith(('.xlsx', '.xls'))]
#import IPython; IPython.embed(colors='Linux'); exit(1)
if "concatenated_output.xlsx" in all_files:
df = pd.read_excel("")
print("concatenated_output.xlsx already exists!")
function3(df)
elif excel_files:
print("excel exits!.")
result_dataframes = read_all_excel_files_in_current_path()
concat_and_save_to_excel(result_dataframes)
df = pd.read_excel("")
print("concatenated_output.xlsx already exists!")
function3(df)
else:
text_path = [
]
#음식점 url에서 리뷰어 url 모음 - 함수1
for url_path in text_path:
url_path = url_path
function1(url_path)
print("function1 end")
result_dataframes = read_all_excel_files_in_current_path()
concat_and_save_to_excel(result_dataframes)
df = pd.read_excel("")
#함수3
function3(df)
now = datetime.datetime.now()
print("끝난 시간 ",end='')
print(now)