-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
78 lines (65 loc) · 2.23 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from selenium import webdriver
import time
import urllib.request as req
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from urllib.parse import quote
from urllib.request import urlopen
import pandas as pd
driver = webdriver.Chrome("path to chromedriver")
url = "http://joongang.joins.com"
driver.get(url)
driver.find_element_by_class_name('icon_search').click()
input=driver.find_element_by_id("searchKeyword")
input.send_keys('코로나')
driver.find_element_by_id("btnSearch").click()
driver.find_element_by_class_name("view_more").click()
driver.find_element_by_class_name("view_more").click()
link = []
for i in range(1, 41):
print('{}번째 page.....'.format(i))
time.sleep(15)
# get html
html = driver.page_source
soup = BeautifulSoup(html,"html.parser")
soup = soup.select("ul.list_default > li")
# get url
for j in soup:
href = j.find('a', {'target': '_blank'})['href']
link.append(href)
if i < 10:
# 1 page -> 2 page
if i == 1:
driver.find_element_by_xpath('//*[@id="content"]/div[3]/div[3]/div/a[1]').click()
continue
# n -> n+1 page (2<= n <= 9)
else:
driver.find_element_by_xpath('//*[@id="content"]/div[2]/div[3]/div/a[{}]'.format(i)).click()
continue
if i == 10:
driver.find_element_by_class_name("btn_next").click()
continue
if i > 10:
driver.find_element_by_xpath('//*[@id="content"]/div[2]/div[3]/div/a[{}]'.format(i%10+1)).click()
continue
if i % 10 == 0 :
driver.find_element_by_class_name("btn_next").click()
link=set(link)
link=list(link)
links=pd.Series(link)
links.to_csv("../news_link.csv",index=False)
url= link[0]
html = urlopen(url)
soup=BeautifulSoup(html, 'html.parser')
soup.select_one('div.article_body').get_text()
with open("/Users/solhee/data/news_joongang.txt","a") as file:
for i,v in enumerate(link):
print('{}번째 실행중 ...........'.format(i+1))
print('url :', v)
url = v
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
text = soup.select_one('div.article_body').get_text()
print(text)
file.write(text)
time.sleep(30)