-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathofficial_vn_news_crawl.py
124 lines (99 loc) · 4.72 KB
/
official_vn_news_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm.notebook import tqdm_notebook
import time
import csv
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
# Configure EdgeOptions for headless mode
edge_options = Options()
edge_options.headless = True
# Set the WebDriver executable path
webdriver_path = '/home/hieu/Desktop/edge_driver/'
# Create the WebDriver instance
driver = webdriver.Edge(service=Service(webdriver_path), options=edge_options)
# specify the URL of the website to crawl
url = "https://vnexpress.net/"
# maximize the browser window
driver.maximize_window()
# navigate to the URL
driver.get(url)
# wait for the page to load
time.sleep(2)
# Open the file and read the lines
# with open("../20_order_sub_cat.txt", "r") as file:
# lines = file.readlines()
# file.close()
# with open('py_20_news_links.txt', 'w') as f:
# # Iterate over each line in the file
# for url in tqdm_notebook(lines, desc="List all links", leave=True):
# # Remove any leading or trailing whitespace
# url = url.strip()
# driver.get(url)
# try:
# articles_frame = driver.find_element(By.ID, 'automation_TV0')
# all_articles_links = articles_frame.find_elements(By.CLASS_NAME, 'thumb-art')
# for article_link in all_articles_links:
# links_in_a_tag = article_link.find_elements(By.TAG_NAME, 'a')
# for link_a in links_in_a_tag:
# href = link_a.get_attribute('href')
# if href and href.startswith('https://vnexpress.net/'):
# # article_urls.append(href)
# f.write(href + '\n')
# print(href)
# except Exception as e:
# print(url)
# print(e)
# pass
# Define the CSV file path
csv_file_path = 'official_1_data.csv'
field_names = ['Category', 'Sub Category', 'Title', 'Description', 'Content']
# Open the file and read the lines
with open("./official_1_news_urls.txt", "r") as file:
lines = file.readlines()
file.close()
# Create the CSV file and write the header row
with open(csv_file_path, 'w', newline='') as file:
writer = csv.DictWriter(file, fieldnames = field_names)
writer.writeheader()
# Iterate over each line in the file
for url in tqdm_notebook(lines, desc="List all links", leave=True):
# Remove any leading or trailing whitespace
url = url.strip()
driver.get(url)
time.sleep(1)
print(driver.current_url)
try:
# Crawl the data
article_category = driver.find_element(By.CSS_SELECTOR, '#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > div.header-content.width_common > ul > li:nth-child(1) > a')
article_sub_category = driver.find_element(By.CSS_SELECTOR, '#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > div.header-content.width_common > ul > li:nth-child(2) > a')
# article_time = driver.find_element(By.CSS_SELECTOR, '#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > div.header-content.width_common > span')
article_title = driver.find_element(By.CSS_SELECTOR, '#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > h1')
article_des = driver.find_element(By.CSS_SELECTOR, '#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > p')
article_text = driver.find_element(By.CSS_SELECTOR,'#dark_theme > section.section.page-detail.top-detail > div > div.sidebar-1 > article')
article_paragraphs = article_text.find_elements(By.CSS_SELECTOR,'p.Normal')
content = '\n'.join([e.text for e in article_paragraphs])
# Create a dictionary with the crawled data
article_data = {
'Category': article_category.text,
'Sub Category': article_sub_category.text,
'Title': article_title.text,
'Description': article_des.text,
'Content': content
}
# Append the data to the CSV file
writer.writerow(article_data)
# Print the data
print(f"Cat: {article_category.text}")
print(f"Sub Cat: {article_sub_category.text}")
print(f"Title: {article_title.text}")
print(f"Des: {article_des.text}")
print(f"Content: {content}")
except Exception as e:
print(f"Error URL is: {url}")
print(e)
pass
file.close()
print(f"Data has been exported to '{csv_file_path}'.")
# Close the browser
driver.quit()