-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap_to_json.py
89 lines (67 loc) · 2.43 KB
/
scrap_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import json
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import lxml
BASE_URL = "https://quotes.toscrape.com"
authors_link_list = set()
quotes_json = []
authors_json = []
def parse_quotes(soup):
quotes = soup.find_all("div", {"class": "quote"})
for quote_element in quotes:
quote = {
"tags": [
tag.get_text() for tag in quote_element.find_all("a", {"class": "tag"})
],
"quote": quote_element.find("span", {"class": "text"}).get_text(),
"author": quote_element.find("small", {"class": "author"}).get_text(),
}
quotes_json.append(quote)
def parse_authors():
for author in authors_link_list:
auth_response = requests.get(f"{BASE_URL}{author}")
auth_soup = BeautifulSoup(auth_response.content, "lxml")
author_description = auth_soup.find(
"div", {"class": "author-description"}
).get_text(strip=True)
author_name = auth_soup.find("h3", {"class": "author-title"}).get_text()
author_born_date = auth_soup.find(
"span", {"class": "author-born-date"}
).get_text()
author_born_location = auth_soup.find(
"span", {"class": "author-born-location"}
).get_text()
entry = {
"fullname": author_name,
"born_date": author_born_date,
"born_location": author_born_location,
"description": author_description,
}
authors_json.append(entry)
def write_json(file_name, items_list):
with open(f"{file_name}.json", "w", encoding="utf-8") as file:
json.dump(items_list, file, indent=4, ensure_ascii=False)
def main():
next_page = ""
while next_page is not None:
response = requests.get(f"{BASE_URL}{next_page}")
soup = BeautifulSoup(response.content, "lxml")
parse_quotes(soup)
link_list = soup.find_all("a", {"class": None})
[
authors_link_list.add(link.get("href"))
for link in link_list
if "author" in link.get("href")
]
try:
next_page = soup.find("li", {"class": "next"}).find("a").get("href")
except AttributeError:
break
parse_authors()
write_json("quotes", quotes_json)
write_json("authors", authors_json)
if __name__ == "__main__":
start = datetime.now()
main()
print((datetime.now() - start).seconds)