Skip to content

Commit 386c931

Browse files
author
manycoding
committed
Refactor
1 parent e4802a2 commit 386c931

File tree

2 files changed

+94
-80
lines changed

2 files changed

+94
-80
lines changed

amazon_parser.py

+8-80
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,18 @@
1-
import re
2-
import router
3-
import numpy as np
41
import matplotlib.pyplot as plt
5-
from bs4 import BeautifulSoup
6-
from operator import itemgetter
7-
8-
9-
def get_department_urls():
10-
"""Get links of all first level departments."""
11-
soup = BeautifulSoup(router.do_get(
12-
"https://www.amazon.com/Best-Sellers/zgbs/").text, "html.parser")
13-
14-
return [a.attrs["href"] for a in soup.find_all(
15-
"a",
16-
href=re.compile("https://www.amazon.com/Best-Sellers-"))]
17-
18-
19-
def get_bestsellers_data(url):
20-
"""Parse name, rating, reviews and price from department url"""
21-
# print(url)
22-
dep_bestsellers = []
23-
b_soup = BeautifulSoup(router.do_get(url).text, "html.parser")
24-
# Find first bestseller data
25-
bestsellers = b_soup.find_all("div", class_="zg_itemImmersion")
26-
for b in bestsellers:
27-
name = b.find("a").text.strip()
28-
rating = float(b.find("a", href=re.compile(
29-
"/product-reviews")).text.split()[0])
30-
reviews = int(b.find("a", class_="a-size-small").text.
31-
replace(",", ""))
32-
price = b.find("span", class_="p13n-sc-price").text[1:]
33-
34-
if reviews > 300:
35-
b = {"name": name, "rating": rating, "reviews": reviews,
36-
"price": float(price)}
37-
dep_bestsellers.append(b)
38-
39-
return dep_bestsellers
40-
41-
42-
def get_percentile(list, p):
43-
a = np.array(list)
44-
return np.percentile(a, 80)
45-
46-
47-
def get_bestsellers(urls):
48-
"""Create list from each department"""
49-
bestsellers = []
50-
for u in urls:
51-
try:
52-
b = get_bestsellers_data(u)
53-
bestsellers += b
54-
except Exception as e:
55-
print("Skipping {}".format(u))
56-
print(str(e))
57-
return bestsellers
58-
59-
60-
def filter_data(dict, key, value):
61-
return list(filter(lambda b: b['price'] < value, dict))
2+
from parser import Parser
623

634

645
def main():
65-
dep_urls = get_department_urls()
66-
bestsellers = get_bestsellers(dep_urls)
67-
print("Found {} bestsellers".format(len(bestsellers)))
68-
69-
# Remove duplicates
70-
bestsellers = [dict(t) for t in set([tuple(d.items())
71-
for d in bestsellers])]
72-
print("Removed duplcates\n {}".format(len(bestsellers)))
73-
74-
sorted_b = sorted(bestsellers, key=itemgetter('price', 'reviews'))
75-
p = get_percentile([b['price'] for b in sorted_b], 95)
76-
print("Percentile value is: {}".format(p))
77-
78-
# Filter by percentile
79-
filtered_b = filter_data(dict=sorted_b, key='price', value=p)
80-
print("Filtered to {}".format(len(filtered_b)))
6+
p = Parser()
7+
b_data = p.get_data()
818

82-
x_values = [b['price'] for b in filtered_b]
83-
y_values = [b['reviews'] for b in filtered_b]
9+
x_values = [b['price'] for b in b_data]
10+
y_values = [b['reviews'] for b in b_data]
8411

8512
# Print the name of top item
86-
top_name = [b['name'] for b in filtered_b if b['reviews'] == max(y_values)]
87-
print("The most reviewed is {}".format(top_name))
13+
print(
14+
"The most reviewed is\n {name} {reviews} {rating} {price}".
15+
format(**p.top))
8816

8917
# Set the size of the plotting window.
9018
plt.figure(dpi=128, figsize=(10, 6))

parser.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import re
2+
import router
3+
import numpy as np
4+
from bs4 import BeautifulSoup
5+
from operator import itemgetter
6+
7+
8+
class Parser:
9+
"""Get bestsellers data from Amazon departments and filter it."""
10+
PERCENTILE = 95
11+
12+
def get_department_urls(self):
13+
"""Get links of all first level departments."""
14+
soup = BeautifulSoup(router.do_get(
15+
"https://www.amazon.com/Best-Sellers/zgbs/").text, "html.parser")
16+
17+
return [a.attrs["href"] for a in soup.find_all(
18+
"a",
19+
href=re.compile("https://www.amazon.com/Best-Sellers-"))]
20+
21+
def get_department_bestsellers(self, url):
22+
"""Parse name, rating, reviews and price from department url"""
23+
# print(url)
24+
dep_bestsellers = []
25+
b_soup = BeautifulSoup(router.do_get(url).text, "html.parser")
26+
# Find first bestseller data
27+
bestsellers = b_soup.find_all("div", class_="zg_itemImmersion")
28+
for b in bestsellers:
29+
name = b.find("a").text.strip()
30+
rating = float(b.find("a", href=re.compile(
31+
"/product-reviews")).text.split()[0])
32+
reviews = int(b.find("a", class_="a-size-small").text.
33+
replace(",", ""))
34+
price = b.find("span", class_="p13n-sc-price").text[1:]
35+
36+
if reviews > 300:
37+
b = {"name": name, "rating": rating, "reviews": reviews,
38+
"price": float(price)}
39+
# Track the most reviewed
40+
if reviews > self.max_reviews:
41+
self.top = b
42+
self.max_reviews = reviews
43+
dep_bestsellers.append(b)
44+
45+
return dep_bestsellers
46+
47+
def get_percentile(self, list, p):
48+
a = np.array(list)
49+
return np.percentile(a, 80)
50+
51+
def get_bestsellers(self, urls):
52+
"""Create list from each department"""
53+
self.max_reviews = 0
54+
bestsellers = []
55+
for u in urls:
56+
try:
57+
b = self.get_department_bestsellers(u)
58+
bestsellers += b
59+
except Exception as e:
60+
print("Skipping {}".format(u))
61+
print(str(e))
62+
return bestsellers
63+
64+
def filter_data(self, dict, key, value):
65+
return list(filter(lambda b: b['price'] < value, dict))
66+
67+
def get_data(self):
68+
dep_urls = self.get_department_urls()
69+
bestsellers = self.get_bestsellers(dep_urls)
70+
71+
# Remove duplicates
72+
bestsellers = [dict(t) for t in set([tuple(d.items())
73+
for d in bestsellers])]
74+
print("Found {} bestsellers".format(len(bestsellers)))
75+
76+
sorted_b = sorted(bestsellers, key=itemgetter('price', 'reviews'))
77+
p = self.get_percentile([b['price']
78+
for b in sorted_b], self.PERCENTILE)
79+
print("Percentile value is: {}".format(p))
80+
81+
# Filter by percentile
82+
filtered_b = self.filter_data(dict=sorted_b, key='price', value=p)
83+
print("Filtered to {} by {} percentile".format(
84+
len(filtered_b), self.PERCENTILE))
85+
86+
return filtered_b

0 commit comments

Comments
 (0)