|
1 |
| -import re |
2 |
| -import router |
3 |
| -import numpy as np |
4 | 1 | import matplotlib.pyplot as plt
|
5 |
| -from bs4 import BeautifulSoup |
6 |
| -from operator import itemgetter |
7 |
| - |
8 |
| - |
9 |
| -def get_department_urls(): |
10 |
| - """Get links of all first level departments.""" |
11 |
| - soup = BeautifulSoup(router.do_get( |
12 |
| - "https://www.amazon.com/Best-Sellers/zgbs/").text, "html.parser") |
13 |
| - |
14 |
| - return [a.attrs["href"] for a in soup.find_all( |
15 |
| - "a", |
16 |
| - href=re.compile("https://www.amazon.com/Best-Sellers-"))] |
17 |
| - |
18 |
| - |
19 |
| -def get_bestsellers_data(url): |
20 |
| - """Parse name, rating, reviews and price from department url""" |
21 |
| - # print(url) |
22 |
| - dep_bestsellers = [] |
23 |
| - b_soup = BeautifulSoup(router.do_get(url).text, "html.parser") |
24 |
| - # Find first bestseller data |
25 |
| - bestsellers = b_soup.find_all("div", class_="zg_itemImmersion") |
26 |
| - for b in bestsellers: |
27 |
| - name = b.find("a").text.strip() |
28 |
| - rating = float(b.find("a", href=re.compile( |
29 |
| - "/product-reviews")).text.split()[0]) |
30 |
| - reviews = int(b.find("a", class_="a-size-small").text. |
31 |
| - replace(",", "")) |
32 |
| - price = b.find("span", class_="p13n-sc-price").text[1:] |
33 |
| - |
34 |
| - if reviews > 300: |
35 |
| - b = {"name": name, "rating": rating, "reviews": reviews, |
36 |
| - "price": float(price)} |
37 |
| - dep_bestsellers.append(b) |
38 |
| - |
39 |
| - return dep_bestsellers |
40 |
| - |
41 |
| - |
42 |
| -def get_percentile(list, p): |
43 |
| - a = np.array(list) |
44 |
| - return np.percentile(a, 80) |
45 |
| - |
46 |
| - |
47 |
| -def get_bestsellers(urls): |
48 |
| - """Create list from each department""" |
49 |
| - bestsellers = [] |
50 |
| - for u in urls: |
51 |
| - try: |
52 |
| - b = get_bestsellers_data(u) |
53 |
| - bestsellers += b |
54 |
| - except Exception as e: |
55 |
| - print("Skipping {}".format(u)) |
56 |
| - print(str(e)) |
57 |
| - return bestsellers |
58 |
| - |
59 |
| - |
60 |
| -def filter_data(dict, key, value): |
61 |
| - return list(filter(lambda b: b['price'] < value, dict)) |
| 2 | +from parser import Parser |
62 | 3 |
|
63 | 4 |
|
64 | 5 | def main():
|
65 |
| - dep_urls = get_department_urls() |
66 |
| - bestsellers = get_bestsellers(dep_urls) |
67 |
| - print("Found {} bestsellers".format(len(bestsellers))) |
68 |
| - |
69 |
| - # Remove duplicates |
70 |
| - bestsellers = [dict(t) for t in set([tuple(d.items()) |
71 |
| - for d in bestsellers])] |
72 |
| - print("Removed duplcates\n {}".format(len(bestsellers))) |
73 |
| - |
74 |
| - sorted_b = sorted(bestsellers, key=itemgetter('price', 'reviews')) |
75 |
| - p = get_percentile([b['price'] for b in sorted_b], 95) |
76 |
| - print("Percentile value is: {}".format(p)) |
77 |
| - |
78 |
| - # Filter by percentile |
79 |
| - filtered_b = filter_data(dict=sorted_b, key='price', value=p) |
80 |
| - print("Filtered to {}".format(len(filtered_b))) |
| 6 | + p = Parser() |
| 7 | + b_data = p.get_data() |
81 | 8 |
|
82 |
| - x_values = [b['price'] for b in filtered_b] |
83 |
| - y_values = [b['reviews'] for b in filtered_b] |
| 9 | + x_values = [b['price'] for b in b_data] |
| 10 | + y_values = [b['reviews'] for b in b_data] |
84 | 11 |
|
85 | 12 | # Print the name of top item
|
86 |
| - top_name = [b['name'] for b in filtered_b if b['reviews'] == max(y_values)] |
87 |
| - print("The most reviewed is {}".format(top_name)) |
| 13 | + print( |
| 14 | + "The most reviewed is\n {name} {reviews} {rating} {price}". |
| 15 | + format(**p.top)) |
88 | 16 |
|
89 | 17 | # Set the size of the plotting window.
|
90 | 18 | plt.figure(dpi=128, figsize=(10, 6))
|
|
0 commit comments