forked from fgscivittaro/ebay
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_links.py
138 lines (103 loc) · 3.83 KB
/
collect_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import requests
import re
from bs4 import BeautifulSoup
from scrape_page import get_soup
from scrape_page import get_title
def collect_featured_links():
"""
Scrapes the eBay home page and returns the links to each featured collection
"""
soup = get_soup('http://www.ebay.com/')
no_lazy = soup.find_all('div', attrs = {'class':'no-lazy'})
featured_links = []
# Returns the link of each Featured Collection displayed on the main page.
for html_code in no_lazy:
featured_links.append(html_code.find('a').get('href'))
return featured_links
def collect_featured_products(url):
"""
Takes in the url of a featured collection and returns all the product links
within the collection.
"""
soup = get_soup(url)
product_links = []
# Iterates through all the URLs found within the HTML code and appends them
# to product_links
item_thumb = soup.find_all('div', attrs={'class':'itemThumb'})
for html_code in item_thumb:
product_links.append(html_code.find('a').get('href'))
# Generates the URL of an xml ajax request responsible for retrieving some
# but not all of the product links
editor = url[24:]
slash = editor.index('/')
editor = editor[:slash]
col_code = url[-12:]
lxml_url = 'http://www.ebay.com/cln/_ajax/2/%s/%s' % (editor, col_code)
limiter = {'itemsPerPage':'30'}
lxml_soup = BeautifulSoup((requests.get(
lxml_url,
params = limiter).content),
'lxml')
# Retrieves all the URLs that the xml code is responsible for
lxml_links = [a["href"] for a in lxml_soup.select(
"div.itemThumb div.itemImg.image.lazy-image a[href]")]
# Merges the lists and turns them into a set, since there is some overlap
# between the two
all_links = list(set(product_links + lxml_links))
print str(len(all_links)) + " links added"
return all_links
def collect_all_featured_links():
"""
Iterates through collect_featured_links and returns a combined list of all
the featured product links within all of the featured collections.
"""
all_links = []
for url in collect_featured_links():
all_links += collect_featured_products(url)
print "Added all featured links to list"
return all_links
def collect_bad_links(link_list):
"""
Some links are for listings that have already ended. These trigger fatal
errors when BeautifulSoup attempts to scrape them, so they must be removed.
This function checks for keywords common in bad links and adds the bad links
to a set.
"""
bad_links = set([])
ended1 = re.compile(r'This listing has ended')
ended2 = re.compile(r'This listing was ended')
ended3 = re.compile(r'Bidding has ended')
for link in link_list:
print "Checking link"
soup = get_soup(link)
ended_listing1 = soup.find(text=ended1)
ended_listing2 = soup.find(text=ended2)
ended_listing3 = soup.find(text=ended3)
title = get_title(soup)
if (ended_listing1
or ended_listing2
or ended_listing3
or title=="N/A"):
bad_links.add(link)
print "Bad link added"
return bad_links
def remove_bad_links_from_link_list(bad_links, link_list):
"""
Checks the link list for bad links and removes them
"""
clean_list = []
for link in link_list:
if link not in bad_links:
clean_list.append(link)
return clean_list
def remove_old_links(old_list, new_list):
"""
Checks two link lists and returns a list containing links that are in the
new list but not the old list.
"""
new_links = []
old_set = set(old_list)
for link in new_list:
if link not in old_set:
new_links.append(link)
return new_links