-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeSearchPages.py
88 lines (78 loc) · 4.07 KB
/
scrapeSearchPages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# This script uses Selenium to scrape a list of users from each city,
# create a list of URLs for their profiles, and take any data avaialable
# about them from the search page alone. This uses Selenium because search
# pages are dynamically loaded using AJAX.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import random
driver = webdriver.Chrome()
# Create/load files to store data
csv_file = open('profileURLs.csv', 'a')
writer = csv.writer(csv_file)
#with open('completecities.txt','r') as finishedCityFile:
# finishedcities = finishedCityFile.readlines()
#finishedcities = list(map(lambda x: x.strip('\n'), finishedcities))
# To avoid 250-page limit, scrape first 5000 prfiles from lots of cities instead of only one
print('Scraping list of cities for URLs')
cityurls = []
driver.get('https://www.couchsurfing.com/places')
citylist = driver.find_elements_by_xpath('//section[@class="cs-sitemap-region"]//li//a')
for city in citylist:
cityurls += [city.get_attribute('href') + '/accommodation']
random.shuffle(cityurls)
print("Found " + str(len(cityurls)) + " locations to scrape")
for citypage in cityurls:
print('Scraping ' + citypage)
# Check list of cities already fetched and skip if it's already in the data
with open('completecities.txt','r') as finishedCityFile:
finishedcities = finishedCityFile.readlines()
finishedcities = list(map(lambda x: x.strip('\n'), finishedcities))
print("Cities finished: " + str(len(finishedcities)) + " of " + str(len(cityurls)))
if citypage in finishedcities:
print("This city has already been done")
continue
# Load the current city's results page
driver.get(citypage)
wait_urls = WebDriverWait(driver, 10)
index = 1
while index <= 50: # Fetch information from each set of results and then move on to the next set
try:
url_list = []
# Click Next Page button
print('Scraping page ' + str(index))
index += 1
# Make a dictionary of user profiles and information available on results page:
userboxes = wait_urls.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="user-card__content mod-host"]')))
#print(str(len(userboxes)) + ' userboxes on this page')
for userbox in userboxes: # For each profile URL on this page of results
# Extract useful information from each user's box
user_url = userbox.find_elements_by_xpath(
".//a[@class='user-card__profile-link']")[0].get_attribute('href')
responseSpeed = userbox.find_element_by_xpath('.//span[@class="user-card__response-time text mod-gray"]').text
refsandfriends = userbox.find_element_by_xpath('.//div[@class="user-card__stats"]/div').text
#nRefs = refsandfriends.split('\n')[0]
#nFriends = refsandfriends.split('\n')[1]
try:
languages = userbox.find_element_by_xpath('.//span[@class="user-card__languages mod-1-line"]').text
except:
languages = ''
couchStatus = userbox.find_element_by_xpath('//span[contains(.,"Guests")]').text
# Make a dictionary for each user profile:
user_deets = {'user_url':user_url, 'responseSpeed':responseSpeed,'refsandfriends':refsandfriends, 'languages':languages, 'couchStatus':couchStatus}
writer.writerow(user_deets.values())
time.sleep(10)
wait_button = WebDriverWait(driver, 5)
next_button = wait_button.until(EC.element_to_be_clickable(
(By.XPATH, '//button[@aria-label="Next Page"]')))
next_button.click()
except Exception as e:
print(e)
driver.close()
csv_file.close()
break
with open('completecities.txt','a') as finishedCityFile:
finishedCityFile.write(citypage + '\n')