-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.rb
40 lines (32 loc) · 1.01 KB
/
scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
require 'rubygems'
require 'mechanize'
agent = Mechanize.new
# Load pages list to page
page = agent.get('http://en.wikipedia.org/wiki/List_of_American_film_actresses')
# Lookup table
actress = {}
page.search('.column-width ul li a').each do |link|
#Form lookup of Name and Wiki-Link
actress[link['title']] = "http://en.wikipedia.org" + link['href']
end
# Parse each actress page and get twitter url
def parse_find_twitter(actor_url, actor_name)
#open and parse page
bot = Mechanize.new
out = bot.get(actor_url)
if out.links_with(:href=> /^.+:\W+twitter.com\W[a-zA-Z0-9_\W]{1,15}$/)[0]
return [actor_name, out.links_with(:href=> /^.+:\W+twitter.com\W[a-zA-Z0-9_\W]{1,15}$/)[0].href]
else
return [actor_name]
end
end
# open a file output.txt
target = File.open('output.txt', 'w')
count = 0
actress.keys.each do |n|
puts 'Processed '+ n +' - ' + count.to_s
target.write("#{parse_find_twitter(actress[n], n)[0]} | #{parse_find_twitter(actress[n], n)[1]}\n")
count = count + 1
end
# Close file
target.close()