-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmultiscraper.rb
146 lines (127 loc) · 4.76 KB
/
multiscraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
require 'feedjira'
require 'nokogiri'
require 'open-uri'
require 'json'
include ActionView::Helpers::SanitizeHelper
@logger = Logger.new(STDOUT)
def scrape_planet
url = 'https://planet.postgresql.org/rss20_short.xml'
rss = open(url).read
feed = Feedjira::Feed.parse rss
posts = feed.entries
@logger.info('Planet importer starting')
posts.each do |post|
Refinery::CommunityEvents::CommunityEvent.find_or_initialize_by({url: post.url}) do |post_record|
# planet postgres prepends author name and semicolon to each of their feed posts
title_split = post.title.split(':')
if title_split.size > 1
post_author = title_split[0]
post_title = title_split[1].strip
else
post_author = 'Planet PostgreSQL'
post_title = title_split[0]
end
post_record.published_at = post.published
post_record.title = post_title
post_record.body = post.summary
post_record.author = post_author
post_record.save
@logger.info "creating new record: #{post_record.title}"
end
end
end
AUTHOR_MAP = {'Linuxhiker' => '[email protected]', 'Joshua Drake' => '[email protected]', 'Jim Mlodgenski' => '[email protected]'}
def scrape_blogs
url = 'https://blog.pgconf.us/feeds/posts/default'
rss = open(url).read
feed = Feedjira::Feed.parse rss
posts = feed.entries
@logger.info('Blog importer starting')
posts.each do |post|
post_author = User.find_by_email(AUTHOR_MAP[post.author])
post_record = Refinery::Blog::Post.find_or_initialize_by({title: post.title})
post_record.title = post.title
post_record.body = post.content
post_record.published_at = post.published
post_record.author = post_author
post_record.draft = false
if post_record.new_record?
post_record.save
@logger.info "creating new record: #{post_record.title}"
end
end
end
def get_group_event_ids(url)
doc = Nokogiri::HTML(open("#{url}/events/"))
event_ids = []
res = doc.css('.eventCard--link').each do |link|
eid = link.attribute('href').value.split('/').last
event_ids << eid if eid
end
event_ids
end
def get_event_info(url, event_id)
doc = Nokogiri::HTML(open("#{url}/events/#{event_id}/"))
ret = {}
datajson = JSON.parse(doc.css('script[id="__NEXT_DATA__"]').children.first.to_s)
event = datajson["props"]["pageProps"]["event"]
ret['name'] = event['title']
ret['description'] = event['description']
ret['url'] = event['eventUrl']
ret['pic_url'] = event['imageUrl']
ret['startDate'] = event['dateTime']
ret['endDate'] = event['endTime']
location = 'N/A'
venue = event['venue']
if venue['address'].present?
location = "#{venue['address']}, #{venue['city']} / #{venue['country']}"
end
ret['location'] = location
ret
end
GROUP_URLS=[
'https://www.meetup.com/postgres-nyc/',
'https://www.meetup.com/postgres-philly/',
'https://www.meetup.com/Silicon-Valley-Postgres/',
'https://www.meetup.com/Montreal-Postgres/',
'https://www.meetup.com/Toronto-Postgres/',
'https://www.meetup.com/Vancouver-Postgres/',
'https://www.meetup.com/Houston-Postgres/',
'https://www.meetup.com/Dallas-Fort-Worth-Postgres/',
'https://www.meetup.com/austinpostgres/',
'https://www.meetup.com/Seattle-Postgres/',
'https://www.meetup.com/Los-Angeles-Postgres/',
'https://www.meetup.com/Phoenix-Postgres/',
'https://www.meetup.com/Salt-Lake-City-Postgres/',
'https://www.meetup.com/Denver-Postgres/',
'https://www.meetup.com/Whatcom-Postgres/'
]
def scrape_meetups
@logger.info('Meetup scraper starting')
GROUP_URLS.each do |gurl|
eids = get_group_event_ids(gurl)
eids.each do |event_id|
event = get_event_info(gurl, event_id)
# byebug
meetup = Refinery::Meetups::Meetup.find_or_initialize_by({external_id: event_id})
meetup.title = event['name']
meetup.description = event['description']
meetup.url = event['url']
meetup.picture_url = event['pic_url']
meetup.start = DateTime.parse(event['startDate'])
if event['endDate']
meetup.end = DateTime.parse(event['endDate'])
end
meetup.location = event['location']
if meetup.new_record?
@logger.info("Creating new meetup record from #{event_id}: #{meetup.title}")
else
@logger.info("Updating meetup record from #{event_id}: #{meetup.title}")
end
meetup.save
end
end
end
scrape_planet
scrape_blogs
scrape_meetups