Skip to content

Commit

Permalink
Splits fetching data from the web (repositories, identities, contribu…
Browse files Browse the repository at this point in the history
…tors and so on) in to parallel batches - introducing parallel gem #321
  • Loading branch information
bartoszmajsak committed Jul 25, 2016
1 parent f492066 commit 7a38885
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 145 deletions.
42 changes: 23 additions & 19 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,30 @@

source 'https://rubygems.org'

#gem "awestruct", "0.5.7"
gem "awestruct", :path => '../awestruct'
#gem "awestruct", :git => '[email protected]:awestruct/awestruct.git'
#gem 'awestruct', '0.5.7'
gem 'awestruct', :path => '../awestruct'
#gem 'awestruct', :git => '[email protected]:awestruct/awestruct.git'
gem 'git', '1.2.9.1' # newer version incorrectly reads UTF-8 enconded commits (authors for example)
gem 'asciidoctor'
gem 'haml-contrib'
gem 'bootstrap-sass', '< 3.0'
gem "puma"
gem "rest-client", ">= 2.0.0"
gem "hpricot"
gem "RedCloth"
gem "redcarpet"
gem "coffee-script"
gem "uglifier"
gem "htmlcompressor"
gem "compass", "0.12.7"
gem "ri_cal", "0.8.8"
gem "tzinfo", "0.3.33"
gem "therubyracer", "0.11.3"
gem "jruby-openssl", "0.7.7", :platforms => :jruby
gem "rb-inotify", :platforms => [:ruby, :jruby]
gem "versionomy"
gem 'puma'
gem 'rest-client', '>= 2.0.0'
gem 'hpricot'
gem 'RedCloth'
gem 'redcarpet'
gem 'coffee-script'
gem 'uglifier'

gem 'htmlcompressor'
gem 'compass', '0.12.7'
gem 'ri_cal', '0.8.8'
gem 'tzinfo', '0.3.33'
gem 'therubyracer', '0.11.3'
gem 'jruby-openssl', '0.7.7', :platforms => :jruby
gem 'rb-inotify', :platforms => [:ruby, :jruby]
gem 'versionomy'
gem 'rspec', '>= 2.9'
gem "nokogiri", "1.6.8"
gem 'nokogiri', '1.6.8'
gem 'ruby-progressbar'
gem 'parallel'
6 changes: 4 additions & 2 deletions _ext/github.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- encoding : utf-8 -*-
require 'parallel'

module Awestruct::Extensions::Github
DEFAULT_BASE_URL = 'https://api.github.com'

Expand Down Expand Up @@ -27,7 +29,7 @@ def execute(site)
milestones_data = RestClient.get milestones_url, :accept => 'application/json',
:cache_key => "github/milestones_project-#{@project_key}.json", :cache_expiry_age => DURATION_1_DAY

milestones_data.content.each do |m|
Parallel.each(milestones_data.content, progress: "Fetching milestones of [#{milestones_url}] ") { |m|
release_key = m['title']
release_key = "#{@prefix_version}_#{release_key}" unless @prefix_version.nil?

Expand All @@ -49,7 +51,7 @@ def execute(site)
end

site.release_notes[release_key] = release_notes
end
}

end
end
Expand Down
2 changes: 1 addition & 1 deletion _ext/identities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def unique_by_emails(contributors)

def lookup_by_contributor(contributor)
identity = self.find {|e| e.contributor and e.contributor.emails and e.contributor.emails.include? contributor.email }
# identity = lookup_by_emails(contributor.email)
# identity = lookup_by_emails(contributor.email) if identity.nil?
# identity = lookup_by_name(contributor.name) if identity.nil?
if identity.nil?
# Indication that we have a mismatched account
Expand Down
15 changes: 8 additions & 7 deletions _ext/identities/github.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- encoding : utf-8 -*-
require 'parallel'

module Identities
module GitHub
Expand Down Expand Up @@ -35,17 +36,17 @@ def add_repository(repository)

def add_match_filter(match_filter)
@match_filters << match_filter
#File.open('/tmp/committers.yml', 'w') do |out|
#File.open('/tmp/committers.yml', 'w:UTF-8') do |out|
# YAML.dump(match_filter, out)
#end
end

def collect(identities)
visited = []
@repositories.each do |r|
visited = Parallel.each(@repositories, progress:
'Processing contributors from GitHub') { |r|
url = CONTRIBUTORS_URL_TEMPLATE % [ r.owner, r.path ]
contributors = RestClient.get url, :accept => 'application/json'
contributors.content.each do |acct|
contributors.content.each { |acct|
github_id = acct['login'].downcase
author = nil
@match_filters.each do |filter|
Expand All @@ -58,9 +59,9 @@ def collect(identities)
end
identity = identities.lookup_by_github_id(github_id, true)
github_acct_to_identity(acct, author, identity)
visited << github_id
end
end
github_id
}
}.flatten

# github doesn't keep perfect records of contributors, so handle those omitted contributors
@match_filters.each do |filter|
Expand Down
4 changes: 2 additions & 2 deletions _ext/identities/gravatar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def crawl(identity)
return
end
url = API_URL_TEMPLATE % hash
response = RestClient.get(url, :user_agent => "rest-client") do |rsp, req, res, &blk|
response = RestClient.get(url, :user_agent => "rest-client", :accept => 'application/json') do |rsp, req, res, &blk|
if rsp.code.eql? 404
rsp = RestClient::Response.create('{}', rsp.net_http_res, req)
rsp.instance_variable_set(:@code, 200)
Expand All @@ -33,7 +33,7 @@ def crawl(identity)
end
end

data = JSON.parse response
data = response.content
if data.empty?
return
end
Expand Down
8 changes: 5 additions & 3 deletions _ext/jira.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- encoding : utf-8 -*-
require 'parallel'

module Awestruct::Extensions::Jira
DEFAULT_BASE_URL = 'https://issues.jboss.org'
Expand Down Expand Up @@ -29,7 +30,8 @@ def execute(site)
url = @base_url + (PROJECT_PATH_TEMPLATE % @project_key)
project_data = RestClient.get url, :accept => 'application/json',
:cache_key => "jira/project-#{@project_key}.json", :cache_expiry_age => DURATION_1_DAY
project_data.content['versions'].each do |v|
Parallel.each(project_data.content['versions'],
progress: "Fetching release notes from JIRA of [#{url}]") { |v|
next if !v['released']
release_key = v['name']
release_key = "#{@prefix_version}_#{release_key}" unless @prefix_version.nil?
Expand All @@ -51,7 +53,7 @@ def execute(site)
end

site.release_notes[release_key] = release_notes
end
}
end
end

Expand All @@ -67,7 +69,7 @@ def execute(site)
url = @base_url + (COMPONENTS_PATH_TEMPLATE % @project_key)
components = RestClient.get url, :accept => 'application/json',
:cache_key => "jira/components-#{@project_key}.json"
components.content.each do |c|
Parallel.each(components.content, progress: "Fetching component leads data from JIRA of of [#{url}]") do |c|
component_data = RestClient.get(c['self'], :accept => 'application/json',
:cache_key => "jira/component-#{@project_key}-#{c['id']}.json").content
if component_data.has_key? 'lead' and component_data['description'] =~ / :: ([^ ]+)$/
Expand Down
28 changes: 17 additions & 11 deletions _ext/lanyrd.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require 'tzinfo'
require 'nokogiri'
require 'rest-client'
require 'parallel'
require_relative 'common.rb'
##
# Lanyrd is an Awestruct extension module for interacting with lanyrd.com
Expand Down Expand Up @@ -60,19 +61,24 @@ def execute(site)
# context=future only works for conferences atm...waiting for session search support
search_url = "#{@base}/search/?type=session&q=#{@term}"

sessions = []
sessions = Hamster::Set.new

pages = []

page1 = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-1.html"), search_url))
pages << page1
pages = Hamster::Set.new
begin
page1 = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-1.html"), search_url))
rescue => e
puts e
site.sessions = []
return
end
pages = pages << page1

extract_pages(page1, pages, search_url)
pages.each do |page|
Parallel.each(pages, progress: "Extracting sessions details from Lanyrd" ) do |page|
extract_sessions(page, sessions)
end

site.sessions = sessions.sort_by{|session| session.start_datetime}
sessions = sessions.sort_by{|session| session.start_datetime}
site.sessions = sessions.size.times.map { |i| session.at(i) }
end

# Find all Pages in a 'root' Page
Expand All @@ -83,11 +89,11 @@ def extract_pages(root, pages, search_url)
root.css('div[@class*=pagination]').each do |p|
last_page_index = Integer(p.search('li').last.at('a').inner_text) +1
end
for index in 2...last_page_index
Parallel.each(2...last_page_index, progress: "Fetching Arquillian sessions from Lanyrd") { |index|
pageinated_url = "#{search_url}&page=#{index}"
pageX = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-#{index}.html"), pageinated_url))
pages << pageX
end
}
end

# Find all sessions in Page
Expand Down Expand Up @@ -166,7 +172,7 @@ def extract_sessions(page, sessions)
session.speakers << {'name' => name, 'username' => username }
end

sessions << session
sessions = sessions << session
end
end
end
Expand Down
Loading

0 comments on commit 7a38885

Please sign in to comment.