Skip to content

Commit d6eae8e

Browse files
authored
Merge branch 'master' into update_npm_crawler
2 parents e737585 + 040af67 commit d6eae8e

12 files changed

+1813
-1589
lines changed

Gemfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
source "http://rubygems.org"
22

3-
gem 'versioneye-core', :git => 'https://github.com/versioneye/versioneye-core.git', :tag => 'v12.8.0'
3+
gem 'versioneye-core', :git => 'https://github.com/versioneye/versioneye-core.git', :tag => 'v12.8.4'
44
#gem 'versioneye-core' , :path => "../versioneye-core"
55

66
gem 'rufus-scheduler', '3.4.2'

Gemfile.lock

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
GIT
22
remote: https://github.com/versioneye/versioneye-core.git
3-
revision: d46cd326d5eeab115f56bad5df5632228a87b5d3
4-
tag: v12.8.0
3+
revision: 0aedeaefa11a4cd159713f827af96f26673cfd5b
4+
tag: v12.8.4
55
specs:
6-
versioneye-core (12.8.0)
6+
versioneye-core (12.8.4)
77
actionmailer (~> 5.1.0)
88
aws-sdk (~> 2.10.7)
99
bunny (~> 2.7.0)
@@ -64,13 +64,13 @@ GEM
6464
public_suffix (~> 2.0, >= 2.0.2)
6565
amq-protocol (2.2.0)
6666
ansi (1.5.0)
67-
aws-sdk (2.10.16)
68-
aws-sdk-resources (= 2.10.16)
69-
aws-sdk-core (2.10.16)
67+
aws-sdk (2.10.17)
68+
aws-sdk-resources (= 2.10.17)
69+
aws-sdk-core (2.10.17)
7070
aws-sigv4 (~> 1.0)
7171
jmespath (~> 1.0)
72-
aws-sdk-resources (2.10.16)
73-
aws-sdk-core (= 2.10.16)
72+
aws-sdk-resources (2.10.17)
73+
aws-sdk-core (= 2.10.17)
7474
aws-sigv4 (1.0.1)
7575
bson (4.2.2)
7676
builder (3.2.3)

lib/versioneye/crawl.rb

+3-3
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,14 @@ def self.parse_url url_text
106106
end
107107

108108
def self.fetch url
109-
HTTParty.get url, { timeout: 5 }
109+
HTTParty.get url, { timeout: 20 }
110110
rescue
111111
logger.error "failed to fetch data from #{url}"
112112
nil
113113
end
114114

115115

116-
def self.fetch_json( url, ttl = 5, symbolize = true)
116+
def self.fetch_json( url, ttl = 20, symbolize = true)
117117
res = Timeout::timeout(ttl) { HTTParty.get(url) }
118118
if res.code != 200
119119
self.logger.error "Failed to fetch JSON doc from: #{url} - #{res}"
@@ -127,7 +127,7 @@ def self.fetch_json( url, ttl = 5, symbolize = true)
127127
nil
128128
end
129129

130-
def self.post_json(url, options, ttl = 5)
130+
def self.post_json(url, options, ttl = 20)
131131
res = Timeout::timeout(ttl) { HTTParty.post(url, options) }
132132
if res.code > 205
133133
logger.error "Failed to post data to the url: #{url}, #{res.code} - #{res.message}\n#{options}"

lib/versioneye/crawlers/crates_crawler.rb

+50-20
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ def self.crawl( api_key = nil )
3434
end
3535

3636

37+
def self.recrawl
38+
api_key = fetch_api_key
39+
return if api_key.to_s.empty?
40+
41+
Product.where(language: Product::A_LANGUAGE_RUST).to_a.each do |prod|
42+
crawl_package(prod[:prod_key], api_key, false)
43+
end
44+
end
45+
3746
def self.crawl_product_list(api_key, page_nr = 1, per_page = 100)
3847
logger.info "going to crawl product list from #{page_nr}, per_page: #{per_page}"
3948

@@ -86,9 +95,22 @@ def self.crawl_package(product_id, api_key = nil, ignore_existing = true)
8695

8796
def self.process_versions( product_db, product_doc, api_key, ignore_existing )
8897
owners = fetch_product_owners( api_key, product_db.prod_key )
89-
product_license = product_doc[:crate][:license].to_s
98+
if product_doc.nil?
99+
logger.error "process_versions: product json response cant be empty"
100+
return
101+
end
102+
103+
if product_doc.has_key?(:versions) == false
104+
logger.error "process_versions: product json has no :verisons field\n#{product_doc}"
105+
end
106+
90107
product_doc[:versions].each do |version_doc|
91108
version_num = version_doc[:num].to_s.strip
109+
if version_num.empty?
110+
log.error "process_versions: version_doc has no :num field\n#{version_doc}"
111+
next
112+
end
113+
92114
if !product_db.version_by_number( version_num ).nil? && ignore_existing
93115
logger.info "process_versions: #{product_db.prod_key}:#{version_num} exist already"
94116
next
@@ -100,7 +122,7 @@ def self.process_versions( product_db, product_doc, api_key, ignore_existing )
100122
next
101123
end
102124

103-
upsert_version_licenses( product_db, version_db.version, product_license )
125+
upsert_version_licenses( product_db, version_db.version, version_doc[:license] )
104126
upsert_version_links( product_db, version_db.version, product_doc[:crate] )
105127
upsert_version_archive( product_db, version_db.version, version_doc[:dl_path] )
106128
upset_version_devs( product_db, version_db, owners )
@@ -123,7 +145,7 @@ def self.crawl_dependencies( product_db, version_db, api_key )
123145
end
124146

125147
logger.info "crawl_dependencies: fetching version details for #{product_db.prod_key} - #{version}"
126-
dep_docs = fetch_version_dependencies( api_key, product_db[:prod_key], version )
148+
dep_docs = fetch_version_dependencies( api_key, product_db.prod_key, version )
127149
dep_docs.to_a.each do |dep_doc|
128150
upsert_product_dependency(product_db, version, dep_doc)
129151
end
@@ -133,6 +155,11 @@ def self.crawl_dependencies( product_db, version_db, api_key )
133155
#-- persistance helpers
134156

135157
def self.upsert_product(product_doc)
158+
if product_doc.nil?
159+
logger.error "upsert_product: API response had no :crate subdocument"
160+
return
161+
end
162+
136163
prod_key = product_doc[:id].to_s.strip
137164
prod_key_dc = prod_key.downcase
138165
product_db = Product.where(
@@ -216,6 +243,11 @@ def self.upsert_product_owner(product_db, owner_doc, version_num)
216243

217244

218245
def self.upsert_version_licenses(product_db, version_label, license_label)
246+
if license_label.nil? or license_label.empty?
247+
logger.error "upsert_version_licenses: missing license of #{product_db}/#{version_label}"
248+
return
249+
end
250+
219251
licenses = license_label.to_s.strip.split('/')
220252
licenses.to_a.each do |license|
221253
self.upsert_version_license(product_db, version_label, license)
@@ -228,12 +260,12 @@ def self.upsert_version_licenses(product_db, version_label, license_label)
228260
def self.upsert_version_license(product_db, version_label, license_name)
229261
license_name = license_name.to_s.strip
230262

231-
lic_db = License.where(
263+
lic_db = License.find_or_create_by(
232264
language: product_db[:language],
233265
prod_key: product_db[:prod_key],
234266
version: version_label,
235267
name: license_name
236-
).first_or_create
268+
)
237269

238270
lic_db.update(source: 'crates')
239271
lic_db.save
@@ -243,14 +275,6 @@ def self.upsert_version_license(product_db, version_label, license_name)
243275

244276

245277
def self.upsert_product_dependency(product_db, version_id, dep_doc)
246-
dep_db = Dependency.where(
247-
prod_type: A_TYPE_CARGO,
248-
language: product_db.language,
249-
prod_key: product_db.prod_key,
250-
prod_version: version_id,
251-
dep_prod_key: dep_doc[:crate_id],
252-
).first_or_create
253-
254278
scope = if dep_doc[:optional]
255279
Dependency::A_SCOPE_OPTIONAL
256280
elsif dep_doc[:target] == 'test'
@@ -259,12 +283,18 @@ def self.upsert_product_dependency(product_db, version_id, dep_doc)
259283
Dependency::A_SCOPE_COMPILE
260284
end
261285

262-
dep_db.update(
286+
dep_db = Dependency.find_or_create_by(
287+
prod_type: A_TYPE_CARGO,
288+
language: product_db.language,
289+
prod_key: product_db.prod_key,
290+
prod_version: version_id,
291+
292+
dep_prod_key: dep_doc[:crate_id],
263293
version: dep_doc[:req],
264-
name: dep_doc[:crate_id],
265294
scope: scope
266295
)
267-
296+
dep_db.name = dep_doc[:crate_id]
297+
dep_db.save
268298
dep_db
269299
end
270300

@@ -284,12 +314,12 @@ def self.upsert_version_links(product_db, version_id, product_doc)
284314

285315

286316
def self.upsert_version_link(product_db, version_id, name, url)
287-
url_db = Versionlink.where(
317+
url_db = Versionlink.find_or_create_by(
288318
language: product_db[:language],
289319
prod_key: product_db[:prod_key],
290320
version_id: version_id,
291321
link: url.to_s.strip
292-
).first_or_create
322+
)
293323

294324
url_db.update(name: name.to_s.strip)
295325
url_db.save
@@ -301,12 +331,12 @@ def self.upsert_version_archive(product_db, version_id, dl_path)
301331
pkg_name = "#{product_db[:prod_key]}-#{version_id}.crate"
302332
url = "#{API_HOST}/#{dl_path}"
303333

304-
url_db = Versionarchive.where(
334+
url_db = Versionarchive.find_or_create_by(
305335
language: product_db[:language],
306336
prod_key: product_db[:prod_key],
307337
version_id: version_id,
308338
name: pkg_name
309-
).first_or_create
339+
)
310340

311341
url_db.update(link: url)
312342
url_db.save

lib/versioneye/crawlers/nuget_crawler.rb

+39-11
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class NugetCrawler < Versioneye::Crawl
88
A_PROFILE_URL = "https://www.nuget.org/profiles"
99
A_LANGUAGE_CSHARP = Product::A_LANGUAGE_CSHARP
1010
A_TYPE_NUGET = Project::A_TYPE_NUGET
11+
A_TIMEOUT = 30 # seconds
1112

1213

1314
def self.logger
@@ -46,7 +47,7 @@ def self.crawl_last_x_days( x_days = 10 )
4647
# otherwise will only crawl catalogs published on the date_txt
4748
# date_txt format: YYYY-mm-dd or any other format supported by DateTime.parse
4849
def self.crawl(date_txt = nil)
49-
catalog = self.fetch_json("#{A_NUGET_URL}#{A_CATALOG_PATH}")
50+
catalog = self.fetch_json("#{A_NUGET_URL}#{A_CATALOG_PATH}", A_TIMEOUT)
5051
if catalog.nil?
5152
self.logger.error "crawl: Failed to fetch the Nuget catalog."
5253
return nil
@@ -80,7 +81,7 @@ def self.crawl_catalog_page(the_page)
8081

8182
self.logger.info "Crawling catalog page: #{the_page[:@id]} - items: #{the_page[:count]}"
8283

83-
page_items = fetch_json the_page[:@id]
84+
page_items = fetch_json( the_page[:@id], A_TIMEOUT )
8485
if page_items.nil?
8586
logger.warn "crawl_catalog_page: failed to fetch items on the catalog page: #{the_page}"
8687
return
@@ -95,7 +96,7 @@ def self.crawl_package( the_package )
9596
return
9697
end
9798

98-
doc = fetch_json the_package[:@id]
99+
doc = fetch_json( the_package[:@id], A_TIMEOUT)
99100
if doc.nil?
100101
logger.warn "crawl_package: failed to fetch package details from: #{the_package}"
101102
return
@@ -221,6 +222,15 @@ def self.create_new_version(product, product_doc)
221222
release_dt = parse_date_string( publish_date_label )
222223
end
223224

225+
#if package has no release or created date - then use commit Date
226+
if release_dt.nil?
227+
publish_date_label = product_doc[:'catalog:commitTimeStamp']
228+
release_dt = parse_date_string(publish_date_label)
229+
230+
logger.warn "create_new_version: no `created` or `published` fields\n#{product}\n#{product_doc}"
231+
logger.warn "\twill use commitTimeStamp instead - #{publish_date_label}"
232+
end
233+
224234
version_db = Version.new({
225235
version: product_doc[:version],
226236
released_at: release_dt,
@@ -362,15 +372,33 @@ def self.upsert_artefact(product, version)
362372

363373

364374
def self.upsert_artefact_sha(product, version, sha, sha_method)
365-
artefact = Artefact.find_or_create_by(
366-
:language => product.language,
367-
:prod_key => product.prod_key,
368-
:version => version,
369-
:prod_type => product.prod_type,
370-
:packaging => 'nupkg',
371-
:sha_value => sha,
372-
:sha_method => sha_method )
375+
artefact = Artefact.where(sha_value: sha).first
376+
if artefact and artefact[:prod_key] != product[:prod_key]
377+
logger.error "#-- CRITICAL ERROR ---------------------------------------------------"
378+
logger.error "upsert_artefact_sha: sha `#{sha}` belongs to other product #{artefact}"
379+
logger.error "\tcurrent_product: #{product} => #{version}, #{sha_method}"
380+
return false
381+
end
382+
383+
artefact ||= Artefact.new(
384+
:language => product.language,
385+
:prod_key => product.prod_key,
386+
:prod_type => product.prod_type
387+
)
388+
389+
artefact.update(
390+
version: version.to_s.strip,
391+
packaging: 'nupkg',
392+
sha_value: sha,
393+
sha_method: sha_method
394+
)
395+
396+
373397
artefact.save
398+
rescue => e
399+
logger.error "upsert_artefact_sha: failed to save #{product} sha #{sha_method}:#{sha}"
400+
logger.error "\treason: #{e.message}"
401+
false
374402
end
375403

376404

spec/fixtures/vcr_cassettes/crates/empty_product_details.yml

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

spec/fixtures/vcr_cassettes/crates/empty_products_list.yml

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)