Skip to content

Commit f7aa245

Browse files
committed
add licence matcher by url; tuned li filter for AWS pages;
1 parent a74684f commit f7aa245

File tree

7 files changed

+4250
-44
lines changed

7 files changed

+4250
-44
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,3 @@ pkg
5151
log/*.log
5252
scripts/set_vars_for_dev_reiz.sh
5353
tmp/*
54-
Gemfile.lock

Gemfile.lock

+290
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
GIT
2+
remote: https://github.com/versioneye/versioneye-core.git
3+
revision: 0d56a9b3262020916020d87d70b455bd794aa56f
4+
tag: v9.5.24
5+
specs:
6+
versioneye-core (9.5.24)
7+
actionmailer (~> 4.2.6)
8+
aws-sdk (~> 2.4.0)
9+
bundler (~> 1.12.5)
10+
bunny (~> 2.4.0)
11+
cocoapods-core (~> 0.39.0)
12+
dalli (~> 2.7.0)
13+
etcd (~> 0.3.0)
14+
httparty (~> 0.13.1)
15+
mongoid (~> 5.1.0)
16+
naturalsorter (~> 3.0.14)
17+
net-ldap (~> 0.15.0)
18+
nokogiri (~> 1.6.8)
19+
oauth (~> 0.5.0)
20+
octokit (~> 4.3.0)
21+
pdfkit (~> 0.8.0)
22+
persistent_httparty (~> 0.1.0)
23+
semverly (~> 1.0.0)
24+
stripe (~> 1.48.0)
25+
tire (~> 0.6.2)
26+
will_paginate_mongoid (= 2.0.1)
27+
wkhtmltopdf-binary (~> 0.9.9.1)
28+
29+
GEM
30+
remote: http://rubygems.org/
31+
specs:
32+
actionmailer (4.2.7)
33+
actionpack (= 4.2.7)
34+
actionview (= 4.2.7)
35+
activejob (= 4.2.7)
36+
mail (~> 2.5, >= 2.5.4)
37+
rails-dom-testing (~> 1.0, >= 1.0.5)
38+
actionpack (4.2.7)
39+
actionview (= 4.2.7)
40+
activesupport (= 4.2.7)
41+
rack (~> 1.6)
42+
rack-test (~> 0.6.2)
43+
rails-dom-testing (~> 1.0, >= 1.0.5)
44+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
45+
actionview (4.2.7)
46+
activesupport (= 4.2.7)
47+
builder (~> 3.1)
48+
erubis (~> 2.7.0)
49+
rails-dom-testing (~> 1.0, >= 1.0.5)
50+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
51+
activejob (4.2.7)
52+
activesupport (= 4.2.7)
53+
globalid (>= 0.3.0)
54+
activemodel (4.2.7)
55+
activesupport (= 4.2.7)
56+
builder (~> 3.1)
57+
activesupport (4.2.7)
58+
i18n (~> 0.7)
59+
json (~> 1.7, >= 1.7.7)
60+
minitest (~> 5.1)
61+
thread_safe (~> 0.3, >= 0.3.4)
62+
tzinfo (~> 1.1)
63+
addressable (2.4.0)
64+
amq-protocol (2.0.1)
65+
ansi (1.5.0)
66+
aws-sdk (2.4.3)
67+
aws-sdk-resources (= 2.4.3)
68+
aws-sdk-core (2.4.3)
69+
jmespath (~> 1.0)
70+
aws-sdk-resources (2.4.3)
71+
aws-sdk-core (= 2.4.3)
72+
bson (4.1.1)
73+
builder (3.2.2)
74+
bunny (2.4.0)
75+
amq-protocol (>= 2.0.1)
76+
capybara (2.7.1)
77+
addressable
78+
mime-types (>= 1.16)
79+
nokogiri (>= 1.3.3)
80+
rack (>= 1.0.0)
81+
rack-test (>= 0.5.4)
82+
xpath (~> 2.0)
83+
cocoapods-core (0.39.0)
84+
activesupport (>= 4.0.2)
85+
fuzzy_match (~> 2.0.4)
86+
nap (~> 1.0)
87+
crack (0.4.3)
88+
safe_yaml (~> 1.0.0)
89+
dalli (2.7.6)
90+
database_cleaner (1.5.3)
91+
descendants_tracker (0.0.4)
92+
thread_safe (~> 0.3, >= 0.3.1)
93+
diff-lcs (1.2.5)
94+
docile (1.1.5)
95+
domain_name (0.5.20160615)
96+
unf (>= 0.0.5, < 1.0.0)
97+
erubis (2.7.0)
98+
etcd (0.3.0)
99+
mixlib-log
100+
factory_girl (4.7.0)
101+
activesupport (>= 3.0.0)
102+
fakeweb (1.3.0)
103+
faraday (0.9.2)
104+
multipart-post (>= 1.2, < 3)
105+
fuzzy_match (2.0.4)
106+
gene_pool (1.4.1)
107+
thread_safe
108+
git (1.3.0)
109+
github_api (0.14.4)
110+
addressable (~> 2.4.0)
111+
descendants_tracker (~> 0.0.4)
112+
faraday (~> 0.8, < 0.10)
113+
hashie (>= 3.4)
114+
oauth2 (~> 1.0.0)
115+
globalid (0.3.7)
116+
activesupport (>= 4.1.0)
117+
hashdiff (0.3.0)
118+
hashie (3.4.4)
119+
hashr (0.0.22)
120+
highline (1.7.8)
121+
http-cookie (1.0.2)
122+
domain_name (~> 0.5)
123+
httparty (0.13.7)
124+
json (~> 1.8)
125+
multi_xml (>= 0.5.2)
126+
i18n (0.7.0)
127+
jeweler (2.1.1)
128+
builder
129+
bundler (>= 1.0)
130+
git (>= 1.2.5)
131+
github_api
132+
highline (>= 1.6.15)
133+
nokogiri (>= 1.5.10)
134+
rake
135+
rdoc
136+
semver
137+
jmespath (1.3.1)
138+
json (1.8.3)
139+
jwt (1.5.4)
140+
loofah (2.0.3)
141+
nokogiri (>= 1.5.9)
142+
mail (2.6.4)
143+
mime-types (>= 1.16, < 4)
144+
mime-types (2.99.2)
145+
mini_portile2 (2.1.0)
146+
minitest (5.9.0)
147+
mixlib-log (1.6.0)
148+
mongo (2.2.7)
149+
bson (~> 4.0)
150+
mongoid (5.1.3)
151+
activemodel (~> 4.0)
152+
mongo (~> 2.1)
153+
origin (~> 2.2)
154+
tzinfo (>= 0.3.37)
155+
multi_json (1.12.1)
156+
multi_xml (0.5.5)
157+
multipart-post (2.0.0)
158+
nap (1.1.0)
159+
narray (0.6.1.2)
160+
naturalsorter (3.0.14)
161+
net-ldap (0.15.0)
162+
netrc (0.11.0)
163+
nokogiri (1.6.8)
164+
mini_portile2 (~> 2.1.0)
165+
pkg-config (~> 1.1.7)
166+
oauth (0.5.1)
167+
oauth2 (1.0.0)
168+
faraday (>= 0.8, < 0.10)
169+
jwt (~> 1.0)
170+
multi_json (~> 1.3)
171+
multi_xml (~> 0.5)
172+
rack (~> 1.2)
173+
octokit (4.3.0)
174+
sawyer (~> 0.7.0, >= 0.5.3)
175+
origin (2.2.0)
176+
pdfkit (0.8.2)
177+
persistent_http (1.0.6)
178+
gene_pool (>= 1.3)
179+
persistent_httparty (0.1.2)
180+
httparty (~> 0.9)
181+
persistent_http (< 2)
182+
pkg-config (1.1.7)
183+
rack (1.6.4)
184+
rack-test (0.6.3)
185+
rack (>= 1.0)
186+
rails-deprecated_sanitizer (1.0.3)
187+
activesupport (>= 4.2.0.alpha)
188+
rails-dom-testing (1.0.7)
189+
activesupport (>= 4.2.0.beta, < 5.0)
190+
nokogiri (~> 1.6.0)
191+
rails-deprecated_sanitizer (>= 1.0.1)
192+
rails-html-sanitizer (1.0.3)
193+
loofah (~> 2.0)
194+
rake (11.2.2)
195+
rdoc (4.2.2)
196+
json (~> 1.4)
197+
rest-client (1.8.0)
198+
http-cookie (>= 1.0.2, < 2.0)
199+
mime-types (>= 1.16, < 3.0)
200+
netrc (~> 0.7)
201+
rspec (3.5.0)
202+
rspec-core (~> 3.5.0)
203+
rspec-expectations (~> 3.5.0)
204+
rspec-mocks (~> 3.5.0)
205+
rspec-core (3.5.1)
206+
rspec-support (~> 3.5.0)
207+
rspec-expectations (3.5.0)
208+
diff-lcs (>= 1.2.0, < 2.0)
209+
rspec-support (~> 3.5.0)
210+
rspec-mocks (3.5.0)
211+
diff-lcs (>= 1.2.0, < 2.0)
212+
rspec-support (~> 3.5.0)
213+
rspec-support (3.5.0)
214+
rspec_junit_formatter (0.2.3)
215+
builder (< 4)
216+
rspec-core (>= 2, < 4, != 2.12.0)
217+
rufus-scheduler (3.2.1)
218+
safe_yaml (1.0.4)
219+
sawyer (0.7.0)
220+
addressable (>= 2.3.5, < 2.5)
221+
faraday (~> 0.8, < 0.10)
222+
semver (1.0.1)
223+
semverly (1.0.0)
224+
shoulda (3.5.0)
225+
shoulda-context (~> 1.0, >= 1.0.1)
226+
shoulda-matchers (>= 1.4.1, < 3.0)
227+
shoulda-context (1.2.1)
228+
shoulda-matchers (2.8.0)
229+
activesupport (>= 3.0.0)
230+
simplecov (0.12.0)
231+
docile (~> 1.1.0)
232+
json (>= 1.8, < 3)
233+
simplecov-html (~> 0.10.0)
234+
simplecov-html (0.10.0)
235+
stripe (1.48.0)
236+
rest-client (>= 1.4, < 3.0)
237+
tf-idf-similarity (0.1.5)
238+
unicode_utils (~> 1.4)
239+
thread_safe (0.3.5)
240+
tire (0.6.2)
241+
activemodel (>= 3.0)
242+
activesupport
243+
ansi
244+
hashr (~> 0.0.19)
245+
multi_json (~> 1.3)
246+
rake
247+
rest-client (~> 1.6)
248+
tzinfo (1.2.2)
249+
thread_safe (~> 0.1)
250+
unf (0.1.4)
251+
unf_ext
252+
unf_ext (0.0.7.2)
253+
unicode_utils (1.4.0)
254+
vcr (3.0.3)
255+
webmock (2.1.0)
256+
addressable (>= 2.3.6)
257+
crack (>= 0.3.2)
258+
hashdiff
259+
will_paginate (3.1.0)
260+
will_paginate_mongoid (2.0.1)
261+
mongoid
262+
will_paginate (~> 3.0)
263+
wkhtmltopdf-binary (0.9.9.3)
264+
xpath (2.0.0)
265+
nokogiri (~> 1.3)
266+
267+
PLATFORMS
268+
ruby
269+
270+
DEPENDENCIES
271+
bundler (~> 1.12.0)
272+
capybara (~> 2.7.1)
273+
database_cleaner (~> 1.5.1)
274+
factory_girl (~> 4.7.0)
275+
fakeweb (~> 1.3.0)
276+
jeweler (~> 2.1.1)
277+
narray (= 0.6.1.2)
278+
rdoc (~> 4.2.0)
279+
rspec (~> 3.5.0)
280+
rspec_junit_formatter (= 0.2.3)
281+
rufus-scheduler (= 3.2.1)
282+
shoulda
283+
simplecov (~> 0.12.0)
284+
tf-idf-similarity (= 0.1.5)
285+
vcr (~> 3.0.1)
286+
versioneye-core!
287+
webmock (~> 2.1.0)
288+
289+
BUNDLED WITH
290+
1.12.5

data/licenses.json

+1,913-1
Large diffs are not rendered by default.

lib/versioneye/crawlers/license_crawler.rb

+25-17
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,25 @@ def self.crawl_unidentified_urls(language)
4040
licenses = License.where(language: language, spdx_identifier: nil)
4141
licenses.to_a.each do |lic_db|
4242
n += 1
43-
updated_lic = crawl_license_file(lic_matcher, lic_db)
44-
failed += 1 if updated_lic.nil?
43+
#first try to match by url without doing http request
44+
spdx_id, score = lic_matcher.match_url(lic_db[:url])
45+
if spdx_id.nil?
46+
logger.info "crawl_unidentified_urls: going to fetch license text from #{lic_db[:url]}"
47+
spdx_id, score = crawl_license_file(lic_matcher, lic_db)
48+
else
49+
logger.info "crawl_unidentified_urls: found match by url - #{spdx_id}, #{lic_db[:url]}"
50+
end
51+
52+
if spdx_id.nil?
53+
logger.warn "crawl_unidentified_urls: detect no license for #{lic_db[:url]}"
54+
failed += 1
55+
next
56+
end
57+
58+
lic_db.update(
59+
spdx_identifier: spdx_id,
60+
name: spdx_id
61+
)
4562
end
4663

4764
logger.info "crawl_unidentified_urls: done! crawled #{n} licenses, skipped: #{failed}"
@@ -53,14 +70,14 @@ def self.crawl_license_file(lic_matcher, lic_db)
5370
url = lic_db[:url].to_s.strip
5471
if url.empty?
5572
logger.warn "crawl_license_file: no url for #{prod_id}."
56-
return
73+
return []
5774
end
5875

5976
res = fetch url
6077
if res.nil? or res.code != 200
6178
logger.error "crawl_license_file: failed to read #{prod_id} data from: #{url} - #{res.try(:code)}"
6279
lic_db.update(name: 'unknown')
63-
return
80+
return []
6481
end
6582

6683
matches = case res.headers["content-type"]
@@ -76,21 +93,17 @@ def self.crawl_license_file(lic_matcher, lic_db)
7693
best_match = matches.to_a.first
7794
if best_match.nil? or best_match.empty?
7895
logger.warn "crawl_license_file: found no match for #{prod_id}, #{url}"
79-
return
96+
return []
8097
end
8198

8299
if best_match.last < A_MIN_SCORE_CONFIDENCE
83100
logger.warn "crawl_license_file: #{prod_id} best match had too low score #{best_match}, #{url}"
84-
return
101+
return []
85102
end
86103

87104
spdx_id, score = best_match
88-
if url =~ /mit/i and spdx_id != 'MIT'
89-
logger.error "crawl_license_file: didnt detect MIT for #{prod_id}, #{matches}, #{url}"
90-
spdx_id, score = 'MIT', 1.0
91-
end
92105

93-
#TODO: if these special cases gets bigger -> refactor own method or change solution
106+
#TODO: if these special cases gets bigger -> refactor to License.match_url
94107
case url
95108
when 'http://jquery.org/license'
96109
spdx_id, score = 'MIT', 1.0 #Jquery license page doesnt include any license text
@@ -99,12 +112,7 @@ def self.crawl_license_file(lic_matcher, lic_db)
99112
end
100113

101114
logger.debug "crawl_license_file: best match for #{prod_id} => #{spdx_id}:#{score} #{url}"
102-
lic_db.update(
103-
spdx_identifier: spdx_id,
104-
name: spdx_id
105-
)
106-
107-
return lic_db
115+
[spdx_id, score]
108116
end
109117

110118

0 commit comments

Comments
 (0)