Skip to content

Commit 28cebcd

Browse files
committed
Update tests
1 parent 99b1876 commit 28cebcd

File tree

4 files changed

+54
-31
lines changed

4 files changed

+54
-31
lines changed

Dockerfile

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
FROM versioneye/ruby-base:2.4.32
22
MAINTAINER Robert Reiz <[email protected]>
33

4+
RUN rm -Rf /app; \
5+
mkdir /app
6+
47
ADD . /app
58

69
RUN cp /app/supervisord.conf /etc/supervisord.conf; \

lib/versioneye/utils/license_matcher.rb

+22-22
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def initialize(files_path = DEFAULT_CORPUS_FILES_PATH, license_json_file = LICEN
3838

3939
def init_id_idx(licenses_json_doc)
4040
idx = {}
41-
licenses_json_doc.to_a.each do |spdx_item|
41+
licenses_json_doc.to_a.each do |spdx_item|
4242
lic_id = spdx_item[:id].to_s.downcase
4343
idx[lic_id] = spdx_item[:id]
4444
end
@@ -148,12 +148,12 @@ def match_url(the_url)
148148
def match_rules(text, early_exit = false)
149149
matches = []
150150
text = preprocess_text(text)
151-
151+
152152
#if text is already spdx_id, then shortcut matching
153153
if @rules.has_key?(text.downcase)
154-
return [[text.downcase, 1.0]]
154+
return [[text.downcase, 1.0]]
155155
end
156-
156+
157157
text += ' ' # required to make wordborder matcher to work with 1word texts
158158
@rules.each do |spdx_id, rules|
159159
match_res = matches_any_rule?(rules, text)
@@ -176,7 +176,7 @@ def match_rules(text, early_exit = false)
176176
def ignore?(lic_text)
177177
ignore_rules = get_ignore_rules
178178
m = matches_any_rule?(ignore_rules, lic_text.to_s)
179-
not m.nil?
179+
not m.nil?
180180
end
181181

182182
def matches_any_rule?(rules, license_name)
@@ -196,19 +196,19 @@ def matches_any_rule?(rules, license_name)
196196
#-- helpers
197197
def preprocess_text(text)
198198
text = safe_encode(text)
199-
199+
200200
#remove markdown url tags
201201
text = text.gsub(/\[.+?\]\(.+?\)/, ' ')
202202

203203
#remove spam words
204204
text.gsub!(/\bTHE\b/i, '')
205-
205+
206206
#remove some XML grabage
207207
text = text.gsub(/\<\!\[CDATA.*?\]\]\>/, ' ').to_s
208208
text = text.gsub(/\<\!--.+?--\>/, ' ').to_s
209209
text = text.gsub(/<\!\[CDATA.+?\]>/, ' ').to_s
210210

211-
return text.to_s.strip.gsub(/\s+/, ' ')
211+
return text.to_s.strip.gsub(/\s+/, ' ')
212212
end
213213

214214
def preprocess_html(html_text)
@@ -292,13 +292,13 @@ def rank_text_and_rules_matches(text_results, rules_results)
292292
t_max = t_scores.max
293293

294294
r_total = rules_results.reduce(0) {|acc, i| acc += i[3]; acc}
295-
295+
296296
text_results.to_a.each do |t|
297297
#find match in rules results and calc rankscore
298298
lic_id = t[0].downcase
299299
r = rules_results.to_a.find {|x| x[0].downcase == lic_id }
300300
next if r.nil? #there was no matching rules
301-
301+
302302
t_score = t[1] #score of confidence
303303
r_score = r[3] #length of match
304304

@@ -374,7 +374,7 @@ def safe_encode(txt)
374374
def init_rules(license_json_doc)
375375
rules = {}
376376
rules = build_rules_from_spdx_json(license_json_doc)
377-
377+
378378
get_custom_rules.each do |spdx_id, custom_rules_array|
379379
spdx_id = spdx_id.to_s.strip.downcase
380380

@@ -419,7 +419,7 @@ def build_spdx_item_rules(spdx_item)
419419
rules << Regexp.new("\\b[\\(]?https?:\/\/(www\.)?#{lic_url}[\/]?[\\)]?\\b".gsub(/\s+/, ''), Regexp::IGNORECASE)
420420
end
421421

422-
422+
423423
spdx_name = preprocess_text(spdx_item[:name])
424424
spdx_name.gsub!(/\(.+?\)/, '') #remove SPDX ids in the license names
425425
spdx_name.gsub!(/\./, '\\.') #mark version dots as not regex selector
@@ -429,7 +429,7 @@ def build_spdx_item_rules(spdx_item)
429429
spdx_name.gsub!(/\s+/, '\\s\\+') #replace spaces with space selector
430430

431431
rules << Regexp.new("\\b#{spdx_name}\\b", Regexp::IGNORECASE)
432-
432+
433433
#use spdx_id in full-text match if it's uniq and doest ambiquity like MIT, Fair, Glide
434434
spdx_id = spdx_item[:id].to_s.strip.downcase
435435
if spdx_id.match /[\d|-]|ware\z/
@@ -463,7 +463,7 @@ def get_ignore_rules
463463
/^\(c\)\s+\d{2,4}\d/,
464464
/^LICENSE\s*$/i, /^FREE\s*$/i, /\ASee\sLicense\s*\b/i, /^TODO\s*$/i, /^FREEWARE\s*$/i,
465465
/^All\srights\sreserved\s*$/i, /^COPYING\s*$/i, /^OTHER\s*$/i, /^NONE\s*$/i, /^DUAL\s*$/i,
466-
/^KEEP\s+IT\s+REAL\s*\b/i, /\ABE\s+REAL\s*\z/i, /\APrivate\s*\z/i, /\ACommercial\s*\z/i,
466+
/^KEEP\s+IT\s+REAL\s*\b/i, /\ABE\s+REAL\s*\z/i, /\APrivate\s*\z/i, /\ACommercial\s*\z/i,
467467
/\ASee\s+LICENSE\s+file\b/i, /\ASee\sthe\sLICENSE\b/i, /\ALICEN[C|S]E\s*\z/i,
468468
/^PUBLIC\s*$/i, /^see file LICENSE\s*$/i, /^__license__\s*$/i,
469469
/\bLIEULA\b/i, /\AEULA\s*\z/i, /^qQuickLicen[c|s]e\b/i, /^For\sfun\b/i, /\AVarious\s*\z/i,
@@ -666,7 +666,7 @@ def get_custom_rules
666666
],
667667
"ECL-1.0" => [ /\bECL[-|\s|_]?v?1\.0\b/i, /\bECL[-|\s|_]?v?1\b/i ],
668668
"ECL-2.0" => [
669-
/\bECL[-|\s|_]?v?2\.0\b/i, /\bECL[-|\s|_]?v?2\b/i,
669+
/\bECL[-|\s|_]?v?2\.0\b/i, /\bECL[-|\s|_]?v?2\b/i,
670670
/\bEDUCATIONAL\s+COMMUNITY\s+LICENSE[,]?\sVERSION\s2\.0\b/i
671671
],
672672
"EFL-1.0" => [/\bEFL[-|\s|_]?v?1\.0\b/i, /\bEFL[-|\s|_]?v?1\b/i ],
@@ -677,7 +677,7 @@ def get_custom_rules
677677
/\bEiffel\sForum\sLicense\b/i
678678
],
679679
"EPL-1.0" => [
680-
/\bEPL[-|\s|_]?v?1\.0\b/i, /\bEPL[-|\s|_]?v?1\b/i,
680+
/\bEPL[-|\s|_]?v?1\.0\b/i, /\bEPL[-|\s|_]?v?1\b/i,
681681
/\bECLIPSE\s+PUBLIC\s+LICENSE\s+[v]?1\.0\b/i,
682682
/\bECLIPSE\s+PUBLIC\s+LICENSE\b/i,
683683
/^ECLIPSE$/i, /\AEPL\s*\z/
@@ -700,7 +700,7 @@ def get_custom_rules
700700
/\b[\(]?FDL[\)]?\b/
701701
],
702702
"GPL-1.0" => [
703-
/\bGPL[-|\s|_]?v?1\.0\b/i, /\bGPL[-|\s|_]?v?1\b/i,
703+
/\bGPL[-|\s|_]?v?1\.0\b/i, /\bGPL[-|\s|_]?v?1\b/i,
704704
/\bGNU\sPUBLIC\sLICEN[S|C]E\sv?1\b/i
705705
],
706706
"GPL-2.0" => [
@@ -747,8 +747,8 @@ def get_custom_rules
747747
/\bLESSER\sGENERAL\sPUBLIC\sLICENSE[\,]?\sVersion\s2\.1[\,]?\b/i,
748748
/\bLESSER\sGENERAL\sPUBLIC\sLICENSE[\,]?\sv?2\.1\b/i
749749
],
750-
"LGPL-3.0" => [/\bLGPL[-|\s|_]?v?3\.0\b/i, /\bLGPL[-|\s|_]?v?3[\+]?\b/i,
751-
/\bLGLP[\s|-|v]?3\.0\b/i, /^LPLv3\s*$/, /\bLPGL[-|\s|_]?v?3[\+]?\b/i,
750+
"LGPL-3.0" => [/\bLGPL[-|\s|_]?v?3\.0\b/i, /\bLGPL[-|\s|_]?v?3[\+]?\b/i,
751+
/\bLGLP[\s|-|v]?3\.0\b/i, /^LPLv3\s*$/, /\bLPGL[-|\s|_]?v?3[\+]?\b/i,
752752
/\bLESSER\s+GENERAL\s+PUBLIC\s+License\s+[v]?3\b/i,
753753
/\bLesser\sGeneral\sPublic\sLicense\sv?\.?\s+3\.0\b/i,
754754
/\bhttps?:\/\/www\.gnu\.org\/copyleft\/lesser.html\b/i,
@@ -774,7 +774,7 @@ def get_custom_rules
774774
/\bMPL[-|\s|\_]?v?1\.1\b/i,
775775
],
776776
"MPL-2.0" => [
777-
/\bMPL[-|\s|\_]?v?2\.0\b/i, /\bMPL[-|\s|\_]?v?2\b/i,
777+
/\bMPL[-|\s|\_]?v?2\.0\b/i, /\bMPL[-|\s|\_]?v?2\b/i,
778778
/\bMOZILLA\s+PUBLIC\s+LICENSE\s+2\.0\b/i,
779779
/\bMozilla\sPublic\sLicense[\,]?\s+v?[\.]?\s*2\.0\b/i,
780780
/\bMOZILLA\s+PUBLIC\s+LICENSE[,]?\s+version\s+2\.0\b/i,
@@ -831,7 +831,7 @@ def get_custom_rules
831831
/\bRPL[-|\s|_]?v?1\.5\b/i, /\ARPL\s*\z/i,
832832
/\bhttps?:\/\/www\.opensource\.org\/licenses\/rpl\.php\b/i
833833
],
834-
"Ruby" => [/\bRUBY\sLICEN[S|C]E\b/i, /\ARUBY\b/i, /\bRUBY\'s\b/i],
834+
"Ruby" => [/\bRUBY\sLICEN[S|C]E\b/i, /\ARUBY\b/i, /\bRUBY\'s\b/i],
835835
"QPL-1.0" => [/\bQPL[-|\s|_]?v?1\.0\b/i,
836836
/\bQT\sPublic\sLicen[c|s]e\b/i,
837837
/\bPyQ\sGeneral\sLicense\b/i],
@@ -863,7 +863,7 @@ def get_custom_rules
863863
"ZPL-1.1" => [/\bZPL[-|\s|\_]?v?1\.1\b/i, /\bZPL[-|\s|\_]?v?1(?!\.)\b/i,
864864
/\bZPL[-|\s|\_]?1\.0\b/i],
865865
"ZPL-2.1" => [
866-
/\bZPL[-|\s|\/|_|]?v?2\.1\b/i, /\bZPL[-|\s|_]?v?2(?!\.)\b/i,
866+
/\bZPL[-|\s|\/|_|]?v?2\.1\b/i, /\bZPL[-|\s|_]?v?2(?!\.)\b/i,
867867
/\bZPL\s+2\.\d\b/i, /\bZOPE\s+PUBLIC\s+LICENSE\b/i,
868868
/\bZPL\s?$/i
869869
],

scripts/runtests_local.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ echo "Going to run all specs"
44
export RAILS_ENV="test"
55
echo "Rails mode: $RAILS_ENV"
66

7-
rspec --format documentation
7+
rspec spec/* --format documentation
88

99
export RAILS_ENV="development"
1010
echo "Rails mode: $RAILS_ENV"

spec/versioneye/utils/license_matcher_spec.rb

+28-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
describe LicenseMatcher do
55
lic_matcher = LicenseMatcher.new
6-
let(:licenses_json_path){'data/spdx_licenses/licenses.json'}
6+
let(:licenses_json_path){'data/spdx_licenses/licenses.json'}
77
let(:corpus_path){ 'data/spdx_licenses/plain' }
88
let(:spec_path){ 'spec/fixtures/files/licenses' }
99

@@ -12,7 +12,7 @@
1212
let(:lgpl_txt){ File.read("#{corpus_path}/LGPL-2.0") }
1313
let(:bsd3_txt){ File.read("#{corpus_path}/BSD-3-Clause") }
1414
let(:dotnet_txt){ File.read('data/custom_licenses/ms_dotnet') }
15-
let(:mit_issue11){ File.read("#{spec_path}/mit_issue11.txt")}
15+
let(:mit_issue11){ File.read("#{spec_path}/mit_issue11.txt")}
1616

1717
it "finds correct matches for text files" do
1818
expect( lic_matcher.match_text(mit_txt).first.first ).to eq("mit")
@@ -26,7 +26,7 @@
2626
it "matches MIT license so it could fix the issue#11" do
2727
res = lic_matcher.match_text(mit_issue11)
2828
expect( res.size ).to eq(3)
29-
29+
3030
spdx_id, score = res.first
3131
expect( spdx_id ).to eq("mit")
3232
expect( score ).to be > 0.9
@@ -57,15 +57,15 @@
5757
expect( spdx_id ).to eq('apache-2.0')
5858
expect( score ).to be > min_score
5959

60-
spdx_id, score = lic_matcher.match_html(apache_plex).first
60+
spdx_id, score = lic_matcher.match_html(apache_plex).first
6161
expect( spdx_id ).to eq('apache-2.0')
6262
expect( score ).to be > min_score
6363

6464
spdx_id, score = lic_matcher.match_html(bsd_fparsec).first
6565
expect( spdx_id ).to eq('bsd-3-clear')
6666
expect( score ).to be > min_score
6767

68-
spdx_id, score = lic_matcher.match_html(mit_ooi).first
68+
spdx_id, score = lic_matcher.match_html(mit_ooi).first
6969
expect( spdx_id ).to eq('mit')
7070
expect( score ).to be > min_score
7171

@@ -77,16 +77,36 @@
7777

7878
spdx_id, score = lic_matcher.match_html(cpol).first
7979
expect( spdx_id ).to eq('cpol-1.02')
80-
expect( score ).to be > min_score
80+
expect( score ).to be > min_score
8181
end
8282

8383
it "matches all the license files in the corpuse correctly" do
8484
lic_matcher.spdx_ids.each do |lic_id|
8585
lic_id = lic_id.downcase
8686
next if lic_id == 'ms_dotnet' or lic_id == 'cpol-1.02'
87-
8887

89-
lic_txt = File.read "#{corpus_path}/#{lic_id}"
88+
lic_txt = nil
89+
begin
90+
lic_txt = File.read "#{corpus_path}/#{lic_id}"
91+
rescue => e
92+
p e.message
93+
end
94+
95+
if lic_txt.nil?
96+
begin
97+
lic_txt = File.read "#{corpus_path}/#{lic_id.upcase}"
98+
rescue => e
99+
p e.message
100+
end
101+
end
102+
103+
if lic_txt.nil?
104+
begin
105+
lic_txt = File.read "#{corpus_path}/#{lic_id.downcase}"
106+
rescue => e
107+
p e.message
108+
end
109+
end
90110

91111
res = lic_matcher.match_text(lic_txt)
92112
#p "#{lic_id} => #{res} "

0 commit comments

Comments
 (0)