@@ -38,7 +38,7 @@ def initialize(files_path = DEFAULT_CORPUS_FILES_PATH, license_json_file = LICEN
38
38
39
39
def init_id_idx ( licenses_json_doc )
40
40
idx = { }
41
- licenses_json_doc . to_a . each do |spdx_item |
41
+ licenses_json_doc . to_a . each do |spdx_item |
42
42
lic_id = spdx_item [ :id ] . to_s . downcase
43
43
idx [ lic_id ] = spdx_item [ :id ]
44
44
end
@@ -148,12 +148,12 @@ def match_url(the_url)
148
148
def match_rules ( text , early_exit = false )
149
149
matches = [ ]
150
150
text = preprocess_text ( text )
151
-
151
+
152
152
#if text is already spdx_id, then shortcut matching
153
153
if @rules . has_key? ( text . downcase )
154
- return [ [ text . downcase , 1.0 ] ]
154
+ return [ [ text . downcase , 1.0 ] ]
155
155
end
156
-
156
+
157
157
text += ' ' # required to make wordborder matcher to work with 1word texts
158
158
@rules . each do |spdx_id , rules |
159
159
match_res = matches_any_rule? ( rules , text )
@@ -176,7 +176,7 @@ def match_rules(text, early_exit = false)
176
176
def ignore? ( lic_text )
177
177
ignore_rules = get_ignore_rules
178
178
m = matches_any_rule? ( ignore_rules , lic_text . to_s )
179
- not m . nil?
179
+ not m . nil?
180
180
end
181
181
182
182
def matches_any_rule? ( rules , license_name )
@@ -196,19 +196,19 @@ def matches_any_rule?(rules, license_name)
196
196
#-- helpers
197
197
def preprocess_text ( text )
198
198
text = safe_encode ( text )
199
-
199
+
200
200
#remove markdown url tags
201
201
text = text . gsub ( /\[ .+?\] \( .+?\) / , ' ' )
202
202
203
203
#remove spam words
204
204
text . gsub! ( /\b THE\b /i , '' )
205
-
205
+
206
206
#remove some XML grabage
207
207
text = text . gsub ( /\< \! \[ CDATA.*?\] \] \> / , ' ' ) . to_s
208
208
text = text . gsub ( /\< \! --.+?--\> / , ' ' ) . to_s
209
209
text = text . gsub ( /<\! \[ CDATA.+?\] >/ , ' ' ) . to_s
210
210
211
- return text . to_s . strip . gsub ( /\s +/ , ' ' )
211
+ return text . to_s . strip . gsub ( /\s +/ , ' ' )
212
212
end
213
213
214
214
def preprocess_html ( html_text )
@@ -292,13 +292,13 @@ def rank_text_and_rules_matches(text_results, rules_results)
292
292
t_max = t_scores . max
293
293
294
294
r_total = rules_results . reduce ( 0 ) { |acc , i | acc += i [ 3 ] ; acc }
295
-
295
+
296
296
text_results . to_a . each do |t |
297
297
#find match in rules results and calc rankscore
298
298
lic_id = t [ 0 ] . downcase
299
299
r = rules_results . to_a . find { |x | x [ 0 ] . downcase == lic_id }
300
300
next if r . nil? #there was no matching rules
301
-
301
+
302
302
t_score = t [ 1 ] #score of confidence
303
303
r_score = r [ 3 ] #length of match
304
304
@@ -374,7 +374,7 @@ def safe_encode(txt)
374
374
def init_rules ( license_json_doc )
375
375
rules = { }
376
376
rules = build_rules_from_spdx_json ( license_json_doc )
377
-
377
+
378
378
get_custom_rules . each do |spdx_id , custom_rules_array |
379
379
spdx_id = spdx_id . to_s . strip . downcase
380
380
@@ -419,7 +419,7 @@ def build_spdx_item_rules(spdx_item)
419
419
rules << Regexp . new ( "\\ b[\\ (]?https?:\/ \/ (www\. )?#{ lic_url } [\/ ]?[\\ )]?\\ b" . gsub ( /\s +/ , '' ) , Regexp ::IGNORECASE )
420
420
end
421
421
422
-
422
+
423
423
spdx_name = preprocess_text ( spdx_item [ :name ] )
424
424
spdx_name . gsub! ( /\( .+?\) / , '' ) #remove SPDX ids in the license names
425
425
spdx_name . gsub! ( /\. / , '\\.' ) #mark version dots as not regex selector
@@ -429,7 +429,7 @@ def build_spdx_item_rules(spdx_item)
429
429
spdx_name . gsub! ( /\s +/ , '\\s\\+' ) #replace spaces with space selector
430
430
431
431
rules << Regexp . new ( "\\ b#{ spdx_name } \\ b" , Regexp ::IGNORECASE )
432
-
432
+
433
433
#use spdx_id in full-text match if it's uniq and doest ambiquity like MIT, Fair, Glide
434
434
spdx_id = spdx_item [ :id ] . to_s . strip . downcase
435
435
if spdx_id . match /[\d |-]|ware\z /
@@ -463,7 +463,7 @@ def get_ignore_rules
463
463
/^\( c\) \s +\d {2,4}\d / ,
464
464
/^LICENSE\s *$/i , /^FREE\s *$/i , /\A See\s License\s *\b /i , /^TODO\s *$/i , /^FREEWARE\s *$/i ,
465
465
/^All\s rights\s reserved\s *$/i , /^COPYING\s *$/i , /^OTHER\s *$/i , /^NONE\s *$/i , /^DUAL\s *$/i ,
466
- /^KEEP\s +IT\s +REAL\s *\b /i , /\A BE\s +REAL\s *\z /i , /\A Private\s *\z /i , /\A Commercial\s *\z /i ,
466
+ /^KEEP\s +IT\s +REAL\s *\b /i , /\A BE\s +REAL\s *\z /i , /\A Private\s *\z /i , /\A Commercial\s *\z /i ,
467
467
/\A See\s +LICENSE\s +file\b /i , /\A See\s the\s LICENSE\b /i , /\A LICEN[C|S]E\s *\z /i ,
468
468
/^PUBLIC\s *$/i , /^see file LICENSE\s *$/i , /^__license__\s *$/i ,
469
469
/\b LIEULA\b /i , /\A EULA\s *\z /i , /^qQuickLicen[c|s]e\b /i , /^For\s fun\b /i , /\A Various\s *\z /i ,
@@ -666,7 +666,7 @@ def get_custom_rules
666
666
] ,
667
667
"ECL-1.0" => [ /\b ECL[-|\s |_]?v?1\. 0\b /i , /\b ECL[-|\s |_]?v?1\b /i ] ,
668
668
"ECL-2.0" => [
669
- /\b ECL[-|\s |_]?v?2\. 0\b /i , /\b ECL[-|\s |_]?v?2\b /i ,
669
+ /\b ECL[-|\s |_]?v?2\. 0\b /i , /\b ECL[-|\s |_]?v?2\b /i ,
670
670
/\b EDUCATIONAL\s +COMMUNITY\s +LICENSE[,]?\s VERSION\s 2\. 0\b /i
671
671
] ,
672
672
"EFL-1.0" => [ /\b EFL[-|\s |_]?v?1\. 0\b /i , /\b EFL[-|\s |_]?v?1\b /i ] ,
@@ -677,7 +677,7 @@ def get_custom_rules
677
677
/\b Eiffel\s Forum\s License\b /i
678
678
] ,
679
679
"EPL-1.0" => [
680
- /\b EPL[-|\s |_]?v?1\. 0\b /i , /\b EPL[-|\s |_]?v?1\b /i ,
680
+ /\b EPL[-|\s |_]?v?1\. 0\b /i , /\b EPL[-|\s |_]?v?1\b /i ,
681
681
/\b ECLIPSE\s +PUBLIC\s +LICENSE\s +[v]?1\. 0\b /i ,
682
682
/\b ECLIPSE\s +PUBLIC\s +LICENSE\b /i ,
683
683
/^ECLIPSE$/i , /\A EPL\s *\z /
@@ -700,7 +700,7 @@ def get_custom_rules
700
700
/\b [\( ]?FDL[\) ]?\b /
701
701
] ,
702
702
"GPL-1.0" => [
703
- /\b GPL[-|\s |_]?v?1\. 0\b /i , /\b GPL[-|\s |_]?v?1\b /i ,
703
+ /\b GPL[-|\s |_]?v?1\. 0\b /i , /\b GPL[-|\s |_]?v?1\b /i ,
704
704
/\b GNU\s PUBLIC\s LICEN[S|C]E\s v?1\b /i
705
705
] ,
706
706
"GPL-2.0" => [
@@ -747,8 +747,8 @@ def get_custom_rules
747
747
/\b LESSER\s GENERAL\s PUBLIC\s LICENSE[\, ]?\s Version\s 2\. 1[\, ]?\b /i ,
748
748
/\b LESSER\s GENERAL\s PUBLIC\s LICENSE[\, ]?\s v?2\. 1\b /i
749
749
] ,
750
- "LGPL-3.0" => [ /\b LGPL[-|\s |_]?v?3\. 0\b /i , /\b LGPL[-|\s |_]?v?3[\+ ]?\b /i ,
751
- /\b LGLP[\s |-|v]?3\. 0\b /i , /^LPLv3\s *$/ , /\b LPGL[-|\s |_]?v?3[\+ ]?\b /i ,
750
+ "LGPL-3.0" => [ /\b LGPL[-|\s |_]?v?3\. 0\b /i , /\b LGPL[-|\s |_]?v?3[\+ ]?\b /i ,
751
+ /\b LGLP[\s |-|v]?3\. 0\b /i , /^LPLv3\s *$/ , /\b LPGL[-|\s |_]?v?3[\+ ]?\b /i ,
752
752
/\b LESSER\s +GENERAL\s +PUBLIC\s +License\s +[v]?3\b /i ,
753
753
/\b Lesser\s General\s Public\s License\s v?\. ?\s +3\. 0\b /i ,
754
754
/\b https?:\/ \/ www\. gnu\. org\/ copyleft\/ lesser.html\b /i ,
@@ -774,7 +774,7 @@ def get_custom_rules
774
774
/\b MPL[-|\s |\_ ]?v?1\. 1\b /i ,
775
775
] ,
776
776
"MPL-2.0" => [
777
- /\b MPL[-|\s |\_ ]?v?2\. 0\b /i , /\b MPL[-|\s |\_ ]?v?2\b /i ,
777
+ /\b MPL[-|\s |\_ ]?v?2\. 0\b /i , /\b MPL[-|\s |\_ ]?v?2\b /i ,
778
778
/\b MOZILLA\s +PUBLIC\s +LICENSE\s +2\. 0\b /i ,
779
779
/\b Mozilla\s Public\s License[\, ]?\s +v?[\. ]?\s *2\. 0\b /i ,
780
780
/\b MOZILLA\s +PUBLIC\s +LICENSE[,]?\s +version\s +2\. 0\b /i ,
@@ -831,7 +831,7 @@ def get_custom_rules
831
831
/\b RPL[-|\s |_]?v?1\. 5\b /i , /\A RPL\s *\z /i ,
832
832
/\b https?:\/ \/ www\. opensource\. org\/ licenses\/ rpl\. php\b /i
833
833
] ,
834
- "Ruby" => [ /\b RUBY\s LICEN[S|C]E\b /i , /\A RUBY\b /i , /\b RUBY\' s\b /i ] ,
834
+ "Ruby" => [ /\b RUBY\s LICEN[S|C]E\b /i , /\A RUBY\b /i , /\b RUBY\' s\b /i ] ,
835
835
"QPL-1.0" => [ /\b QPL[-|\s |_]?v?1\. 0\b /i ,
836
836
/\b QT\s Public\s Licen[c|s]e\b /i ,
837
837
/\b PyQ\s General\s License\b /i ] ,
@@ -863,7 +863,7 @@ def get_custom_rules
863
863
"ZPL-1.1" => [ /\b ZPL[-|\s |\_ ]?v?1\. 1\b /i , /\b ZPL[-|\s |\_ ]?v?1(?!\. )\b /i ,
864
864
/\b ZPL[-|\s |\_ ]?1\. 0\b /i ] ,
865
865
"ZPL-2.1" => [
866
- /\b ZPL[-|\s |\/ |_|]?v?2\. 1\b /i , /\b ZPL[-|\s |_]?v?2(?!\. )\b /i ,
866
+ /\b ZPL[-|\s |\/ |_|]?v?2\. 1\b /i , /\b ZPL[-|\s |_]?v?2(?!\. )\b /i ,
867
867
/\b ZPL\s +2\. \d \b /i , /\b ZOPE\s +PUBLIC\s +LICENSE\b /i ,
868
868
/\b ZPL\s ?$/i
869
869
] ,
0 commit comments