-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcommon_word_matcher.go
87 lines (81 loc) · 1.46 KB
/
common_word_matcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package langdetect
func IsCommonWord(la Language, word string) bool {
langs, ok := commonWordMap[word]
if ok {
lnum := la.Number()
for _, cand := range langs {
if cand == lnum {
return true
}
}
}
return false
}
type commonWordMatcherState struct {
total int
totalMatches int
counts [0xFF]byte
}
func (c *commonWordMatcherState) addWord(word []byte) {
vector, found := commonWordMap[string(word)]
c.total++
if found {
for _, lc := range vector {
count := c.counts[lc]
if count < 0xFF {
c.totalMatches++
c.counts[lc] = count + 1
}
}
}
}
// Score for language with number "num", lower is better, 0..1.
func (c *commonWordMatcherState) score(num uint8) float64 {
if c.totalMatches < 5 {
return 1.0
}
return 1 - (float64(c.counts[num]) / float64(c.totalMatches))
}
func lowerkillpunctuation(bs []byte) {
skip := 0
for i, c := range bs {
if skip > 0 {
skip--
continue
}
switch {
case c == '\'':
case c < 'A':
bs[i] = ' '
case c < 'Z':
bs[i] |= 32
case c < 'a':
bs[i] = ' '
case c < 'z':
case c < 0x80:
bs[i] = ' '
case c < 0xC0:
skip = 1
case c < 0xE0:
skip = 2
case c < 0xF0:
skip = 3
default:
skip = 4
}
}
}
func (cms *commonWordMatcherState) processText(bs []byte) {
tgt := make([]byte, len(bs))
copy(tgt, bs)
lowerkillpunctuation(tgt)
ws := 0
for i, c := range tgt {
if c == ' ' {
if ws != i {
cms.addWord(tgt[ws:i])
}
ws = i + 1
}
}
}