forked from healeycodes/crane-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.go
141 lines (126 loc) · 3.13 KB
/
search.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Uses code from Simple Full-Text Search engine
// Copied/Modified from https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/
// LICENSE: https://github.com/akrylysov/simplefts/blob/master/LICENSE
package search
import (
"strings"
"unicode"
snowballeng "github.com/kljensen/snowball/english"
)
// Document represents a text file
type Document struct {
Title string
URL string
Text string
ID int
}
// Result is a search result item
type Result struct {
Title string
URL string
ID int
}
// Index is an inverted Index. It maps tokens to document IDs.
type Index map[string][]int
// Store contains results and their index
type Store struct {
Index Index
Results []Result
}
// Add adds documents to the index.
func (index Index) Add(docs []Document) {
for _, doc := range docs {
for _, token := range Analyze(doc.Text) {
ids := index[token]
if ids != nil && ids[len(ids)-1] == doc.ID {
// Don't add same ID twice.
continue
}
index[token] = append(ids, doc.ID)
}
}
}
// Intersection returns the set Intersection between a and b.
// a and b have to be sorted in ascending order and contain no duplicates.
func Intersection(a []int, b []int) []int {
maxLen := len(a)
if len(b) > maxLen {
maxLen = len(b)
}
r := make([]int, 0, maxLen)
var i, j int
for i < len(a) && j < len(b) {
if a[i] < b[j] {
i++
} else if a[i] > b[j] {
j++
} else {
r = append(r, a[i])
i++
j++
}
}
return r
}
// Search queries the index for the given text.
func (index Index) Search(text string) []int {
var r []int
for _, token := range Analyze(text) {
if ids, ok := index[token]; ok {
if r == nil {
r = ids
} else {
r = Intersection(r, ids)
}
} else {
// Token doesn't exist.
return nil
}
}
return r
}
// Tokenize returns a slice of tokens for the given text.
func Tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
// Split on any character that is not a letter or a number.
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
}
// Analyze analyzes the text and returns a slice of tokens.
func Analyze(text string) []string {
tokens := Tokenize(text)
tokens = LowercaseFilter(tokens)
tokens = StopwordFilter(tokens)
tokens = StemmerFilter(tokens)
return tokens
}
// LowercaseFilter returns a slice of tokens normalized to lower case.
func LowercaseFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = strings.ToLower(token)
}
return r
}
// StopwordFilter returns a slice of tokens with stop words removed.
func StopwordFilter(tokens []string) []string {
var stopwords = map[string]struct{}{
"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
}
r := make([]string, 0, len(tokens))
for _, token := range tokens {
if _, ok := stopwords[token]; !ok {
r = append(r, token)
}
}
return r
}
// StemmerFilter returns a slice of stemmed tokens.
func StemmerFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = snowballeng.Stem(token, false)
}
return r
}