-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfilter.go
187 lines (174 loc) · 4.87 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Package bloom provides a Bloom filter implementation.
//
// Bloom filters
//
// A Bloom filter is a fast and space-efficient probabilistic data structure
// used to test set membership.
//
// A membership test returns either ”likely member” or ”definitely not
// a member”. Only false positives can occur: an element that has been added
// to the filter will always be identified as ”likely member”.
//
// The probabilities of different outcomes of a membership test at
// a false-positives rate of 1/100 are:
//
// Test(s) true false
// --------------------------------------
// s has been added 1 0
// s has not been added 0.01 0.99
//
// Elements can be added, but not removed. With more elements in the filter,
// the probability of false positives increases.
//
// Performance
//
// A full filter with a false-positives rate of 1/p uses roughly
// 0.26ln(p) bytes per element and performs ⌈1.4ln(p)⌉ bit array lookups
// per test:
//
// p bytes lookups
// -------------------------
// 4 0.4 2
// 8 0.5 3
// 16 0.7 4
// 32 0.9 5
// 64 1.1 6
// 128 1.3 7
// 256 1.5 8
// 512 1.6 9
// 1024 1.8 10
//
// Each membership test makes a single call to a 128-bit hash function.
// This improves speed without increasing the false-positives rate
// as shown by Kirsch and Mitzenmacher.
//
// Limitations
//
// This implementation is not intended for cryptographic use.
//
// The internal data representation is different for big-endian
// and little-endian machines.
//
// Typical use case
//
// The Basics example contains a typcial use case:
// a blacklist of shady websites.
//
package bloom
import (
"math"
)
const (
shift = 6
mask = 0x3f
)
// Filter represents a Bloom filter.
type Filter struct {
data []uint64 // Bit array, the length is a power of 2.
lookups int // Lookups per query
count int64 // Estimate number of elements
}
// New creates an empty Bloom filter with room for n elements
// at a false-positives rate less than 1/p.
func New(n int, p int) *Filter {
minWords := int(0.0325 * math.Log(float64(p)) * float64(n))
words := 1
for words < minWords {
words *= 2
}
return &Filter{
data: make([]uint64, words),
lookups: int(1.4*math.Log(float64(p)) + 1),
}
}
// AddByte adds b to the filter and tells if b was already a likely member.
func (f *Filter) AddByte(b []byte) bool {
return f.add(hash(b))
}
// Add adds s to the filter and tells if s was already a likely member.
func (f *Filter) Add(s string) bool {
return f.add(hashString(s))
}
func (f *Filter) add(h1, h2 uint64) bool {
trunc := uint64(len(f.data))<<shift - 1
member := true
for i := f.lookups; i > 0; i-- {
h1 += h2
n := h1 & trunc
k, b := n>>shift, uint64(1<<uint(n&mask))
if f.data[k]&b == 0 {
member = false
f.data[k] |= b
}
}
if !member {
f.count++
}
return member
}
// TestByte tells if b is a likely member of the filter.
// If true, b is probably a member; if false, b is definitely not a member.
func (f *Filter) TestByte(b []byte) bool {
return f.test(hash(b))
}
// Test tells if s is a likely member of the filter.
// If true, s is probably a member; if false, s is definitely not a member.
func (f *Filter) Test(s string) bool {
return f.test(hashString(s))
}
func (f *Filter) test(h1, h2 uint64) bool {
trunc := uint64(len(f.data))<<shift - 1
for i := f.lookups; i > 0; i-- {
h1 += h2
n := h1 & trunc
k, b := n>>shift, uint64(1<<uint(n&mask))
if f.data[k]&b == 0 {
return false
}
}
return true
}
// Count returns an estimate of the number of elements in the filter.
func (f *Filter) Count() int64 {
return f.count
}
// Union returns a new Bloom filter that consists of all elements
// that belong to either f1 or f2. The two filters must be of
// the same size n and have the same false-positives rate p.
//
// The resulting filter is the same as the filter created
// from scratch using the union of the two sets.
func (f1 *Filter) Union(f2 *Filter) *Filter {
if len(f1.data) != len(f2.data) || f1.lookups != f2.lookups {
panic("operation requires filters of the same type")
}
len := len(f1.data)
res := &Filter{
data: make([]uint64, len),
lookups: f1.lookups,
}
bitCount := 0
for i := 0; i < len; i++ {
w := f1.data[i] | f2.data[i]
res.data[i] = w
bitCount += count(w)
}
// Estimate the number of elements from the bitCount.
m := 64 * float64(len)
n := m / float64(f1.lookups) * math.Log(m/(m-float64(bitCount)))
res.count = int64(n)
return res
}
// count returns the number of nonzero bits in w.
func count(w uint64) int {
// Adapted from github.com/yourbasic/bit/funcs.go.
const maxw = 1<<64 - 1
const bpw = 64
w -= (w >> 1) & (maxw / 3)
w = w&(maxw/15*3) + (w>>2)&(maxw/15*3)
w += w >> 4
w &= maxw / 255 * 15
w *= maxw / 255
w >>= (bpw/8 - 1) * 8
return int(w)
}