-
Notifications
You must be signed in to change notification settings - Fork 13
/
bottomk.go
142 lines (109 loc) · 2.91 KB
/
bottomk.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Package minhash implements the bottom-k sketch for streaming set similarity.
/*
For more information,
http://research.neustar.biz/2012/07/09/sketch-of-the-day-k-minimum-values/
MinHashing:
http://infolab.stanford.edu/~ullman/mmds/ch3.pdf
https://en.wikipedia.org/wiki/MinHash
BottomK:
http://www.math.tau.ac.il/~haimk/papers/p225-cohen.pdf
http://cohenwang.org/edith/Papers/metrics394-cohen.pdf
http://www.mpi-inf.mpg.de/~rgemulla/publications/beyer07distinct.pdf
This package works best when provided with a strong 64-bit hash function, such as CityHash, Spooky, Murmur3, or SipHash.
*/
package minhash
import (
"container/heap"
"math"
"sort"
)
type intHeap []uint64
func (h intHeap) Len() int { return len(h) }
// actually Greater, since we want a max-heap
func (h intHeap) Less(i, j int) bool { return h[i] > h[j] }
func (h intHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *intHeap) Push(x interface{}) {
*h = append(*h, x.(uint64))
}
func (h *intHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// BottomK is a bottom-k sketch of a set
type BottomK struct {
size int
h Hash64
minimums *intHeap
}
// NewBottomK returns a new BottomK implementation.
func NewBottomK(h Hash64, k int) *BottomK {
return &BottomK{
size: k,
h: h,
minimums: &intHeap{},
}
}
// Push adds an element to the set.
func (m *BottomK) Push(b []byte) {
i64 := m.h(b)
if i64 == 0 {
return
}
if len(*m.minimums) < m.size {
heap.Push(m.minimums, i64)
return
}
if i64 < (*m.minimums)[0] {
(*m.minimums)[0] = i64
heap.Fix(m.minimums, 0)
}
}
// Merge combines the signatures of the second set, creating the signature of their union.
func (m *BottomK) Merge(m2 *BottomK) {
for _, v := range *m2.minimums {
if len(*m.minimums) < m.size {
heap.Push(m.minimums, v)
continue
}
if v < (*m.minimums)[0] {
(*m.minimums)[0] = v
heap.Fix(m.minimums, 0)
}
}
}
// Cardinality estimates the cardinality of the set
func (m *BottomK) Cardinality() int {
return int(float64(len(*m.minimums)-1) / (float64((*m.minimums)[0]) / float64(math.MaxUint64)))
}
// Signature returns a signature for the set.
func (m *BottomK) Signature() []uint64 {
mins := make(intHeap, len(*m.minimums))
copy(mins, *m.minimums)
sort.Sort(mins)
return mins
}
// Similarity computes an estimate for the similarity between the two sets.
func (m *BottomK) Similarity(m2 *BottomK) float64 {
if m.size != m2.size {
panic("minhash minimums size mismatch")
}
mins := make(map[uint64]int, len(*m.minimums))
for _, v := range *m.minimums {
mins[v]++
}
intersect := 0
for _, v := range *m2.minimums {
if count, ok := mins[v]; ok && count > 0 {
intersect++
mins[v] = count - 1
}
}
maxlength := len(*m.minimums)
if maxlength < len(*m2.minimums) {
maxlength = len(*m2.minimums)
}
return float64(intersect) / float64(maxlength)
}