-
Notifications
You must be signed in to change notification settings - Fork 0
/
hll.go
121 lines (105 loc) · 3.43 KB
/
hll.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package hll
import (
"errors"
"github.com/spaolacci/murmur3"
"hash"
"math"
"math/bits"
)
type HLL struct {
numRegisterBits int
registers []int
murmur32 hash.Hash32
}
// Returns new HLL instance configured to use the final 6 bits to denote the register (64 register total)
func NewHLL() HLL {
return NewHLLWithRegisterBits(6)
}
// Returns new HLL instance with the specified number of register bits
func NewHLLWithRegisterBits(numRegisterBits int) HLL {
numRegisters := int(math.Exp2(float64(numRegisterBits)))
registers := make([]int, numRegisters)
murmur32 := murmur3.New32()
hllInstance := HLL{numRegisterBits, registers, murmur32}
return hllInstance
}
// Add 32 bit hash to HLL
func (hll HLL) AddHash(hashedValue uint32) {
// bit mask to fetch bits representing register index to update
maskRegisterBits := ^uint32(0) >> uint32(32-hll.numRegisterBits)
registerIndex := uint32(hashedValue & maskRegisterBits)
remainingBits := hashedValue >> uint32(hll.numRegisterBits)
numRemainingBits := 32 - hll.numRegisterBits
trailingZeroes := bits.TrailingZeros32(remainingBits)
registerValue := 0
if trailingZeroes > numRemainingBits {
registerValue = numRemainingBits + 1
} else {
registerValue = trailingZeroes + 1
}
hll.registers[registerIndex] = int(math.Max(float64(hll.registers[registerIndex]), float64(registerValue)))
}
// Add string value to the HLL. MurmurHash is used to get 32 bit hash of string bytes.
func (hll HLL) AddString(value string) {
hll.murmur32.Write([]byte(value))
hashedValue := hll.murmur32.Sum32()
hll.AddHash(hashedValue)
}
// Computes the count/cardinality from the instance's register values
func (hll HLL) Count() float64 {
harmonicMean := 0.0
numZeroRegisters := 0.0
for _, registerVal := range hll.registers {
harmonicMean += math.Pow(2.0, -1*float64(registerVal))
if registerVal == 0 {
numZeroRegisters += 1.0
}
}
harmonicMean = 1.0 / harmonicMean
// TODO: figure out what alpha param means
estimate := getAlphaByNumRegisters(len(hll.registers)) * math.Pow(float64(len(hll.registers)), float64(2)) * float64(harmonicMean)
count := 0.0
// small range correction
if estimate <= (5.0/2.0)*float64(len(hll.registers)) {
if numZeroRegisters == 0 {
count = estimate
} else {
count = math.Round(float64(len(hll.registers)) * math.Log(float64(len(hll.registers))/numZeroRegisters))
}
return count
}
if estimate <= 1.0/30.0*math.Exp2(32.0) {
// intermediate range, no correction
count = estimate
} else {
// large range correction
// TODO: re-use 2^32
count = -1 * math.Pow(2.0, 32.0) * math.Log2(1-estimate/math.Pow(2, 32))
}
return count
}
// Merges two HLL instances by computing max(HLL1.register[i], HLL2.register[i]) for i in [0, numRegisters - 1]. Both
// HLLs must have the same number of register bits.
func (hll HLL) Merge(other HLL) error {
// verify that num register bits are equal
if hll.numRegisterBits != other.numRegisterBits {
return errors.New("hll: can't merge HLLs with different number of registers")
}
for index, registerVal := range other.registers {
hll.registers[index] = int(math.Max(float64(registerVal), float64(hll.registers[index])))
}
return nil
}
func getAlphaByNumRegisters(numRegisters int) float64 {
var alpha float64
if numRegisters == 16 {
alpha = 0.673
} else if numRegisters == 32 {
alpha = 0.697
} else if numRegisters == 64 {
alpha = 0.709
} else {
alpha = 0.7213 / (1 + 1.079/float64(numRegisters))
}
return alpha
}