forked from eclesh/hyperloglog
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhyperloglog.go
147 lines (134 loc) · 3.91 KB
/
hyperloglog.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Package hyperloglog implements the HyperLogLog algorithm for
// cardinality estimation. In English: it counts things. It counts
// things using very small amounts of memory compared to the number of
// objects it is counting.
//
// For a full description of the algorithm, see the paper HyperLogLog:
// the analysis of a near-optimal cardinality estimation algorithm by
// Flajolet, et. al.
package hyperloglog
import (
"fmt"
"math"
"math/bits"
)
const (
exp32 = 1 << 32 // 2^32
)
// A HyperLogLog is a deterministic cardinality estimator. This version
// exports its fields so that it is suitable for saving eg. to a database.
type HyperLogLog struct {
M uint // Number of registers
B uint32 // Number of bits used to determine register index
Alpha float64 // Bias correction constant
Registers []uint8
}
// Compute bias correction alpha_m.
func getAlpha(m uint) (result float64) {
switch m {
case 16:
result = 0.673
case 32:
result = 0.697
case 64:
result = 0.709
default:
result = 0.7213 / (1.0 + 1.079/float64(m))
}
return result
}
// New creates a HyperLogLog with the given number of registers. More
// registers leads to lower error in your estimated count, at the
// expense of memory.
//
// Choose a power of two number of registers, depending on the amount
// of memory you're willing to use and the error you're willing to
// tolerate. Each register uses one byte of memory.
//
// Standard error will be: σ ≈ 1.04 / sqrt(registers)
// The estimates provided by hyperloglog are expected to be within σ, 2σ, 3σ
// of the exact count in respectively 65%, 95%, 99% of all the cases.
func New(registers uint) (*HyperLogLog, error) {
if registers == 0 {
panic("cannot have zero registers")
}
if (registers & (registers - 1)) != 0 {
return nil, fmt.Errorf("number of registers %d not a power of two", registers)
}
h := &HyperLogLog{}
h.M = registers
h.B = uint32(math.Log2(float64(registers)))
h.Alpha = getAlpha(registers)
h.Registers = make([]uint8, h.M)
return h, nil
}
// Reset all internal variables and set the count to zero.
func (h *HyperLogLog) Reset() {
for i := range h.Registers {
h.Registers[i] = 0
}
}
// Add to the count. val should be a 32 bit unsigned integer from a
// good hash function.
func (h *HyperLogLog) Add(val uint32) {
k := 32 - h.B
slice := (val << h.B) | (1 << (h.B - 1))
r := uint8(bits.LeadingZeros32(slice) + 1)
j := val >> uint(k)
if r > h.Registers[j] {
h.Registers[j] = r
}
}
// Count returns the estimated cardinality.
func (h *HyperLogLog) Count() uint64 {
return h.count(true)
}
// CountWithoutLargeRangeCorrection returns the estimated cardinality, without applying
// the large range correction proposed by Flajolet et al. as it can lead to significant
// overcounting.
//
// See https://github.com/DataDog/hyperloglog/pull/15
func (h *HyperLogLog) CountWithoutLargeRangeCorrection() uint64 {
return h.count(false)
}
func (h *HyperLogLog) count(withLargeRangeCorrection bool) uint64 {
sum := 0.0
m := float64(h.M)
for _, val := range h.Registers {
sum += 1.0 / float64(int(1)<<val)
}
estimate := h.Alpha * m * m / sum
if estimate <= 5.0/2.0*m {
// Small range correction
v := 0
for _, r := range h.Registers {
if r == 0 {
v++
}
}
if v > 0 {
estimate = m * math.Log(m/float64(v))
}
} else if estimate > 1.0/30.0*exp32 && withLargeRangeCorrection {
// Large range correction
estimate = -exp32 * math.Log(1-estimate/exp32)
}
return uint64(estimate)
}
// Merge another HyperLogLog into this one. The number of registers in
// each must be the same.
func (h *HyperLogLog) Merge(other *HyperLogLog) error {
if h.M != other.M {
return fmt.Errorf("number of registers doesn't match: %d != %d",
h.M, other.M)
}
// Trigger boundary check once for h.Registers
registers := h.Registers
_ = registers[len(other.Registers)-1]
for j, r := range other.Registers {
if r > registers[j] {
registers[j] = r
}
}
return nil
}