Skip to content

Commit d479841

Browse files
committed
[compression][gorilla] Add notes from paper & time
- implemented (or copied) the double delta for time
1 parent a5e454f commit d479841

File tree

6 files changed

+257
-3
lines changed

6 files changed

+257
-3
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ test:
2121

2222
.PHONY: fmt
2323
fmt:
24-
goimports -d -l -w $(PKGST)
24+
goimports -d -l -w $(PKGST) ./playground
2525

2626
.PHONY: generate
2727
generate:

doc/compression/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ Compression for both time series data and the meta data in index
44

55
## TODO
66

7-
- [ ] tsz
7+
- [ ] gorilla
88
- [ ] https://roaringbitmap.org/ I think used by both influxdb and m3 (and maybe more)

doc/compression/gorilla.md

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Facebook Gorilla
2+
3+
## Paper
4+
5+
http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
6+
7+
4.1 Time series compression
8+
9+
- timestamps and values are compressed separately using information about previous values
10+
- but they are put into same byte stream
11+
12+
4.1.1 Compressing time stamps
13+
14+
- first value is aligned to two hour window
15+
- second value is delta with first value, size is 14 bits because, 14 bits is 16384 seconds, 4.5h
16+
- use a dictionary, the range of dictionary is determined by sample
17+
18+
4.1.2 Compressing values
19+
20+
- first XOR w/ previous value
21+
- variable length encoding
22+
23+
## Beringei
24+
25+
https://github.com/facebookarchive/beringei

doc/database/akumuli/protocol.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ this is very similar to InfluxDB's style
5454
````json
5555
{
5656
"select": "balancers.cpuload",
57-
"range": {
57+
"range": {
5858
"from": "20120102T123000.000000",
5959
"to": "20190102T123010.000000"
6060
}

go.sum

+13
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANyt
1717
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
1818
github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
1919
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
20+
github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
2021
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
2122
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
2223
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
@@ -37,9 +38,21 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
3738
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
3839
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
3940
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
41+
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
42+
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
43+
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
44+
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
45+
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
46+
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
4047
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
48+
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
49+
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
4150
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
51+
golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563 h1:NIou6eNFigscvKJmsbyez16S2cIS6idossORlFtSt2E=
4252
golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
53+
golang.org/x/tools v0.0.0-20200204230316-67a4523381ef h1:mdhEDFpO1Tfj7PXIflIuP1tbXt4rJgHIvbzdh62SARw=
54+
golang.org/x/tools v0.0.0-20200204230316-67a4523381ef/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
55+
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
4356
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
4457
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
4558
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

playground/gorilla/gorilla_test.go

+216
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
package gorilla_test
2+
3+
import (
4+
"encoding/binary"
5+
"testing"
6+
"time"
7+
8+
"github.com/stretchr/testify/assert"
9+
)
10+
11+
// a quick hack implementation for the algorithm described in gorilla paper
12+
13+
// bits is based on https://github.com/dgryski/go-tsz/blob/master/bstream.go
14+
// it allows you to read/write individual bit(s)
15+
type bits struct {
16+
buf []byte // underlying bytes
17+
i int // index of last byte
18+
// NOTE: the reason we use remain instead of used because it makes appending bits easier
19+
// byte = byte | 1 << (remain - 1)
20+
remain uint8
21+
}
22+
23+
func newBits() *bits {
24+
// 0 byte and 0 bits, i is -1 so the grow logic works ...
25+
return &bits{buf: make([]byte, 0), i: -1, remain: 0}
26+
}
27+
28+
func (b *bits) writeBit(bit bool) {
29+
if b.remain == 0 {
30+
b.buf = append(b.buf, 0)
31+
b.remain = 8
32+
b.i++
33+
}
34+
if bit {
35+
b.buf[b.i] |= 1 << (b.remain - 1)
36+
}
37+
b.remain--
38+
}
39+
40+
func (b *bits) writeByte(byt byte) {
41+
// fast path, previous write are aligned to byte boundary
42+
if b.remain == 0 {
43+
b.buf = append(b.buf, byt)
44+
b.i++
45+
return
46+
}
47+
48+
// e.g. b.remain = 6
49+
// [0, 1, 2, 3, 4, 5, 6, 7]
50+
// [0, 1, 2, 3, 4, 5, 6, 7] [0, 1, 2, 3, 4, 5, 6, 7]
51+
b.buf[b.i] |= byt >> (8 - b.remain)
52+
b.buf = append(b.buf, 0)
53+
b.i++
54+
b.buf[b.i] |= byt << b.remain
55+
// no need to update b.remain, it's the same
56+
}
57+
58+
func (b *bits) writeBits(u uint64, n uint) {
59+
u <<= 64 - n
60+
for n >= 8 {
61+
byt := byte(u >> 56)
62+
b.writeByte(byt)
63+
u <<= 8
64+
n -= 8
65+
}
66+
67+
for n > 0 {
68+
b.writeBit((u >> 63) == 1)
69+
u <<= 1
70+
n--
71+
}
72+
}
73+
74+
func TestBits(t *testing.T) {
75+
t.Run("writeBit", func(t *testing.T) {
76+
bs := newBits()
77+
for i := 0; i < 8; i++ {
78+
bs.writeBit(true)
79+
}
80+
assert.Equal(t, bs.remain, uint8(0))
81+
assert.Equal(t, bs.buf[0], byte(0b1111_1111))
82+
bs.writeByte(8)
83+
assert.Equal(t, bs.buf[1], byte(8))
84+
bs.writeBit(true)
85+
bs.writeByte(1)
86+
assert.Equal(t, bs.buf[2], byte(0b1000_0000))
87+
assert.Equal(t, bs.buf[3], byte(0b1000_0000))
88+
})
89+
90+
t.Run("writeBits", func(t *testing.T) {
91+
bs := newBits()
92+
bs.writeBits(20, 32)
93+
assert.Equal(t, bs.buf[0], byte(0))
94+
assert.Equal(t, bs.buf[1], byte(0))
95+
assert.Equal(t, bs.buf[2], byte(0))
96+
assert.Equal(t, bs.buf[3], byte(20))
97+
assert.Equal(t, len(bs.buf), 4)
98+
assert.Equal(t, bs.remain, uint8(0))
99+
assert.Equal(t, bs.i, 3)
100+
})
101+
102+
}
103+
104+
// encoder encodes time stream, i.e. it does not mix value into same stream
105+
type encoder struct {
106+
bs bits
107+
start uint64
108+
prevTime uint64
109+
delta uint64
110+
}
111+
112+
func newEncoder(start uint64) *encoder {
113+
bs := newBits()
114+
bs.writeBits(start, 64)
115+
return &encoder{
116+
bs: *bs,
117+
start: start,
118+
prevTime: 0,
119+
}
120+
}
121+
122+
func (e *encoder) write(tm uint64) {
123+
// first value since start, write using delta
124+
if e.prevTime == 0 {
125+
delta := tm - e.start
126+
e.prevTime = tm
127+
e.bs.writeBits(delta, 14)
128+
e.delta = delta
129+
return
130+
}
131+
132+
// TODO: delta is positive if time comes in order, dod can be negative because interval
133+
// double delta
134+
delta := tm - e.prevTime
135+
dod := int64(delta - e.delta)
136+
e.delta = delta
137+
switch {
138+
case dod == 0:
139+
e.bs.writeBit(false)
140+
case dod <= 64 && dod >= -63:
141+
e.bs.writeBits(0b10, 2)
142+
e.bs.writeBits(uint64(dod), 7)
143+
case dod <= 256 && dod > -255:
144+
e.bs.writeBits(0b110, 3)
145+
e.bs.writeBits(uint64(dod), 9)
146+
case dod <= 2048 && dod > -2047:
147+
e.bs.writeBits(0b1110, 4)
148+
e.bs.writeBits(uint64(dod), 12)
149+
default:
150+
e.bs.writeBits(0b1111, 4)
151+
e.bs.writeBits(uint64(dod), 32)
152+
}
153+
e.prevTime = tm
154+
}
155+
156+
func TestDoubleDelta(t *testing.T) {
157+
// Figure 2 in paper, start is aligned to 2 hour window
158+
start := mtime("2015-03-24T02:00:00Z")
159+
t1 := mtime("2015-03-24T02:01:02Z")
160+
t2 := mtime("2015-03-24T02:02:02Z")
161+
t3 := mtime("2015-03-24T02:03:02Z")
162+
enc := newEncoder(start)
163+
enc.write(t1)
164+
enc.write(t2)
165+
enc.write(t3)
166+
// first 64 bytes is the header
167+
var b8 [8]byte
168+
binary.BigEndian.PutUint64(b8[:], start)
169+
assert.Equal(t, enc.bs.buf[0], b8[0])
170+
assert.Equal(t, enc.bs.buf[7], b8[7])
171+
// the next 14 bits is the first time using delta
172+
// 62 is 111110, first 8 bits is empty, next 6 bits is the value
173+
assert.Equal(t, byte(0), enc.bs.buf[8])
174+
assert.Equal(t, byte(62), enc.bs.buf[9]>>2)
175+
// the first double delta encoded value, dict is 10, value is -2
176+
assert.Equal(t, byte(0b10), enc.bs.buf[9]&0b11)
177+
// TODO: value is 7 bit ... e, I need a bit reader implementation
178+
//assert.Equal(t, byte(-2), enc.bs.buf[10] )
179+
//assert.Equal(t, byte(t2-t1), enc.bs.buf[9]>>2)
180+
}
181+
182+
func subu64(a, b uint64) int64 {
183+
return int64(a - b)
184+
}
185+
186+
func TestUint64(t *testing.T) {
187+
// ./gorilla_test.go:164:23: constant -1 overflows uint64
188+
//a := int64(uint64(1) - uint64(2))
189+
//t.Log(a)
190+
// TODO: does this unsigned subtraction produce signed integer work in other languages?
191+
assert.Equal(t, subu64(1, 2), int64(-1))
192+
193+
// cast is using the same bytes, but
194+
a := int64(-1)
195+
b := uint64(a)
196+
c := int64(a)
197+
t.Log(a, b, c) // -1 18446744073709551615 -1
198+
}
199+
200+
// given a RFC3339 string returns a unix epoch, panic if failed to convert
201+
// https://github.com/golang/go/issues/9346
202+
// The time.RFC3339 format is a case where the format string itself isn't a valid time. You can't have a Z and an offset in the time string, but the format string has both because the spec can contain either type of timezone specification.
203+
//
204+
// Both of these are valid RFC3339 times:
205+
//
206+
// "2015-09-15T14:00:12-00:00"
207+
// "2015-09-15T14:00:13Z"
208+
//
209+
//And the time package needs to be able to parse them both using the same RFC3339 format string.
210+
func mtime(s string) uint64 {
211+
tm, err := time.Parse(time.RFC3339, s)
212+
if err != nil {
213+
panic(err)
214+
}
215+
return uint64(tm.Unix())
216+
}

0 commit comments

Comments
 (0)