[compression][gorilla] Add notes from paper & time

at15 · at15 · commit d47984130759 · 2020-02-04T20:10:10.000-08:00
- implemented (or copied) the double delta for time
diff --git a/Makefile b/Makefile
@@ -21,7 +21,7 @@ test:
 
 .PHONY: fmt
 fmt:
-	goimports -d -l -w $(PKGST)
+	goimports -d -l -w $(PKGST) ./playground
 
 .PHONY: generate
 generate:
diff --git a/doc/compression/README.md b/doc/compression/README.md
@@ -4,5 +4,5 @@ Compression for both time series data and the meta data in index
 
 ## TODO
 
-- [ ] tsz
+- [ ] gorilla
 - [ ] https://roaringbitmap.org/ I think used by both influxdb and m3 (and maybe more)
diff --git a/doc/compression/gorilla.md b/doc/compression/gorilla.md
@@ -0,0 +1,25 @@
+# Facebook Gorilla
+
+## Paper
+
+http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
+
+4.1 Time series compression
+
+- timestamps and values are compressed separately using information about previous values
+  - but they are put into same byte stream
+
+4.1.1 Compressing time stamps 
+
+- first value is aligned to two hour window
+- second value is delta with first value, size is 14 bits because, 14 bits is 16384 seconds, 4.5h
+- use a dictionary, the range of dictionary is determined by sample
+
+4.1.2 Compressing values
+
+- first XOR w/ previous value
+- variable length encoding
+ 
+## Beringei
+
+https://github.com/facebookarchive/beringei
diff --git a/doc/database/akumuli/protocol.md b/doc/database/akumuli/protocol.md
@@ -54,7 +54,7 @@ this is very similar to InfluxDB's style
 ````json
 {
     "select": "balancers.cpuload",
-		 "range": {
+		"range": {
         "from": "20120102T123000.000000",
         "to":   "20190102T123010.000000"
     }
diff --git a/go.sum b/go.sum
@@ -17,6 +17,7 @@ github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANyt
 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
 github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
@@ -37,9 +38,21 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
 golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563 h1:NIou6eNFigscvKJmsbyez16S2cIS6idossORlFtSt2E=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20200204230316-67a4523381ef h1:mdhEDFpO1Tfj7PXIflIuP1tbXt4rJgHIvbzdh62SARw=
+golang.org/x/tools v0.0.0-20200204230316-67a4523381ef/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/playground/gorilla/gorilla_test.go b/playground/gorilla/gorilla_test.go
@@ -0,0 +1,216 @@
+package gorilla_test
+
+import (
+	"encoding/binary"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// a quick hack implementation for the algorithm described in gorilla paper
+
+// bits is based on https://github.com/dgryski/go-tsz/blob/master/bstream.go
+// it allows you to read/write individual bit(s)
+type bits struct {
+	buf []byte // underlying bytes
+	i   int    // index of last byte
+	// NOTE: the reason we use remain instead of used because it makes appending bits easier
+	// byte = byte | 1 << (remain - 1)
+	remain uint8
+}
+
+func newBits() *bits {
+	// 0 byte and 0 bits, i is -1 so the grow logic works ...
+	return &bits{buf: make([]byte, 0), i: -1, remain: 0}
+}
+
+func (b *bits) writeBit(bit bool) {
+	if b.remain == 0 {
+		b.buf = append(b.buf, 0)
+		b.remain = 8
+		b.i++
+	}
+	if bit {
+		b.buf[b.i] |= 1 << (b.remain - 1)
+	}
+	b.remain--
+}
+
+func (b *bits) writeByte(byt byte) {
+	// fast path, previous write are aligned to byte boundary
+	if b.remain == 0 {
+		b.buf = append(b.buf, byt)
+		b.i++
+		return
+	}
+
+	// e.g. b.remain = 6
+	//       [0, 1, 2, 3, 4, 5,  6, 7]
+	// [0, 1, 2, 3, 4, 5, 6, 7] [0, 1, 2, 3, 4, 5, 6, 7]
+	b.buf[b.i] |= byt >> (8 - b.remain)
+	b.buf = append(b.buf, 0)
+	b.i++
+	b.buf[b.i] |= byt << b.remain
+	// no need to update b.remain, it's the same
+}
+
+func (b *bits) writeBits(u uint64, n uint) {
+	u <<= 64 - n
+	for n >= 8 {
+		byt := byte(u >> 56)
+		b.writeByte(byt)
+		u <<= 8
+		n -= 8
+	}
+
+	for n > 0 {
+		b.writeBit((u >> 63) == 1)
+		u <<= 1
+		n--
+	}
+}
+
+func TestBits(t *testing.T) {
+	t.Run("writeBit", func(t *testing.T) {
+		bs := newBits()
+		for i := 0; i < 8; i++ {
+			bs.writeBit(true)
+		}
+		assert.Equal(t, bs.remain, uint8(0))
+		assert.Equal(t, bs.buf[0], byte(0b1111_1111))
+		bs.writeByte(8)
+		assert.Equal(t, bs.buf[1], byte(8))
+		bs.writeBit(true)
+		bs.writeByte(1)
+		assert.Equal(t, bs.buf[2], byte(0b1000_0000))
+		assert.Equal(t, bs.buf[3], byte(0b1000_0000))
+	})
+
+	t.Run("writeBits", func(t *testing.T) {
+		bs := newBits()
+		bs.writeBits(20, 32)
+		assert.Equal(t, bs.buf[0], byte(0))
+		assert.Equal(t, bs.buf[1], byte(0))
+		assert.Equal(t, bs.buf[2], byte(0))
+		assert.Equal(t, bs.buf[3], byte(20))
+		assert.Equal(t, len(bs.buf), 4)
+		assert.Equal(t, bs.remain, uint8(0))
+		assert.Equal(t, bs.i, 3)
+	})
+
+}
+
+// encoder encodes time stream, i.e. it does not mix value into same stream
+type encoder struct {
+	bs       bits
+	start    uint64
+	prevTime uint64
+	delta    uint64
+}
+
+func newEncoder(start uint64) *encoder {
+	bs := newBits()
+	bs.writeBits(start, 64)
+	return &encoder{
+		bs:       *bs,
+		start:    start,
+		prevTime: 0,
+	}
+}
+
+func (e *encoder) write(tm uint64) {
+	// first value since start, write using delta
+	if e.prevTime == 0 {
+		delta := tm - e.start
+		e.prevTime = tm
+		e.bs.writeBits(delta, 14)
+		e.delta = delta
+		return
+	}
+
+	// TODO: delta is positive if time comes in order, dod can be negative because interval
+	// double delta
+	delta := tm - e.prevTime
+	dod := int64(delta - e.delta)
+	e.delta = delta
+	switch {
+	case dod == 0:
+		e.bs.writeBit(false)
+	case dod <= 64 && dod >= -63:
+		e.bs.writeBits(0b10, 2)
+		e.bs.writeBits(uint64(dod), 7)
+	case dod <= 256 && dod > -255:
+		e.bs.writeBits(0b110, 3)
+		e.bs.writeBits(uint64(dod), 9)
+	case dod <= 2048 && dod > -2047:
+		e.bs.writeBits(0b1110, 4)
+		e.bs.writeBits(uint64(dod), 12)
+	default:
+		e.bs.writeBits(0b1111, 4)
+		e.bs.writeBits(uint64(dod), 32)
+	}
+	e.prevTime = tm
+}
+
+func TestDoubleDelta(t *testing.T) {
+	// Figure 2 in paper, start is aligned to 2 hour window
+	start := mtime("2015-03-24T02:00:00Z")
+	t1 := mtime("2015-03-24T02:01:02Z")
+	t2 := mtime("2015-03-24T02:02:02Z")
+	t3 := mtime("2015-03-24T02:03:02Z")
+	enc := newEncoder(start)
+	enc.write(t1)
+	enc.write(t2)
+	enc.write(t3)
+	// first 64 bytes is the header
+	var b8 [8]byte
+	binary.BigEndian.PutUint64(b8[:], start)
+	assert.Equal(t, enc.bs.buf[0], b8[0])
+	assert.Equal(t, enc.bs.buf[7], b8[7])
+	// the next 14 bits is the first time using delta
+	// 62 is 111110, first 8 bits is empty, next 6 bits is the value
+	assert.Equal(t, byte(0), enc.bs.buf[8])
+	assert.Equal(t, byte(62), enc.bs.buf[9]>>2)
+	// the first double delta encoded value, dict is 10, value is -2
+	assert.Equal(t, byte(0b10), enc.bs.buf[9]&0b11)
+	// TODO: value is 7 bit ... e, I need a bit reader implementation
+	//assert.Equal(t, byte(-2), enc.bs.buf[10] )
+	//assert.Equal(t, byte(t2-t1), enc.bs.buf[9]>>2)
+}
+
+func subu64(a, b uint64) int64 {
+	return int64(a - b)
+}
+
+func TestUint64(t *testing.T) {
+	// ./gorilla_test.go:164:23: constant -1 overflows uint64
+	//a := int64(uint64(1) - uint64(2))
+	//t.Log(a)
+	// TODO: does this unsigned subtraction produce signed integer work in other languages?
+	assert.Equal(t, subu64(1, 2), int64(-1))
+
+	// cast is using the same bytes, but
+	a := int64(-1)
+	b := uint64(a)
+	c := int64(a)
+	t.Log(a, b, c) // -1 18446744073709551615 -1
+}
+
+// given a RFC3339 string returns a unix epoch, panic if failed to convert
+// https://github.com/golang/go/issues/9346
+// The time.RFC3339 format is a case where the format string itself isn't a valid time. You can't have a Z and an offset in the time string, but the format string has both because the spec can contain either type of timezone specification.
+//
+// Both of these are valid RFC3339 times:
+//
+// "2015-09-15T14:00:12-00:00"
+// "2015-09-15T14:00:13Z"
+//
+//And the time package needs to be able to parse them both using the same RFC3339 format string.
+func mtime(s string) uint64 {
+	tm, err := time.Parse(time.RFC3339, s)
+	if err != nil {
+		panic(err)
+	}
+	return uint64(tm.Unix())
+}

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ this is very similar to InfluxDB's style`
`54`	`54`	````json
`55`	`55`	`{`
`56`	`56`	`"select": "balancers.cpuload",`
`57`		`- "range": {`
	`57`	`+ "range": {`
`58`	`58`	`"from": "20120102T123000.000000",`
`59`	`59`	`"to": "20190102T123010.000000"`
`60`	`60`	`}`