io.go

// Copyright 2023 the Blobloom authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package blobloom

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"strings"
	"sync/atomic"
)

const maxCommentLen = 44

// Dump writes f to w, with an optional comment string, in the binary format
// that a Loader accepts. It returns the number of bytes written to w.
//
// The comment may contain arbitrary data, within the limits layed out by the
// format description. It can be used to record the hash function to be used
// with a Filter.
func Dump(w io.Writer, f *Filter, comment string) (int64, error) {
	return dump(w, f.b, f.k, comment)
}

// DumpSync is like Dump, but for SyncFilters.
//
// If other goroutines are simultaneously modifying f,
// their modifications may not be reflected in the dump.
// Separate synchronization is required to prevent this.
//
// The format produced is the same as Dump's. The fact that
// the argument is a SyncFilter is not encoded in the dump.
func DumpSync(w io.Writer, f *SyncFilter, comment string) (n int64, err error) {
	return dump(w, f.b, f.k, comment)
}

func dump(w io.Writer, b []block, nhashes int, comment string) (n int64, err error) {
	switch {
	case len(b) == 0 || nhashes == 0:
		err = errors.New("blobloom: won't dump uninitialized Filter")
	case len(comment) > maxCommentLen:
		err = fmt.Errorf("blobloom: comment of length %d too long", len(comment))
	case strings.IndexByte(comment, 0) != -1:
		err = fmt.Errorf("blobloom: comment %q contains zero byte", len(comment))
	}
	if err != nil {
		return 0, err
	}

	var buf [64]byte
	copy(buf[:8], "blobloom")
	// As documented in the comment for Loader, we store one less than the
	// number of blocks. This way, we can use the otherwise invalid value 0
	// and store 2³² blocks instead of at most 2³²-1.
	binary.LittleEndian.PutUint32(buf[12:], uint32(len(b)-1))
	binary.LittleEndian.PutUint32(buf[16:], uint32(nhashes))
	copy(buf[20:], comment)

	k, err := w.Write(buf[:])
	n = int64(k)
	if err != nil {
		return n, err
	}

	for i := range b {
		for j := range b[i] {
			x := atomic.LoadUint32(&b[i][j])
			binary.LittleEndian.PutUint32(buf[4*j:], x)
		}
		k, err = w.Write(buf[:])
		n += int64(k)
		if err != nil {
			break
		}
	}

	return n, err
}

// A Loader reads a Filter or SyncFilter from an io.Reader.
//
// A Loader accepts the binary format produced by Dump. The format starts
// with a 64-byte header:
//   - the string "blobloom", in ASCII;
//   - a four-byte version number, which must be zero;
//   - the number of Bloom filter blocks, minus one, as a 32-bit integer;
//   - the number of hashes, as a 32-bit integer;
//   - a comment of at most 44 non-zero bytes, padded to 44 bytes with zeros.
//
// After the header come the 512-bit blocks, divided into sixteen 32-bit limbs.
// All integers are little-endian.
type Loader struct {
	buf [64]byte
	r   io.Reader
	err error

	Comment string // Comment field. Filled in by NewLoader.
	nblocks uint64
	nhashes int
}

// NewLoader parses the format header from r and returns a Loader
// that can be used to load a Filter from it.
func NewLoader(r io.Reader) (*Loader, error) {
	l := &Loader{r: r}

	err := l.fillbuf()
	if err != nil {
		return nil, err
	}

	version := binary.LittleEndian.Uint32(l.buf[8:])
	// See comment in dump for the +1.
	l.nblocks = 1 + uint64(binary.LittleEndian.Uint32(l.buf[12:]))
	l.nhashes = int(binary.LittleEndian.Uint32(l.buf[16:]))
	comment := l.buf[20:]

	switch {
	case string(l.buf[:8]) != "blobloom":
		err = errors.New("blobloom: not a Bloom filter dump")
	case version != 0:
		err = errors.New("blobloom: unsupported dump version")
	case l.nhashes == 0:
		err = errors.New("blobloom: zero hashes in Bloom filter dump")
	}
	if err == nil {
		comment, err = checkComment(comment)
		l.Comment = string(comment)
	}

	if err != nil {
		l = nil
	}
	return l, err
}

// Load sets f to the union of f and the Loader's filter, then returns f.
// If f is nil, a new Filter of the appropriate size is constructed.
//
// If f is not nil and an error occurs while reading from the Loader,
// f may end up in an inconsistent state.
func (l *Loader) Load(f *Filter) (*Filter, error) {
	if f == nil {
		nbits := BlockBits * l.nblocks
		if nbits > MaxBits {
			return nil, fmt.Errorf("blobloom: %d blocks is too large", l.nblocks)
		}
		f = New(nbits, int(l.nhashes))
	} else if err := l.checkBitsAndHashes(len(f.b), f.k); err != nil {
		return nil, err
	}

	for i := range f.b {
		if err := l.fillbuf(); err != nil {
			return nil, err
		}

		for j := range f.b[i] {
			f.b[i][j] |= binary.LittleEndian.Uint32(l.buf[4*j:])
		}
	}

	return f, nil
}

// Load sets f to the union of f and the Loader's filter, then returns f.
// If f is nil, a new SyncFilter of the appropriate size is constructed.
// Else, LoadSync may run concurrently with other modifications to f.
//
// If f is not nil and an error occurs while reading from the Loader,
// f may end up in an inconsistent state.
func (l *Loader) LoadSync(f *SyncFilter) (*SyncFilter, error) {
	if f == nil {
		nbits := BlockBits * l.nblocks
		if nbits > MaxBits {
			return nil, fmt.Errorf("blobloom: %d blocks is too large", l.nblocks)
		}
		f = NewSync(nbits, int(l.nhashes))
	} else if err := l.checkBitsAndHashes(len(f.b), f.k); err != nil {
		return nil, err
	}

	for i := range f.b {
		if err := l.fillbuf(); err != nil {
			return nil, err
		}

		for j := range f.b[i] {
			p := &f.b[i][j]
			x := binary.LittleEndian.Uint32(l.buf[4*j:])

			for {
				old := atomic.LoadUint32(p)
				if atomic.CompareAndSwapUint32(p, old, old|x) {
					break
				}
			}
		}
	}

	return f, nil
}

func (l *Loader) checkBitsAndHashes(nblocks, nhashes int) error {
	switch {
	case nblocks != int(l.nblocks):
		return fmt.Errorf("blobloom: Filter has %d blocks, but dump has %d", nblocks, l.nblocks)
	case nhashes != l.nhashes:
		return fmt.Errorf("blobloom: Filter has %d hashes, but dump has %d", nhashes, l.nhashes)
	}
	return nil
}

func (l *Loader) fillbuf() error {
	_, err := io.ReadFull(l.r, l.buf[:])
	if err == io.EOF {
		err = io.ErrUnexpectedEOF
	}
	return err
}

func checkComment(p []byte) ([]byte, error) {
	eos := bytes.IndexByte(p, 0)
	if eos != -1 {
		tail := p[eos+1:]
		if !bytes.Equal(tail, make([]byte, len(tail))) {
			return nil, fmt.Errorf("blobloom: comment block %q contains zero byte", p)
		}
		p = p[:eos]
	}
	return p, nil
}