diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a84aece..ebd14eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,8 @@ jobs: run: cargo fmt --all --check - name: Rust Lint - Clippy run: cargo clippy --all-features --all-targets + - name: Rust Lint - Docs + run: cargo doc --no-deps --all-features --document-private-items - name: Rust Test run: cargo test --workspace --all-features diff --git a/Cargo.toml b/Cargo.toml index 487d1bb..45143c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,10 @@ name = "onpair" warnings = "deny" missing_docs = "deny" +[lints.rustdoc] +broken_intra_doc_links = "deny" +private_intra_doc_links = "deny" + [lints.clippy] all = { level = "deny", priority = -1 } if_then_some_else_none = { level = "deny" } diff --git a/README.md b/README.md index f24e1ad..a91e762 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,14 @@ OnPair is a dictionary-based string compression algorithm designed for on-disk and in-memory database workloads that need both strong compression ratios and fast random access to individual values. It builds its dictionary in a single sequential pass by incrementally merging frequent adjacent substrings, achieving compression comparable to BPE while being substantially faster and more memory-efficient. -## Format +## Interchange format -The binary layout of a compressed column — dictionary bytes, dictionary -offsets, and codes — is specified in [docs/binary-format.md](docs/binary-format.md). +OnPair defines a shared in-memory representation — the *plain interchange form* +that independent implementations exchange so a column produced by one is +readable by another. It fixes the buffers (dictionary bytes, dictionary +offsets, codes, and row offsets) and their invariants; denser internal +encodings and on-disk serialization are out of scope. See +[docs/interchange-format.md](docs/interchange-format.md). ## References diff --git a/benches/tpch.rs b/benches/tpch.rs index 22a5660..1b33dbb 100644 --- a/benches/tpch.rs +++ b/benches/tpch.rs @@ -19,7 +19,7 @@ // // Run with: cargo bench --bench tpch // -// Targets the slim public API in PUBLIC_API.md +// Targets the slim public API // (`compress` / `decompress` free fns + `Column::as_parts()`). use std::collections::HashMap; diff --git a/benchmarks/onpair-bench/README.md b/benchmarks/onpair-bench/README.md index 2425fc9..64ce7c5 100644 --- a/benchmarks/onpair-bench/README.md +++ b/benchmarks/onpair-bench/README.md @@ -24,7 +24,7 @@ The bench is a uv workspace member of the repo-root `pyproject.toml`. Sync once from the repo root, then drive it with `uv run`: ```bash -# from /Users/joeisaacs/git/spiraldb/onpair (one-time): +# from the repo root (one-time): uv sync # drop a corpus in: diff --git a/docs/binary-format.md b/docs/interchange-format.md similarity index 99% rename from docs/binary-format.md rename to docs/interchange-format.md index 7fbfd2d..d649e4f 100644 --- a/docs/binary-format.md +++ b/docs/interchange-format.md @@ -347,7 +347,8 @@ A column is conformant if and only if all of the following hold. - Every token length `o_{i+1} - o_i` is in `1 ..= MAX_TOKEN_SIZE`. - All 256 single-byte tokens are present (completeness, §3). - No two tokens are equal (uniqueness, §3). -- `dict_bytes_len >= o_N + MAX_TOKEN_SIZE` (the read-padding bound, §3.1). +- `dict_bytes_len >= o_{N-1} + MAX_TOKEN_SIZE` (the read-padding bound, §3.1; + `o_{N-1}` is the offset of the last token). - `is_sorted` is `0` or `1`; if `1`, tokens are strictly increasing in bytewise-lexicographic order. diff --git a/src/column.rs b/src/column.rs index 72a7274..1030c7c 100644 --- a/src/column.rs +++ b/src/column.rs @@ -28,13 +28,14 @@ pub struct Column { pub codes: Vec, /// `R + 1` offsets into `codes` delimiting the `R` input rows: row `r`'s /// codes are `codes[code_offsets[r]..code_offsets[r + 1]]`. The compressor - /// emits these because a token may span a row boundary, so the row - /// structure cannot be recovered from the codes alone. + /// emits these because the codes are a flat concatenation with no in-band + /// row delimiter, so the row structure cannot be recovered from the codes + /// alone. pub code_offsets: Vec, } /// Borrowed view of the data the decoder needs, consumed by -/// [`crate::decompress`] and [`crate::decompress_into`]. +/// [`fn@crate::decompress`] and [`crate::decompress_into`]. /// Downstream consumers deserializing from storage build this via struct /// literal — there is no constructor. #[derive(Copy, Clone, Debug)] @@ -56,7 +57,7 @@ pub struct Parts<'a> { impl Column { /// Zero-copy view over this column's decode arrays. Pass directly to - /// [`crate::decompress`] or [`crate::decompress_into`]. `code_offsets` is + /// [`fn@crate::decompress`] or [`crate::decompress_into`]. `code_offsets` is /// compressor metadata and is not part of the view. #[inline] pub fn as_parts(&self) -> Parts<'_> { diff --git a/src/decompress/fat.rs b/src/decompress/fat.rs index 1a7dd50..eb8ab76 100644 --- a/src/decompress/fat.rs +++ b/src/decompress/fat.rs @@ -4,11 +4,9 @@ //! "Fat" token table layout. //! //! Each token is materialized into a 16-byte-strided row, so a decode load -//! addresses `data + code * 16` straight from the code — replacing the -//! `code → entry → dict[offset]` dependent-load chain of the -//! [`super::DecodeEntry`] layout with a single independent load. Costs -//! `dict_tokens * 16` bytes of table; whether that pays is a cache-residency -//! question the [`super::plan`] index decides per host. +//! addresses `data + code * 16` straight from the code — a single independent +//! load, with no `code → entry → dict[offset]` indirection. Costs +//! `dict_tokens * 16` bytes of table, rebuilt once per decode call. //! //! Loop structure: a 16-byte over-copy fast region ([`super::scalar::copy16`]) //! plus an exact, length-aware tail. diff --git a/src/decompress/mod.rs b/src/decompress/mod.rs index 272de0d..a052e69 100644 --- a/src/decompress/mod.rs +++ b/src/decompress/mod.rs @@ -583,8 +583,8 @@ mod tests { } /// Exercise the full decode width sweep against a corpus large enough to - /// drive the batched AVX-512 prefix, the scalar 16-byte remainder, and the - /// exact tail in a single call. + /// drive the 16-byte over-copy fast region and the exact, length-aware + /// tail in a single call. #[test] fn decompress_matches_input_across_widths() { let mut bytes = Vec::new(); diff --git a/src/lpm.rs b/src/lpm.rs index 57a98bc..ecb4f01 100644 --- a/src/lpm.rs +++ b/src/lpm.rs @@ -266,7 +266,7 @@ impl LongestPrefixMatcher { /// prefix's length. /// /// Precondition: `!data.is_empty()` and the matcher contains every - /// single-byte token (always true after [`new`] or [`from_dictionary`] + /// single-byte token (always true after [`Self::new`] or [`Self::from_dictionary`] /// with a complete dictionary). #[inline] pub fn find_longest_match(&self, data: &[u8]) -> (Token, usize) { diff --git a/src/parser.rs b/src/parser.rs index ddb40d8..190f555 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -31,7 +31,7 @@ impl Parser { /// Train a dictionary against `bytes` / `offsets` and build the matching /// LPM. `offsets` has length `n + 1`. Returns [`Error::InvalidArg`] if /// `offsets` is empty or its last (maximum) offset cannot be represented in - /// `usize` or exceeds `bytes.len()` — see [`validate_offsets`]. The `cfg` + /// `usize` or exceeds `bytes.len()`. The `cfg` /// is valid by construction ([`Bits`](crate::Bits) / /// [`Threshold`](crate::Threshold)). pub fn train(bytes: &[u8], offsets: &[O], cfg: Config) -> Result { @@ -52,8 +52,8 @@ impl Parser { /// Encode `bytes` / `offsets` using this parser. The dictionary is cloned /// into the returned [`Column`] so the column is fully decode-self- /// contained — the strings need not be the corpus the parser was trained - /// on. Returns [`Error::InvalidArg`] on invalid offsets — see - /// [`validate_offsets`]. + /// on. Returns [`Error::InvalidArg`] if `offsets` is empty or its last + /// offset cannot be represented in `usize` or exceeds `bytes.len()`. pub fn parse(&self, bytes: &[u8], offsets: &[O]) -> Result, Error> { validate_offsets(bytes, offsets)?; Ok(self.parse_unchecked(bytes, offsets)) @@ -81,9 +81,9 @@ impl Parser { /// Encode every string into a flat `Vec` of codes plus per-row /// `code_offsets`. Offset `[i]..[i + 1]` indexes the codes for row `i`. The -/// offsets are compressor metadata — a token may span a row boundary, so the -/// row structure cannot be recovered from the codes alone — and are not needed -/// to decode the column as one flat stream. +/// offsets are compressor metadata — the codes are a flat concatenation with no +/// in-band row delimiter, so the row structure cannot be recovered from the +/// codes alone — and are not needed to decode the column as one flat stream. pub(crate) fn encode_strings( bytes: &[u8], offsets: &[O],