Skip to content

Commit 538d089

Browse files
authored
IPC Prototype (#181)
Very early prototype, serde is not correctly implemented for arrays, but the structure is almost there.
1 parent 4b59675 commit 538d089

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+2992
-946
lines changed

Cargo.lock

+327-120
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ members = [
1111
"vortex-dict",
1212
"vortex-error",
1313
"vortex-fastlanes",
14+
"vortex-flatbuffers",
15+
"vortex-ipc",
1416
"vortex-ree",
1517
"vortex-roaring",
1618
"vortex-schema",
@@ -47,6 +49,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] }
4749
croaring = "1.0.1"
4850
divan = "0.1.14"
4951
flatbuffers = "23.5.26"
52+
flexbuffers = "2.0.0"
5053
flatc = "0.2.2"
5154
half = { version = "^2", features = ["std", "num-traits"] }
5255
hashbrown = "0.14.3"

README.md

+31-17
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,15 @@ without prior discussion infeasible. If you are interested in contributing, plea
159159

160160
This repo uses submodules for non-Rust dependencies (e.g., for the zig fastlanez repo). Before building make sure to run
161161

162-
* `git submodule update --init --recursive`
163-
* `./zigup` (this will install the zig version required by fastlanez)
162+
```bash
163+
git submodule update --init --recursive
164+
165+
# Install the zig version required by fastlanez
166+
./zigup
167+
168+
# Install Rye from https://rye-up.com, and setup the virtualenv
169+
rye sync
170+
```
164171

165172
## License
166173

@@ -172,24 +179,31 @@ This project is inspired by and--in some cases--directly based upon the existing
172179
and OSS developers.
173180

174181
In particular, the following academic papers greatly influenced the development:
175-
* Maximilian Kuschewski, David Sauerwein, Adnan Alhomssi, and Viktor Leis. 2023. [BtrBlocks: Efficient Columnar Compression
176-
for Data Lakes](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf). Proc. ACM Manag. Data 1, 2,
177-
Article 118 (June 2023), 14 pages. https://doi.org/10.1145/3589263
178-
* Azim Afroozeh and Peter Boncz. [The FastLanes Compression Layout: Decoding >100 Billion Integers per Second with Scalar
179-
Code](https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf). PVLDB, 16(9): 2132 - 2144, 2023.
180-
* Peter Boncz, Thomas Neumann, and Viktor Leis. [FSST: Fast Random Access String
181-
Compression](https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf).
182-
PVLDB, 13(11): 2649-2661, 2020.
183-
* Azim Afroozeh, Leonardo X. Kuffo, and Peter Boncz. 2023. [ALP: Adaptive Lossless floating-Point
184-
Compression](https://ir.cwi.nl/pub/33334/33334.pdf). Proc. ACM
185-
Manag. Data 1, 4 (SIGMOD), Article 230 (December 2023), 26 pages. https://doi.org/10.1145/3626717
182+
183+
* Maximilian Kuschewski, David Sauerwein, Adnan Alhomssi, and Viktor Leis.
184+
2023. [BtrBlocks: Efficient Columnar Compression
185+
for Data Lakes](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf). Proc. ACM Manag. Data 1,
186+
2,
187+
Article 118 (June 2023), 14 pages. https://doi.org/10.1145/3589263
188+
* Azim Afroozeh and Peter
189+
Boncz. [The FastLanes Compression Layout: Decoding >100 Billion Integers per Second with Scalar
190+
Code](https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf). PVLDB, 16(9): 2132 - 2144, 2023.
191+
* Peter Boncz, Thomas Neumann, and Viktor Leis. [FSST: Fast Random Access String
192+
Compression](https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf).
193+
PVLDB, 13(11): 2649-2661, 2020.
194+
* Azim Afroozeh, Leonardo X. Kuffo, and Peter Boncz. 2023. [ALP: Adaptive Lossless floating-Point
195+
Compression](https://ir.cwi.nl/pub/33334/33334.pdf). Proc. ACM
196+
Manag. Data 1, 4 (SIGMOD), Article 230 (December 2023), 26 pages. https://doi.org/10.1145/3626717
186197

187198
Additionally, we benefited greatly from:
188-
* the collected OSS work of [Daniel Lemire](https://github.com/lemire), such as [FastPFor](https://github.com/lemire/FastPFor),
189-
and [StreamVByte](https://github.com/lemire/streamvbyte).
190-
* the [parquet2](https://github.com/jorgecarleitao/parquet2) project by [Jorge Leitao](https://github.com/jorgecarleitao).
199+
200+
* the collected OSS work of [Daniel Lemire](https://github.com/lemire), such
201+
as [FastPFor](https://github.com/lemire/FastPFor),
202+
and [StreamVByte](https://github.com/lemire/streamvbyte).
203+
* the [parquet2](https://github.com/jorgecarleitao/parquet2) project
204+
by [Jorge Leitao](https://github.com/jorgecarleitao).
191205
* the public discussions around choices of compression codecs, as well as the C++ implementations thereof,
192-
from [duckdb](https://github.com/duckdb/duckdb).
206+
from [duckdb](https://github.com/duckdb/duckdb).
193207
* the existence, ideas, & implementation of the [Apache Arrow](https://arrow.apache.org) project.
194208
* the [Velox](https://github.com/facebookincubator/velox) project and discussions with its maintainers.
195209

bench-vortex/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ vortex-datetime = { path = "../vortex-datetime" }
2323
vortex-dict = { path = "../vortex-dict" }
2424
vortex-error = { path = "../vortex-error", features = ["parquet"] }
2525
vortex-fastlanes = { path = "../vortex-fastlanes" }
26+
vortex-ipc = { path = "../vortex-ipc" }
2627
vortex-ree = { path = "../vortex-ree" }
2728
vortex-roaring = { path = "../vortex-roaring" }
2829
vortex-schema = { path = "../vortex-schema" }

bench-vortex/src/bin/ipc.rs

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use log::LevelFilter;
2+
use std::fs::File;
3+
4+
use bench_vortex::reader::open_vortex;
5+
use bench_vortex::setup_logger;
6+
use bench_vortex::taxi_data::taxi_data_vortex;
7+
use vortex::array::primitive::PrimitiveArray;
8+
use vortex::array::Array;
9+
use vortex::compute::take::take;
10+
use vortex::serde::context::SerdeContext;
11+
use vortex_error::VortexResult;
12+
use vortex_ipc::iter::FallibleLendingIterator;
13+
use vortex_ipc::reader::StreamReader;
14+
use vortex_ipc::writer::StreamWriter;
15+
16+
pub fn main() -> VortexResult<()> {
17+
setup_logger(LevelFilter::Error);
18+
19+
let array = open_vortex(&taxi_data_vortex())?;
20+
println!("Array {}", &array);
21+
22+
//let ipc = idempotent("ipc.vortex", |path| {
23+
let ipc = "bench-vortex/data/ipc.vortex";
24+
let mut write = File::create("bench-vortex/data/ipc.vortex")?;
25+
let ctx = SerdeContext::default();
26+
let mut writer = StreamWriter::try_new(&mut write, ctx)?;
27+
writer.write(&array)?;
28+
//})?;
29+
30+
// Now try to read from the IPC stream.
31+
let mut read = File::open(ipc)?;
32+
let mut ipc_reader = StreamReader::try_new(&mut read)?;
33+
34+
// We know we only wrote a single array.
35+
// TODO(ngates): create an option to skip the multi-array reader?
36+
let mut array_reader = ipc_reader.next()?.unwrap();
37+
println!("DType: {:?}", array_reader.dtype());
38+
// Read some number of chunks from the stream.
39+
while let Some(chunk) = array_reader.next().unwrap() {
40+
println!("VIEW: {}", (&chunk as &dyn Array));
41+
let taken = take(&chunk, &PrimitiveArray::from(vec![0, 1, 0, 1])).unwrap();
42+
println!("Taken: {}", &taken);
43+
}
44+
45+
Ok(())
46+
}

bench-vortex/src/lib.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ use crate::taxi_data::taxi_data_parquet;
1313
use vortex::array::chunked::ChunkedArray;
1414
use vortex::array::downcast::DowncastArrayBuiltin;
1515
use vortex::array::IntoArray;
16-
use vortex::array::{Array, ArrayRef, EncodingRef, ENCODINGS};
16+
use vortex::array::{Array, ArrayRef};
1717
use vortex::arrow::FromArrowType;
1818
use vortex::compress::{CompressConfig, CompressCtx};
19+
use vortex::encoding::{EncodingRef, ENCODINGS};
1920
use vortex::formatter::display_tree;
2021
use vortex_alp::ALPEncoding;
2122
use vortex_datetime::DateTimeEncoding;

flatbuffers.build.rs

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
use flatc::flatc;
2+
use std::env;
3+
use std::ffi::OsStr;
4+
use std::path::{Path, PathBuf};
5+
use std::process::Command;
6+
7+
use walkdir::WalkDir;
8+
9+
fn main() {
10+
let flatbuffers_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap())
11+
.canonicalize()
12+
.expect("Failed to canonicalize CARGO_MANIFEST_DIR")
13+
.join("flatbuffers");
14+
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap())
15+
.canonicalize()
16+
.expect("Failed to canonicalize OUT_DIR");
17+
18+
let fbs_files = WalkDir::new(&flatbuffers_dir)
19+
.into_iter()
20+
.filter_map(|e| e.ok())
21+
.filter(|e| e.path().extension() == Some(OsStr::new("fbs")))
22+
.map(|e| {
23+
rerun_if_changed(e.path());
24+
e.path().to_path_buf()
25+
})
26+
.collect::<Vec<_>>();
27+
28+
if !Command::new(flatc())
29+
.arg("--rust")
30+
.arg("--filename-suffix")
31+
.arg("")
32+
.arg("-I")
33+
.arg(flatbuffers_dir.join("../../"))
34+
.arg("--include-prefix")
35+
.arg("flatbuffers::deps")
36+
.arg("-o")
37+
.arg(out_dir.join("flatbuffers"))
38+
.args(fbs_files)
39+
.status()
40+
.unwrap()
41+
.success()
42+
{
43+
panic!("Failed to run flatc");
44+
}
45+
}
46+
47+
fn rerun_if_changed(path: &Path) {
48+
println!(
49+
"cargo:rerun-if-changed={}",
50+
path.canonicalize()
51+
.unwrap_or_else(|_| panic!("failed to canonicalize {}", path.to_str().unwrap()))
52+
.to_str()
53+
.unwrap()
54+
);
55+
}

pyvortex/src/array.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ use vortex::array::sparse::{SparseArray, SparseEncoding};
1010
use vortex::array::struct_::{StructArray, StructEncoding};
1111
use vortex::array::varbin::{VarBinArray, VarBinEncoding};
1212
use vortex::array::varbinview::{VarBinViewArray, VarBinViewEncoding};
13-
use vortex::array::{Array, ArrayKind, ArrayRef, EncodingRef};
13+
use vortex::array::{Array, ArrayKind, ArrayRef};
14+
use vortex::encoding::EncodingRef;
1415
use vortex_alp::{ALPArray, ALPEncoding};
1516
use vortex_dict::{DictArray, DictEncoding};
1617
use vortex_fastlanes::{

pyvortex/src/compress.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use pyo3::types::PyType;
22
use pyo3::{pyclass, pyfunction, pymethods, Py, PyResult, Python};
33
use std::sync::Arc;
4-
use vortex::array::ENCODINGS;
4+
use vortex::encoding::ENCODINGS;
55

66
use vortex::compress::{CompressConfig, CompressCtx};
77

pyvortex/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
2323

2424
debug!(
2525
"Discovered encodings: {:?}",
26-
vortex::array::ENCODINGS
26+
vortex::encoding::ENCODINGS
2727
.iter()
2828
.map(|e| e.id().to_string())
2929
.collect::<Vec<String>>()

requirements-dev.lock

+51-1
Original file line numberDiff line numberDiff line change
@@ -5,54 +5,104 @@
55
# pre: false
66
# features: []
77
# all-features: false
8+
# with-sources: false
89

910
-e file:pyvortex
1011
-e file:.
1112
babel==2.14.0
13+
# via mkdocs-material
1214
bracex==2.4
15+
# via wcmatch
1316
certifi==2024.2.2
17+
# via requests
1418
charset-normalizer==3.3.2
19+
# via requests
1520
click==8.1.7
21+
# via mkdocs
1622
colorama==0.4.6
23+
# via mkdocs-material
1724
ghp-import==2.1.0
25+
# via mkdocs
1826
idna==3.6
27+
# via requests
1928
importlib-metadata==7.0.1
29+
# via mike
2030
importlib-resources==6.1.2
31+
# via mike
2132
iniconfig==2.0.0
33+
# via pytest
2234
jinja2==3.1.3
35+
# via mike
36+
# via mkdocs
37+
# via mkdocs-material
2338
markdown==3.5.2
39+
# via mkdocs
40+
# via mkdocs-material
41+
# via pymdown-extensions
2442
markupsafe==2.1.5
43+
# via jinja2
44+
# via mkdocs
2545
maturin==1.4.0
2646
mergedeep==1.3.4
47+
# via mkdocs
2748
mike==2.0.0
2849
mkdocs==1.5.3
50+
# via mike
51+
# via mkdocs-include-markdown-plugin
52+
# via mkdocs-material
2953
mkdocs-include-markdown-plugin==6.0.4
3054
mkdocs-material==9.5.12
3155
mkdocs-material-extensions==1.3.1
56+
# via mkdocs-material
3257
numpy==1.26.4
58+
# via pyarrow
3359
packaging==23.2
60+
# via mkdocs
61+
# via pytest
3462
paginate==0.5.6
63+
# via mkdocs-material
3564
pathspec==0.12.1
65+
# via mkdocs
3666
platformdirs==4.2.0
67+
# via mkdocs
3768
pluggy==1.4.0
69+
# via pytest
3870
py-cpuinfo==9.0.0
71+
# via pytest-benchmark
3972
pyarrow==15.0.0
4073
pygments==2.17.2
74+
# via mkdocs-material
4175
pymdown-extensions==10.7
76+
# via mkdocs-material
4277
pyparsing==3.1.1
78+
# via mike
4379
pytest==7.4.0
80+
# via pytest-benchmark
4481
pytest-benchmark==4.0.0
4582
python-dateutil==2.9.0
83+
# via ghp-import
4684
pyyaml==6.0.1
85+
# via mike
86+
# via mkdocs
87+
# via pymdown-extensions
88+
# via pyyaml-env-tag
4789
pyyaml-env-tag==0.1
90+
# via mkdocs
4891
regex==2023.12.25
92+
# via mkdocs-material
4993
requests==2.31.0
94+
# via mkdocs-material
5095
ruff==0.2.2
5196
six==1.16.0
97+
# via python-dateutil
5298
urllib3==2.2.1
99+
# via requests
53100
verspec==0.1.0
101+
# via mike
54102
watchdog==4.0.0
103+
# via mkdocs
55104
wcmatch==8.5.1
105+
# via mkdocs-include-markdown-plugin
56106
zipp==3.17.0
57-
# The following packages are considered to be unsafe in a requirements file:
107+
# via importlib-metadata
58108
pip==24.0

requirements.lock

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# pre: false
66
# features: []
77
# all-features: false
8+
# with-sources: false
89

910
-e file:pyvortex
1011
-e file:.

vortex-alp/src/array.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
use std::sync::{Arc, RwLock};
22

3-
use vortex::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef};
3+
use vortex::array::{Array, ArrayKind, ArrayRef};
44
use vortex::compress::EncodingCompression;
5+
use vortex::compute::ArrayCompute;
6+
use vortex::encoding::{Encoding, EncodingId, EncodingRef};
57
use vortex::formatter::{ArrayDisplay, ArrayFormatter};
6-
use vortex::impl_array;
78
use vortex::serde::{ArraySerde, EncodingSerde};
89
use vortex::stats::{Stats, StatsSet};
910
use vortex::validity::{ArrayValidity, Validity};
11+
use vortex::{impl_array, ArrayWalker};
1012
use vortex_error::{vortex_bail, vortex_err, VortexResult};
1113
use vortex_schema::{DType, IntWidth, Signedness};
1214

@@ -115,6 +117,10 @@ impl Array for ALPArray {
115117
fn serde(&self) -> Option<&dyn ArraySerde> {
116118
Some(self)
117119
}
120+
121+
fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> {
122+
walker.visit_child(self.encoded())
123+
}
118124
}
119125

120126
impl ArrayDisplay for ALPArray {

vortex-alp/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ pub use alp::*;
22
pub use array::*;
33

44
use linkme::distributed_slice;
5-
use vortex::array::{EncodingRef, ENCODINGS};
5+
use vortex::encoding::{EncodingRef, ENCODINGS};
66

77
mod alp;
88
mod array;

vortex-alp/src/serde.rs

+4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ impl ArraySerde for ALPArray {
1313
ctx.write_fixed_slice([self.exponents().e, self.exponents().f])?;
1414
ctx.write(self.encoded())
1515
}
16+
17+
fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
18+
Ok(Some(vec![self.exponents().e, self.exponents().f]))
19+
}
1620
}
1721

1822
impl EncodingSerde for ALPEncoding {

0 commit comments

Comments
 (0)