Skip to content

Commit d25b534

Browse files
committed
feat(datafusion): support metadata tables for Datafusion
Signed-off-by: xxchan <[email protected]>
1 parent e5bdbfc commit d25b534

File tree

13 files changed

+473
-61
lines changed

13 files changed

+473
-61
lines changed

Diff for: Cargo.lock

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ serde_derive = "1"
9191
serde_json = "1"
9292
serde_repr = "0.1.16"
9393
serde_with = "3.4"
94+
strum = "0.26"
9495
tempfile = "3.8"
9596
tokio = { version = "1", default-features = false }
9697
typed-builder = "0.20"

Diff for: crates/iceberg/Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,16 @@ serde_derive = { workspace = true }
7878
serde_json = { workspace = true }
7979
serde_repr = { workspace = true }
8080
serde_with = { workspace = true }
81+
strum = { workspace = true, features = ["derive"] }
8182
tokio = { workspace = true, optional = true }
8283
typed-builder = { workspace = true }
8384
url = { workspace = true }
8485
uuid = { workspace = true }
8586
zstd = { workspace = true }
87+
expect-test = { workspace = true }
8688

8789
[dev-dependencies]
8890
ctor = { workspace = true }
89-
expect-test = { workspace = true }
9091
iceberg-catalog-memory = { workspace = true }
9192
iceberg_test_utils = { path = "../test_utils", features = ["tests"] }
9293
pretty_assertions = { workspace = true }

Diff for: crates/iceberg/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ pub mod transform;
8383
mod runtime;
8484

8585
pub mod arrow;
86+
pub mod test_utils;
8687
mod utils;
8788
pub mod writer;
8889

Diff for: crates/iceberg/src/metadata_scan.rs

+39-48
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,46 @@ use crate::Result;
3535
/// - <https://github.com/apache/iceberg/blob/ac865e334e143dfd9e33011d8cf710b46d91f1e5/core/src/main/java/org/apache/iceberg/MetadataTableType.java#L23-L39>
3636
/// - <https://iceberg.apache.org/docs/latest/spark-queries/#querying-with-sql>
3737
/// - <https://py.iceberg.apache.org/api/#inspecting-tables>
38-
#[derive(Debug)]
38+
#[derive(Debug, Clone)]
3939
pub struct MetadataTable(Table);
4040

41+
/// Metadata table type.
42+
#[derive(Debug, Clone, strum::EnumIter)]
43+
pub enum MetadataTableType {
44+
/// [`SnapshotsTable`]
45+
Snapshots,
46+
/// [`ManifestsTable`]
47+
Manifests,
48+
}
49+
50+
impl MetadataTableType {
51+
/// Returns the string representation of the metadata table type.
52+
pub fn as_str(&self) -> &str {
53+
match self {
54+
MetadataTableType::Snapshots => "snapshots",
55+
MetadataTableType::Manifests => "manifests",
56+
}
57+
}
58+
59+
/// Returns all the metadata table types.
60+
pub fn all_types() -> impl Iterator<Item = Self> {
61+
use strum::IntoEnumIterator;
62+
Self::iter()
63+
}
64+
}
65+
66+
impl TryFrom<&str> for MetadataTableType {
67+
type Error = String;
68+
69+
fn try_from(value: &str) -> std::result::Result<Self, String> {
70+
match value {
71+
"snapshots" => Ok(Self::Snapshots),
72+
"manifests" => Ok(Self::Manifests),
73+
_ => Err(format!("invalid metadata table type: {value}")),
74+
}
75+
}
76+
}
77+
4178
impl MetadataTable {
4279
/// Creates a new metadata scan.
4380
pub(super) fn new(table: Table) -> Self {
@@ -262,53 +299,7 @@ mod tests {
262299

263300
use super::*;
264301
use crate::scan::tests::TableTestFixture;
265-
266-
/// Snapshot testing to check the resulting record batch.
267-
///
268-
/// - `expected_schema/data`: put `expect![[""]]` as a placeholder,
269-
/// and then run test with `UPDATE_EXPECT=1 cargo test` to automatically update the result,
270-
/// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)).
271-
/// Check the doc of [`expect_test`] for more details.
272-
/// - `ignore_check_columns`: Some columns are not stable, so we can skip them.
273-
/// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column.
274-
fn check_record_batch(
275-
record_batch: RecordBatch,
276-
expected_schema: Expect,
277-
expected_data: Expect,
278-
ignore_check_columns: &[&str],
279-
sort_column: Option<&str>,
280-
) {
281-
let mut columns = record_batch.columns().to_vec();
282-
if let Some(sort_column) = sort_column {
283-
let column = record_batch.column_by_name(sort_column).unwrap();
284-
let indices = arrow_ord::sort::sort_to_indices(column, None, None).unwrap();
285-
columns = columns
286-
.iter()
287-
.map(|column| arrow_select::take::take(column.as_ref(), &indices, None).unwrap())
288-
.collect_vec();
289-
}
290-
291-
expected_schema.assert_eq(&format!(
292-
"{}",
293-
record_batch.schema().fields().iter().format(",\n")
294-
));
295-
expected_data.assert_eq(&format!(
296-
"{}",
297-
record_batch
298-
.schema()
299-
.fields()
300-
.iter()
301-
.zip_eq(columns)
302-
.map(|(field, column)| {
303-
if ignore_check_columns.contains(&field.name().as_str()) {
304-
format!("{}: (skipped)", field.name())
305-
} else {
306-
format!("{}: {:?}", field.name(), column)
307-
}
308-
})
309-
.format(",\n")
310-
));
311-
}
302+
use crate::test_utils::check_record_batch;
312303

313304
#[test]
314305
fn test_snapshots_table() {

Diff for: crates/iceberg/src/test_utils.rs

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Test utilities.
19+
//! This module is pub just for internal testing.
20+
//! It is subject to change and is not intended to be used by external users.
21+
22+
use arrow_array::RecordBatch;
23+
use expect_test::Expect;
24+
use itertools::Itertools;
25+
26+
/// Snapshot testing to check the resulting record batch.
27+
///
28+
/// - `expected_schema/data`: put `expect![[""]]` as a placeholder,
29+
/// and then run test with `UPDATE_EXPECT=1 cargo test` to automatically update the result,
30+
/// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)).
31+
/// Check the doc of [`expect_test`] for more details.
32+
/// - `ignore_check_columns`: Some columns are not stable, so we can skip them.
33+
/// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column.
34+
pub fn check_record_batch(
35+
record_batch: RecordBatch,
36+
expected_schema: Expect,
37+
expected_data: Expect,
38+
ignore_check_columns: &[&str],
39+
sort_column: Option<&str>,
40+
) {
41+
let mut columns = record_batch.columns().to_vec();
42+
if let Some(sort_column) = sort_column {
43+
let column = record_batch.column_by_name(sort_column).unwrap();
44+
let indices = arrow_ord::sort::sort_to_indices(column, None, None).unwrap();
45+
columns = columns
46+
.iter()
47+
.map(|column| arrow_select::take::take(column.as_ref(), &indices, None).unwrap())
48+
.collect_vec();
49+
}
50+
51+
expected_schema.assert_eq(&format!(
52+
"{}",
53+
record_batch.schema().fields().iter().format(",\n")
54+
));
55+
expected_data.assert_eq(&format!(
56+
"{}",
57+
record_batch
58+
.schema()
59+
.fields()
60+
.iter()
61+
.zip_eq(columns)
62+
.map(|(field, column)| {
63+
if ignore_check_columns.contains(&field.name().as_str()) {
64+
format!("{}: (skipped)", field.name())
65+
} else {
66+
format!("{}: {:?}", field.name(), column)
67+
}
68+
})
69+
.format(",\n")
70+
));
71+
}

Diff for: crates/integrations/datafusion/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,6 @@ iceberg = { workspace = true }
3737
tokio = { workspace = true }
3838

3939
[dev-dependencies]
40+
expect-test = { workspace = true }
4041
iceberg-catalog-memory = { workspace = true }
4142
tempfile = { workspace = true }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use datafusion::catalog::TableProvider;
19+
use datafusion::physical_expr::EquivalenceProperties;
20+
use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
21+
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
22+
use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties};
23+
24+
use crate::metadata_table::IcebergMetadataTableProvider;
25+
26+
#[derive(Debug)]
27+
pub struct IcebergMetadataScan {
28+
provider: IcebergMetadataTableProvider,
29+
properties: PlanProperties,
30+
}
31+
32+
impl IcebergMetadataScan {
33+
pub fn new(provider: IcebergMetadataTableProvider) -> Self {
34+
let properties = PlanProperties::new(
35+
EquivalenceProperties::new(provider.schema()),
36+
Partitioning::UnknownPartitioning(1),
37+
EmissionType::Incremental,
38+
Boundedness::Bounded,
39+
);
40+
Self {
41+
provider,
42+
properties,
43+
}
44+
}
45+
}
46+
47+
impl DisplayAs for IcebergMetadataScan {
48+
fn fmt_as(
49+
&self,
50+
_t: datafusion::physical_plan::DisplayFormatType,
51+
f: &mut std::fmt::Formatter,
52+
) -> std::fmt::Result {
53+
write!(f, "IcebergMetadataScan")
54+
}
55+
}
56+
57+
impl ExecutionPlan for IcebergMetadataScan {
58+
fn name(&self) -> &str {
59+
"IcebergMetadataScan"
60+
}
61+
62+
fn as_any(&self) -> &dyn std::any::Any {
63+
self
64+
}
65+
66+
fn properties(&self) -> &PlanProperties {
67+
&self.properties
68+
}
69+
70+
fn children(&self) -> Vec<&std::sync::Arc<dyn ExecutionPlan>> {
71+
vec![]
72+
}
73+
74+
fn with_new_children(
75+
self: std::sync::Arc<Self>,
76+
_children: Vec<std::sync::Arc<dyn ExecutionPlan>>,
77+
) -> datafusion::error::Result<std::sync::Arc<dyn ExecutionPlan>> {
78+
Ok(self)
79+
}
80+
81+
fn execute(
82+
&self,
83+
_partition: usize,
84+
_context: std::sync::Arc<datafusion::execution::TaskContext>,
85+
) -> datafusion::error::Result<datafusion::execution::SendableRecordBatchStream> {
86+
let batch_fut = self.provider.clone().scan();
87+
let schema = self.provider.schema();
88+
let stream = futures::stream::once(batch_fut);
89+
Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
90+
}
91+
}

Diff for: crates/integrations/datafusion/src/physical_plan/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@
1616
// under the License.
1717

1818
pub(crate) mod expr_to_predicate;
19+
pub(crate) mod metadata_scan;
1920
pub(crate) mod scan;

0 commit comments

Comments
 (0)