apache · zhuqi-lucas · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs
@@ -105,6 +105,14 @@ impl FileFormat for TSVFileFormat {
             .await
     }
 
+    async fn infer_file_ordering(
+        &self,
+        _store: &Arc<dyn ObjectStore>,
+        _object: &ObjectMeta,
+    ) -> Option<String> {
+        None
+    }
+
     async fn create_physical_plan(
         &self,
         state: &SessionState,

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -1977,7 +1977,13 @@ mod tests {
 
     use crate::prelude::{CsvReadOptions, NdJsonReadOptions, ParquetReadOptions};
     use arrow::array::Int32Array;
-    use datafusion_common::{assert_batches_eq, Constraint, Constraints, ScalarValue};
+    use arrow::util::pretty::pretty_format_batches;
+    use arrow_array::TimestampNanosecondArray;
+    use arrow_schema::TimeUnit;
+    use datafusion_common::{
+        assert_batches_eq, assert_contains, assert_not_contains, Constraint, Constraints,
+        ScalarValue,
+    };
     use datafusion_common_runtime::SpawnedTask;
     use datafusion_expr::expr::WindowFunction;
     use datafusion_expr::{
@@ -1989,6 +1995,7 @@ mod tests {
     use datafusion_functions_window::nth_value::first_value_udwf;
     use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
+    use rand::Rng;
     use sqlparser::ast::NullTreatment;
     use tempfile::TempDir;
 
@@ -4136,11 +4143,6 @@ mod tests {
         let df = ctx.sql("SELECT * FROM data").await?;
         let results = df.collect().await?;
 
-        let df_explain = ctx.sql("explain SELECT a FROM data").await?;
-        let explain_result = df_explain.collect().await?;
-
-        println!("explain_result {:?}", explain_result);
-
         assert_batches_eq!(
             &[
                 "+---+---+",
@@ -4327,4 +4329,174 @@ mod tests {
         );
         Ok(())
     }
+
+    #[tokio::test]
+    async fn write_parquet_with_order_metadata() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+        ]));
+
+        let tmp_dir = TempDir::new()?;
+
+        // It should work only when we enable the collect_statistics
+        let ctx = SessionContext::new_with_config(
+            SessionConfig::default()
+                .set_bool("datafusion.execution.collect_statistics", true),
+        );
+
+        // random write data to parquet
+        let num_rows = 1000;
+        let mut rng = rand::thread_rng();
+        let ids: Vec<i64> = (0..num_rows).collect();
+        let timestamps: Vec<i64> = (0..num_rows)
+            .map(|_| rng.gen_range(1_700_000_000_000..1_800_000_000_000))
+            .collect();
+
+        let id_array = Arc::new(Int64Array::from(ids));
+        let timestamp_array = Arc::new(TimestampNanosecondArray::from(timestamps));
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![id_array, timestamp_array])?;
+
+        let file = tmp_dir.path().join("testSorted.parquet");
+        let write_df = ctx.read_batch(batch)?;
+
+        write_df
+            .clone()
+            .write_parquet(
+                file.to_str().unwrap(),
+                DataFrameWriteOptions::new()
+                    .with_sort_by(vec![col("timestamp").sort(true, false)]),
+                None,
+            )
+            .await?;
+
+        // Create the table without with order
+        let sql_str =
+            "create external table sortData(id INT, timestamp TIMESTAMP) stored as parquet location'"
+                .to_owned()
+                + file.to_str().unwrap()
+                + "'";
+
+        ctx.sql(sql_str.as_str()).await?.collect().await?;
+
+        let sql_result = ctx
+            .sql("SELECT * FROM sortData order by timestamp")
+            .await?
+            .explain(false, false)?
+            .collect()
+            .await?;
+
+        let formatted = pretty_format_batches(&sql_result).unwrap().to_string();
+        // Assert we have the output_ordering in the explain plan
+        assert_contains!(
+            formatted.as_str(),
+            "output_ordering=[timestamp@1 ASC NULLS LAST]"
+        );
+
+        // Assert we don't contain SortExec in the plan, the optimizer can optimize to remove the sort
+        assert_not_contains!(formatted.as_str(), "SortExec");
+
+        // testing multi col sort case
+        write_df
+            .clone()
+            .write_parquet(
+                file.to_str().unwrap(),
+                DataFrameWriteOptions::new().with_sort_by(vec![
+                    col("timestamp").sort(true, false),
+                    col("id").sort(true, false),
+                ]),
+                None,
+            )
+            .await?;
+
+        let sql_result = ctx
+            .sql("SELECT * FROM sortData")
+            .await?
+            .explain(false, false)?
+            .collect()
+            .await?;
+
+        let formatted = pretty_format_batches(&sql_result).unwrap().to_string();
+        // Assert we have the output_ordering in the explain plan
+        assert_contains!(
+            formatted.as_str(),
+            "output_ordering=[timestamp@1 ASC NULLS LAST, id@0 ASC NULLS LAST]"
+        );
+
+        // Assert we don't contain SortExec in the plan, the optimizer can optimize to remove the sort
+        assert_not_contains!(formatted.as_str(), "SortExec");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn write_parquet_without_order_metadata() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+        ]));
+
+        let tmp_dir = TempDir::new()?;
+
+        let ctx = SessionContext::new();
+
+        // random write data to parquet
+        let num_rows = 1000;
+        let mut rng = rand::thread_rng();
+        let ids: Vec<i64> = (0..num_rows).collect();
+        let timestamps: Vec<i64> = (0..num_rows)
+            .map(|_| rng.gen_range(1_700_000_000_000..1_800_000_000_000))
+            .collect();
+
+        let id_array = Arc::new(Int64Array::from(ids));
+        let timestamp_array = Arc::new(TimestampNanosecondArray::from(timestamps));
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![id_array, timestamp_array])?;
+
+        let file = tmp_dir.path().join("testSorted.parquet");
+        let write_df = ctx.read_batch(batch)?;
+
+        write_df
+            .clone()
+            .write_parquet(file.to_str().unwrap(), DataFrameWriteOptions::new(), None)
+            .await?;
+
+        // Create the table without with order
+        let sql_str =
+            "create external table sortData(id INT, timestamp TIMESTAMP) stored as parquet location'"
+                .to_owned()
+                + file.to_str().unwrap()
+                + "'";
+
+        ctx.sql(sql_str.as_str()).await?.collect().await?;
+
+        let sql_result = ctx
+            .sql("SELECT * FROM sortData order by timestamp")
+            .await?
+            .explain(false, false)?
+            .collect()
+            .await?;
+
+        let formatted = pretty_format_batches(&sql_result).unwrap().to_string();
+        // Assert we don't have the output_ordering in the explain plan because we don't disable the statistics
+        assert_not_contains!(
+            formatted.as_str(),
+            "output_ordering=[timestamp@1 ASC NULLS LAST]"
+        );
+
+        // Assert we contain SortExec in the plan
+        // the optimizer will not remove it without metadata sort information
+        assert_contains!(formatted.as_str(), "SortExec");
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs
@@ -165,6 +165,15 @@ impl FileFormat for ArrowFormat {
         Ok(Statistics::new_unknown(&table_schema))
     }
 
+    async fn infer_file_ordering(
+        &self,
+        _store: &Arc<dyn ObjectStore>,
+        _object: &ObjectMeta,
+    ) -> Option<String> {
+        // todo for now we don't support infer ordering for Arrow files
+        None
+    }
+
     async fn create_physical_plan(
         &self,
         _state: &SessionState,

diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs
@@ -134,6 +134,15 @@ impl FileFormat for AvroFormat {
         Ok(Arc::new(merged_schema))
     }
 
+    async fn infer_file_ordering(
+        &self,
+        _store: &Arc<dyn ObjectStore>,
+        _object: &ObjectMeta,
+    ) -> Option<String> {
+        // todo Avro files sort order are not sorted
+        None
+    }
+
     async fn infer_stats(
         &self,
         _state: &SessionState,

diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
@@ -406,6 +406,15 @@ impl FileFormat for CsvFormat {
         Ok(Statistics::new_unknown(&table_schema))
     }
 
+    async fn infer_file_ordering(
+        &self,
+        _store: &Arc<dyn ObjectStore>,
+        _object: &ObjectMeta,
+    ) -> Option<String> {
+        // CSV infer files order info is not supported
+        None
+    }
+
     async fn create_physical_plan(
         &self,
         state: &SessionState,

diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs
@@ -242,6 +242,15 @@ impl FileFormat for JsonFormat {
         Ok(Statistics::new_unknown(&table_schema))
     }
 
+    async fn infer_file_ordering(
+        &self,
+        _store: &Arc<dyn ObjectStore>,
+        _object: &ObjectMeta,
+    ) -> Option<String> {
+        // Json infer files order are not supported
+        None
+    }
+
     async fn create_physical_plan(
         &self,
         _state: &SessionState,

diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
@@ -31,17 +31,16 @@ pub mod options;
 pub mod parquet;
 pub mod write;
 
-use std::any::Any;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::{self, Debug, Display};
-use std::sync::Arc;
-use std::task::Poll;
-
 use crate::arrow::datatypes::SchemaRef;
 use crate::datasource::physical_plan::{FileScanConfig, FileSinkConfig};
 use crate::error::Result;
 use crate::execution::context::SessionState;
 use crate::physical_plan::{ExecutionPlan, Statistics};
+use std::any::Any;
+use std::collections::{HashMap, VecDeque};
+use std::fmt::{self, Debug, Display};
+use std::sync::Arc;
+use std::task::Poll;
 
 use arrow_array::RecordBatch;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema};
@@ -123,6 +122,22 @@ pub trait FileFormat: Send + Sync + Debug {
         object: &ObjectMeta,
     ) -> Result<Statistics>;
 
+    /// Infers the file ordering for a given object store and object meta.
+    ///
+    /// # Arguments
+    ///
+    /// * `store` - A reference to the object store.
+    /// * `object` - A reference to the object meta.
+    ///
+    /// # Returns
+    ///
+    /// An optional string representing the file ordering.
+    async fn infer_file_ordering(
+        &self,
+        store: &Arc<dyn ObjectStore>,
+        object: &ObjectMeta,
+    ) -> Option<String>;
+
     /// Take a list of files and convert it to the appropriate executor
     /// according to this file format.
     async fn create_physical_plan(
@@ -398,6 +413,11 @@ pub fn file_type_to_format(
     }
 }
 
+/// Check if the file format is parquet
+pub fn is_file_parquet_format(file_format: &Arc<dyn FileType>) -> bool {
+    file_format.get_ext() == "parquet"
+}
+
 /// Create a new field with the specified data type, copying the other
 /// properties from the input field
 fn field_with_new_type(field: &FieldRef, new_type: DataType) -> FieldRef {

diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -397,6 +397,37 @@ impl FileFormat for ParquetFormat {
         Ok(stats)
     }
 
+    async fn infer_file_ordering(
+        &self,
+        store: &Arc<dyn ObjectStore>,
+        object: &ObjectMeta,
+    ) -> Option<String> {
+        // Fetch metadata
+        let metadata =
+            fetch_parquet_metadata(store.as_ref(), object, self.metadata_size_hint())
+                .await
+                .map_err(|e| ParquetError::General(format!("Get_metadata error: {e}")))
+                .ok()?; // Propagate errors to avoid breaking execution
+
+        let file_metadata = metadata.file_metadata();
+
+        // Convert Parquet schema to Arrow schema
+        let file_schema = parquet_to_arrow_schema(
+            file_metadata.schema_descr(),
+            file_metadata.key_value_metadata(),
+        )
+        .map_err(|e| {
+            ParquetError::General(format!(
+                "Failed to convert Parquet schema to Arrow schema: {e}"
+            ))
+        })
+        .ok()?; // Propagate errors
+
+        // Get "DATAFUSION_ORDER_BY" from metadata
+        let order_by = file_schema.metadata().get("DATAFUSION_ORDER_BY").cloned();
+        order_by
+    }
+
     async fn create_physical_plan(
         &self,
         _state: &SessionState,