diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 12097be8f..d458d3eb0 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -129,3 +129,24 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f .limit(20) .to_pandas() ) + + +Handling Missing Values +===================== + +DataFusion provides methods to handle missing values in DataFrames: + +fill_null +--------- + +The ``fill_null()`` method replaces NULL values in specified columns with a provided value: + +.. code-block:: python + + # Fill all NULL values with 0 where possible + df = df.fill_null(0) + + # Fill NULL values only in specific string columns + df = df.fill_null("missing", subset=["name", "category"]) + +The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 26fe8f453..a1df7e080 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -37,6 +37,8 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 +from datafusion._internal import DataFrame as DataFrameInternal +from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream @@ -53,8 +55,6 @@ from enum import Enum -from datafusion.expr import Expr, SortExpr, sort_or_default - # excerpt from deltalake # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 @@ -869,3 +869,25 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame: DataFrame: After applying func to the original dataframe. """ return func(self, *args) + + def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: + """Fill null values in specified columns with a value. + + Args: + value: Value to replace nulls with. Will be cast to match column type. + subset: Optional list of column names to fill. If None, fills all columns. + + Returns: + DataFrame with null values replaced where type casting is possible + + Examples: + >>> df = df.fill_null(0) # Fill all nulls with 0 where possible + >>> # Fill nulls in specific string columns + >>> df = df.fill_null("missing", subset=["name", "category"]) + + Notes: + - Only fills nulls in columns where the value can be cast to the column type + - For columns where casting fails, the original column is kept unchanged + - For columns not in subset, the original column is kept unchanged + """ + return DataFrame(self.df.fill_null(value, subset)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e01308c86..dd5f962b2 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import datetime import os import re from typing import Any @@ -119,6 +120,38 @@ def clean_formatter_state(): reset_formatter() +@pytest.fixture +def null_df(): + """Create a DataFrame with null values of different types.""" + ctx = SessionContext() + + # Create a RecordBatch with nulls across different types + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, None, 3, None], type=pa.int64()), + pa.array([4.5, 6.7, None, None], type=pa.float64()), + pa.array(["a", None, "c", None], type=pa.string()), + pa.array([True, None, False, None], type=pa.bool_()), + pa.array( + [10957, None, 18993, None], type=pa.date32() + ), # 2000-01-01, null, 2022-01-01, null + pa.array( + [946684800000, None, 1640995200000, None], type=pa.date64() + ), # 2000-01-01, null, 2022-01-01, null + ], + names=[ + "int_col", + "float_col", + "str_col", + "bool_col", + "date32_col", + "date64_col", + ], + ) + + return ctx.create_dataframe([[batch]]) + + # custom style for testing with html formatter class CustomStyleProvider: def get_cell_style(self) -> str: @@ -1794,3 +1827,236 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "