From 62fdc85b390119388ea303b9092b9138108dfdae Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 12 Dec 2024 13:03:45 +0400 Subject: [PATCH 1/2] feat(python): Streamline creation of empty frame from `Schema` --- py-polars/polars/schema.py | 37 ++++++++++++++++++++++++++++- py-polars/tests/unit/test_schema.py | 22 +++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/py-polars/polars/schema.py b/py-polars/polars/schema.py index fb1b8268bf2f..f36a0ff19019 100644 --- a/py-polars/polars/schema.py +++ b/py-polars/polars/schema.py @@ -3,7 +3,7 @@ import sys from collections import OrderedDict from collections.abc import Mapping -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Literal, Union, overload from polars._typing import PythonDataType from polars.datatypes import DataType, DataTypeClass, is_polars_dtype @@ -12,6 +12,8 @@ if TYPE_CHECKING: from collections.abc import Iterable + from polars import DataFrame, LazyFrame + if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -152,6 +154,39 @@ def dtypes(self) -> list[DataType]: """ return list(self.values()) + @overload + def empty_frame(self, *, eager: Literal[False] = ...) -> LazyFrame: ... + + @overload + def empty_frame(self, *, eager: Literal[True]) -> DataFrame: ... + + def empty_frame(self, *, eager: bool = True) -> DataFrame | LazyFrame: + """ + Create an empty DataFrame (or LazyFrame) from this Schema. + + Parameters + ---------- + eager + If True, create a DataFrame; otherwise, create a LazyFrame. + + Examples + -------- + >>> s = pl.Schema({"x": pl.Int32(), "y": pl.String()}) + >>> s.empty_frame() + shape: (0, 2) + ┌─────┬─────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ i32 ┆ str │ + ╞═════╪═════╡ + └─────┴─────┘ + >>> s.empty_frame(eager=False) # doctest: +IGNORE_RESULT + + """ + from polars import DataFrame, LazyFrame + + return DataFrame(schema=self) if eager else LazyFrame(schema=self) + def len(self) -> int: """ Get the number of schema entries. diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 7ebae8e22f01..5856678fb22b 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -24,6 +24,28 @@ def test_schema() -> None: pl.Schema({"foo": pl.String, "bar": pl.List}) +@pytest.mark.parametrize( + "schema", + [ + pl.Schema(), + pl.Schema({"foo": pl.Int8()}), + pl.Schema({"foo": pl.Datetime("us"), "bar": pl.String()}), + pl.Schema( + { + "foo": pl.UInt32(), + "bar": pl.Categorical("physical"), + "baz": pl.Struct({"x": pl.Int64(), "y": pl.Float64()}), + } + ), + ], +) +def test_schema_empty_frame(schema: pl.Schema) -> None: + assert_frame_equal( + schema.empty_frame(), + pl.DataFrame(schema=schema), + ) + + def test_schema_equality() -> None: s1 = pl.Schema({"foo": pl.Int8(), "bar": pl.Float64()}) s2 = pl.Schema({"foo": pl.Int8(), "bar": pl.String()}) From d6b634609c7b2406f752335415f4558f895d81cf Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Fri, 13 Dec 2024 09:37:23 +0400 Subject: [PATCH 2/2] change `empty_frame` to `to_frame` --- py-polars/polars/schema.py | 12 +++++------- py-polars/tests/unit/test_schema.py | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/py-polars/polars/schema.py b/py-polars/polars/schema.py index f36a0ff19019..35d72dd6d493 100644 --- a/py-polars/polars/schema.py +++ b/py-polars/polars/schema.py @@ -19,7 +19,6 @@ else: from typing_extensions import TypeAlias - if sys.version_info >= (3, 10): def _required_init_args(tp: DataTypeClass) -> bool: @@ -37,7 +36,6 @@ def _required_init_args(tp: DataTypeClass) -> bool: BaseSchema = OrderedDict[str, DataType] SchemaInitDataType: TypeAlias = Union[DataType, DataTypeClass, PythonDataType] - __all__ = ["Schema"] @@ -155,12 +153,12 @@ def dtypes(self) -> list[DataType]: return list(self.values()) @overload - def empty_frame(self, *, eager: Literal[False] = ...) -> LazyFrame: ... + def to_frame(self, *, eager: Literal[False] = ...) -> LazyFrame: ... @overload - def empty_frame(self, *, eager: Literal[True]) -> DataFrame: ... + def to_frame(self, *, eager: Literal[True]) -> DataFrame: ... - def empty_frame(self, *, eager: bool = True) -> DataFrame | LazyFrame: + def to_frame(self, *, eager: bool = True) -> DataFrame | LazyFrame: """ Create an empty DataFrame (or LazyFrame) from this Schema. @@ -172,7 +170,7 @@ def empty_frame(self, *, eager: bool = True) -> DataFrame | LazyFrame: Examples -------- >>> s = pl.Schema({"x": pl.Int32(), "y": pl.String()}) - >>> s.empty_frame() + >>> s.to_frame() shape: (0, 2) ┌─────┬─────┐ │ x ┆ y │ @@ -180,7 +178,7 @@ def empty_frame(self, *, eager: bool = True) -> DataFrame | LazyFrame: │ i32 ┆ str │ ╞═════╪═════╡ └─────┴─────┘ - >>> s.empty_frame(eager=False) # doctest: +IGNORE_RESULT + >>> s.to_frame(eager=False) # doctest: +IGNORE_RESULT """ from polars import DataFrame, LazyFrame diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 5856678fb22b..bdfc4bd21195 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -41,7 +41,7 @@ def test_schema() -> None: ) def test_schema_empty_frame(schema: pl.Schema) -> None: assert_frame_equal( - schema.empty_frame(), + schema.to_frame(), pl.DataFrame(schema=schema), ) @@ -270,13 +270,15 @@ def test_lazy_agg_lit_explode() -> None: assert_frame_equal(q.collect(), pl.DataFrame({"k": 1, "o": [[1]]}, schema=schema)) # type: ignore[arg-type] -@pytest.mark.parametrize("expr_op", [ - "approx_n_unique", "arg_max", "arg_min", "bitwise_and", "bitwise_or", - "bitwise_xor", "count", "entropy", "first", "has_nulls", "implode", "kurtosis", - "last", "len", "lower_bound", "max", "mean", "median", "min", "n_unique", "nan_max", - "nan_min", "null_count", "product", "sample", "skew", "std", "sum", "upper_bound", - "var" -]) # fmt: skip +@pytest.mark.parametrize( + "expr_op", [ + "approx_n_unique", "arg_max", "arg_min", "bitwise_and", "bitwise_or", + "bitwise_xor", "count", "entropy", "first", "has_nulls", "implode", "kurtosis", + "last", "len", "lower_bound", "max", "mean", "median", "min", "n_unique", "nan_max", + "nan_min", "null_count", "product", "sample", "skew", "std", "sum", "upper_bound", + "var" + ] +) # fmt: skip @pytest.mark.parametrize("lhs", [pl.col("b"), pl.lit(1, dtype=pl.Int64).alias("b")]) def test_lazy_agg_to_scalar_schema_19752(lhs: pl.Expr, expr_op: str) -> None: op = getattr(pl.Expr, expr_op)