Skip to content

Commit

Permalink
Implement minimal built-in checks for Ibis backend (#1885)
Browse files Browse the repository at this point in the history
* Implement minimal built-in checks for Ibis backend

Signed-off-by: Deepyaman Datta <[email protected]>

* Implement `Column` validation for the Ibis backend

Signed-off-by: Deepyaman Datta <[email protected]>

* Promote check object to table during preprocessing

Signed-off-by: Deepyaman Datta <[email protected]>

* Remove extraneous fixture for backend registration

Signed-off-by: Deepyaman Datta <[email protected]>

* Resolve lint (unused imports, undefined variables)

Signed-off-by: Deepyaman Datta <[email protected]>

* Partially standardize docstrings of builtin checks

Signed-off-by: Deepyaman Datta <[email protected]>

* Fix the `preprocess` docstrings copied from pandas

Signed-off-by: Deepyaman Datta <[email protected]>

* Format pandera/backends/ibis/checks.py using Black

Signed-off-by: Deepyaman Datta <[email protected]>

---------

Signed-off-by: Deepyaman Datta <[email protected]>
  • Loading branch information
deepyaman authored Jan 13, 2025
1 parent ffe6c55 commit d68e842
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 55 deletions.
7 changes: 7 additions & 0 deletions pandera/api/ibis/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandera.api.ibis.types import IbisDtypeInputTypes
from pandera.backends.ibis.register import register_ibis_backends
from pandera.engines import ibis_engine
from pandera.utils import is_regex


class Column(ComponentSchema[ir.Table]):
Expand Down Expand Up @@ -111,6 +112,12 @@ def dtype(self):
def dtype(self, value) -> None:
self._dtype = ibis_engine.Engine.dtype(value) if value else None

@property
def selector(self):
if self.name is not None and not is_regex(self.name) and self.regex:
return f"^{self.name}$"
return self.name

def set_name(self, name: str):
"""Set or modify the name of a column object.
Expand Down
25 changes: 25 additions & 0 deletions pandera/backends/ibis/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Built-in checks for Ibis."""

from typing import Any, TypeVar

import ibis.expr.types as ir

from pandera.api.extensions import register_builtin_check
from pandera.api.ibis.types import IbisData

T = TypeVar("T")


@register_builtin_check(
aliases=["eq"],
error="equal_to({value})",
)
def equal_to(data: IbisData, value: Any) -> ir.Table:
"""Ensure all elements of a data container equal a certain value.
:param data: NamedTuple IbisData contains the table and column name for the check. The key
to access the table is "table", and the key to access the column name is "key".
:param value: Values in this Ibis data structure must be
equal to this value.
"""
return data.table[data.key] == value
13 changes: 7 additions & 6 deletions pandera/backends/ibis/checks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Check backend for Ibis."""

from functools import partial
from typing import Optional

from typing import Optional, Union

import ibis
import ibis.expr.types as ir
Expand Down Expand Up @@ -41,11 +40,13 @@ def aggregate(self, check_obj: ir.Table):
"""Implements aggregation behavior for check object."""
raise NotImplementedError

def preprocess(self, check_obj: ir.Table, key: Optional[str]):
def preprocess(
self, check_obj: Union[ir.Column, ir.Table], key: Optional[str]
):
"""Preprocesses a check object before applying the check function."""
# This handles the case of Series validation, which has no other context except
# for the index to groupby on. Right now grouping by the index is not allowed.
return check_obj
# This handles the case of Column validation by promoting it to
# a Table with a single column. Table inputs are unaffected.
return check_obj.as_table()

def apply(self, check_obj: IbisData):
"""Apply the check function to a check object."""
Expand Down
2 changes: 2 additions & 0 deletions pandera/backends/ibis/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ def register_ibis_backends():
from pandera.api.checks import Check
from pandera.api.ibis.components import Column
from pandera.api.ibis.container import DataFrameSchema
from pandera.backends.ibis import builtin_checks
from pandera.backends.ibis.components import ColumnBackend
from pandera.backends.ibis.container import DataFrameSchemaBackend
from pandera.backends.ibis.checks import IbisCheckBackend

DataFrameSchema.register_backend(ir.Table, DataFrameSchemaBackend)
Column.register_backend(ir.Table, ColumnBackend)
Check.register_backend(ir.Table, IbisCheckBackend)
Check.register_backend(ir.Column, IbisCheckBackend)
6 changes: 3 additions & 3 deletions pandera/backends/pandas/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Pandas implementation of built-in checks"""
"""Built-in checks for pandas."""

import sys
import operator
Expand Down Expand Up @@ -282,8 +282,8 @@ def str_length(
) -> PandasData:
"""Ensure that the length of strings is within a specified range.
:param min_value: Minimum length of strings (default: no minimum)
:param max_value: Maximum length of strings (default: no maximum)
:param min_value: Minimum length of strings. No minimum by default.
:param max_value: Maximum length of strings. No maximum by default.
"""
str_len = data.str.len()
if min_value is None and max_value is None:
Expand Down
82 changes: 41 additions & 41 deletions pandera/backends/polars/builtin_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
def equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
"""Ensure all elements of a data container equal a certain value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param value: values in this polars data structure must be
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param value: Values in this Polars data structure must be
equal to this value.
"""
return data.lazyframe.select(pl.col(data.key).eq(value))
Expand All @@ -33,9 +33,9 @@ def equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
def not_equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
"""Ensure no elements of a data container equals a certain value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param value: This value must not occur in the checked
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param value: This value must not occur in the checked data structure.
"""
return data.lazyframe.select(pl.col(data.key).ne(value))

Expand All @@ -49,10 +49,10 @@ def greater_than(data: PolarsData, min_value: Any) -> pl.LazyFrame:
Ensure values of a data container are strictly greater than a minimum
value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param min_value: Lower bound to be exceeded. Must be
a type comparable to the dtype of the series datatype of Polars
a type comparable to the dtype of the series datatype of Polars.
"""
return data.lazyframe.select(pl.col(data.key).gt(min_value))

Expand All @@ -64,10 +64,10 @@ def greater_than(data: PolarsData, min_value: Any) -> pl.LazyFrame:
def greater_than_or_equal_to(data: PolarsData, min_value: Any) -> pl.LazyFrame:
"""Ensure all values are greater or equal a certain value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param min_value: Allowed minimum value for values of a series. Must be
a type comparable to the dtype of the series datatype of Polars
a type comparable to the dtype of the series datatype of Polars.
"""
return data.lazyframe.select(pl.col(data.key).ge(min_value))

Expand All @@ -79,10 +79,10 @@ def greater_than_or_equal_to(data: PolarsData, min_value: Any) -> pl.LazyFrame:
def less_than(data: PolarsData, max_value: Any) -> pl.LazyFrame:
"""Ensure values of a series are strictly below a maximum value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param max_value: All elements of a series must be strictly smaller
than this. Must be a type comparable to the dtype of the series datatype of Polars
than this. Must be a type comparable to the dtype of the series datatype of Polars.
"""
return data.lazyframe.select(pl.col(data.key).lt(max_value))

Expand All @@ -94,8 +94,8 @@ def less_than(data: PolarsData, max_value: Any) -> pl.LazyFrame:
def less_than_or_equal_to(data: PolarsData, max_value: Any) -> pl.LazyFrame:
"""Ensure values of a series are strictly below a maximum value.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param max_value: Upper bound not to be exceeded. Must be a type comparable to the dtype of the
series datatype of Polars
"""
Expand All @@ -118,8 +118,8 @@ def in_range(
Both endpoints must be a type comparable to the dtype of the
series datatype of Polars
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param min_value: Left / lower endpoint of the interval.
:param max_value: Right / upper endpoint of the interval. Must not be
smaller than min_value.
Expand Down Expand Up @@ -150,8 +150,8 @@ def isin(data: PolarsData, allowed_values: Iterable) -> pl.LazyFrame:
in allowed_values at least once can meet this condition. If you
want to check for substrings use :meth:`Check.str_contains`.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param allowed_values: The set of allowed values. May be any iterable.
"""
return data.lazyframe.select(pl.col(data.key).is_in(allowed_values))
Expand All @@ -169,7 +169,7 @@ def notin(data: PolarsData, forbidden_values: Iterable) -> pl.LazyFrame:
design.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
to access the dataframe is "dataframe" and column name using "key".
:param forbidden_values: The set of values which should not occur. May
be any iterable.
"""
Expand All @@ -187,9 +187,9 @@ def str_matches(
) -> pl.LazyFrame:
"""Ensure that string starts with a match of a regular expression pattern.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param pattern: Regular expression pattern to use for matching
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param pattern: Regular expression pattern to use for matching.
"""
pattern = pattern.pattern if isinstance(pattern, re.Pattern) else pattern
if not pattern.startswith("^"):
Expand All @@ -208,9 +208,9 @@ def str_contains(
) -> pl.LazyFrame:
"""Ensure that a pattern can be found in the string.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param pattern: Regular expression pattern to use for searching
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param pattern: Regular expression pattern to use for searching.
"""

pattern = pattern.pattern if isinstance(pattern, re.Pattern) else pattern
Expand All @@ -225,9 +225,9 @@ def str_contains(
def str_startswith(data: PolarsData, string: str) -> pl.LazyFrame:
"""Ensure that all values start with a certain string.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param string: String all values should start with
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param string: String all values should start with.
"""

return data.lazyframe.select(pl.col(data.key).str.starts_with(string))
Expand All @@ -237,9 +237,9 @@ def str_startswith(data: PolarsData, string: str) -> pl.LazyFrame:
def str_endswith(data: PolarsData, string: str) -> pl.LazyFrame:
"""Ensure that all values end with a certain string.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param string: String all values should end with
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param string: String all values should end with.
"""
return data.lazyframe.select(pl.col(data.key).str.ends_with(string))

Expand All @@ -254,10 +254,10 @@ def str_length(
) -> pl.LazyFrame:
"""Ensure that the length of strings is within a specified range.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param min_value: Minimum length of strings (including) (default: no minimum)
:param max_value: Maximum length of strings (including) (default: no maximum)
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param min_value: Minimum length of strings (inclusive). (default: no minimum)
:param max_value: Maximum length of strings (inclusive). (default: no maximum)
"""
if min_value is None and max_value is None:
raise ValueError(
Expand Down Expand Up @@ -285,9 +285,9 @@ def unique_values_eq(data: PolarsData, values: Iterable) -> bool:
In contrast with :func:`isin`, this check makes sure that all the items
in the ``values`` iterable are contained within the series.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
to access the dataframe is "dataframe" and column name using "key".
:param values: The set of values that must be present. Maybe any iterable.
:param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
to access the dataframe is "dataframe", and the key the to access the column name is "key".
:param values: The set of values that must be present. May be any iterable.
"""

return (
Expand Down
2 changes: 0 additions & 2 deletions pandera/backends/polars/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def aggregate(self, check_obj: pl.LazyFrame):

def preprocess(self, check_obj: pl.LazyFrame, key: Optional[str]):
"""Preprocesses a check object before applying the check function."""
# This handles the case of Series validation, which has no other context except
# for the index to groupby on. Right now grouping by the index is not allowed.
return check_obj

def apply(self, check_obj: PolarsData):
Expand Down
2 changes: 1 addition & 1 deletion pandera/backends/pyspark/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""PySpark implementation of built-in checks"""
"""Built-in checks for PySpark."""

from typing import Any, Iterable, TypeVar

Expand Down
Loading

0 comments on commit d68e842

Please sign in to comment.