Implement minimal built-in checks for Ibis backend (#1885)

* Implement minimal built-in checks for Ibis backend Signed-off-by: Deepyaman Datta <[email protected]> * Implement `Column` validation for the Ibis backend Signed-off-by: Deepyaman Datta <[email protected]> * Promote check object to table during preprocessing Signed-off-by: Deepyaman Datta <[email protected]> * Remove extraneous fixture for backend registration Signed-off-by: Deepyaman Datta <[email protected]> * Resolve lint (unused imports, undefined variables) Signed-off-by: Deepyaman Datta <[email protected]> * Partially standardize docstrings of builtin checks Signed-off-by: Deepyaman Datta <[email protected]> * Fix the `preprocess` docstrings copied from pandas Signed-off-by: Deepyaman Datta <[email protected]> * Format pandera/backends/ibis/checks.py using Black Signed-off-by: Deepyaman Datta <[email protected]> --------- Signed-off-by: Deepyaman Datta <[email protected]>
unionai-oss · Jan 13, 2025 · d68e842 · d68e842
1 parent ffe6c55
commit d68e842
Show file tree

Hide file tree

Showing 10 changed files with 259 additions and 55 deletions.
diff --git a/pandera/api/ibis/components.py b/pandera/api/ibis/components.py
@@ -9,6 +9,7 @@
 from pandera.api.ibis.types import IbisDtypeInputTypes
 from pandera.backends.ibis.register import register_ibis_backends
 from pandera.engines import ibis_engine
+from pandera.utils import is_regex
 
 
 class Column(ComponentSchema[ir.Table]):
@@ -111,6 +112,12 @@ def dtype(self):
     def dtype(self, value) -> None:
         self._dtype = ibis_engine.Engine.dtype(value) if value else None
 
+    @property
+    def selector(self):
+        if self.name is not None and not is_regex(self.name) and self.regex:
+            return f"^{self.name}$"
+        return self.name
+
     def set_name(self, name: str):
         """Set or modify the name of a column object.
 

diff --git a/pandera/backends/ibis/builtin_checks.py b/pandera/backends/ibis/builtin_checks.py
@@ -0,0 +1,25 @@
+"""Built-in checks for Ibis."""
+
+from typing import Any, TypeVar
+
+import ibis.expr.types as ir
+
+from pandera.api.extensions import register_builtin_check
+from pandera.api.ibis.types import IbisData
+
+T = TypeVar("T")
+
+
+@register_builtin_check(
+    aliases=["eq"],
+    error="equal_to({value})",
+)
+def equal_to(data: IbisData, value: Any) -> ir.Table:
+    """Ensure all elements of a data container equal a certain value.
+
+    :param data: NamedTuple IbisData contains the table and column name for the check. The key
+        to access the table is "table", and the key to access the column name is "key".
+    :param value: Values in this Ibis data structure must be
+        equal to this value.
+    """
+    return data.table[data.key] == value
diff --git a/pandera/backends/ibis/checks.py b/pandera/backends/ibis/checks.py
@@ -1,8 +1,7 @@
 """Check backend for Ibis."""
 
 from functools import partial
-from typing import Optional
-
+from typing import Optional, Union
 
 import ibis
 import ibis.expr.types as ir
@@ -41,11 +40,13 @@ def aggregate(self, check_obj: ir.Table):
         """Implements aggregation behavior for check object."""
         raise NotImplementedError
 
-    def preprocess(self, check_obj: ir.Table, key: Optional[str]):
+    def preprocess(
+        self, check_obj: Union[ir.Column, ir.Table], key: Optional[str]
+    ):
         """Preprocesses a check object before applying the check function."""
-        # This handles the case of Series validation, which has no other context except
-        # for the index to groupby on. Right now grouping by the index is not allowed.
-        return check_obj
+        # This handles the case of Column validation by promoting it to
+        # a Table with a single column. Table inputs are unaffected.
+        return check_obj.as_table()
 
     def apply(self, check_obj: IbisData):
         """Apply the check function to a check object."""

diff --git a/pandera/backends/ibis/register.py b/pandera/backends/ibis/register.py
@@ -14,10 +14,12 @@ def register_ibis_backends():
     from pandera.api.checks import Check
     from pandera.api.ibis.components import Column
     from pandera.api.ibis.container import DataFrameSchema
+    from pandera.backends.ibis import builtin_checks
     from pandera.backends.ibis.components import ColumnBackend
     from pandera.backends.ibis.container import DataFrameSchemaBackend
     from pandera.backends.ibis.checks import IbisCheckBackend
 
     DataFrameSchema.register_backend(ir.Table, DataFrameSchemaBackend)
     Column.register_backend(ir.Table, ColumnBackend)
     Check.register_backend(ir.Table, IbisCheckBackend)
+    Check.register_backend(ir.Column, IbisCheckBackend)
diff --git a/pandera/backends/pandas/builtin_checks.py b/pandera/backends/pandas/builtin_checks.py
@@ -1,4 +1,4 @@
-"""Pandas implementation of built-in checks"""
+"""Built-in checks for pandas."""
 
 import sys
 import operator
@@ -282,8 +282,8 @@ def str_length(
 ) -> PandasData:
     """Ensure that the length of strings is within a specified range.
 
-    :param min_value: Minimum length of strings (default: no minimum)
-    :param max_value: Maximum length of strings (default: no maximum)
+    :param min_value: Minimum length of strings. No minimum by default.
+    :param max_value: Maximum length of strings. No maximum by default.
     """
     str_len = data.str.len()
     if min_value is None and max_value is None:

diff --git a/pandera/backends/polars/builtin_checks.py b/pandera/backends/polars/builtin_checks.py
@@ -18,9 +18,9 @@
 def equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
     """Ensure all elements of a data container equal a certain value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param value: values in this polars data structure must be
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param value: Values in this Polars data structure must be
         equal to this value.
     """
     return data.lazyframe.select(pl.col(data.key).eq(value))
@@ -33,9 +33,9 @@ def equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
 def not_equal_to(data: PolarsData, value: Any) -> pl.LazyFrame:
     """Ensure no elements of a data container equals a certain value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param value: This value must not occur in the checked
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param value: This value must not occur in the checked data structure.
     """
     return data.lazyframe.select(pl.col(data.key).ne(value))
 
@@ -49,10 +49,10 @@ def greater_than(data: PolarsData, min_value: Any) -> pl.LazyFrame:
     Ensure values of a data container are strictly greater than a minimum
     value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param min_value: Lower bound to be exceeded. Must be
-            a type comparable to the dtype of the series datatype of Polars
+        a type comparable to the dtype of the series datatype of Polars.
     """
     return data.lazyframe.select(pl.col(data.key).gt(min_value))
 
@@ -64,10 +64,10 @@ def greater_than(data: PolarsData, min_value: Any) -> pl.LazyFrame:
 def greater_than_or_equal_to(data: PolarsData, min_value: Any) -> pl.LazyFrame:
     """Ensure all values are greater or equal a certain value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param min_value: Allowed minimum value for values of a series. Must be
-            a type comparable to the dtype of the series datatype of Polars
+        a type comparable to the dtype of the series datatype of Polars.
     """
     return data.lazyframe.select(pl.col(data.key).ge(min_value))
 
@@ -79,10 +79,10 @@ def greater_than_or_equal_to(data: PolarsData, min_value: Any) -> pl.LazyFrame:
 def less_than(data: PolarsData, max_value: Any) -> pl.LazyFrame:
     """Ensure values of a series are strictly below a maximum value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param max_value: All elements of a series must be strictly smaller
-        than this. Must be a type comparable to the dtype of the series datatype of Polars
+        than this. Must be a type comparable to the dtype of the series datatype of Polars.
     """
     return data.lazyframe.select(pl.col(data.key).lt(max_value))
 
@@ -94,8 +94,8 @@ def less_than(data: PolarsData, max_value: Any) -> pl.LazyFrame:
 def less_than_or_equal_to(data: PolarsData, max_value: Any) -> pl.LazyFrame:
     """Ensure values of a series are strictly below a maximum value.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param max_value: Upper bound not to be exceeded. Must be a type comparable to the dtype of the
     series datatype of Polars
     """
@@ -118,8 +118,8 @@ def in_range(
     Both endpoints must be a type comparable to the dtype of the
     series datatype of Polars
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param min_value: Left / lower endpoint of the interval.
     :param max_value: Right / upper endpoint of the interval. Must not be
         smaller than min_value.
@@ -150,8 +150,8 @@ def isin(data: PolarsData, allowed_values: Iterable) -> pl.LazyFrame:
     in allowed_values at least once can meet this condition. If you
     want to check for substrings use :meth:`Check.str_contains`.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
     :param allowed_values: The set of allowed values. May be any iterable.
     """
     return data.lazyframe.select(pl.col(data.key).is_in(allowed_values))
@@ -169,7 +169,7 @@ def notin(data: PolarsData, forbidden_values: Iterable) -> pl.LazyFrame:
     design.
 
     :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
+        to access the dataframe is "dataframe" and column name using "key".
     :param forbidden_values: The set of values which should not occur. May
         be any iterable.
     """
@@ -187,9 +187,9 @@ def str_matches(
 ) -> pl.LazyFrame:
     """Ensure that string starts with a match of a regular expression pattern.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param pattern: Regular expression pattern to use for matching
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param pattern: Regular expression pattern to use for matching.
     """
     pattern = pattern.pattern if isinstance(pattern, re.Pattern) else pattern
     if not pattern.startswith("^"):
@@ -208,9 +208,9 @@ def str_contains(
 ) -> pl.LazyFrame:
     """Ensure that a pattern can be found in the string.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param pattern: Regular expression pattern to use for searching
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param pattern: Regular expression pattern to use for searching.
     """
 
     pattern = pattern.pattern if isinstance(pattern, re.Pattern) else pattern
@@ -225,9 +225,9 @@ def str_contains(
 def str_startswith(data: PolarsData, string: str) -> pl.LazyFrame:
     """Ensure that all values start with a certain string.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param string: String all values should start with
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param string: String all values should start with.
     """
 
     return data.lazyframe.select(pl.col(data.key).str.starts_with(string))
@@ -237,9 +237,9 @@ def str_startswith(data: PolarsData, string: str) -> pl.LazyFrame:
 def str_endswith(data: PolarsData, string: str) -> pl.LazyFrame:
     """Ensure that all values end with a certain string.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param string: String all values should end with
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param string: String all values should end with.
     """
     return data.lazyframe.select(pl.col(data.key).str.ends_with(string))
 
@@ -254,10 +254,10 @@ def str_length(
 ) -> pl.LazyFrame:
     """Ensure that the length of strings is within a specified range.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param min_value: Minimum length of strings (including) (default: no minimum)
-    :param max_value: Maximum length of strings (including) (default: no maximum)
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param min_value: Minimum length of strings (inclusive). (default: no minimum)
+    :param max_value: Maximum length of strings (inclusive). (default: no maximum)
     """
     if min_value is None and max_value is None:
         raise ValueError(
@@ -285,9 +285,9 @@ def unique_values_eq(data: PolarsData, values: Iterable) -> bool:
         In contrast with :func:`isin`, this check makes sure that all the items
         in the ``values`` iterable are contained within the series.
 
-    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "key".
-    :param values: The set of values that must be present. Maybe any iterable.
+    :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe", and the key the to access the column name is "key".
+    :param values: The set of values that must be present. May be any iterable.
     """
 
     return (

diff --git a/pandera/backends/polars/checks.py b/pandera/backends/polars/checks.py
@@ -41,8 +41,6 @@ def aggregate(self, check_obj: pl.LazyFrame):
 
     def preprocess(self, check_obj: pl.LazyFrame, key: Optional[str]):
         """Preprocesses a check object before applying the check function."""
-        # This handles the case of Series validation, which has no other context except
-        # for the index to groupby on. Right now grouping by the index is not allowed.
         return check_obj
 
     def apply(self, check_obj: PolarsData):

diff --git a/pandera/backends/pyspark/builtin_checks.py b/pandera/backends/pyspark/builtin_checks.py
@@ -1,4 +1,4 @@
-"""PySpark implementation of built-in checks"""
+"""Built-in checks for PySpark."""
 
 from typing import Any, Iterable, TypeVar