Skip to content

Commit

Permalink
TEST-modin-project#6016: make sure eval_general doesn't expect exce…
Browse files Browse the repository at this point in the history
…ptions by default (modin-project#6954)

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev authored Mar 14, 2024
1 parent a9bdcbe commit c753436
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 90 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,10 @@ def _read_csv_check_support(
+ "'infer' header values",
)
if isinstance(parse_dates, list) and not set(parse_dates).issubset(names):
raise ValueError("Missing column provided to 'parse_dates'")
missing_columns = set(parse_dates) - set(names)
raise ValueError(
f"Missing column provided to 'parse_dates': '{', '.join(missing_columns)}'"
)

empty_pandas_df = pandas.read_csv(
**dict(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ def test_read_csv_datetime(
engine,
parse_dates,
names,
request,
):
parse_dates_unsupported = isinstance(parse_dates, dict) or (
isinstance(parse_dates, list)
Expand All @@ -311,9 +312,27 @@ def test_read_csv_datetime(
skip_exc_type_check = parse_dates_unsupported and engine == "arrow"
if skip_exc_type_check:
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7012")

raising_exceptions = None
if "names1-parse_dates2" in request.node.callspec.id:
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'col2'"
)
elif (
"names1-parse_dates5-None" in request.node.callspec.id
or "names1-parse_dates4-None" in request.node.callspec.id
):
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'col2, col3'"
)
elif "None-parse_dates3" in request.node.callspec.id:
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'c2'"
)
eval_io(
fn_name="read_csv",
md_extra_kwargs={"engine": engine},
raising_exceptions=raising_exceptions,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
parse_dates=parse_dates,
Expand Down Expand Up @@ -509,7 +528,7 @@ def test_dup_names(self, names):
["i1", "i2", "a"],
],
)
def test_reset_index(self, names):
def test_reset_index(self, names, request):
index = pandas.MultiIndex.from_tuples(
[(i, j, k) for i in range(2) for j in range(3) for k in range(4)],
names=names,
Expand All @@ -519,7 +538,12 @@ def applier(lib):
df = lib.DataFrame(self.data, index=index) + 1
return df.reset_index()

eval_general(pd, pandas, applier)
raising_exceptions = None
if "names3" in request.node.callspec.id:
raising_exceptions = ValueError("cannot insert i1, already exists")
elif "names4" in request.node.callspec.id:
raising_exceptions = ValueError("cannot insert a, already exists")
eval_general(pd, pandas, applier, raising_exceptions=raising_exceptions)

@pytest.mark.parametrize("is_multiindex", [True, False])
def test_reset_index_multicolumns(self, is_multiindex):
Expand Down Expand Up @@ -1327,7 +1351,7 @@ def groupby(df, **kwargs):
@pytest.mark.parametrize("n", [10, -10])
@pytest.mark.parametrize("invert", [True, False])
@pytest.mark.parametrize("select", [True, False])
@pytest.mark.parametrize("ascending", [None, True, False])
@pytest.mark.parametrize("ascending", [True, False])
def test_head_tail(self, op, n, invert, select, ascending):
def head(df, **kwargs):
if invert:
Expand Down
4 changes: 4 additions & 0 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ def _pandas_read_csv_glob(path, storage_options):
).reset_index(drop=True)
return pandas_df

raising_exceptions = None
if "anon" in storage_options_extra:
raising_exceptions = PermissionError("Forbidden")
eval_general(
pd,
pandas,
Expand All @@ -237,6 +240,7 @@ def _pandas_read_csv_glob(path, storage_options):
else _pandas_read_csv_glob(s3_path, **kwargs)
),
storage_options=s3_storage_options | storage_options_extra,
raising_exceptions=raising_exceptions,
)


Expand Down
54 changes: 50 additions & 4 deletions modin/pandas/test/dataframe/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
@pytest.mark.parametrize(
"other",
[
lambda df: 4,
lambda df, axis: 4,
lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]),
lambda df, axis: {
label: idx + 1
Expand Down Expand Up @@ -114,14 +114,19 @@ def test___rdivmod__():
*("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"),
],
)
def test_math_functions_fill_value(other, fill_value, op):
def test_math_functions_fill_value(other, fill_value, op, request):
data = test_data["int_data"]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

raising_exceptions = None
if "check_different_index" in request.node.callspec.id and fill_value == 3.0:
raising_exceptions = NotImplementedError("fill_value 3.0 not supported.")

eval_general(
modin_df,
pandas_df,
lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value),
raising_exceptions=raising_exceptions,
# This test causes an empty slice to be generated thus triggering:
# https://github.com/modin-project/modin/issues/5974
comparator_kwargs={"check_dtypes": get_current_execution() != "BaseOnPython"},
Expand Down Expand Up @@ -178,7 +183,7 @@ def test_math_alias(math_op, alias):
@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"])
@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_comparison(data, op, other):
def test_comparison(data, op, other, request):
def operation(df):
df = getattr(df, op)(df if other == "as_left" else other)
if other == "as_left" and StorageFormat.get() == "Hdk":
Expand All @@ -187,9 +192,20 @@ def operation(df):
df = df.sort_index(axis=1)
return df

raising_exceptions = None
if "int_data" in request.node.callspec.id and other == "a":
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
elif "float_nan_data" in request.node.callspec.id and other == "a":
raising_exceptions = TypeError(
"Invalid comparison between dtype=float64 and str"
)
if StorageFormat.get() == "Hdk":
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")

eval_general(
*create_test_dfs(data),
operation=operation,
raising_exceptions=raising_exceptions,
)


Expand Down Expand Up @@ -344,6 +360,9 @@ def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partit
lambda df: (
df / modin_df1.a if isinstance(df, pd.DataFrame) else df / pandas_df1.a
),
raising_exceptions=ValueError(
"cannot reindex on an axis with duplicate labels"
),
)
return

Expand Down Expand Up @@ -492,13 +511,40 @@ def test_non_commutative_multiply():
pytest.param(3.5, id="float scalar"),
],
)
def test_arithmetic_with_tricky_dtypes(val1, val2, op):
def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
modin_df1, pandas_df1 = create_test_dfs(val1)
modin_df2, pandas_df2 = (
create_test_dfs(val2) if isinstance(val2, list) else (val2, val2)
)

raising_exceptions = None
if (
"bool-bool" in request.node.callspec.id
or "bool scalar-bool" in request.node.callspec.id
) and op in [
"pow",
"rpow",
"truediv",
"rtruediv",
"floordiv",
"rfloordiv",
]:
op_name = op[1:] if op.startswith("r") else op
raising_exceptions = NotImplementedError(
f"operator '{op_name}' not implemented for bool dtypes"
)
elif (
"bool-bool" in request.node.callspec.id
or "bool scalar-bool" in request.node.callspec.id
) and op in ["sub", "rsub"]:
raising_exceptions = TypeError(
"numpy boolean subtract, the `-` operator, is not supported, "
+ "use the bitwise_xor, the `^` operator, or the logical_xor function instead."
)

eval_general(
(modin_df1, modin_df2),
(pandas_df1, pandas_df2),
lambda dfs: getattr(dfs[0], op)(dfs[1]),
raising_exceptions=raising_exceptions,
)
11 changes: 6 additions & 5 deletions modin/pandas/test/dataframe/test_join_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ def test_merge(test_data, test_data2):
pandas_df,
lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
comparator=comparator,
raising_exceptions=ValueError("Cannot merge a Series without a name"),
)

# merge a Series with a name
Expand Down Expand Up @@ -538,9 +539,7 @@ def test_merge_on_single_index(left_index, right_index):


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys)
)
@pytest.mark.parametrize("ascending", [False, True])
@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"])
def test_sort_index(axis, ascending, na_position):
data = test_data["float_nan_data"]
Expand Down Expand Up @@ -620,8 +619,10 @@ def test_sort_multiindex(sort_remaining):
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize(
"ascending",
bool_arg_values + ["list_first_True", "list_first_False"],
ids=arg_keys("ascending", bool_arg_keys + ["list_first_True", "list_first_False"]),
[False, True] + ["list_first_True", "list_first_False"],
ids=arg_keys(
"ascending", ["False", "True"] + ["list_first_True", "list_first_False"]
),
)
@pytest.mark.parametrize(
"inplace", bool_arg_values, ids=arg_keys("inplace", bool_arg_keys)
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ def test_insert(data):
eval_insert(
pd.DataFrame(columns=list("ab")),
pandas.DataFrame(columns=list("ab")),
col=lambda df: df.columns[0],
col="Series insert",
value=lambda df: df[df.columns[0]],
)
eval_insert(
Expand Down
50 changes: 30 additions & 20 deletions modin/pandas/test/dataframe/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import numpy as np
import pandas
import pytest
from pandas._libs.lib import no_default
from pandas.core.dtypes.common import is_list_like

import modin.pandas as pd
Expand Down Expand Up @@ -60,13 +59,13 @@
def test_agg_dict():
md_df, pd_df = create_test_dfs(test_data_values[0])
agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")}
eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True)
eval_general(md_df, pd_df, lambda df: df.agg(agg_dict))

agg_dict = {
"new_col1": (pd_df.columns[0], "sum"),
"new_col2": (pd_df.columns[-1], "count"),
}
eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict))


@pytest.mark.parametrize("axis", [0, 1])
Expand All @@ -76,10 +75,19 @@ def test_agg_dict():
ids=agg_func_keys + agg_func_except_keys,
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply(axis, func, op):
def test_agg_apply(axis, func, op, request):
raising_exceptions = None
if "sum sum" in request.node.callspec.id:
raising_exceptions = pandas.errors.SpecificationError(
"Function names must be unique if there is no new column names assigned"
)
elif "should raise AssertionError" in request.node.callspec.id:
# FIXME: https://github.com/modin-project/modin/issues/7031
raising_exceptions = False
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
lambda df: getattr(df, op)(func, axis),
raising_exceptions=raising_exceptions,
)


Expand All @@ -90,10 +98,19 @@ def test_agg_apply(axis, func, op):
ids=agg_func_keys + agg_func_except_keys,
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply_axis_names(axis, func, op):
def test_agg_apply_axis_names(axis, func, op, request):
raising_exceptions = None
if "sum sum" in request.node.callspec.id:
raising_exceptions = pandas.errors.SpecificationError(
"Function names must be unique if there is no new column names assigned"
)
elif "should raise AssertionError" in request.node.callspec.id:
# FIXME: https://github.com/modin-project/modin/issues/7031
raising_exceptions = False
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: getattr(df, op)(func, axis),
raising_exceptions=raising_exceptions,
)


Expand Down Expand Up @@ -130,23 +147,22 @@ def test_apply_key_error(func):
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: df.apply({"row": func}, axis=1),
raising_exceptions=KeyError("Column(s) ['row'] do not exist"),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("level", [no_default, None, -1, 0, 1])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("func", ["kurt", "count", "sum", "mean", "all", "any"])
def test_apply_text_func_with_level(level, data, func, axis):
func_kwargs = {"level": level, "axis": axis}
def test_apply_text_func(data, func, axis):
func_kwargs = {"axis": axis}
rows_number = len(next(iter(data.values()))) # length of the first data column
level_0 = np.random.choice([0, 1, 2], rows_number)
level_1 = np.random.choice([3, 4, 5], rows_number)
index = pd.MultiIndex.from_arrays([level_0, level_1])

eval_general(
pd.DataFrame(data, index=index),
pandas.DataFrame(data, index=index),
*create_test_dfs(data, index=index),
lambda df, *args, **kwargs: df.apply(func, *args, **kwargs),
**func_kwargs,
)
Expand Down Expand Up @@ -448,8 +464,6 @@ def test_query_named_index():
eval_general(
*(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])),
lambda df: df.query("col1 % 2 == 0 | col3 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand All @@ -460,8 +474,6 @@ def test_query_named_multiindex():
for df in create_test_dfs(test_data["int_data"])
),
lambda df: df.query("col1 % 2 == 1 | col3 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand All @@ -474,8 +486,6 @@ def make_df(without_index):
eval_general(
*(make_df(df) for df in create_test_dfs(test_data["int_data"])),
lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col4 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand Down Expand Up @@ -514,9 +524,9 @@ def test_query_with_element_access_issue_4580(engine):

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize(
"func",
agg_func_values + agg_func_except_values,
ids=agg_func_keys + agg_func_except_keys,
"func", [lambda x: x + 1, [np.sqrt, np.exp]], ids=["lambda", "list_udfs"]
)
def test_transform(data, func):
def test_transform(data, func, request):
if "list_udfs" in request.node.callspec.id:
pytest.xfail(reason="https://github.com/modin-project/modin/issues/6998")
eval_general(*create_test_dfs(data), lambda df: df.transform(func))
Loading

0 comments on commit c753436

Please sign in to comment.