Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix failing tests with Pandas 1.3.0 (was: Add logic to deal with ABCIndexClass being renamed to ABCIndex) #218

Merged
merged 8 commits into from
Jul 7, 2021
49 changes: 44 additions & 5 deletions text_extensions_for_pandas/array/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,14 @@
from memoized_property import memoized_property
# noinspection PyProtectedMember
from pandas.api.types import is_bool_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could do from pandas.core.dtypes.generic import ABCIndex as ABCIndexClass

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I should have thought of that. Will put in a follow-on PR with that edit.

from pandas.core.indexers import check_array_indexer

# Internal imports
Expand Down Expand Up @@ -319,6 +326,7 @@ def __from_arrow__(self, extension_array):
SpanArray.
"""
from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span

return arrow_to_span(extension_array)


Expand Down Expand Up @@ -862,15 +870,46 @@ def is_single_document(self) -> bool:
:return: True if there is at least one span in the and every span is over the
same target text.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns false.
if len(self) == 0:
# If there are zero spans, then there are zero documents.
return False
elif self._string_table.num_things == 1:
return True
# Only one string; make sure that this array has a non-null value
for b in self._begins:
if b != Span.NULL_OFFSET_VALUE:
return True
# All nulls --> zero spans
return False
else:
# More than one string in the StringTable and at least one span. Check whether
# every span has the same text ID.
return not np.any(self._text_ids[0] != self._text_ids)
# More than one string in the StringTable and at least one span.
return self._is_single_document_slow_path()

def _is_single_document_slow_path(self) -> bool:
# Slow but reliable way to test whether everything in this SpanArray is from
# the same document.
# Checks whether every span has the same text ID.
# Ignores NAs when making this comparison.

# First we need to find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return False
return not np.any(
np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id),
)
)

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down
34 changes: 30 additions & 4 deletions text_extensions_for_pandas/array/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
import numpy as np
import pandas as pd
from pandas.compat import set_function_name
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex
from pandas.core.indexers import check_array_indexer, validate_indices

""" Begin Patching of ExtensionArrayFormatter """
Expand Down Expand Up @@ -342,7 +349,12 @@ def isna(self) -> np.array:
for information about this method.
"""
if self._tensor.dtype.type is np.object_:
return self._tensor == None
# Avoid comparing with __eq__ because the elements of the tensor may do
# something funny with that operation.
result_list = [
self._tensor[i] is None for i in range(len(self))
]
return np.array(result_list, dtype=bool)
elif self._tensor.dtype.type is np.str_:
return np.all(self._tensor == "", axis=-1)
else:
Expand Down Expand Up @@ -475,6 +487,11 @@ def astype(self, dtype, copy=True):
return dtype.construct_array_type()._from_sequence(values, copy=False)
else:
return values
elif pd.api.types.is_object_dtype(dtype):
# Interpret astype(object) as "cast to an array of numpy arrays"
values = np.empty(len(self), dtype=object)
for i in range(len(self)):
values[i] = self._tensor[i]
else:
values = self._tensor.astype(dtype, copy=copy)
return values
Expand Down Expand Up @@ -516,15 +533,24 @@ def __getitem__(self, item) -> Union["TensorArray", "TensorElement"]:
See docstring in `Extension Array` class in `pandas/core/arrays/base.py`
for information about this method.
"""
# Return scalar if single value is selected, a TensorElement for single array element,
# or TensorArray for slice
# Return scalar if single value is selected, a TensorElement for single array
# element, or TensorArray for slice
if isinstance(item, int):
value = self._tensor[item]
if np.isscalar(value):
return value
else:
return TensorElement(value)
else:
# BEGIN workaround for Pandas issue #42430
if (pd.__version__ == "1.3.0" and isinstance(item, tuple) and len(item) > 1
and item[0] == Ellipsis):
if len(item) > 2:
# Hopefully this case is not possible, but can't be sure
raise ValueError(f"Workaround Pandas issue #42430 not implemented "
f"for tuple length > 2")
item = item[1]
# END workaround for issue #42430
if isinstance(item, TensorArray):
item = np.asarray(item)
item = check_array_indexer(self, item)
Expand Down
17 changes: 15 additions & 2 deletions text_extensions_for_pandas/array/test_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,11 @@ def test_reindex(self, data, na_value):


class TestPandasSetitem(base.BaseSetitemTests):
pass
# Temporarily disabled until Pandas issue #42437 is fixed
# See Text Extensions for Pandas issue #221 for a workaround.
@pytest.mark.skip(reason="See Pandas issue #42437")
def test_setitem_series(self, data, full_indexer):
super().test_setitem_series(data, full_indexer)


class TestPandasMissing(base.BaseMissingTests):
Expand Down Expand Up @@ -1047,15 +1051,24 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
s = pd.Series(data[1:]) # Avoid zero values for div
self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)

def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
""" Override to prevent div by zero warning."""
# frame & scalar
op_name = all_arithmetic_operators
df = pd.DataFrame({"A": data[1:]}) # Avoid zero values for div
self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)

def test_arith_series_with_array(self, data, all_arithmetic_operators):
""" Override because creates Series from list of TensorElements as dtype=object."""
""" Override because creates Series from list of TensorElements as
dtype=object."""
# ndarray & other series
op_name = all_arithmetic_operators
s = pd.Series(data[1:]) # Avoid zero values for div
self.check_opname(
s, op_name, pd.Series([s.iloc[0]] * len(s), dtype=TensorDtype()), exc=self.series_array_exc
)


@pytest.mark.skip(reason="TensorArray does not error on ops")
def test_error(self, data, all_arithmetic_operators):
# other specific errors tested in the TensorArray specific tests
Expand Down
39 changes: 35 additions & 4 deletions text_extensions_for_pandas/array/token_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,15 @@
from memoized_property import memoized_property
# noinspection PyProtectedMember
from pandas.api.types import is_bool_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex

from pandas.core.indexers import check_array_indexer

from text_extensions_for_pandas.array.span import (
Expand Down Expand Up @@ -130,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
)
if end_token > len(tokens) + 1:
raise ValueError(
f"End token offset of {begin_token} larger than "
f"End token offset of {end_token} larger than "
f"number of tokens + 1 ({len(tokens)} + 1)"
)
if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
raise ValueError(
f"Tried to create a non-null TokenSpan over an empty list of tokens."
)
if TokenSpan.NULL_OFFSET_VALUE == begin_token:
if TokenSpan.NULL_OFFSET_VALUE != end_token:
raise ValueError(
Expand Down Expand Up @@ -471,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
isinstance(value, TokenSpanArray))):
for k, v in zip(key, value):
self._tokens[k] = v.tokens
self._begin_tokens[k] = v.begin_token
self._end_tokens[k] = v.end_token
else:
Expand Down Expand Up @@ -607,7 +620,8 @@ def isna(self) -> np.array:
See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
for information about this method.
"""
return self.nulls_mask
# isna() of an ExtensionArray must return a copy that the caller can scribble on.
return self.nulls_mask.copy()

def copy(self) -> "TokenSpanArray":
"""
Expand Down Expand Up @@ -959,14 +973,31 @@ def is_single_document(self) -> bool:
:return: True if every span in this array is over the same target text
or if there are zero spans in this array.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in SpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns False.
if len(self) == 0:
# If there are zero spans, we consider there to be one document with the
# document text being whatever is the document text for our tokens.
return True
else:
# More than one tokenization and at least one span. Check whether
# every span has the same text.
return not np.any(self.target_text[0] != self.target_text)

# Find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return True
return not np.any(np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id)))

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down