Skip to content

Commit 1a002c0

Browse files
authored
Merge pull request #218 from frreiss/branch-abc
Fix failing tests with Pandas 1.3.0 (was: Add logic to deal with ABCIndexClass being renamed to ABCIndex)
2 parents e414fcd + a238051 commit 1a002c0

File tree

4 files changed

+124
-15
lines changed

4 files changed

+124
-15
lines changed

text_extensions_for_pandas/array/span.py

+44-5
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,14 @@
3030
from memoized_property import memoized_property
3131
# noinspection PyProtectedMember
3232
from pandas.api.types import is_bool_dtype
33-
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
33+
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
34+
try:
35+
from pandas.core.dtypes.generic import ABCIndexClass
36+
except ImportError:
37+
# ABCIndexClass changed to ABCIndex in Pandas 1.3
38+
# noinspection PyUnresolvedReferences
39+
from pandas.core.dtypes.generic import ABCIndex
40+
ABCIndexClass = ABCIndex
3441
from pandas.core.indexers import check_array_indexer
3542

3643
# Internal imports
@@ -319,6 +326,7 @@ def __from_arrow__(self, extension_array):
319326
SpanArray.
320327
"""
321328
from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span
329+
322330
return arrow_to_span(extension_array)
323331

324332

@@ -862,15 +870,46 @@ def is_single_document(self) -> bool:
862870
:return: True if there is at least one span in the and every span is over the
863871
same target text.
864872
"""
873+
# NOTE: For legacy reasons, this method is currently inconsistent with the method
874+
# by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
875+
# True on an empty array, while SpanArray.is_single_document() returns false.
865876
if len(self) == 0:
866877
# If there are zero spans, then there are zero documents.
867878
return False
868879
elif self._string_table.num_things == 1:
869-
return True
880+
# Only one string; make sure that this array has a non-null value
881+
for b in self._begins:
882+
if b != Span.NULL_OFFSET_VALUE:
883+
return True
884+
# All nulls --> zero spans
885+
return False
870886
else:
871-
# More than one string in the StringTable and at least one span. Check whether
872-
# every span has the same text ID.
873-
return not np.any(self._text_ids[0] != self._text_ids)
887+
# More than one string in the StringTable and at least one span.
888+
return self._is_single_document_slow_path()
889+
890+
def _is_single_document_slow_path(self) -> bool:
891+
# Slow but reliable way to test whether everything in this SpanArray is from
892+
# the same document.
893+
# Checks whether every span has the same text ID.
894+
# Ignores NAs when making this comparison.
895+
896+
# First we need to find the first text ID that is not NA
897+
first_text_id = None
898+
for b, t in zip(self._begins, self._text_ids):
899+
if b != Span.NULL_OFFSET_VALUE:
900+
first_text_id = t
901+
break
902+
if first_text_id is None:
903+
# Special case: All NAs --> Zero documents
904+
return False
905+
return not np.any(
906+
np.logical_and(
907+
# Row is not null...
908+
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
909+
# ...and is over a different text than the first row's text ID
910+
np.not_equal(self._text_ids, first_text_id),
911+
)
912+
)
874913

875914
def split_by_document(self) -> List["SpanArray"]:
876915
"""

text_extensions_for_pandas/array/tensor.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@
2929
import numpy as np
3030
import pandas as pd
3131
from pandas.compat import set_function_name
32-
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
32+
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
33+
try:
34+
from pandas.core.dtypes.generic import ABCIndexClass
35+
except ImportError:
36+
# ABCIndexClass changed to ABCIndex in Pandas 1.3
37+
# noinspection PyUnresolvedReferences
38+
from pandas.core.dtypes.generic import ABCIndex
39+
ABCIndexClass = ABCIndex
3340
from pandas.core.indexers import check_array_indexer, validate_indices
3441

3542
""" Begin Patching of ExtensionArrayFormatter """
@@ -342,7 +349,12 @@ def isna(self) -> np.array:
342349
for information about this method.
343350
"""
344351
if self._tensor.dtype.type is np.object_:
345-
return self._tensor == None
352+
# Avoid comparing with __eq__ because the elements of the tensor may do
353+
# something funny with that operation.
354+
result_list = [
355+
self._tensor[i] is None for i in range(len(self))
356+
]
357+
return np.array(result_list, dtype=bool)
346358
elif self._tensor.dtype.type is np.str_:
347359
return np.all(self._tensor == "", axis=-1)
348360
else:
@@ -475,6 +487,11 @@ def astype(self, dtype, copy=True):
475487
return dtype.construct_array_type()._from_sequence(values, copy=False)
476488
else:
477489
return values
490+
elif pd.api.types.is_object_dtype(dtype):
491+
# Interpret astype(object) as "cast to an array of numpy arrays"
492+
values = np.empty(len(self), dtype=object)
493+
for i in range(len(self)):
494+
values[i] = self._tensor[i]
478495
else:
479496
values = self._tensor.astype(dtype, copy=copy)
480497
return values
@@ -516,15 +533,24 @@ def __getitem__(self, item) -> Union["TensorArray", "TensorElement"]:
516533
See docstring in `Extension Array` class in `pandas/core/arrays/base.py`
517534
for information about this method.
518535
"""
519-
# Return scalar if single value is selected, a TensorElement for single array element,
520-
# or TensorArray for slice
536+
# Return scalar if single value is selected, a TensorElement for single array
537+
# element, or TensorArray for slice
521538
if isinstance(item, int):
522539
value = self._tensor[item]
523540
if np.isscalar(value):
524541
return value
525542
else:
526543
return TensorElement(value)
527544
else:
545+
# BEGIN workaround for Pandas issue #42430
546+
if (pd.__version__ == "1.3.0" and isinstance(item, tuple) and len(item) > 1
547+
and item[0] == Ellipsis):
548+
if len(item) > 2:
549+
# Hopefully this case is not possible, but can't be sure
550+
raise ValueError(f"Workaround Pandas issue #42430 not implemented "
551+
f"for tuple length > 2")
552+
item = item[1]
553+
# END workaround for issue #42430
528554
if isinstance(item, TensorArray):
529555
item = np.asarray(item)
530556
item = check_array_indexer(self, item)

text_extensions_for_pandas/array/test_tensor.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1015,7 +1015,11 @@ def test_reindex(self, data, na_value):
10151015

10161016

10171017
class TestPandasSetitem(base.BaseSetitemTests):
1018-
pass
1018+
# Temporarily disabled until Pandas issue #42437 is fixed
1019+
# See Text Extensions for Pandas issue #221 for a workaround.
1020+
@pytest.mark.skip(reason="See Pandas issue #42437")
1021+
def test_setitem_series(self, data, full_indexer):
1022+
super().test_setitem_series(data, full_indexer)
10191023

10201024

10211025
class TestPandasMissing(base.BaseMissingTests):
@@ -1047,15 +1051,24 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
10471051
s = pd.Series(data[1:]) # Avoid zero values for div
10481052
self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)
10491053

1054+
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
1055+
""" Override to prevent div by zero warning."""
1056+
# frame & scalar
1057+
op_name = all_arithmetic_operators
1058+
df = pd.DataFrame({"A": data[1:]}) # Avoid zero values for div
1059+
self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
1060+
10501061
def test_arith_series_with_array(self, data, all_arithmetic_operators):
1051-
""" Override because creates Series from list of TensorElements as dtype=object."""
1062+
""" Override because creates Series from list of TensorElements as
1063+
dtype=object."""
10521064
# ndarray & other series
10531065
op_name = all_arithmetic_operators
10541066
s = pd.Series(data[1:]) # Avoid zero values for div
10551067
self.check_opname(
10561068
s, op_name, pd.Series([s.iloc[0]] * len(s), dtype=TensorDtype()), exc=self.series_array_exc
10571069
)
10581070

1071+
10591072
@pytest.mark.skip(reason="TensorArray does not error on ops")
10601073
def test_error(self, data, all_arithmetic_operators):
10611074
# other specific errors tested in the TensorArray specific tests

text_extensions_for_pandas/array/token_span.py

+35-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,15 @@
2929
from memoized_property import memoized_property
3030
# noinspection PyProtectedMember
3131
from pandas.api.types import is_bool_dtype
32-
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
32+
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
33+
try:
34+
from pandas.core.dtypes.generic import ABCIndexClass
35+
except ImportError:
36+
# ABCIndexClass changed to ABCIndex in Pandas 1.3
37+
# noinspection PyUnresolvedReferences
38+
from pandas.core.dtypes.generic import ABCIndex
39+
ABCIndexClass = ABCIndex
40+
3341
from pandas.core.indexers import check_array_indexer
3442

3543
from text_extensions_for_pandas.array.span import (
@@ -130,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
130138
)
131139
if end_token > len(tokens) + 1:
132140
raise ValueError(
133-
f"End token offset of {begin_token} larger than "
141+
f"End token offset of {end_token} larger than "
134142
f"number of tokens + 1 ({len(tokens)} + 1)"
135143
)
144+
if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
145+
raise ValueError(
146+
f"Tried to create a non-null TokenSpan over an empty list of tokens."
147+
)
136148
if TokenSpan.NULL_OFFSET_VALUE == begin_token:
137149
if TokenSpan.NULL_OFFSET_VALUE != end_token:
138150
raise ValueError(
@@ -471,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
471483
((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
472484
isinstance(value, TokenSpanArray))):
473485
for k, v in zip(key, value):
486+
self._tokens[k] = v.tokens
474487
self._begin_tokens[k] = v.begin_token
475488
self._end_tokens[k] = v.end_token
476489
else:
@@ -607,7 +620,8 @@ def isna(self) -> np.array:
607620
See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
608621
for information about this method.
609622
"""
610-
return self.nulls_mask
623+
# isna() of an ExtensionArray must return a copy that the caller can scribble on.
624+
return self.nulls_mask.copy()
611625

612626
def copy(self) -> "TokenSpanArray":
613627
"""
@@ -959,14 +973,31 @@ def is_single_document(self) -> bool:
959973
:return: True if every span in this array is over the same target text
960974
or if there are zero spans in this array.
961975
"""
976+
# NOTE: For legacy reasons, this method is currently inconsistent with the method
977+
# by the same name in SpanArray. TokenSpanArray.is_single_document() returns
978+
# True on an empty array, while SpanArray.is_single_document() returns False.
962979
if len(self) == 0:
963980
# If there are zero spans, we consider there to be one document with the
964981
# document text being whatever is the document text for our tokens.
965982
return True
966983
else:
967984
# More than one tokenization and at least one span. Check whether
968985
# every span has the same text.
969-
return not np.any(self.target_text[0] != self.target_text)
986+
987+
# Find the first text ID that is not NA
988+
first_text_id = None
989+
for b, t in zip(self._begins, self._text_ids):
990+
if b != Span.NULL_OFFSET_VALUE:
991+
first_text_id = t
992+
break
993+
if first_text_id is None:
994+
# Special case: All NAs --> Zero documents
995+
return True
996+
return not np.any(np.logical_and(
997+
# Row is not null...
998+
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
999+
# ...and is over a different text than the first row's text ID
1000+
np.not_equal(self._text_ids, first_text_id)))
9701001

9711002
def split_by_document(self) -> List["SpanArray"]:
9721003
"""

0 commit comments

Comments
 (0)