CODAIT · frreiss · Jul 7, 2021 · Jul 3, 2021 · Jul 6, 2021 · Jul 6, 2021
diff --git a/text_extensions_for_pandas/array/span.py b/text_extensions_for_pandas/array/span.py
@@ -30,7 +30,14 @@
 from memoized_property import memoized_property
 # noinspection PyProtectedMember
 from pandas.api.types import is_bool_dtype
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+    from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+    # ABCIndexClass changed to ABCIndex in Pandas 1.3
+    # noinspection PyUnresolvedReferences
+    from pandas.core.dtypes.generic import ABCIndex
+    ABCIndexClass = ABCIndex
 from pandas.core.indexers import check_array_indexer
 
 # Internal imports
@@ -319,6 +326,7 @@ def __from_arrow__(self, extension_array):
         SpanArray.
         """
         from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span
+
         return arrow_to_span(extension_array)
 
 
@@ -862,15 +870,46 @@ def is_single_document(self) -> bool:
         :return: True if there is at least one span in the and every span is over the
          same target text.
         """
+        # NOTE: For legacy reasons, this method is currently inconsistent with the method
+        # by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
+        # True on an empty array, while SpanArray.is_single_document() returns false.
         if len(self) == 0:
             # If there are zero spans, then there are zero documents.
             return False
         elif self._string_table.num_things == 1:
-            return True
+            # Only one string; make sure that this array has a non-null value
+            for b in self._begins:
+                if b != Span.NULL_OFFSET_VALUE:
+                    return True
+            # All nulls --> zero spans
+            return False
         else:
-            # More than one string in the StringTable and at least one span. Check whether
-            # every span has the same text ID.
-            return not np.any(self._text_ids[0] != self._text_ids)
+            # More than one string in the StringTable and at least one span.
+            return self._is_single_document_slow_path()
+
+    def _is_single_document_slow_path(self) -> bool:
+        # Slow but reliable way to test whether everything in this SpanArray is from
+        # the same document.
+        # Checks whether every span has the same text ID.
+        # Ignores NAs when making this comparison.
+
+        # First we need to find the first text ID that is not NA
+        first_text_id = None
+        for b, t in zip(self._begins, self._text_ids):
+            if b != Span.NULL_OFFSET_VALUE:
+                first_text_id = t
+                break
+        if first_text_id is None:
+            # Special case: All NAs --> Zero documents
+            return False
+        return not np.any(
+            np.logical_and(
+                # Row is not null...
+                np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
+                # ...and is over a different text than the first row's text ID
+                np.not_equal(self._text_ids, first_text_id),
+            )
+        )
 
     def split_by_document(self) -> List["SpanArray"]:
         """

diff --git a/text_extensions_for_pandas/array/tensor.py b/text_extensions_for_pandas/array/tensor.py
@@ -29,7 +29,14 @@
 import numpy as np
 import pandas as pd
 from pandas.compat import set_function_name
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+    from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+    # ABCIndexClass changed to ABCIndex in Pandas 1.3
+    # noinspection PyUnresolvedReferences
+    from pandas.core.dtypes.generic import ABCIndex
+    ABCIndexClass = ABCIndex
 from pandas.core.indexers import check_array_indexer, validate_indices
 
 """ Begin Patching of ExtensionArrayFormatter """
@@ -342,7 +349,12 @@ def isna(self) -> np.array:
         for information about this method.
         """
         if self._tensor.dtype.type is np.object_:
-            return self._tensor == None
+            # Avoid comparing with __eq__ because the elements of the tensor may do
+            # something funny with that operation.
+            result_list = [
+                self._tensor[i] is None for i in range(len(self))
+            ]
+            return np.array(result_list, dtype=bool)
         elif self._tensor.dtype.type is np.str_:
             return np.all(self._tensor == "", axis=-1)
         else:
@@ -475,6 +487,11 @@ def astype(self, dtype, copy=True):
                 return dtype.construct_array_type()._from_sequence(values, copy=False)
             else:
                 return values
+        elif pd.api.types.is_object_dtype(dtype):
+            # Interpret astype(object) as "cast to an array of numpy arrays"
+            values = np.empty(len(self), dtype=object)
+            for i in range(len(self)):
+                values[i] = self._tensor[i]
         else:
             values = self._tensor.astype(dtype, copy=copy)
         return values
@@ -516,15 +533,24 @@ def __getitem__(self, item) -> Union["TensorArray", "TensorElement"]:
         See docstring in `Extension   Array` class in `pandas/core/arrays/base.py`
         for information about this method.
         """
-        # Return scalar if single value is selected, a TensorElement for single array element,
-        # or TensorArray for slice
+        # Return scalar if single value is selected, a TensorElement for single array
+        # element, or TensorArray for slice
         if isinstance(item, int):
             value = self._tensor[item]
             if np.isscalar(value):
                 return value
             else:
                 return TensorElement(value)
         else:
+            # BEGIN workaround for Pandas issue #42430
+            if (pd.__version__ == "1.3.0" and isinstance(item, tuple) and len(item) > 1
+                    and item[0] == Ellipsis):
+                if len(item) > 2:
+                    # Hopefully this case is not possible, but can't be sure
+                    raise ValueError(f"Workaround Pandas issue #42430 not implemented "
+                                     f"for tuple length > 2")
+                item = item[1]
+            # END workaround for issue #42430
             if isinstance(item, TensorArray):
                 item = np.asarray(item)
             item = check_array_indexer(self, item)

diff --git a/text_extensions_for_pandas/array/test_tensor.py b/text_extensions_for_pandas/array/test_tensor.py
@@ -1015,7 +1015,11 @@ def test_reindex(self, data, na_value):
 
 
 class TestPandasSetitem(base.BaseSetitemTests):
-    pass
+    # Temporarily disabled until Pandas issue #42437 is fixed
+    # See Text Extensions for Pandas issue #221 for a workaround.
+    @pytest.mark.skip(reason="See Pandas issue #42437")
+    def test_setitem_series(self, data, full_indexer):
+        super().test_setitem_series(data, full_indexer)
 
 
 class TestPandasMissing(base.BaseMissingTests):
@@ -1047,15 +1051,24 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         s = pd.Series(data[1:])  # Avoid zero values for div
         self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)
 
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+        """ Override to prevent div by zero warning."""
+        # frame & scalar
+        op_name = all_arithmetic_operators
+        df = pd.DataFrame({"A": data[1:]})  # Avoid zero values for div
+        self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
+
     def test_arith_series_with_array(self, data, all_arithmetic_operators):
-        """ Override because creates Series from list of TensorElements as dtype=object."""
+        """ Override because creates Series from list of TensorElements as
+        dtype=object."""
         # ndarray & other series
         op_name = all_arithmetic_operators
         s = pd.Series(data[1:])  # Avoid zero values for div
         self.check_opname(
             s, op_name, pd.Series([s.iloc[0]] * len(s), dtype=TensorDtype()), exc=self.series_array_exc
         )
 
+
     @pytest.mark.skip(reason="TensorArray does not error on ops")
     def test_error(self, data, all_arithmetic_operators):
         # other specific errors tested in the TensorArray specific tests

diff --git a/text_extensions_for_pandas/array/token_span.py b/text_extensions_for_pandas/array/token_span.py
@@ -29,7 +29,15 @@
 from memoized_property import memoized_property
 # noinspection PyProtectedMember
 from pandas.api.types import is_bool_dtype
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+    from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+    # ABCIndexClass changed to ABCIndex in Pandas 1.3
+    # noinspection PyUnresolvedReferences
+    from pandas.core.dtypes.generic import ABCIndex
+    ABCIndexClass = ABCIndex
+
 from pandas.core.indexers import check_array_indexer
 
 from text_extensions_for_pandas.array.span import (
@@ -130,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
             )
         if end_token > len(tokens) + 1:
             raise ValueError(
-                f"End token offset of {begin_token} larger than "
+                f"End token offset of {end_token} larger than "
                 f"number of tokens + 1 ({len(tokens)} + 1)"
             )
+        if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
+            raise ValueError(
+                f"Tried to create a non-null TokenSpan over an empty list of tokens."
+            )
         if TokenSpan.NULL_OFFSET_VALUE == begin_token:
             if TokenSpan.NULL_OFFSET_VALUE != end_token:
                 raise ValueError(
@@ -471,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
                 ((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
                  isinstance(value, TokenSpanArray))):
             for k, v in zip(key, value):
+                self._tokens[k] = v.tokens
                 self._begin_tokens[k] = v.begin_token
                 self._end_tokens[k] = v.end_token
         else:
@@ -607,7 +620,8 @@ def isna(self) -> np.array:
         See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
         for information about this method.
         """
-        return self.nulls_mask
+        # isna() of an ExtensionArray must return a copy that the caller can scribble on.
+        return self.nulls_mask.copy()
 
     def copy(self) -> "TokenSpanArray":
         """
@@ -959,14 +973,31 @@ def is_single_document(self) -> bool:
         :return: True if every span in this array is over the same target text
          or if there are zero spans in this array.
         """
+        # NOTE: For legacy reasons, this method is currently inconsistent with the method
+        # by the same name in SpanArray. TokenSpanArray.is_single_document() returns
+        # True on an empty array, while SpanArray.is_single_document() returns False.
         if len(self) == 0:
             # If there are zero spans, we consider there to be one document with the
             # document text being whatever is the document text for our tokens.
             return True
         else:
             # More than one tokenization and at least one span. Check whether
             # every span has the same text.
-            return not np.any(self.target_text[0] != self.target_text)
+
+            # Find the first text ID that is not NA
+            first_text_id = None
+            for b, t in zip(self._begins, self._text_ids):
+                if b != Span.NULL_OFFSET_VALUE:
+                    first_text_id = t
+                    break
+            if first_text_id is None:
+                # Special case: All NAs --> Zero documents
+                return True
+            return not np.any(np.logical_and(
+                # Row is not null...
+                np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
+                # ...and is over a different text than the first row's text ID
+                np.not_equal(self._text_ids, first_text_id)))
 
     def split_by_document(self) -> List["SpanArray"]:
         """