Skip to content

Commit 0558a3c

Browse files
TomAugspurgerjreback
authored andcommitted
API: DataFrame.sparse accessor (#25682)
1 parent 304d8d4 commit 0558a3c

File tree

7 files changed

+423
-86
lines changed

7 files changed

+423
-86
lines changed

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
311311
DataFrame.boxplot
312312
DataFrame.hist
313313

314+
315+
.. _api.frame.sparse:
316+
317+
Sparse Accessor
318+
~~~~~~~~~~~~~~~
319+
320+
Sparse-dtype specific methods and attributes are provided under the
321+
``DataFrame.sparse`` accessor.
322+
323+
.. autosummary::
324+
:toctree: api/
325+
:template: autosummary/accessor_attribute.rst
326+
327+
DataFrame.sparse.density
328+
329+
.. autosummary::
330+
:toctree: api/
331+
332+
DataFrame.sparse.from_spmatrix
333+
DataFrame.sparse.to_coo
334+
DataFrame.sparse.to_dense
335+
336+
314337
Serialization / IO / Conversion
315338
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
316339
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other Enhancements
3535
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
3636
- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`)
3737
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
38+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
3839
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3940
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
4041
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

pandas/core/arrays/sparse.py

+252-11
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,55 @@ def _simple_new(
697697
new._dtype = dtype
698698
return new
699699

700+
@classmethod
701+
def from_spmatrix(cls, data):
702+
"""
703+
Create a SparseArray from a scipy.sparse matrix.
704+
705+
.. versionadded:: 0.25.0
706+
707+
Parameters
708+
----------
709+
data : scipy.sparse.sp_matrix
710+
This should be a SciPy sparse matrix where the size
711+
of the second dimension is 1. In other words, a
712+
sparse matrix with a single column.
713+
714+
Returns
715+
-------
716+
SparseArray
717+
718+
Examples
719+
--------
720+
>>> import scipy.sparse
721+
>>> mat = scipy.sparse.coo_matrix((4, 1))
722+
>>> pd.SparseArray.from_spmatrix(mat)
723+
[0.0, 0.0, 0.0, 0.0]
724+
Fill: 0.0
725+
IntIndex
726+
Indices: array([], dtype=int32)
727+
"""
728+
length, ncol = data.shape
729+
730+
if ncol != 1:
731+
raise ValueError(
732+
"'data' must have a single column, not '{}'".format(ncol)
733+
)
734+
735+
# our sparse index classes require that the positions be strictly
736+
# increasing. So we need to sort loc, and arr accordingly.
737+
arr = data.data
738+
idx, _ = data.nonzero()
739+
loc = np.argsort(idx)
740+
arr = arr.take(loc)
741+
idx.sort()
742+
743+
zero = np.array(0, dtype=arr.dtype).item()
744+
dtype = SparseDtype(arr.dtype, zero)
745+
index = IntIndex(length, idx)
746+
747+
return cls._simple_new(arr, index, dtype)
748+
700749
def __array__(self, dtype=None, copy=True):
701750
fill_value = self.fill_value
702751

@@ -1906,27 +1955,32 @@ def _make_index(length, indices, kind):
19061955
# ----------------------------------------------------------------------------
19071956
# Accessor
19081957

1958+
1959+
class BaseAccessor:
1960+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1961+
1962+
def __init__(self, data=None):
1963+
self._parent = data
1964+
self._validate(data)
1965+
1966+
def _validate(self, data):
1967+
raise NotImplementedError
1968+
1969+
19091970
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
19101971
'sp_values'],
19111972
typ='property')
1912-
class SparseAccessor(PandasDelegate):
1973+
class SparseAccessor(BaseAccessor, PandasDelegate):
19131974
"""
19141975
Accessor for SparseSparse from other sparse matrix data types.
19151976
"""
19161977

1917-
def __init__(self, data=None):
1918-
self._validate(data)
1919-
# Store the Series since we need that for to_coo
1920-
self._parent = data
1921-
1922-
@staticmethod
1923-
def _validate(data):
1978+
def _validate(self, data):
19241979
if not isinstance(data.dtype, SparseDtype):
1925-
msg = "Can only use the '.sparse' accessor with Sparse data."
1926-
raise AttributeError(msg)
1980+
raise AttributeError(self._validation_msg)
19271981

19281982
def _delegate_property_get(self, name, *args, **kwargs):
1929-
return getattr(self._parent.values, name)
1983+
return getattr(self._parent.array, name)
19301984

19311985
def _delegate_method(self, name, *args, **kwargs):
19321986
if name == 'from_coo':
@@ -2040,3 +2094,190 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20402094
column_levels,
20412095
sort_labels=sort_labels)
20422096
return A, rows, columns
2097+
2098+
def to_dense(self):
2099+
"""
2100+
Convert a Series from sparse values to dense.
2101+
2102+
.. versionadded:: 0.25.0
2103+
2104+
Returns
2105+
-------
2106+
Series:
2107+
A Series with the same values, stored as a dense array.
2108+
2109+
Examples
2110+
--------
2111+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2112+
>>> series
2113+
0 0
2114+
1 1
2115+
2 0
2116+
dtype: Sparse[int64, 0]
2117+
2118+
>>> series.sparse.to_dense()
2119+
0 0
2120+
1 1
2121+
2 0
2122+
dtype: int64
2123+
"""
2124+
from pandas import Series
2125+
return Series(self._parent.array.to_dense(),
2126+
index=self._parent.index,
2127+
name=self._parent.name)
2128+
2129+
2130+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
2131+
"""
2132+
DataFrame accessor for sparse data.
2133+
2134+
.. versionadded :: 0.25.0
2135+
"""
2136+
2137+
def _validate(self, data):
2138+
dtypes = data.dtypes
2139+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2140+
raise AttributeError(self._validation_msg)
2141+
2142+
@classmethod
2143+
def from_spmatrix(cls, data, index=None, columns=None):
2144+
"""
2145+
Create a new DataFrame from a scipy sparse matrix.
2146+
2147+
.. versionadded:: 0.25.0
2148+
2149+
Parameters
2150+
----------
2151+
data : scipy.sparse.spmatrix
2152+
Must be convertible to csc format.
2153+
index, columns : Index, optional
2154+
Row and column labels to use for the resulting DataFrame.
2155+
Defaults to a RangeIndex.
2156+
2157+
Returns
2158+
-------
2159+
DataFrame
2160+
Each column of the DataFrame is stored as a
2161+
:class:`SparseArray`.
2162+
2163+
Examples
2164+
--------
2165+
>>> import scipy.sparse
2166+
>>> mat = scipy.sparse.eye(3)
2167+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2168+
0 1 2
2169+
0 1.0 0.0 0.0
2170+
1 0.0 1.0 0.0
2171+
2 0.0 0.0 1.0
2172+
"""
2173+
from pandas import DataFrame
2174+
2175+
data = data.tocsc()
2176+
index, columns = cls._prep_index(data, index, columns)
2177+
sparrays = [
2178+
SparseArray.from_spmatrix(data[:, i])
2179+
for i in range(data.shape[1])
2180+
]
2181+
data = dict(enumerate(sparrays))
2182+
result = DataFrame(data, index=index)
2183+
result.columns = columns
2184+
return result
2185+
2186+
def to_dense(self):
2187+
"""
2188+
Convert a DataFrame with sparse values to dense.
2189+
2190+
.. versionadded:: 0.25.0
2191+
2192+
Returns
2193+
-------
2194+
DataFrame
2195+
A DataFrame with the same values stored as dense arrays.
2196+
2197+
Examples
2198+
--------
2199+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2200+
>>> df.sparse.to_dense()
2201+
A
2202+
0 0
2203+
1 1
2204+
2 0
2205+
"""
2206+
from pandas import DataFrame
2207+
2208+
data = {k: v.array.to_dense()
2209+
for k, v in self._parent.items()}
2210+
return DataFrame(data,
2211+
index=self._parent.index,
2212+
columns=self._parent.columns)
2213+
2214+
def to_coo(self):
2215+
"""
2216+
Return the contents of the frame as a sparse SciPy COO matrix.
2217+
2218+
.. versionadded:: 0.25.0
2219+
2220+
Returns
2221+
-------
2222+
coo_matrix : scipy.sparse.spmatrix
2223+
If the caller is heterogeneous and contains booleans or objects,
2224+
the result will be of dtype=object. See Notes.
2225+
2226+
Notes
2227+
-----
2228+
The dtype will be the lowest-common-denominator type (implicit
2229+
upcasting); that is to say if the dtypes (even of numeric types)
2230+
are mixed, the one that accommodates all will be chosen.
2231+
2232+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2233+
float32. By numpy.find_common_type convention, mixing int64 and
2234+
and uint64 will result in a float64 dtype.
2235+
"""
2236+
try:
2237+
from scipy.sparse import coo_matrix
2238+
except ImportError:
2239+
raise ImportError('Scipy is not installed')
2240+
2241+
dtype = find_common_type(self._parent.dtypes)
2242+
if isinstance(dtype, SparseDtype):
2243+
dtype = dtype.subtype
2244+
2245+
cols, rows, datas = [], [], []
2246+
for col, name in enumerate(self._parent):
2247+
s = self._parent[name]
2248+
row = s.array.sp_index.to_int_index().indices
2249+
cols.append(np.repeat(col, len(row)))
2250+
rows.append(row)
2251+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2252+
2253+
cols = np.concatenate(cols)
2254+
rows = np.concatenate(rows)
2255+
datas = np.concatenate(datas)
2256+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2257+
2258+
@property
2259+
def density(self) -> float:
2260+
"""
2261+
Ratio of non-sparse points to total (dense) data points
2262+
represented in the DataFrame.
2263+
"""
2264+
return np.mean([column.array.density
2265+
for _, column in self._parent.items()])
2266+
2267+
@staticmethod
2268+
def _prep_index(data, index, columns):
2269+
import pandas.core.indexes.base as ibase
2270+
2271+
N, K = data.shape
2272+
if index is None:
2273+
index = ibase.default_index(N)
2274+
if columns is None:
2275+
columns = ibase.default_index(K)
2276+
2277+
if len(columns) != K:
2278+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2279+
.format(columns=len(columns), K=K))
2280+
if len(index) != N:
2281+
raise ValueError('Index length mismatch: {index} vs. {N}'
2282+
.format(index=len(index), N=N))
2283+
return index, columns

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
from pandas.compat import PY36, raise_with_traceback
3535
from pandas.compat.numpy import function as nv
36+
from pandas.core.arrays.sparse import SparseFrameAccessor
3637
from pandas.core.dtypes.cast import (
3738
maybe_upcast,
3839
cast_scalar_to_array,
@@ -8027,6 +8028,7 @@ def isin(self, values):
80278028
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80288029
hist = gfx.hist_frame
80298030
boxplot = gfx.boxplot_frame
8031+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80308032

80318033

80328034
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

0 commit comments

Comments
 (0)