@@ -697,6 +697,55 @@ def _simple_new(
697
697
new ._dtype = dtype
698
698
return new
699
699
700
+ @classmethod
701
+ def from_spmatrix (cls , data ):
702
+ """
703
+ Create a SparseArray from a scipy.sparse matrix.
704
+
705
+ .. versionadded:: 0.25.0
706
+
707
+ Parameters
708
+ ----------
709
+ data : scipy.sparse.sp_matrix
710
+ This should be a SciPy sparse matrix where the size
711
+ of the second dimension is 1. In other words, a
712
+ sparse matrix with a single column.
713
+
714
+ Returns
715
+ -------
716
+ SparseArray
717
+
718
+ Examples
719
+ --------
720
+ >>> import scipy.sparse
721
+ >>> mat = scipy.sparse.coo_matrix((4, 1))
722
+ >>> pd.SparseArray.from_spmatrix(mat)
723
+ [0.0, 0.0, 0.0, 0.0]
724
+ Fill: 0.0
725
+ IntIndex
726
+ Indices: array([], dtype=int32)
727
+ """
728
+ length , ncol = data .shape
729
+
730
+ if ncol != 1 :
731
+ raise ValueError (
732
+ "'data' must have a single column, not '{}'" .format (ncol )
733
+ )
734
+
735
+ # our sparse index classes require that the positions be strictly
736
+ # increasing. So we need to sort loc, and arr accordingly.
737
+ arr = data .data
738
+ idx , _ = data .nonzero ()
739
+ loc = np .argsort (idx )
740
+ arr = arr .take (loc )
741
+ idx .sort ()
742
+
743
+ zero = np .array (0 , dtype = arr .dtype ).item ()
744
+ dtype = SparseDtype (arr .dtype , zero )
745
+ index = IntIndex (length , idx )
746
+
747
+ return cls ._simple_new (arr , index , dtype )
748
+
700
749
def __array__ (self , dtype = None , copy = True ):
701
750
fill_value = self .fill_value
702
751
@@ -1906,27 +1955,32 @@ def _make_index(length, indices, kind):
1906
1955
# ----------------------------------------------------------------------------
1907
1956
# Accessor
1908
1957
1958
+
1959
+ class BaseAccessor :
1960
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1961
+
1962
+ def __init__ (self , data = None ):
1963
+ self ._parent = data
1964
+ self ._validate (data )
1965
+
1966
+ def _validate (self , data ):
1967
+ raise NotImplementedError
1968
+
1969
+
1909
1970
@delegate_names (SparseArray , ['npoints' , 'density' , 'fill_value' ,
1910
1971
'sp_values' ],
1911
1972
typ = 'property' )
1912
- class SparseAccessor (PandasDelegate ):
1973
+ class SparseAccessor (BaseAccessor , PandasDelegate ):
1913
1974
"""
1914
1975
Accessor for SparseSparse from other sparse matrix data types.
1915
1976
"""
1916
1977
1917
- def __init__ (self , data = None ):
1918
- self ._validate (data )
1919
- # Store the Series since we need that for to_coo
1920
- self ._parent = data
1921
-
1922
- @staticmethod
1923
- def _validate (data ):
1978
+ def _validate (self , data ):
1924
1979
if not isinstance (data .dtype , SparseDtype ):
1925
- msg = "Can only use the '.sparse' accessor with Sparse data."
1926
- raise AttributeError (msg )
1980
+ raise AttributeError (self ._validation_msg )
1927
1981
1928
1982
def _delegate_property_get (self , name , * args , ** kwargs ):
1929
- return getattr (self ._parent .values , name )
1983
+ return getattr (self ._parent .array , name )
1930
1984
1931
1985
def _delegate_method (self , name , * args , ** kwargs ):
1932
1986
if name == 'from_coo' :
@@ -2040,3 +2094,190 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
2040
2094
column_levels ,
2041
2095
sort_labels = sort_labels )
2042
2096
return A , rows , columns
2097
+
2098
+ def to_dense (self ):
2099
+ """
2100
+ Convert a Series from sparse values to dense.
2101
+
2102
+ .. versionadded:: 0.25.0
2103
+
2104
+ Returns
2105
+ -------
2106
+ Series:
2107
+ A Series with the same values, stored as a dense array.
2108
+
2109
+ Examples
2110
+ --------
2111
+ >>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2112
+ >>> series
2113
+ 0 0
2114
+ 1 1
2115
+ 2 0
2116
+ dtype: Sparse[int64, 0]
2117
+
2118
+ >>> series.sparse.to_dense()
2119
+ 0 0
2120
+ 1 1
2121
+ 2 0
2122
+ dtype: int64
2123
+ """
2124
+ from pandas import Series
2125
+ return Series (self ._parent .array .to_dense (),
2126
+ index = self ._parent .index ,
2127
+ name = self ._parent .name )
2128
+
2129
+
2130
+ class SparseFrameAccessor (BaseAccessor , PandasDelegate ):
2131
+ """
2132
+ DataFrame accessor for sparse data.
2133
+
2134
+ .. versionadded :: 0.25.0
2135
+ """
2136
+
2137
+ def _validate (self , data ):
2138
+ dtypes = data .dtypes
2139
+ if not all (isinstance (t , SparseDtype ) for t in dtypes ):
2140
+ raise AttributeError (self ._validation_msg )
2141
+
2142
+ @classmethod
2143
+ def from_spmatrix (cls , data , index = None , columns = None ):
2144
+ """
2145
+ Create a new DataFrame from a scipy sparse matrix.
2146
+
2147
+ .. versionadded:: 0.25.0
2148
+
2149
+ Parameters
2150
+ ----------
2151
+ data : scipy.sparse.spmatrix
2152
+ Must be convertible to csc format.
2153
+ index, columns : Index, optional
2154
+ Row and column labels to use for the resulting DataFrame.
2155
+ Defaults to a RangeIndex.
2156
+
2157
+ Returns
2158
+ -------
2159
+ DataFrame
2160
+ Each column of the DataFrame is stored as a
2161
+ :class:`SparseArray`.
2162
+
2163
+ Examples
2164
+ --------
2165
+ >>> import scipy.sparse
2166
+ >>> mat = scipy.sparse.eye(3)
2167
+ >>> pd.DataFrame.sparse.from_spmatrix(mat)
2168
+ 0 1 2
2169
+ 0 1.0 0.0 0.0
2170
+ 1 0.0 1.0 0.0
2171
+ 2 0.0 0.0 1.0
2172
+ """
2173
+ from pandas import DataFrame
2174
+
2175
+ data = data .tocsc ()
2176
+ index , columns = cls ._prep_index (data , index , columns )
2177
+ sparrays = [
2178
+ SparseArray .from_spmatrix (data [:, i ])
2179
+ for i in range (data .shape [1 ])
2180
+ ]
2181
+ data = dict (enumerate (sparrays ))
2182
+ result = DataFrame (data , index = index )
2183
+ result .columns = columns
2184
+ return result
2185
+
2186
+ def to_dense (self ):
2187
+ """
2188
+ Convert a DataFrame with sparse values to dense.
2189
+
2190
+ .. versionadded:: 0.25.0
2191
+
2192
+ Returns
2193
+ -------
2194
+ DataFrame
2195
+ A DataFrame with the same values stored as dense arrays.
2196
+
2197
+ Examples
2198
+ --------
2199
+ >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2200
+ >>> df.sparse.to_dense()
2201
+ A
2202
+ 0 0
2203
+ 1 1
2204
+ 2 0
2205
+ """
2206
+ from pandas import DataFrame
2207
+
2208
+ data = {k : v .array .to_dense ()
2209
+ for k , v in self ._parent .items ()}
2210
+ return DataFrame (data ,
2211
+ index = self ._parent .index ,
2212
+ columns = self ._parent .columns )
2213
+
2214
+ def to_coo (self ):
2215
+ """
2216
+ Return the contents of the frame as a sparse SciPy COO matrix.
2217
+
2218
+ .. versionadded:: 0.25.0
2219
+
2220
+ Returns
2221
+ -------
2222
+ coo_matrix : scipy.sparse.spmatrix
2223
+ If the caller is heterogeneous and contains booleans or objects,
2224
+ the result will be of dtype=object. See Notes.
2225
+
2226
+ Notes
2227
+ -----
2228
+ The dtype will be the lowest-common-denominator type (implicit
2229
+ upcasting); that is to say if the dtypes (even of numeric types)
2230
+ are mixed, the one that accommodates all will be chosen.
2231
+
2232
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
2233
+ float32. By numpy.find_common_type convention, mixing int64 and
2234
+ and uint64 will result in a float64 dtype.
2235
+ """
2236
+ try :
2237
+ from scipy .sparse import coo_matrix
2238
+ except ImportError :
2239
+ raise ImportError ('Scipy is not installed' )
2240
+
2241
+ dtype = find_common_type (self ._parent .dtypes )
2242
+ if isinstance (dtype , SparseDtype ):
2243
+ dtype = dtype .subtype
2244
+
2245
+ cols , rows , datas = [], [], []
2246
+ for col , name in enumerate (self ._parent ):
2247
+ s = self ._parent [name ]
2248
+ row = s .array .sp_index .to_int_index ().indices
2249
+ cols .append (np .repeat (col , len (row )))
2250
+ rows .append (row )
2251
+ datas .append (s .array .sp_values .astype (dtype , copy = False ))
2252
+
2253
+ cols = np .concatenate (cols )
2254
+ rows = np .concatenate (rows )
2255
+ datas = np .concatenate (datas )
2256
+ return coo_matrix ((datas , (rows , cols )), shape = self ._parent .shape )
2257
+
2258
+ @property
2259
+ def density (self ) -> float :
2260
+ """
2261
+ Ratio of non-sparse points to total (dense) data points
2262
+ represented in the DataFrame.
2263
+ """
2264
+ return np .mean ([column .array .density
2265
+ for _ , column in self ._parent .items ()])
2266
+
2267
+ @staticmethod
2268
+ def _prep_index (data , index , columns ):
2269
+ import pandas .core .indexes .base as ibase
2270
+
2271
+ N , K = data .shape
2272
+ if index is None :
2273
+ index = ibase .default_index (N )
2274
+ if columns is None :
2275
+ columns = ibase .default_index (K )
2276
+
2277
+ if len (columns ) != K :
2278
+ raise ValueError ('Column length mismatch: {columns} vs. {K}'
2279
+ .format (columns = len (columns ), K = K ))
2280
+ if len (index ) != N :
2281
+ raise ValueError ('Index length mismatch: {index} vs. {N}'
2282
+ .format (index = len (index ), N = N ))
2283
+ return index , columns
0 commit comments