Skip to content

Commit 34e9a42

Browse files
committed
FEAT: implemented dta adapter using Pandas
1 parent 34ac534 commit 34e9a42

File tree

1 file changed

+81
-2
lines changed

1 file changed

+81
-2
lines changed

larray_editor/arrayadapter.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3682,8 +3682,7 @@ class PyReadstatSas7BdatPathAdapter(AbstractPyReadStatPathAdapter):
36823682
READ_FUNC_NAME = 'read_sas7bdat'
36833683

36843684

3685-
@path_adapter_for('.dta', 'pyreadstat')
3686-
class DtaPathAdapter(AbstractPyReadStatPathAdapter):
3685+
class PyReadstatDtaPathAdapter(AbstractPyReadStatPathAdapter):
36873686
READ_FUNC_NAME = 'read_dta'
36883687

36893688

@@ -3781,6 +3780,75 @@ def open(cls, fpath):
37813780
return pd.read_sas(fpath, iterator=True, encoding='infer')
37823781

37833782

3783+
@adapter_for('pandas.io.stata.StataReader')
3784+
class PandasStataReaderAdapter(AbstractColumnarAdapter):
3785+
def __init__(self, data, attributes=None):
3786+
super().__init__(data, attributes=attributes)
3787+
reader = data
3788+
reader._ensure_open()
3789+
3790+
# monkey-patch Pandas StataReader to fix column selection (only
3791+
# the first column selection of a reader works in the original version)
3792+
def _do_select_columns(self, data, columns):
3793+
if not hasattr(self, '_full_dtyplist'):
3794+
self._full_dtyplist = self._dtyplist
3795+
self._full_typlist = self._typlist
3796+
self._full_fmtlist = self._fmtlist
3797+
self._full_lbllist = self._lbllist
3798+
3799+
column_set = set(columns)
3800+
if len(column_set) != len(columns):
3801+
raise ValueError("columns contains duplicate entries")
3802+
unmatched = column_set.difference(data.columns)
3803+
if unmatched:
3804+
joined = ", ".join(list(unmatched))
3805+
raise ValueError(
3806+
"The following columns were not "
3807+
f"found in the Stata data set: {joined}"
3808+
)
3809+
# Copy information for retained columns for later processing
3810+
get_loc = data.columns.get_loc
3811+
col_indices = [get_loc(col) for col in columns]
3812+
self._dtyplist = [self._full_dtyplist[i] for i in col_indices]
3813+
self._typlist = [self._full_typlist[i] for i in col_indices]
3814+
self._fmtlist = [self._full_fmtlist[i] for i in col_indices]
3815+
self._lbllist = [self._full_lbllist[i] for i in col_indices]
3816+
self._column_selector_set = True
3817+
return data[columns]
3818+
3819+
reader.__class__._do_select_columns = _do_select_columns
3820+
3821+
def shape2d(self):
3822+
reader = self.data
3823+
return reader._nobs, reader._nvar
3824+
3825+
def get_hlabels_values(self, start, stop):
3826+
return [self.data._varlist[start:stop]]
3827+
3828+
def get_values(self, h_start, v_start, h_stop, v_stop):
3829+
reader = self.data
3830+
columns = reader._varlist[h_start:h_stop]
3831+
3832+
reader._lines_read = v_start
3833+
chunk = reader.read(v_stop - v_start, columns=columns)
3834+
3835+
chunk_columns = [chunk.iloc[:, i].values
3836+
for i in range(h_stop - h_start)]
3837+
try:
3838+
return np.stack(chunk_columns, axis=1)
3839+
except np.exceptions.DTypePromotionError:
3840+
return np.stack(chunk_columns, axis=1, dtype=object)
3841+
3842+
3843+
class PandasDTAPathAdapter(PandasStataReaderAdapter):
3844+
@classmethod
3845+
def open(cls, fpath):
3846+
import pandas as pd
3847+
# iterator=True so that Pandas returns a StataReader instead of a
3848+
# DataFrame
3849+
return pd.read_stata(fpath, iterator=True)
3850+
3851+
37843852
@path_adapter_for('.sas7bdat')
37853853
def dispatch_sas7bdat_path_adapter(fpath):
37863854
# the pandas adapter is first as it (much) faster for reading the first
@@ -3792,6 +3860,17 @@ def dispatch_sas7bdat_path_adapter(fpath):
37923860
})
37933861

37943862

3863+
@path_adapter_for('.dta')
3864+
def dispatch_dta_path_adapter(fpath):
3865+
# the pandas adapter is first as it (much) faster for reading large files.
3866+
# In practice, Pandas is always available because it is currently a hard
3867+
# dependency of larray-editor
3868+
return dispatch_file_suffix_by_available_module('dta',{
3869+
'pandas': PandasDTAPathAdapter,
3870+
'pyreadstat': PyReadstatDtaPathAdapter
3871+
})
3872+
3873+
37953874
@adapter_for('pstats.Stats')
37963875
class ProfilingStatsAdapter(AbstractColumnarAdapter):
37973876
# we display everything except callers

0 commit comments

Comments
 (0)