@@ -3682,8 +3682,7 @@ class PyReadstatSas7BdatPathAdapter(AbstractPyReadStatPathAdapter):
36823682 READ_FUNC_NAME = 'read_sas7bdat'
36833683
36843684
3685- @path_adapter_for ('.dta' , 'pyreadstat' )
3686- class DtaPathAdapter (AbstractPyReadStatPathAdapter ):
3685+ class PyReadstatDtaPathAdapter (AbstractPyReadStatPathAdapter ):
36873686 READ_FUNC_NAME = 'read_dta'
36883687
36893688
@@ -3781,6 +3780,75 @@ def open(cls, fpath):
37813780 return pd .read_sas (fpath , iterator = True , encoding = 'infer' )
37823781
37833782
3783+ @adapter_for ('pandas.io.stata.StataReader' )
3784+ class PandasStataReaderAdapter (AbstractColumnarAdapter ):
3785+ def __init__ (self , data , attributes = None ):
3786+ super ().__init__ (data , attributes = attributes )
3787+ reader = data
3788+ reader ._ensure_open ()
3789+
3790+ # monkey-patch Pandas StataReader to fix column selection (only
3791+ # the first column selection of a reader works in the original version)
3792+ def _do_select_columns (self , data , columns ):
3793+ if not hasattr (self , '_full_dtyplist' ):
3794+ self ._full_dtyplist = self ._dtyplist
3795+ self ._full_typlist = self ._typlist
3796+ self ._full_fmtlist = self ._fmtlist
3797+ self ._full_lbllist = self ._lbllist
3798+
3799+ column_set = set (columns )
3800+ if len (column_set ) != len (columns ):
3801+ raise ValueError ("columns contains duplicate entries" )
3802+ unmatched = column_set .difference (data .columns )
3803+ if unmatched :
3804+ joined = ", " .join (list (unmatched ))
3805+ raise ValueError (
3806+ "The following columns were not "
3807+ f"found in the Stata data set: { joined } "
3808+ )
3809+ # Copy information for retained columns for later processing
3810+ get_loc = data .columns .get_loc
3811+ col_indices = [get_loc (col ) for col in columns ]
3812+ self ._dtyplist = [self ._full_dtyplist [i ] for i in col_indices ]
3813+ self ._typlist = [self ._full_typlist [i ] for i in col_indices ]
3814+ self ._fmtlist = [self ._full_fmtlist [i ] for i in col_indices ]
3815+ self ._lbllist = [self ._full_lbllist [i ] for i in col_indices ]
3816+ self ._column_selector_set = True
3817+ return data [columns ]
3818+
3819+ reader .__class__ ._do_select_columns = _do_select_columns
3820+
3821+ def shape2d (self ):
3822+ reader = self .data
3823+ return reader ._nobs , reader ._nvar
3824+
3825+ def get_hlabels_values (self , start , stop ):
3826+ return [self .data ._varlist [start :stop ]]
3827+
3828+ def get_values (self , h_start , v_start , h_stop , v_stop ):
3829+ reader = self .data
3830+ columns = reader ._varlist [h_start :h_stop ]
3831+
3832+ reader ._lines_read = v_start
3833+ chunk = reader .read (v_stop - v_start , columns = columns )
3834+
3835+ chunk_columns = [chunk .iloc [:, i ].values
3836+ for i in range (h_stop - h_start )]
3837+ try :
3838+ return np .stack (chunk_columns , axis = 1 )
3839+ except np .exceptions .DTypePromotionError :
3840+ return np .stack (chunk_columns , axis = 1 , dtype = object )
3841+
3842+
3843+ class PandasDTAPathAdapter (PandasStataReaderAdapter ):
3844+ @classmethod
3845+ def open (cls , fpath ):
3846+ import pandas as pd
3847+ # iterator=True so that Pandas returns a StataReader instead of a
3848+ # DataFrame
3849+ return pd .read_stata (fpath , iterator = True )
3850+
3851+
37843852@path_adapter_for ('.sas7bdat' )
37853853def dispatch_sas7bdat_path_adapter (fpath ):
37863854 # the pandas adapter is first as it (much) faster for reading the first
@@ -3792,6 +3860,17 @@ def dispatch_sas7bdat_path_adapter(fpath):
37923860 })
37933861
37943862
3863+ @path_adapter_for ('.dta' )
3864+ def dispatch_dta_path_adapter (fpath ):
3865+ # the pandas adapter is first as it (much) faster for reading large files.
3866+ # In practice, Pandas is always available because it is currently a hard
3867+ # dependency of larray-editor
3868+ return dispatch_file_suffix_by_available_module ('dta' ,{
3869+ 'pandas' : PandasDTAPathAdapter ,
3870+ 'pyreadstat' : PyReadstatDtaPathAdapter
3871+ })
3872+
3873+
37953874@adapter_for ('pstats.Stats' )
37963875class ProfilingStatsAdapter (AbstractColumnarAdapter ):
37973876 # we display everything except callers
0 commit comments