Skip to content

Commit

Permalink
Update engine argument for read_df_parquet and read_ddf_parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
mjclawar committed May 19, 2019
1 parent 2c3322b commit 71ba46a
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## [2.3.0] 2019-05-06
### Changes
- `read_df_parquet` and `read_ddf_parquet` now take optional `engine` argument to allow to use `pyarrow` or `fastparquet` engines for reading parquet files.

## [2.2.0] 2019-05-06
### Changes
- Updates `slackclient` dependency to `2.0.1` and handles migration of api to v2 (https://github.com/slackapi/python-slackclient/wiki/Migrating-to-2.x)
Expand Down
15 changes: 10 additions & 5 deletions sd_utils/sd_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,18 @@ def read_df_excel(file_path, *,


@sd_log.log_func
def read_df_parquet(file_path: str, columns: Optional[Iterable[str]]=None,
use_threads: bool=True, **pyarrow_kwargs) -> T_DF:
def read_df_parquet(file_path: str,
columns: Optional[Iterable[str]]=None,
use_threads: bool = True,
engine: str='pyarrow',
**pyarrow_kwargs) -> T_DF:
assert isinstance(file_path, str), '{} does not exist'.format(file_path)
assert os.path.exists(file_path), 'file does not exist at {}'.format(file_path)
assert columns is None or isinstance(columns, (list, tuple))
assert columns is None or all(isinstance(c, str) for c in columns)
assert isinstance(use_threads, bool)

df = pandas.read_parquet(file_path, engine='pyarrow', use_threads=use_threads, columns=columns,
df = pandas.read_parquet(file_path, engine=engine, use_threads=use_threads, columns=columns,
**pyarrow_kwargs)

# df = pyarrow.parquet.read_table(file_path, nthreads=n_threads, columns=columns,
Expand All @@ -175,14 +178,16 @@ def read_df_parquet(file_path: str, columns: Optional[Iterable[str]]=None,


@sd_log.log_func
def read_ddf_parquet(file_path: str, columns: Optional[Iterable[str]]=None,
def read_ddf_parquet(file_path: str,
columns: Optional[Iterable[str]] = None,
engine: str='pyarrow',
**dd_kwargs) -> T_DDF:
assert isinstance(file_path, str), '{} does not exist'.format(file_path)
assert os.path.exists(file_path), 'file does not exist at {}'.format(file_path)
assert columns is None or isinstance(columns, (list, tuple))
assert columns is None or all(isinstance(c, str) for c in columns)

return dask.dataframe.read_parquet(path=file_path, columns=columns, engine='arrow',
return dask.dataframe.read_parquet(path=file_path, columns=columns, engine=engine,
**dd_kwargs)


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='SDUtils',
version='2.2.0',
version='2.3.0',
packages=['sd_utils'],
license='(c) 2017- StratoDem Analytics. All rights reserved.',
description='StratoDem utilities',
Expand Down

0 comments on commit 71ba46a

Please sign in to comment.