Skip to content

Commit

Permalink
v1.1.1 updates (#13)
Browse files Browse the repository at this point in the history
* fix operator assignment

* fix inner sampling method

* version bump

* bump up render limit

* update changelog

* replace pd.NA for np.nan; add more metadata for frontend

* don't serialize lists of type objects

* add extra dataframe size information for metadata
  • Loading branch information
shouples authored Jul 22, 2022
1 parent 8f5a61c commit 04bf5a9
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 24 deletions.
23 changes: 23 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
All notable changes will be documented here.

---
## `1.1.1`
_2022-07-22_
### Added
- Additional metadata sent to frontends to triage issues with output sizes and `dx` settings
### Fixed
- `simple`/`enhanced` display modes no longer raise JSON errors trying to serialize `pd.NA` values
- `SAMPLE_METHOD` returning incorrect value (`True` instead of `DXSampleMethod`) when compared with `COLUMN_SAMPLE_METHOD` and `ROW_SAMPLE_METHOD`

## `1.1.0`
_2022-07-22_
### **Added**
- Direct support for `application/vnd.dataresource+json` media type display formatting
- reverting all settings to `pandas` defaults with `dx.reset()` or switching to the `DISPAY_MODE` setting to `default`
- `pydantic` dependency for BaseSettings use
- `pandas`-inspired `dx.set_option(setting_name, setting_value)`
- `dx.set_display_mode()` convenience function for globally switching between `simple` (simpleTable/DEX), `enhanced` (GRID), and `default` (vanilla pandas)
- Auto-truncating rows and columns of `pd.DataFrame` objects based on `DISPLAY_MAX_ROWS`, `DISPLAY_MAX_COLUMNS`, and `MAX_RENDER_SIZE_BYTES` (1MB default) size limits before rendering (for `simple` & `enhanced` display modes), with blueprintjs flavored warnings
- `SAMPLING_MODE` setting to better control how truncating happens ("first", "last", "outer", "inner", and "random" options)
- `RANDOM_SEED` setting for random sampling

### **Fixed**
- Support for non-string column and index values (possibly temporary) to allow `build_table_schema` to work with `pd.MultiIndex` values

## `1.0.4`
_2022-05-06_
### **Fixed**
Expand Down
2 changes: 1 addition & 1 deletion dx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
from .formatters import *
from .settings import *

__version__ = "1.1.0"
__version__ = "1.1.1"

set_display_mode("simple")
32 changes: 24 additions & 8 deletions dx/formatters/dataresource.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from dx.formatters.utils import (
stringify_columns,
stringify_indices,
truncate_if_too_big,
truncate_and_describe,
)
from dx.settings import settings

Expand Down Expand Up @@ -71,28 +71,44 @@ def format_dataresource(df: pd.DataFrame, display_id: str) -> tuple:
if isinstance(display_df.index, pd.MultiIndex):
display_df = stringify_indices(display_df)

body = {
# build_table_schema() also doesn't like pd.NAs
display_df.fillna(np.nan, inplace=True)

payload_body = {
"schema": build_table_schema(display_df),
"data": display_df.reset_index().to_dict("records"),
"datalink": {},
}
if display_id is not None:
body["datalink"]["display_id"] = display_id
payload = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: body}
metadata = {
dataresource_settings.DATARESOURCE_MEDIA_TYPE: {"display_id": display_id}
payload = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: payload_body}

metadata_body = {
"datalink": {
"dataframe_info": {},
"dx_settings": settings.json(exclude={"RENDERABLE_OBJECTS": True}),
},
}
metadata = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: metadata_body}

if display_id is not None:
payload_body["datalink"]["display_id"] = display_id
metadata_body["datalink"]["display_id"] = display_id

return (payload, metadata)


def _render_dataresource(df, display_id) -> tuple:
df = truncate_if_too_big(df)
df, dataframe_info = truncate_and_describe(df)
payload, metadata = format_dataresource(df, display_id)
metadata[dataresource_settings.DATARESOURCE_MEDIA_TYPE]["datalink"][
"dataframe_info"
] = dataframe_info

# don't pass a dataframe in here, otherwise you'll get recursion errors
with pd.option_context(
"html.table_schema", dataresource_settings.DATARESOURCE_HTML_TABLE_SCHEMA
):
ipydisplay(payload, raw=True, display_id=display_id)

return (payload, metadata)


Expand Down
29 changes: 23 additions & 6 deletions dx/formatters/dx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from dx.formatters.utils import (
stringify_columns,
stringify_indices,
truncate_if_too_big,
truncate_and_describe,
)
from dx.settings import settings

Expand All @@ -29,6 +29,7 @@ class DXSettings(BaseSettings):

class Config:
validate_assignment = True # we need this to enforce `allow_mutation`
json_encoders = {type: lambda t: str(t)}


@lru_cache
Expand Down Expand Up @@ -68,25 +69,41 @@ def format_dx(df: pd.DataFrame, display_id: str) -> tuple:
if isinstance(display_df.index, pd.MultiIndex):
display_df = stringify_indices(display_df)

# build_table_schema() also doesn't like pd.NAs
display_df.fillna(np.nan, inplace=True)

# this will include the `df.index` by default (e.g. slicing/sampling)
body = {
payload_body = {
"schema": build_table_schema(display_df),
"data": display_df.reset_index().transpose().values.tolist(),
"datalink": {},
}
payload = {dx_settings.DX_MEDIA_TYPE: payload_body}

metadata_body = {
"datalink": {
"dataframe_info": {},
"dx_settings": settings.json(exclude={"RENDERABLE_OBJECTS": True}),
},
}
metadata = {dx_settings.DX_MEDIA_TYPE: metadata_body}

if display_id is not None:
body["datalink"]["display_id"] = display_id
payload = {dx_settings.DX_MEDIA_TYPE: body}
metadata = {dx_settings.DX_MEDIA_TYPE: {"display_id": display_id}}
payload_body["datalink"]["display_id"] = display_id
metadata_body["datalink"]["display_id"] = display_id

return (payload, metadata)


def _render_dx(df, display_id) -> tuple:
df = truncate_if_too_big(df)
df, dataframe_info = truncate_and_describe(df)
payload, metadata = format_dx(df, display_id)
metadata[dx_settings.DX_MEDIA_TYPE]["datalink"]["dataframe_info"] = dataframe_info

# don't pass a dataframe in here, otherwise you'll get recursion errors
with pd.option_context("html.table_schema", dx_settings.DX_HTML_TABLE_SCHEMA):
ipydisplay(payload, raw=True, display_id=display_id)

return (payload, metadata)


Expand Down
40 changes: 33 additions & 7 deletions dx/formatters/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
from typing import Tuple

import pandas as pd

Expand Down Expand Up @@ -99,10 +100,11 @@ def reduce_df(df: pd.DataFrame, orig_num_rows: int = 0) -> pd.DataFrame:
def sample_columns(df: pd.DataFrame, num_cols: int) -> pd.DataFrame:
"""
Samples a dataframe to a specified number of rows
based on Settings.SAMPLING_METHOD.
based on Settings.SAMPLING_METHOD, or
Settings.COLUMN_SAMPLING_METHOD if specified.
"""
sampling = settings.SAMPLING_METHOD
if col_sampling := settings.COLUMN_SAMPLING_METHOD != sampling:
if (col_sampling := settings.COLUMN_SAMPLING_METHOD) != sampling:
sampling = col_sampling

# transposing here to treat columns like rows to take advantage of
Expand All @@ -125,10 +127,11 @@ def sample_columns(df: pd.DataFrame, num_cols: int) -> pd.DataFrame:
def sample_rows(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
"""
Samples a dataframe to a specified number of rows
based on Settings.SAMPLING_METHOD.
based on Settings.SAMPLING_METHOD, or
Settings.ROW_SAMPLING_METHOD if specified.
"""
sampling = settings.SAMPLING_METHOD
if row_sampling := settings.ROW_SAMPLING_METHOD != sampling:
if (row_sampling := settings.ROW_SAMPLING_METHOD) != sampling:
sampling = row_sampling

if sampling == DXSamplingMethod.random:
Expand Down Expand Up @@ -182,9 +185,7 @@ def sample_inner(df: pd.DataFrame, num: int) -> pd.DataFrame:
Example: sampling inner 8 of 20 rows:
[......XXXXXXXX......]
"""
# rounding down since we'll be adding one filler row
# as well as using the index
middle_index = int(len(df) / 2) - 1
middle_index = int(len(df) / 2)
inner_buffer = int(num / 2)
middle_start = middle_index - inner_buffer
middle_end = middle_index + inner_buffer
Expand Down Expand Up @@ -231,3 +232,28 @@ def stringify_multiindex(vals):

def stringify_indices(df: pd.DataFrame) -> pd.DataFrame:
return stringify_columns(df.transpose()).transpose()


def truncate_and_describe(df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
"""
Reduces the size of the dataframe, if necessary,
and generates a dictionary of shape/size information
about the dataframe before/after truncation.
"""
num_orig_rows, num_orig_cols = df.shape
orig_size_bytes = sys.getsizeof(df)

df = truncate_if_too_big(df)

num_truncated_rows, num_truncated_cols = df.shape
truncated_size_bytes = sys.getsizeof(df)

dataframe_info = {
"orig_size_bytes": orig_size_bytes,
"orig_num_rows": num_orig_rows,
"orig_num_cols": num_orig_cols,
"truncated_size_bytes": truncated_size_bytes,
"truncated_num_rows": num_truncated_rows,
"truncated_num_cols": num_truncated_cols,
}
return df, dataframe_info
2 changes: 1 addition & 1 deletion dx/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Settings(BaseSettings):
HTML_TABLE_SCHEMA: bool = False
MEDIA_TYPE: str = "application/vnd.dataresource+json"

MAX_RENDER_SIZE_BYTES: int = 1 * MB
MAX_RENDER_SIZE_BYTES: int = 100 * MB
RENDERABLE_OBJECTS: List[type] = [pd.DataFrame, np.ndarray]

# what percentage of the dataset to remove during each truncation
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dx"
version = "1.1.0"
version = "1.1.1"
description = "Python wrapper for Data Explorer"
authors = ["Dave Shoup <[email protected]>", "Kyle Kelley <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 04bf5a9

Please sign in to comment.