v1.1.1 updates (#13)

* fix operator assignment * fix inner sampling method * version bump * bump up render limit * update changelog * replace pd.NA for np.nan; add more metadata for frontend * don't serialize lists of type objects * add extra dataframe size information for metadata
MSeal · Jul 22, 2022 · 04bf5a9 · 04bf5a9
1 parent 8f5a61c
commit 04bf5a9
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,29 @@
 All notable changes will be documented here.
 
 ---
+## `1.1.1`
+_2022-07-22_
+### Added
+- Additional metadata sent to frontends to triage issues with output sizes and `dx` settings
+### Fixed
+- `simple`/`enhanced` display modes no longer raise JSON errors trying to serialize `pd.NA` values
+- `SAMPLE_METHOD` returning incorrect value (`True` instead of `DXSampleMethod`) when compared with `COLUMN_SAMPLE_METHOD` and `ROW_SAMPLE_METHOD`
+
+## `1.1.0`
+_2022-07-22_
+### **Added**
+- Direct support for `application/vnd.dataresource+json` media type display formatting
+- reverting all settings to `pandas` defaults with `dx.reset()` or switching to the `DISPAY_MODE` setting to `default`
+- `pydantic` dependency for BaseSettings use
+    - `pandas`-inspired `dx.set_option(setting_name, setting_value)` 
+    - `dx.set_display_mode()` convenience function for globally switching between `simple` (simpleTable/DEX), `enhanced` (GRID), and `default` (vanilla pandas)
+- Auto-truncating rows and columns of `pd.DataFrame` objects based on `DISPLAY_MAX_ROWS`, `DISPLAY_MAX_COLUMNS`, and `MAX_RENDER_SIZE_BYTES` (1MB default) size limits before rendering (for `simple` & `enhanced` display modes), with blueprintjs flavored warnings
+    - `SAMPLING_MODE` setting to better control how truncating happens ("first", "last", "outer", "inner", and "random" options)
+    - `RANDOM_SEED` setting for random sampling
+
+### **Fixed**
+- Support for non-string column and index values (possibly temporary) to allow `build_table_schema` to work with `pd.MultiIndex` values
+
 ## `1.0.4`
 _2022-05-06_
 ### **Fixed**

diff --git a/dx/__init__.py b/dx/__init__.py
@@ -3,6 +3,6 @@
 from .formatters import *
 from .settings import *
 
-__version__ = "1.1.0"
+__version__ = "1.1.1"
 
 set_display_mode("simple")
diff --git a/dx/formatters/dataresource.py b/dx/formatters/dataresource.py
@@ -15,7 +15,7 @@
 from dx.formatters.utils import (
     stringify_columns,
     stringify_indices,
-    truncate_if_too_big,
+    truncate_and_describe,
 )
 from dx.settings import settings
 
@@ -71,28 +71,44 @@ def format_dataresource(df: pd.DataFrame, display_id: str) -> tuple:
     if isinstance(display_df.index, pd.MultiIndex):
         display_df = stringify_indices(display_df)
 
-    body = {
+    # build_table_schema() also doesn't like pd.NAs
+    display_df.fillna(np.nan, inplace=True)
+
+    payload_body = {
         "schema": build_table_schema(display_df),
         "data": display_df.reset_index().to_dict("records"),
         "datalink": {},
     }
-    if display_id is not None:
-        body["datalink"]["display_id"] = display_id
-    payload = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: body}
-    metadata = {
-        dataresource_settings.DATARESOURCE_MEDIA_TYPE: {"display_id": display_id}
+    payload = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: payload_body}
+
+    metadata_body = {
+        "datalink": {
+            "dataframe_info": {},
+            "dx_settings": settings.json(exclude={"RENDERABLE_OBJECTS": True}),
+        },
     }
+    metadata = {dataresource_settings.DATARESOURCE_MEDIA_TYPE: metadata_body}
+
+    if display_id is not None:
+        payload_body["datalink"]["display_id"] = display_id
+        metadata_body["datalink"]["display_id"] = display_id
+
     return (payload, metadata)
 
 
 def _render_dataresource(df, display_id) -> tuple:
-    df = truncate_if_too_big(df)
+    df, dataframe_info = truncate_and_describe(df)
     payload, metadata = format_dataresource(df, display_id)
+    metadata[dataresource_settings.DATARESOURCE_MEDIA_TYPE]["datalink"][
+        "dataframe_info"
+    ] = dataframe_info
+
     # don't pass a dataframe in here, otherwise you'll get recursion errors
     with pd.option_context(
         "html.table_schema", dataresource_settings.DATARESOURCE_HTML_TABLE_SCHEMA
     ):
         ipydisplay(payload, raw=True, display_id=display_id)
+
     return (payload, metadata)
 
 

diff --git a/dx/formatters/dx.py b/dx/formatters/dx.py
@@ -15,7 +15,7 @@
 from dx.formatters.utils import (
     stringify_columns,
     stringify_indices,
-    truncate_if_too_big,
+    truncate_and_describe,
 )
 from dx.settings import settings
 
@@ -29,6 +29,7 @@ class DXSettings(BaseSettings):
 
     class Config:
         validate_assignment = True  # we need this to enforce `allow_mutation`
+        json_encoders = {type: lambda t: str(t)}
 
 
 @lru_cache
@@ -68,25 +69,41 @@ def format_dx(df: pd.DataFrame, display_id: str) -> tuple:
     if isinstance(display_df.index, pd.MultiIndex):
         display_df = stringify_indices(display_df)
 
+    # build_table_schema() also doesn't like pd.NAs
+    display_df.fillna(np.nan, inplace=True)
+
     # this will include the `df.index` by default (e.g. slicing/sampling)
-    body = {
+    payload_body = {
         "schema": build_table_schema(display_df),
         "data": display_df.reset_index().transpose().values.tolist(),
         "datalink": {},
     }
+    payload = {dx_settings.DX_MEDIA_TYPE: payload_body}
+
+    metadata_body = {
+        "datalink": {
+            "dataframe_info": {},
+            "dx_settings": settings.json(exclude={"RENDERABLE_OBJECTS": True}),
+        },
+    }
+    metadata = {dx_settings.DX_MEDIA_TYPE: metadata_body}
+
     if display_id is not None:
-        body["datalink"]["display_id"] = display_id
-    payload = {dx_settings.DX_MEDIA_TYPE: body}
-    metadata = {dx_settings.DX_MEDIA_TYPE: {"display_id": display_id}}
+        payload_body["datalink"]["display_id"] = display_id
+        metadata_body["datalink"]["display_id"] = display_id
+
     return (payload, metadata)
 
 
 def _render_dx(df, display_id) -> tuple:
-    df = truncate_if_too_big(df)
+    df, dataframe_info = truncate_and_describe(df)
     payload, metadata = format_dx(df, display_id)
+    metadata[dx_settings.DX_MEDIA_TYPE]["datalink"]["dataframe_info"] = dataframe_info
+
     # don't pass a dataframe in here, otherwise you'll get recursion errors
     with pd.option_context("html.table_schema", dx_settings.DX_HTML_TABLE_SCHEMA):
         ipydisplay(payload, raw=True, display_id=display_id)
+
     return (payload, metadata)
 
 

diff --git a/dx/formatters/utils.py b/dx/formatters/utils.py
@@ -1,4 +1,5 @@
 import sys
+from typing import Tuple
 
 import pandas as pd
 
@@ -99,10 +100,11 @@ def reduce_df(df: pd.DataFrame, orig_num_rows: int = 0) -> pd.DataFrame:
 def sample_columns(df: pd.DataFrame, num_cols: int) -> pd.DataFrame:
     """
     Samples a dataframe to a specified number of rows
-    based on Settings.SAMPLING_METHOD.
+    based on Settings.SAMPLING_METHOD, or
+    Settings.COLUMN_SAMPLING_METHOD if specified.
     """
     sampling = settings.SAMPLING_METHOD
-    if col_sampling := settings.COLUMN_SAMPLING_METHOD != sampling:
+    if (col_sampling := settings.COLUMN_SAMPLING_METHOD) != sampling:
         sampling = col_sampling
 
     # transposing here to treat columns like rows to take advantage of
@@ -125,10 +127,11 @@ def sample_columns(df: pd.DataFrame, num_cols: int) -> pd.DataFrame:
 def sample_rows(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
     """
     Samples a dataframe to a specified number of rows
-    based on Settings.SAMPLING_METHOD.
+    based on Settings.SAMPLING_METHOD, or
+    Settings.ROW_SAMPLING_METHOD if specified.
     """
     sampling = settings.SAMPLING_METHOD
-    if row_sampling := settings.ROW_SAMPLING_METHOD != sampling:
+    if (row_sampling := settings.ROW_SAMPLING_METHOD) != sampling:
         sampling = row_sampling
 
     if sampling == DXSamplingMethod.random:
@@ -182,9 +185,7 @@ def sample_inner(df: pd.DataFrame, num: int) -> pd.DataFrame:
     Example: sampling inner 8 of 20 rows:
     [......XXXXXXXX......]
     """
-    # rounding down since we'll be adding one filler row
-    # as well as using the index
-    middle_index = int(len(df) / 2) - 1
+    middle_index = int(len(df) / 2)
     inner_buffer = int(num / 2)
     middle_start = middle_index - inner_buffer
     middle_end = middle_index + inner_buffer
@@ -231,3 +232,28 @@ def stringify_multiindex(vals):
 
 def stringify_indices(df: pd.DataFrame) -> pd.DataFrame:
     return stringify_columns(df.transpose()).transpose()
+
+
+def truncate_and_describe(df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
+    """
+    Reduces the size of the dataframe, if necessary,
+    and generates a dictionary of shape/size information
+    about the dataframe before/after truncation.
+    """
+    num_orig_rows, num_orig_cols = df.shape
+    orig_size_bytes = sys.getsizeof(df)
+
+    df = truncate_if_too_big(df)
+
+    num_truncated_rows, num_truncated_cols = df.shape
+    truncated_size_bytes = sys.getsizeof(df)
+
+    dataframe_info = {
+        "orig_size_bytes": orig_size_bytes,
+        "orig_num_rows": num_orig_rows,
+        "orig_num_cols": num_orig_cols,
+        "truncated_size_bytes": truncated_size_bytes,
+        "truncated_num_rows": num_truncated_rows,
+        "truncated_num_cols": num_truncated_cols,
+    }
+    return df, dataframe_info
diff --git a/dx/settings.py b/dx/settings.py
@@ -17,7 +17,7 @@ class Settings(BaseSettings):
     HTML_TABLE_SCHEMA: bool = False
     MEDIA_TYPE: str = "application/vnd.dataresource+json"
 
-    MAX_RENDER_SIZE_BYTES: int = 1 * MB
+    MAX_RENDER_SIZE_BYTES: int = 100 * MB
     RENDERABLE_OBJECTS: List[type] = [pd.DataFrame, np.ndarray]
 
     # what percentage of the dataset to remove during each truncation

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dx"
-version = "1.1.0"
+version = "1.1.1"
 description = "Python wrapper for Data Explorer"
 authors = ["Dave Shoup <[email protected]>", "Kyle Kelley <[email protected]>"]
 readme = "README.md"