ENH: Fix by in DataFrame.plot.hist and DataFrame.plot.box (#28373)

charlesdong1991 · web-flow · commit d3a018d770a8 · 2021-07-12T09:10:44.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -29,6 +29,7 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
 -  Additional options added to :meth:`.Styler.bar` to control alignment and display (:issue:`26070`)
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -1237,6 +1237,11 @@ def box(self, by=None, **kwargs):
         ----------
         by : str or sequence
             Column in the DataFrame to group by.
+
+            .. versionchanged:: 1.4.0
+
+               Previously, `by` is silently ignore and makes no groupings
+
         **kwargs
             Additional keywords are documented in
             :meth:`DataFrame.plot`.
@@ -1278,6 +1283,11 @@ def hist(self, by=None, bins=10, **kwargs):
         ----------
         by : str or sequence, optional
             Column in the DataFrame to group by.
+
+            .. versionchanged:: 1.4.0
+
+               Previously, `by` is silently ignore and makes no groupings
+
         bins : int, default 10
             Number of histogram bins to be used.
         **kwargs
@@ -1309,6 +1319,16 @@ def hist(self, by=None, bins=10, **kwargs):
             ...     columns = ['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
+
+        A grouped histogram can be generated by providing the parameter `by` (which
+        can be a column name, or a list of column names):
+
+        .. plot::
+            :context: close-figs
+
+            >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
+            >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
+            >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
         """
         return self(kind="hist", by=by, bins=bins, **kwargs)
 
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
@@ -18,6 +18,7 @@
     LinePlot,
     MPLPlot,
 )
+from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
 from pandas.plotting._matplotlib.style import get_standard_colors
 from pandas.plotting._matplotlib.tools import (
     create_subplots,
@@ -135,18 +136,37 @@ def _make_plot(self):
         if self.subplots:
             self._return_obj = pd.Series(dtype=object)
 
-            for i, (label, y) in enumerate(self._iter_data()):
+            # Re-create iterated data if `by` is assigned by users
+            data = (
+                create_iter_data_given_by(self.data, self._kind)
+                if self.by is not None
+                else self.data
+            )
+
+            for i, (label, y) in enumerate(self._iter_data(data=data)):
                 ax = self._get_ax(i)
                 kwds = self.kwds.copy()
 
+                # When by is applied, show title for subplots to know which group it is
+                # just like df.boxplot, and need to apply T on y to provide right input
+                if self.by is not None:
+                    y = y.T
+                    ax.set_title(pprint_thing(label))
+
+                    # When `by` is assigned, the ticklabels will become unique grouped
+                    # values, instead of label which is used as subtitle in this case.
+                    ticklabels = [
+                        pprint_thing(col) for col in self.data.columns.levels[0]
+                    ]
+                else:
+                    ticklabels = [pprint_thing(label)]
+
                 ret, bp = self._plot(
                     ax, y, column_num=i, return_type=self.return_type, **kwds
                 )
                 self.maybe_color_bp(bp)
                 self._return_obj[label] = ret
-
-                label = [pprint_thing(label)]
-                self._set_ticklabels(ax, label)
+                self._set_ticklabels(ax, ticklabels)
         else:
             y = self.data.values.T
             ax = self._get_ax(0)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -9,6 +9,7 @@
 from matplotlib.artist import Artist
 import numpy as np
 
+from pandas._typing import IndexLabel
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
 
@@ -38,10 +39,12 @@
 )
 
 import pandas.core.common as com
+from pandas.core.frame import DataFrame
 
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0
 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters
+from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by
 from pandas.plotting._matplotlib.style import get_standard_colors
 from pandas.plotting._matplotlib.timeseries import (
     decorate_axes,
@@ -99,7 +102,7 @@ def __init__(
         self,
         data,
         kind=None,
-        by=None,
+        by: IndexLabel | None = None,
         subplots=False,
         sharex=None,
         sharey=False,
@@ -124,13 +127,42 @@ def __init__(
         table=False,
         layout=None,
         include_bool=False,
+        column: IndexLabel | None = None,
         **kwds,
     ):
 
         import matplotlib.pyplot as plt
 
         self.data = data
-        self.by = by
+
+        # if users assign an empty list or tuple, raise `ValueError`
+        # similar to current `df.box` and `df.hist` APIs.
+        if by in ([], ()):
+            raise ValueError("No group keys passed!")
+        self.by = com.maybe_make_list(by)
+
+        # Assign the rest of columns into self.columns if by is explicitly defined
+        # while column is not, only need `columns` in hist/box plot when it's DF
+        # TODO: Might deprecate `column` argument in future PR (#28373)
+        if isinstance(data, DataFrame):
+            if column:
+                self.columns = com.maybe_make_list(column)
+            else:
+                if self.by is None:
+                    self.columns = [
+                        col for col in data.columns if is_numeric_dtype(data[col])
+                    ]
+                else:
+                    self.columns = [
+                        col
+                        for col in data.columns
+                        if col not in self.by and is_numeric_dtype(data[col])
+                    ]
+
+        # For `hist` plot, need to get grouped original data before `self.data` is
+        # updated later
+        if self.by is not None and self._kind == "hist":
+            self._grouped = data.groupby(self.by)
 
         self.kind = kind
 
@@ -139,7 +171,9 @@ def __init__(
         self.subplots = subplots
 
         if sharex is None:
-            if ax is None:
+
+            # if by is defined, subplots are used and sharex should be False
+            if ax is None and by is None:
                 self.sharex = True
             else:
                 # if we get an axis, the users should do the visibility
@@ -273,8 +307,15 @@ def _iter_data(self, data=None, keep_index=False, fillna=None):
 
     @property
     def nseries(self) -> int:
+
+        # When `by` is explicitly assigned, grouped data size will be defined, and
+        # this will determine number of subplots to have, aka `self.nseries`
         if self.data.ndim == 1:
             return 1
+        elif self.by is not None and self._kind == "hist":
+            return len(self._grouped)
+        elif self.by is not None and self._kind == "box":
+            return len(self.columns)
         else:
             return self.data.shape[1]
 
@@ -420,6 +461,14 @@ def _compute_plot_data(self):
             if label is None and data.name is None:
                 label = "None"
             data = data.to_frame(name=label)
+        elif self._kind in ("hist", "box"):
+            cols = self.columns if self.by is None else self.columns + self.by
+            data = data.loc[:, cols]
+
+        # GH15079 reconstruct data if by is defined
+        if self.by is not None:
+            self.subplots = True
+            data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns)
 
         # GH16953, _convert is needed as fallback, for ``Series``
         # with ``dtype == object``
diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import numpy as np
+
+from pandas._typing import (
+    Dict,
+    IndexLabel,
+)
+
+from pandas.core.dtypes.missing import remove_na_arraylike
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    concat,
+)
+
+
+def create_iter_data_given_by(
+    data: DataFrame, kind: str = "hist"
+) -> Dict[str, DataFrame | Series]:
+    """
+    Create data for iteration given `by` is assigned or not, and it is only
+    used in both hist and boxplot.
+
+    If `by` is assigned, return a dictionary of DataFrames in which the key of
+    dictionary is the values in groups.
+    If `by` is not assigned, return input as is, and this preserves current
+    status of iter_data.
+
+    Parameters
+    ----------
+    data : reformatted grouped data from `_compute_plot_data` method.
+    kind : str, plot kind. This function is only used for `hist` and `box` plots.
+
+    Returns
+    -------
+    iter_data : DataFrame or Dictionary of DataFrames
+
+    Examples
+    --------
+    If `by` is assigned:
+
+    >>> import numpy as np
+    >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')]
+    >>> mi = MultiIndex.from_tuples(tuples)
+    >>> value = [[1, 3, np.nan, np.nan],
+    ...          [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]]
+    >>> data = DataFrame(value, columns=mi)
+    >>> create_iter_data_given_by(data)
+    {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}),
+     'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})}
+    """
+
+    # For `hist` plot, before transformation, the values in level 0 are values
+    # in groups and subplot titles, and later used for column subselection and
+    # iteration; For `box` plot, values in level 1 are column names to show,
+    # and are used for iteration and as subplots titles.
+    if kind == "hist":
+        level = 0
+    else:
+        level = 1
+
+    # Select sub-columns based on the value of level of MI, and if `by` is
+    # assigned, data must be a MI DataFrame
+    assert isinstance(data.columns, MultiIndex)
+    return {
+        col: data.loc[:, data.columns.get_level_values(level) == col]
+        for col in data.columns.levels[level]
+    }
+
+
+def reconstruct_data_with_by(
+    data: DataFrame, by: IndexLabel, cols: IndexLabel
+) -> DataFrame:
+    """
+    Internal function to group data, and reassign multiindex column names onto the
+    result in order to let grouped data be used in _compute_plot_data method.
+
+    Parameters
+    ----------
+    data : Original DataFrame to plot
+    by : grouped `by` parameter selected by users
+    cols : columns of data set (excluding columns used in `by`)
+
+    Returns
+    -------
+    Output is the reconstructed DataFrame with MultiIndex columns. The first level
+    of MI is unique values of groups, and second level of MI is the columns
+    selected by users.
+
+    Examples
+    --------
+    >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]}
+    >>> df = DataFrame(d)
+    >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b'])
+       h1      h2
+       a   b   a   b
+    0  1   3   NaN NaN
+    1  3   4   NaN NaN
+    2  NaN NaN 5   6
+    """
+    grouped = data.groupby(by)
+
+    data_list = []
+    for key, group in grouped:
+        columns = MultiIndex.from_product([[key], cols])
+        sub_group = group[cols]
+        sub_group.columns = columns
+        data_list.append(sub_group)
+
+    data = concat(data_list, axis=1)
+    return data
+
+
+def reformat_hist_y_given_by(
+    y: Series | np.ndarray, by: IndexLabel | None
+) -> Series | np.ndarray:
+    """Internal function to reformat y given `by` is applied or not for hist plot.
+
+    If by is None, input y is 1-d with NaN removed; and if by is not None, groupby
+    will take place and input y is multi-dimensional array.
+    """
+    if by is not None and len(y.shape) > 1:
+        return np.array([remove_na_arraylike(col) for col in y.T]).T
+    return remove_na_arraylike(y)
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py