Skip to content

plot_distribution

yohou.plotting.exploration.plot_distribution(df, *, columns=None, n_bins=50, show_kde=True, groups=None, facet_by='member', facet_n_cols=2, color_palette=None, show_legend=True, title=None, x_label=None, y_label=None, width=None, height=None, bar_opacity=0.6, kde_width=2.5, kde_points=200, histnorm='probability density')

Plot histogram with optional KDE overlay for one or more columns.

Parameters

Name Type Description Default
df DataFrame

Input DataFrame with 'time' column and numeric columns to plot.

required
columns str | list[str] | None

Column(s) to plot. If None, uses all numeric columns except 'time'.

None
n_bins int

Number of histogram bins.

50
show_kde bool

Whether to overlay a kernel density estimate curve.

True
groups list[str] | None

Panel group prefixes to plot. Creates separate subplots per group.

None
facet_by Literal['group', 'member'] | None

Faceting axis for panel data. "group" creates one subplot per group, "member" one per member. None disables faceting. Ignored for non-panel data.

"member"
facet_n_cols int

Number of columns in facet grid.

2
color_palette list[str] | None

Custom color palette as hex codes.

None
show_legend bool

Whether to show the legend.

True
title str | None

Plot title.

None
x_label str | None

X-axis label.

None
y_label str | None

Y-axis label.

None
width int | None

Plot width in pixels.

None
height int | None

Plot height in pixels.

None
bar_opacity float

Opacity of the histogram bars (0.0 to 1.0).

0.6
kde_width float

Width of the KDE line in pixels.

2.5
kde_points int

Number of points used to evaluate the KDE curve.

200
histnorm str

Histogram normalization mode (passed to Plotly).

"probability density"

Returns

Type Description
Figure

Plotly figure object.

Raises

Type Description
TypeError

If df is not a Polars DataFrame.

ValueError

If DataFrame is empty, missing 'time' column, or specified columns don't exist.

Examples

>>> import polars as pl
>>> from yohou.plotting import plot_distribution
>>> df = pl.DataFrame({
...     "time": pl.date_range(pl.date(2020, 1, 1), pl.date(2020, 12, 31), "1mo", eager=True),
...     "y": [100, 120, 115, 130, 140, 135, 150, 160, 155, 170, 180, 175],
... })
>>> fig = plot_distribution(df, columns="y")
>>> len(fig.data) > 0
True

See Also

plot_boxplot : Plot boxplots grouped by time periods. plot_time_series : Plot basic time series.

Source Code

Show/Hide source
def plot_distribution(
    df: pl.DataFrame,
    *,
    columns: str | list[str] | None = None,
    n_bins: int = 50,
    show_kde: bool = True,
    groups: list[str] | None = None,
    facet_by: Literal["group", "member"] | None = "member",
    facet_n_cols: int = 2,
    color_palette: list[str] | None = None,
    show_legend: bool = True,
    title: str | None = None,
    x_label: str | None = None,
    y_label: str | None = None,
    width: int | None = None,
    height: int | None = None,
    bar_opacity: float = 0.6,
    kde_width: float = 2.5,
    kde_points: int = 200,
    histnorm: str = "probability density",
) -> go.Figure:
    """
    Plot histogram with optional KDE overlay for one or more columns.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame with 'time' column and numeric columns to plot.
    columns : str | list[str] | None, default=None
        Column(s) to plot. If None, uses all numeric columns except 'time'.
    n_bins : int, default=50
        Number of histogram bins.
    show_kde : bool, default=True
        Whether to overlay a kernel density estimate curve.
    groups : list[str] | None, default=None
        Panel group prefixes to plot. Creates separate subplots per group.
    facet_by : Literal["group", "member"] | None, default="member"
        Faceting axis for panel data.  ``"group"`` creates one subplot per
        group, ``"member"`` one per member.  ``None`` disables faceting.
        Ignored for non-panel data.
    facet_n_cols : int, default=2
        Number of columns in facet grid.
    color_palette : list[str] | None, default=None
        Custom color palette as hex codes.
    show_legend : bool, default=True
        Whether to show the legend.
    title : str | None, default=None
        Plot title.
    x_label : str | None, default=None
        X-axis label.
    y_label : str | None, default=None
        Y-axis label.
    width : int | None, default=None
        Plot width in pixels.
    height : int | None, default=None
        Plot height in pixels.
    bar_opacity : float, default=0.6
        Opacity of the histogram bars (0.0 to 1.0).
    kde_width : float, default=2.5
        Width of the KDE line in pixels.
    kde_points : int, default=200
        Number of points used to evaluate the KDE curve.
    histnorm : str, default="probability density"
        Histogram normalization mode (passed to Plotly).

    Returns
    -------
    go.Figure
        Plotly figure object.

    Raises
    ------
    TypeError
        If df is not a Polars DataFrame.
    ValueError
        If DataFrame is empty, missing 'time' column, or specified columns don't exist.

    Examples
    --------
    >>> import polars as pl
    >>> from yohou.plotting import plot_distribution

    >>> df = pl.DataFrame({
    ...     "time": pl.date_range(pl.date(2020, 1, 1), pl.date(2020, 12, 31), "1mo", eager=True),
    ...     "y": [100, 120, 115, 130, 140, 135, 150, 160, 155, 170, 180, 175],
    ... })
    >>> fig = plot_distribution(df, columns="y")
    >>> len(fig.data) > 0
    True

    See Also
    --------
    [`plot_boxplot`][yohou.plotting.plot_boxplot] : Plot boxplots grouped by time periods.
    [`plot_time_series`][yohou.plotting.plot_time_series] : Plot basic time series.
    """
    # Validate inputs
    validate_plotting_data(df, min_rows=2)
    validate_plotting_params(width=width, height=height)

    if groups is None and columns is None and _auto_detect_panel(df):
        groups = []

    if groups is not None:
        _panel_cols = resolve_panel_columns(df, groups, columns)
        _panel_colors = resolve_color_palette(color_palette, len(_panel_cols))
        _legend_tracker = LegendTracker(show_legend=show_legend)

        def _render_distribution(ctx: RenderContext) -> None:
            """Render histogram + KDE for a single panel column."""
            base = [c for c in ctx.sub_df.columns if c != "time"][0]
            series = ctx.sub_df[base].drop_nulls()
            values = series.to_numpy()
            _c = _panel_colors[ctx.entity_idx]
            ctx.fig.add_trace(
                go.Histogram(
                    x=values,
                    nbinsx=n_bins,
                    marker={"color": _c},
                    opacity=bar_opacity,
                    histnorm=histnorm,
                    name=ctx.display_name,
                    legendgroup=ctx.display_name,
                    showlegend=_legend_tracker.should_show(ctx.display_name),
                    hovertemplate=(
                        f"<b>{ctx.display_name}</b><br>Value: %{{x:.2f}}<br>Density: %{{y:.4f}}<extra></extra>"
                    ),
                ),
                row=ctx.row,
                col=ctx.col,
            )
            if show_kde and len(values) > 1:
                from scipy.stats import gaussian_kde  # noqa: PLC0415

                kde = gaussian_kde(values)
                x_range = np.linspace(float(values.min()), float(values.max()), kde_points)
                ctx.fig.add_trace(
                    go.Scatter(
                        x=x_range,
                        y=kde(x_range),
                        mode="lines",
                        line={"color": _c, "width": kde_width},
                        name=f"{ctx.display_name} KDE",
                        legendgroup=ctx.display_name,
                        showlegend=False,
                        hovertemplate=(
                            f"<b>{ctx.display_name} KDE</b><br>Value: %{{x:.2f}}<br>Density: %{{y:.4f}}<extra></extra>"
                        ),
                    ),
                    row=ctx.row,
                    col=ctx.col,
                )

        effective_facet_by = facet_by or "member"
        fig = facet_figure(
            df,
            _render_distribution,
            groups=groups,
            columns=columns,
            facet_by=effective_facet_by,
            facet_n_cols=facet_n_cols,
            title=title or "Distribution",
            x_label=x_label or "Value",
            y_label=y_label or "Density",
            width=width,
            height=height,
            shared_xaxes=False,
        )
        fig.update_layout(showlegend=show_legend)
        return fig

    # Non-panel case: column-mode facet_figure
    plot_columns = validate_plotting_data(df, columns=columns, exclude=["time"], include_categorical=True)
    _colors = resolve_color_palette(color_palette, len(plot_columns))
    _col_colors = dict(zip(plot_columns, _colors, strict=False))

    def _render_distribution(ctx: RenderContext) -> None:
        """Render histogram + optional KDE for one column into a subplot."""
        base = ctx.display_name
        col_color = _col_colors[base]
        series = ctx.sub_df[base].drop_nulls()
        if is_categorical_dtype(series.dtype):
            counts = series.value_counts().sort("count", descending=True)
            cat_col = [c for c in counts.columns if c != "count"][0]
            ctx.fig.add_trace(
                go.Bar(
                    x=[str(v) for v in counts[cat_col].to_list()],
                    y=counts["count"].to_list(),
                    marker={"color": col_color},
                    opacity=bar_opacity,
                    name=base,
                    hovertemplate=f"<b>{base}</b><br>%{{x}}<br>Count: %{{y}}<extra></extra>",
                ),
                row=ctx.row,
                col=ctx.col,
            )
        else:
            values = series.to_numpy()
            ctx.fig.add_trace(
                go.Histogram(
                    x=values,
                    nbinsx=n_bins,
                    marker={"color": col_color},
                    opacity=bar_opacity,
                    histnorm=histnorm,
                    name=base,
                    hovertemplate=(f"<b>{base}</b><br>Value: %{{x:.2f}}<br>Density: %{{y:.4f}}<extra></extra>"),
                ),
                row=ctx.row,
                col=ctx.col,
            )
            if show_kde and len(values) > 1:
                from scipy.stats import gaussian_kde  # noqa: PLC0415

                kde = gaussian_kde(values)
                x_range = np.linspace(float(values.min()), float(values.max()), kde_points)
                ctx.fig.add_trace(
                    go.Scatter(
                        x=x_range,
                        y=kde(x_range),
                        mode="lines",
                        line={"color": col_color, "width": kde_width},
                        name=f"{base} KDE",
                        showlegend=False,
                        hovertemplate=(f"<b>{base} KDE</b><br>Value: %{{x:.2f}}<br>Density: %{{y:.4f}}<extra></extra>"),
                    ),
                    row=ctx.row,
                    col=ctx.col,
                )

    fig = facet_figure(
        df,
        _render_distribution,
        columns=plot_columns,
        facet_n_cols=facet_n_cols,
        title=title or "Distribution",
        x_label=x_label or "Value",
        y_label=y_label or "Density",
        width=width,
        height=height,
        shared_xaxes=False,
    )
    fig.update_layout(showlegend=show_legend)

    return fig

Tutorials

The following example notebooks use this component:

  • Exploratory Visualization


    Visualization

    Exploratory time series visualisation with raw series plots, rolling statistics overlays, seasonal overlays, subseries diagnostics, distribution boxplots, missing data pattern auditing, outlier detection, and resampling comparison.

    View · Open in marimo