Skip to content

plot_missing_data

yohou.plotting.exploration.plot_missing_data(df, *, columns=None, kind='heatmap', groups=None, facet_by='member', facet_n_cols=2, color_palette=None, color_missing='#DC2626', color_present='#059669', show_legend=True, title=None, x_label=None, y_label=None, width=None, height=None, show_percentages=True, time_aggregation=None, sampling_interval=None)

Visualize missing data patterns over time.

Parameters

Name Type Description Default
df DataFrame

Input DataFrame with 'time' column and numeric columns.

required
columns str | list[str] | None

Column(s) to check for missing data. If None, checks all columns except 'time'.

None
kind (heatmap, bars, matrix)

Visualization kind: - "heatmap": time x columns grid showing missing/present - "bars": bar chart of missing percentage per column - "matrix": binary matrix (missingno-style, time on x-axis)

"heatmap"
groups list[str] | None

Panel group prefixes to plot.

None
facet_by Literal['group', 'member'] | None

Faceting axis for panel data. "group" creates one subplot per group, "member" one per member. None disables faceting. Ignored for non-panel data.

"member"
facet_n_cols int

Number of columns in facet grid.

2
color_palette list[str] | None

Custom colour hex codes used for bars traces. When None the default yohou palette is used. Heatmap/matrix kinds always use color_missing / color_present.

None
color_missing str

Color for missing values (red).

"#DC2626"
color_present str

Color for present values (green).

"#059669"
show_legend bool

Whether to show the legend.

True
title str | None

Plot title.

None
x_label str | None

X-axis label.

None
y_label str | None

Y-axis label.

None
width int | None

Plot width in pixels.

None
height int | None

Plot height in pixels.

None
show_percentages bool

Whether to display percentage labels on bars.

True
time_aggregation str | None

Aggregate to time periods before checking. Polars duration string (e.g. "1d", "1w", "1mo").

None
sampling_interval str | None

Expected sampling frequency of the time series (e.g. "1h", "1d"). When provided, the DataFrame is reindexed to the full expected time range before counting missing values, so that absent timestamps (gap rows) are also detected as missing. Only fixed-length intervals (convertible to timedelta) are supported; variable-length intervals such as "1mo" raise ValueError.

None

Returns

Type Description
Figure

Plotly figure object.

Examples

>>> import polars as pl
>>> from yohou.plotting import plot_missing_data
>>> # Create sample data with missing values
>>> df = pl.DataFrame({
...     "time": pl.date_range(pl.date(2020, 1, 1), pl.date(2020, 1, 10), "1d", eager=True),
...     "y": [10, None, 30, 40, None, 60, 70, 80, None, 100],
... })
>>> # Bar chart
>>> fig = plot_missing_data(df, kind="bars")
>>> len(fig.data) > 0
True

See Also

plot_time_series : Plot basic time series.

Source Code

Show/Hide source
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
def plot_missing_data(
    df: pl.DataFrame,
    *,
    columns: str | list[str] | None = None,
    kind: Literal["heatmap", "bars", "matrix"] = "heatmap",
    groups: list[str] | None = None,
    facet_by: Literal["group", "member"] | None = "member",
    facet_n_cols: int = 2,
    color_palette: list[str] | None = None,
    color_missing: str = "#DC2626",
    color_present: str = "#059669",
    show_legend: bool = True,
    title: str | None = None,
    x_label: str | None = None,
    y_label: str | None = None,
    width: int | None = None,
    height: int | None = None,
    show_percentages: bool = True,
    time_aggregation: str | None = None,
    sampling_interval: str | None = None,
) -> go.Figure:
    """
    Visualize missing data patterns over time.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame with 'time' column and numeric columns.
    columns : str | list[str] | None, default=None
        Column(s) to check for missing data. If None, checks all columns except 'time'.
    kind : {"heatmap", "bars", "matrix"}, default="heatmap"
        Visualization kind:
        - "heatmap": time x columns grid showing missing/present
        - "bars": bar chart of missing percentage per column
        - "matrix": binary matrix (missingno-style, time on x-axis)
    groups : list[str] | None, default=None
        Panel group prefixes to plot.
    facet_by : Literal["group", "member"] | None, default="member"
        Faceting axis for panel data.  ``"group"`` creates one subplot per
        group, ``"member"`` one per member.  ``None`` disables faceting.
        Ignored for non-panel data.
    facet_n_cols : int, default=2
        Number of columns in facet grid.
    color_palette : list[str] | None, default=None
        Custom colour hex codes used for bars traces. When ``None``
        the default yohou palette is used.  Heatmap/matrix kinds
        always use ``color_missing`` / ``color_present``.
    color_missing : str, default="#DC2626"
        Color for missing values (red).
    color_present : str, default="#059669"
        Color for present values (green).
    show_legend : bool, default=True
        Whether to show the legend.
    title : str | None, default=None
        Plot title.
    x_label : str | None, default=None
        X-axis label.
    y_label : str | None, default=None
        Y-axis label.
    width : int | None, default=None
        Plot width in pixels.
    height : int | None, default=None
        Plot height in pixels.
    show_percentages : bool, default=True
        Whether to display percentage labels on bars.
    time_aggregation : str | None, default=None
        Aggregate to time periods before checking. Polars duration string
        (e.g. ``"1d"``, ``"1w"``, ``"1mo"``).
    sampling_interval : str | None, default=None
        Expected sampling frequency of the time series (e.g. ``"1h"``,
        ``"1d"``).  When provided, the DataFrame is reindexed to the full
        expected time range before counting missing values, so that absent
        timestamps (gap rows) are also detected as missing.  Only
        fixed-length intervals (convertible to ``timedelta``) are supported;
        variable-length intervals such as ``"1mo"`` raise ``ValueError``.

    Returns
    -------
    go.Figure
        Plotly figure object.

    Examples
    --------
    >>> import polars as pl
    >>> from yohou.plotting import plot_missing_data

    >>> # Create sample data with missing values
    >>> df = pl.DataFrame({
    ...     "time": pl.date_range(pl.date(2020, 1, 1), pl.date(2020, 1, 10), "1d", eager=True),
    ...     "y": [10, None, 30, 40, None, 60, 70, 80, None, 100],
    ... })

    >>> # Bar chart
    >>> fig = plot_missing_data(df, kind="bars")
    >>> len(fig.data) > 0
    True

    See Also
    --------
    [`plot_time_series`][yohou.plotting.plot_time_series] : Plot basic time series.
    """
    # Validate inputs
    validate_plotting_data(df)
    validate_plotting_params(width=width, height=height)

    # Reindex to full time range when sampling_interval is given so that
    # absent timestamps (gap rows) appear as nulls in the analysis.
    if sampling_interval is not None:
        td = interval_to_timedelta(sampling_interval)
        if td is None:
            msg = (
                f"sampling_interval={sampling_interval!r} is a variable-length "
                "interval and cannot be used for reindexing. Use a fixed-length "
                "interval such as '1h' or '1d'."
            )
            raise ValueError(msg)
        full_range = pl.DataFrame({
            "time": pl.datetime_range(
                df["time"].min(),
                df["time"].max(),
                interval=sampling_interval,
                eager=True,
            ),  # ty: ignore[no-matching-overload]
        })
        df = full_range.join(df, on="time", how="left")

    if groups is None and columns is None and _auto_detect_panel(df):
        groups = []

    if groups is not None:
        effective_facet_by = facet_by or "member"

        if kind == "bars":
            tracker = LegendTracker(show_legend)
            color_mgr = PanelColorManager(color_palette)

            def _render_missing(ctx: RenderContext) -> None:
                """Render missing data count bar chart for a single column."""
                base = [c for c in ctx.sub_df.columns if c != "time"][0]
                _sp = show_percentages
                total = len(ctx.sub_df)
                mc = ctx.sub_df[base].null_count()
                pct = (mc / total) * 100 if total > 0 else 0
                text = f"{pct:.1f}%" if _sp else None
                ctx.fig.add_trace(
                    go.Bar(
                        x=[base],
                        y=[pct],
                        name=ctx.display_name,
                        legendgroup=ctx.display_name,
                        showlegend=tracker.should_show(ctx.display_name),
                        marker={"color": color_mgr.get_color(ctx.display_name)},
                        text=[text] if text else None,
                        textposition="auto" if text else None,
                    ),
                    row=ctx.row,
                    col=ctx.col,
                )

            return facet_figure(
                df,
                _render_missing,
                groups=groups,
                columns=columns,
                facet_by=effective_facet_by,
                facet_n_cols=facet_n_cols,
                title=title or "Missing Data",
                x_label=x_label or "Column",
                y_label=y_label or "Missing (%)",
                width=width,
                height=height,
                shared_xaxes=False,
            )

        # Panel heatmap / matrix - custom subplot logic
        if kind not in ("heatmap", "matrix"):
            msg = f"Unknown kind: {kind}. Valid options: heatmap, bars, matrix"
            raise ValueError(msg)
        return _panel_heatmap_missing(
            df,
            kind=kind,
            groups=groups,
            columns=columns,
            facet_by=effective_facet_by,
            facet_n_cols=facet_n_cols,
            color_missing=color_missing,
            color_present=color_present,
            time_aggregation=time_aggregation,
            title=title,
            x_label=x_label,
            y_label=y_label,
            width=width,
            height=height,
        )

    # Resolve columns
    plot_columns = validate_plotting_data(df, columns=columns, exclude=["time"], include_categorical=True)

    if kind == "bars":
        # Column-mode facet_figure for bar chart
        _colors = resolve_color_palette(color_palette, len(plot_columns))
        _col_colors = dict(zip(plot_columns, _colors, strict=False))
        total_rows = len(df)

        def _render_missing(ctx: RenderContext) -> None:
            """Render missing-data bar for one column into a subplot."""
            base = ctx.display_name
            col_color = _col_colors[base]
            missing = ctx.sub_df[base].null_count()
            pct = (missing / total_rows) * 100
            text = f"{pct:.1f}%" if show_percentages else None
            ctx.fig.add_trace(
                go.Bar(
                    x=[base],
                    y=[pct],
                    name=base,
                    marker={"color": col_color},
                    text=[text] if text else None,
                    textposition="auto" if text else None,
                    hovertemplate="<b>%{x}</b><br>Missing: %{y:.1f}%<extra></extra>",
                ),
                row=ctx.row,
                col=ctx.col,
            )

        fig = facet_figure(
            df,
            _render_missing,
            columns=plot_columns,
            facet_n_cols=facet_n_cols,
            title=title or "Missing Data",
            x_label=x_label or "Column",
            y_label=y_label or "Missing (%)",
            width=width,
            height=height,
            shared_xaxes=False,
        )
        fig.update_layout(showlegend=show_legend)
        return fig

    elif kind == "heatmap":
        # Heatmap of missing values over time (binary: 1=missing, 0=present)
        if time_aggregation:
            df_agg = df.with_columns([pl.col("time").dt.truncate(time_aggregation).alias("period")])
            # Group by period and check for missing
            periods = df_agg.select("period").unique().sort("period")["period"].to_list()
            z_data = []
            for col in plot_columns:
                col_data = []
                for period in periods:
                    period_df = df_agg.filter(pl.col("period") == period)
                    has_missing = period_df[col].null_count() > 0
                    col_data.append(1 if has_missing else 0)
                z_data.append(col_data)

            x_vals = [str(p) for p in periods]
        else:
            # Use individual time points
            z_data = []
            for col in plot_columns:
                col_data = df[col].is_null().cast(pl.Int8).to_list()
                z_data.append(col_data)
            x_vals = df["time"].to_list()

        fig = go.Figure()
        # Build custom text matrix for hover readability
        text_data = [["Missing" if v == 1 else "Present" for v in row] for row in z_data]
        fig.add_trace(
            go.Heatmap(
                z=z_data,
                x=x_vals,
                y=plot_columns,
                colorscale=[[0, color_present], [1, color_missing]],
                showscale=False,
                customdata=text_data,
                hovertemplate="<b>%{y}</b><br>%{x}<br>%{customdata}<extra></extra>",
            )
        )

        if x_label is None:
            x_label = "Time"
        if y_label is None:
            y_label = "Column"

    elif kind == "matrix":
        # Binary matrix with time on x-axis
        z_data = []
        for col in plot_columns:
            col_data = df[col].is_null().cast(pl.Int8).to_list()
            z_data.append(col_data)

        text_data = [["Missing" if v == 1 else "Present" for v in row] for row in z_data]
        fig = go.Figure()
        fig.add_trace(
            go.Heatmap(
                z=z_data,
                x=df["time"].to_list(),
                y=plot_columns,
                colorscale=[[0, color_present], [1, color_missing]],
                showscale=False,
                customdata=text_data,
                hovertemplate="<b>%{y}</b><br>%{x}<br>%{customdata}<extra></extra>",
            )
        )

        if x_label is None:
            x_label = "Time"
        if y_label is None:
            y_label = "Column"
    else:
        msg = f"Unknown kind: {kind}. Valid options: heatmap, bars, matrix"
        raise ValueError(msg)

    # Apply layout
    fig = apply_default_layout(
        fig,
        title=title or "Missing Data",
        x_label=x_label,
        y_label=y_label,
        width=width,
        height=height,
    )
    fig.update_layout(showlegend=show_legend)

    return fig

Tutorials

The following example notebooks use this component:

  • How to Clean Time Series Data


    Data-Features

    End-to-end data cleaning pipeline combining SimpleTimeImputer and SeasonalImputer for missing values with OutlierThresholdHandler for anomaly clipping.

    View · Open in marimo

  • Exploratory Visualization


    Visualization

    Exploratory time series visualisation with raw series plots, rolling statistics overlays, seasonal overlays, subseries diagnostics, distribution boxplots, missing data pattern auditing, outlier detection, and resampling comparison.

    View · Open in marimo