Skip to content

OutlierPercentileHandler

yohou.preprocessing.outlier.OutlierPercentileHandler

Bases: BaseTransformer

Handle outliers based on percentile thresholds.

Values outside the specified percentile range are either clipped to the percentile values or set to NaN. Percentiles are computed during fit.

Parameters

Name Type Description Default
low float or None

Lower percentile (0-100). Values below this percentile are handled. If None, no lower bound is applied.

None
high float or None

Upper percentile (0-100). Values above this percentile are handled. If None, no upper bound is applied.

None
strategy (clip, nan)

How to handle outliers: - "clip": Replace outliers with percentile values - "nan": Replace outliers with NaN

"clip"

Attributes

Name Type Description
thresholds_ dict

Dictionary mapping column names to (low_value, high_value) tuples.

Examples

>>> import polars as pl
>>> from datetime import datetime, timedelta
>>> import numpy as np
>>> from yohou.preprocessing import OutlierPercentileHandler
>>> np.random.seed(42)
>>> X = pl.DataFrame({
...     "time": [datetime(2020, 1, 1) + timedelta(days=i) for i in range(100)],
...     "value": np.random.randn(100).tolist(),
... })
>>> # Clip to 5th-95th percentile
>>> handler = OutlierPercentileHandler(low=5, high=95)
>>> handler.fit(X)
OutlierPercentileHandler(high=95, low=5)
>>> X_handled = handler.transform(X)
>>> "time" in X_handled.columns
True
>>> # IQR-based outlier detection (clip to 25th-75th percentile)
>>> handler = OutlierPercentileHandler(low=25, high=75)
>>> handler.fit(X)
OutlierPercentileHandler(...)
>>> X_handled = handler.transform(X)
>>> len(X_handled)
100

See Also

Source Code

Show/Hide source
class OutlierPercentileHandler(BaseTransformer):
    """Handle outliers based on percentile thresholds.

    Values outside the specified percentile range are either clipped to the
    percentile values or set to NaN. Percentiles are computed during fit.

    Parameters
    ----------
    low : float or None, default=None
        Lower percentile (0-100). Values below this percentile are handled.
        If None, no lower bound is applied.
    high : float or None, default=None
        Upper percentile (0-100). Values above this percentile are handled.
        If None, no upper bound is applied.
    strategy : {"clip", "nan"}, default="clip"
        How to handle outliers:
        - "clip": Replace outliers with percentile values
        - "nan": Replace outliers with NaN

    Attributes
    ----------
    thresholds_ : dict
        Dictionary mapping column names to (low_value, high_value) tuples.

    Examples
    --------
    >>> import polars as pl
    >>> from datetime import datetime, timedelta
    >>> import numpy as np
    >>> from yohou.preprocessing import OutlierPercentileHandler

    >>> np.random.seed(42)
    >>> X = pl.DataFrame({
    ...     "time": [datetime(2020, 1, 1) + timedelta(days=i) for i in range(100)],
    ...     "value": np.random.randn(100).tolist(),
    ... })

    >>> # Clip to 5th-95th percentile
    >>> handler = OutlierPercentileHandler(low=5, high=95)
    >>> handler.fit(X)
    OutlierPercentileHandler(high=95, low=5)
    >>> X_handled = handler.transform(X)
    >>> "time" in X_handled.columns
    True

    >>> # IQR-based outlier detection (clip to 25th-75th percentile)
    >>> handler = OutlierPercentileHandler(low=25, high=75)
    >>> handler.fit(X)  # doctest: +ELLIPSIS
    OutlierPercentileHandler(...)
    >>> X_handled = handler.transform(X)
    >>> len(X_handled)
    100

    See Also
    --------
    - [`OutlierThresholdHandler`][yohou.preprocessing.outlier.OutlierThresholdHandler] : Handle outliers based on fixed thresholds.

    """

    _valid_strategies = {"clip", "nan"}

    _parameter_constraints: dict = {
        "low": [Interval(numbers.Real, 0, 100, closed="both"), None],
        "high": [Interval(numbers.Real, 0, 100, closed="both"), None],
        "strategy": [StrOptions(_valid_strategies)],
    }

    _tags = {"stateful": False, "invertible": False}

    def __init__(
        self,
        low: float | None = None,
        high: float | None = None,
        strategy: str = "clip",
    ):
        self.low = low
        self.high = high
        self.strategy = strategy

    def _fit(self, X: pl.DataFrame, y: pl.DataFrame | None = None) -> None:
        """Fit the internal model."""
        # Validate percentile ordering
        if self.low is not None and self.high is not None and self.low > self.high:
            msg = f"low ({self.low}) must be <= high ({self.high})"
            raise ValueError(msg)

        # Compute thresholds for each column
        data_cols = [c for c in X.columns if c != "time"]
        self.thresholds_: dict[str, tuple[float | None, float | None]] = {}

        for col_name in data_cols:
            col_data = X[col_name].drop_nulls().drop_nans().to_numpy()

            low_val = None
            high_val = None

            if len(col_data) > 0:
                if self.low is not None:
                    low_val = float(np.percentile(col_data, self.low))
                if self.high is not None:
                    high_val = float(np.percentile(col_data, self.high))

            self.thresholds_[col_name] = (low_val, high_val)

    def _transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """Handle outliers based on fitted percentile thresholds.

        Parameters
        ----------
        X : pl.DataFrame
            Validated input time series.

        Returns
        -------
        pl.DataFrame
            Transformed time series.

        """
        return _apply_outlier_handling(
            X,
            self.strategy,
            lambda col_name: self.thresholds_.get(col_name, (None, None)),
        )

    def get_feature_names_out(self, input_features: list[str] | None = None) -> list[str]:
        """Get output feature names for transformation.

        Parameters
        ----------
        input_features : list of str or None, default=None
            Column names of the input features.  If ``None``, uses the
            feature names seen during ``fit``.

        Returns
        -------
        list of str
            Output feature names after transformation.

        """
        check_is_fitted(self, ["feature_names_in_"])
        input_features = _check_feature_names_in(self, input_features)
        return list(input_features)

Methods

get_feature_names_out(input_features=None)

Get output feature names for transformation.

Parameters
Name Type Description Default
input_features list of str or None

Column names of the input features. If None, uses the feature names seen during fit.

None
Returns
Type Description
list of str

Output feature names after transformation.

Source Code
Show/Hide source
def get_feature_names_out(self, input_features: list[str] | None = None) -> list[str]:
    """Get output feature names for transformation.

    Parameters
    ----------
    input_features : list of str or None, default=None
        Column names of the input features.  If ``None``, uses the
        feature names seen during ``fit``.

    Returns
    -------
    list of str
        Output feature names after transformation.

    """
    check_is_fitted(self, ["feature_names_in_"])
    input_features = _check_feature_names_in(self, input_features)
    return list(input_features)

Tutorials

The following example notebooks use this component:

  • How to Clean Time Series Data


    Data-Features

    End-to-end data cleaning pipeline combining SimpleTimeImputer and SeasonalImputer for missing values with OutlierThresholdHandler for anomaly clipping.

    View · Open in marimo

  • How to Handle Outliers in a Forecasting Pipeline


    Data-Features

    Detect and clip outliers with OutlierThresholdHandler and OutlierPercentileHandler, then see how outliers affect conformal prediction intervals.

    View · Open in marimo