Skip to content

SlidingWindowSplitter

yohou.model_selection.split.SlidingWindowSplitter

Bases: BaseSplitter

Sliding window time series cross-validation splitter.

Both training and test windows slide forward with fixed sizes. This is useful when recent data is more relevant than distant past (concept drift), or when simulating production scenarios with fixed-size training windows.

Parameters

Name Type Description Default
n_splits int

Number of cross-validation folds. Must be at least 2.

3
train_size int

Number of samples in each training window. When None (default), the training size is computed automatically so that exactly n_splits folds fit the data: train_size = n_samples - test_size - (n_splits - 1) * stride. When both n_splits and train_size are provided explicitly, they must be consistent with the data length; otherwise a ValueError is raised during split().

None
test_size int

Number of samples in each test window.

10
stride int

Number of samples to move forward between splits. If None, defaults to test_size (non-overlapping windows).

None

Attributes

Name Type Description
train_size_ int

Resolved training window size, set during split(). Equal to train_size when provided explicitly, or computed from n_splits and the data length.

Examples

>>> import polars as pl
>>> from datetime import datetime, timedelta
>>> from yohou.model_selection import SlidingWindowSplitter
>>>
>>> # Create time series
>>> time = [datetime(2020, 1, 1) + timedelta(days=i) for i in range(100)]
>>> y = pl.DataFrame({"time": time, "value": range(100)})
>>>
>>> # 5-fold sliding window with 10-day test windows
>>> splitter = SlidingWindowSplitter(n_splits=5, test_size=10)
>>> splits = list(splitter.split(y))
>>> len(splits)
5
>>>
>>> # First split: training size computed automatically
>>> train, test = splits[0]
>>> len(test)
10
>>>
>>> # All training windows have the same size
>>> all(len(tr) == len(splits[0][0]) for tr, _ in splits)
True

Notes

  • Training and test windows have fixed sizes
  • Windows slide forward by stride samples
  • Useful for concept drift scenarios
  • When train_size is omitted, it is computed from n_splits and the data length in the first call to split()

See Also

Source Code

Show/Hide source
class SlidingWindowSplitter(BaseSplitter):
    """Sliding window time series cross-validation splitter.

    Both training and test windows slide forward with fixed sizes.
    This is useful when recent data is more relevant than distant past
    (concept drift), or when simulating production scenarios with
    fixed-size training windows.

    Parameters
    ----------
    n_splits : int, default=3
        Number of cross-validation folds. Must be at least 2.
    train_size : int, default=None
        Number of samples in each training window.  When ``None``
        (default), the training size is computed automatically so that
        exactly ``n_splits`` folds fit the data:
        ``train_size = n_samples - test_size - (n_splits - 1) * stride``.
        When both ``n_splits`` and ``train_size`` are provided explicitly,
        they must be consistent with the data length; otherwise a
        ``ValueError`` is raised during ``split()``.
    test_size : int, default=10
        Number of samples in each test window.
    stride : int, default=None
        Number of samples to move forward between splits. If None,
        defaults to `test_size` (non-overlapping windows).

    Attributes
    ----------
    train_size_ : int
        Resolved training window size, set during ``split()``.
        Equal to ``train_size`` when provided explicitly, or computed
        from ``n_splits`` and the data length.

    Examples
    --------
    >>> import polars as pl
    >>> from datetime import datetime, timedelta
    >>> from yohou.model_selection import SlidingWindowSplitter
    >>>
    >>> # Create time series
    >>> time = [datetime(2020, 1, 1) + timedelta(days=i) for i in range(100)]
    >>> y = pl.DataFrame({"time": time, "value": range(100)})
    >>>
    >>> # 5-fold sliding window with 10-day test windows
    >>> splitter = SlidingWindowSplitter(n_splits=5, test_size=10)
    >>> splits = list(splitter.split(y))
    >>> len(splits)
    5
    >>>
    >>> # First split: training size computed automatically
    >>> train, test = splits[0]
    >>> len(test)
    10
    >>>
    >>> # All training windows have the same size
    >>> all(len(tr) == len(splits[0][0]) for tr, _ in splits)
    True

    Notes
    -----
    - Training and test windows have fixed sizes
    - Windows slide forward by `stride` samples
    - Useful for concept drift scenarios
    - When ``train_size`` is omitted, it is computed from ``n_splits``
      and the data length in the first call to ``split()``

    See Also
    --------
    - [`ExpandingWindowSplitter`][yohou.model_selection.split.ExpandingWindowSplitter] : Growing training window splitter

    """

    _parameter_constraints: dict = {
        "n_splits": [Interval(numbers.Integral, 2, None, closed="left")],
        "train_size": [Interval(numbers.Integral, 1, None, closed="left"), None],
        "test_size": [Interval(numbers.Integral, 1, None, closed="left")],
        "stride": [Interval(numbers.Integral, 1, None, closed="left"), None],
    }

    _tags: ClassVar[dict[str, Any]] = {"splitter_type": "sliding"}

    def __init__(
        self,
        n_splits: int = 3,
        *,
        train_size: int | None = None,
        test_size: int = 10,
        stride: int | None = None,
    ) -> None:
        self.n_splits = n_splits
        self.train_size = train_size
        self.test_size = test_size
        self.stride = stride

        # Validate parameters
        self._validate_params()

    def split(
        self,
        y: pl.DataFrame,
        X_actual: pl.DataFrame | None = None,
    ) -> Iterator[tuple[np.ndarray[Any, np.dtype[np.intp]], np.ndarray[Any, np.dtype[np.intp]]]]:
        """Generate indices to split time series data with sliding windows.

        Parameters
        ----------
        y : pl.DataFrame
            Target time series used to generate train/test split indices.
            Must have a ``"time"`` column.
        X_actual : pl.DataFrame or None, default=None
            Actual features.  Not used for splitting but accepted for
            API consistency.

        Yields
        ------
        train : ndarray
            Training set row indices for that split.
        test : ndarray
            Test set row indices for that split.

        """
        # Validate data
        y, X_actual = validate_splitter_data(self, y=y, X_actual=X_actual)

        # Resolve train_size (may compute from n_splits) and store as fitted attr
        self.train_size_ = self._resolve_train_size(len(y))

        # Delegate to concrete implementation for test indices
        for test_index in self._iter_test_indices(y, X_actual):
            # For sliding window, train indices are the fixed-size window
            # ending at the start of the test set
            train_end = test_index[0]
            train_start = train_end - self.train_size_

            train_index = np.arange(train_start, train_end, dtype=np.intp)
            yield train_index, test_index

    def _iter_test_indices(
        self,
        y: pl.DataFrame,
        X_actual: pl.DataFrame | None = None,
    ) -> Iterator[np.ndarray[Any, np.dtype[np.intp]]]:
        """Generate test indices for sliding window splits.

        Parameters
        ----------
        y : pl.DataFrame
            Target time series.
        X_actual : pl.DataFrame or None, default=None
            Actual features. Not used for splitting but accepted for
            API consistency.

        Yields
        ------
        test : ndarray
            Test set indices for this split.

        """
        n_samples = len(y)
        train_size = self._resolve_train_size(n_samples)
        test_size = self.test_size
        stride = self.stride if self.stride is not None else test_size

        if train_size + test_size > n_samples:
            raise ValueError(
                f"train_size ({train_size}) + test_size ({test_size}) = "
                f"{train_size + test_size} is greater than n_samples ({n_samples})."
            )

        if test_size % stride != 0:
            warnings.warn(
                f"test_size={test_size} is not a multiple of "
                f"stride={stride}. The last vintage in each fold will "
                f"have fewer in-test predictions, causing uneven step "
                f"representation in stepwise scoring.",
                UserWarning,
                stacklevel=3,
            )

        # Fixed iteration: produce exactly n_splits test windows
        test_start = train_size
        for _ in range(self.n_splits):
            if test_start + test_size > n_samples:
                break
            yield np.arange(test_start, test_start + test_size, dtype=np.intp)
            test_start += stride

    def _resolve_train_size(self, n_samples: int) -> int:
        """Compute or validate the training window size.

        Parameters
        ----------
        n_samples : int
            Number of samples in the dataset.

        Returns
        -------
        int
            Resolved training window size.

        Raises
        ------
        ValueError
            If the computed train_size is less than 1 or if an explicit
            train_size is inconsistent with n_splits and the data length.

        """
        test_size = self.test_size
        stride = self.stride if self.stride is not None else test_size
        n_splits = self.n_splits

        if self.train_size is None:
            # Compute train_size from n_splits
            train_size = n_samples - test_size - (n_splits - 1) * stride
            if train_size < 1:
                raise ValueError(
                    f"Not enough data for n_splits={n_splits} with "
                    f"test_size={test_size}, stride={stride}: "
                    f"computed train_size={train_size} (must be >= 1). "
                    f"Reduce n_splits or provide more data."
                )
            return train_size

        # Both n_splits and train_size are explicit; check consistency
        train_size = self.train_size
        if train_size + test_size > n_samples:
            raise ValueError(
                f"train_size ({train_size}) + test_size ({test_size}) = "
                f"{train_size + test_size} is greater than n_samples ({n_samples})."
            )

        available = n_samples - train_size - test_size
        max_splits = (available // stride) + 1
        if n_splits > max_splits:
            raise ValueError(
                f"Inconsistent parameters: n_splits={n_splits} but the data "
                f"(n_samples={n_samples}) with train_size={train_size}, "
                f"test_size={test_size}, stride={stride} supports "
                f"at most {max_splits} splits. Set train_size=None to "
                f"auto-compute or reduce n_splits to at most {max_splits}."
            )
        return train_size

    def get_n_splits(
        self,
        y: pl.DataFrame | None = None,
        X_actual: pl.DataFrame | None = None,
    ) -> int:
        """Return the number of cross-validation folds.

        Parameters
        ----------
        y : pl.DataFrame or None, default=None
            Not used.  Accepted for API consistency.
        X_actual : pl.DataFrame or None, default=None
            Not used.  Accepted for API consistency.

        Returns
        -------
        int
            The number of cross-validation folds.

        """
        return self.n_splits

Methods

split(y, X_actual=None)

Generate indices to split time series data with sliding windows.

Parameters
Name Type Description Default
y DataFrame

Target time series used to generate train/test split indices. Must have a "time" column.

required
X_actual DataFrame or None

Actual features. Not used for splitting but accepted for API consistency.

None

Yields:

Name Type Description
train ndarray

Training set row indices for that split.

test ndarray

Test set row indices for that split.

Source Code
Show/Hide source
def split(
    self,
    y: pl.DataFrame,
    X_actual: pl.DataFrame | None = None,
) -> Iterator[tuple[np.ndarray[Any, np.dtype[np.intp]], np.ndarray[Any, np.dtype[np.intp]]]]:
    """Generate indices to split time series data with sliding windows.

    Parameters
    ----------
    y : pl.DataFrame
        Target time series used to generate train/test split indices.
        Must have a ``"time"`` column.
    X_actual : pl.DataFrame or None, default=None
        Actual features.  Not used for splitting but accepted for
        API consistency.

    Yields
    ------
    train : ndarray
        Training set row indices for that split.
    test : ndarray
        Test set row indices for that split.

    """
    # Validate data
    y, X_actual = validate_splitter_data(self, y=y, X_actual=X_actual)

    # Resolve train_size (may compute from n_splits) and store as fitted attr
    self.train_size_ = self._resolve_train_size(len(y))

    # Delegate to concrete implementation for test indices
    for test_index in self._iter_test_indices(y, X_actual):
        # For sliding window, train indices are the fixed-size window
        # ending at the start of the test set
        train_end = test_index[0]
        train_start = train_end - self.train_size_

        train_index = np.arange(train_start, train_end, dtype=np.intp)
        yield train_index, test_index

get_n_splits(y=None, X_actual=None)

Return the number of cross-validation folds.

Parameters
Name Type Description Default
y DataFrame or None

Not used. Accepted for API consistency.

None
X_actual DataFrame or None

Not used. Accepted for API consistency.

None
Returns
Type Description
int

The number of cross-validation folds.

Source Code
Show/Hide source
def get_n_splits(
    self,
    y: pl.DataFrame | None = None,
    X_actual: pl.DataFrame | None = None,
) -> int:
    """Return the number of cross-validation folds.

    Parameters
    ----------
    y : pl.DataFrame or None, default=None
        Not used.  Accepted for API consistency.
    X_actual : pl.DataFrame or None, default=None
        Not used.  Accepted for API consistency.

    Returns
    -------
    int
        The number of cross-validation folds.

    """
    return self.n_splits