Skip to content

parse_tsf

yohou.datasets._tsf_parser.parse_tsf(source, *, value_column_name='value', n_series=None)

Parse a Monash .tsf file into a wide polars DataFrame.

Parameters

Name Type Description Default
source str or file - like

Path to a .tsf file, or an open file-like object (binary mode).

required
value_column_name str

Name for the value column(s). For panel data, column names become "{series_name}__{value_column_name}".

'value'
n_series int or None

Maximum number of series to parse. None parses all series. Use this to limit memory consumption for large datasets.

None

Returns

Type Description
tuple of (pl.DataFrame, dict)

A tuple of (dataframe, metadata) where dataframe has a "time" column (Datetime) and one column per series, and metadata is a dict with keys "frequency", "horizon", "missing", "equallength", "relation", "n_series".

Source Code

Show/Hide source
def parse_tsf(
    source: str | IO[bytes],
    *,
    value_column_name: str = "value",
    n_series: int | None = None,
) -> tuple[pl.DataFrame, dict]:
    """Parse a Monash ``.tsf`` file into a wide polars DataFrame.

    Parameters
    ----------
    source : str or file-like
        Path to a ``.tsf`` file, or an open file-like object (binary mode).
    value_column_name : str
        Name for the value column(s). For panel data, column names become
        ``"{series_name}__{value_column_name}"``.
    n_series : int or None
        Maximum number of series to parse. ``None`` parses all series.
        Use this to limit memory consumption for large datasets.

    Returns
    -------
    tuple of (pl.DataFrame, dict)
        A tuple of ``(dataframe, metadata)`` where *dataframe* has a
        ``"time"`` column (Datetime) and one column per series, and
        *metadata* is a dict with keys ``"frequency"``, ``"horizon"``,
        ``"missing"``, ``"equallength"``, ``"relation"``, ``"n_series"``.

    """
    line_iter = _iter_text_lines(source)
    attributes, header_meta = _parse_header(line_iter)

    series_list = _parse_data_lines(line_iter, attributes, n_series=n_series)

    polars_freq = TSF_FREQUENCY_MAP.get(header_meta["frequency_raw"], header_meta["frequency_raw"])

    has_timestamp = any(name == "start_timestamp" for name, _ in attributes)

    frame = _build_dataframe(
        series_list,
        polars_freq=polars_freq,
        has_timestamp=has_timestamp,
        value_column_name=value_column_name,
    )

    metadata = {
        "frequency": polars_freq,
        "horizon": header_meta["horizon"],
        "missing": header_meta["missing"],
        "equallength": header_meta["equallength"],
        "relation": header_meta["relation"],
        "n_series": len(series_list),
    }

    return frame, metadata