Source code for fusionlab.datasets.make

# -*- coding: utf-8 -*-
# Author: LKouadio <etanoyau@gmail.com>
# License: BSD-3-Clause 
# -------------------------------------------------------------------
# Provides functions for generating synthetic datasets suitable for
# demonstrating and testing fusionlab models.
# -------------------------------------------------------------------
"""
Synthetic Dataset Generation Utilities.

This module provides functions to create synthetic datasets tailored
for demonstrating and testing the various models and utilities within
the `fusionlab` package, particularly those expecting static, dynamic,
and future features (like TFT and XTFT).
"""
from __future__ import annotations

import textwrap
import warnings
import numpy as np
import pandas as pd
from typing import Optional, List, Union, Tuple

from ..api.bunch import XBunch


__all__ = [
    "make_multi_feature_time_series", 
    "make_quantile_prediction_data",
    "make_anomaly_data", 
    "make_trend_seasonal_data",
    "make_multivariate_target_data"
    ]


[docs]
def make_multi_feature_time_series(
    n_series: int = 3,
    n_timesteps: int = 100,
    freq: str = 'D', 
    static_noise_level: float = 0.1,
    trend_base: float = 10,
    trend_factor: float = 0.1,
    seasonality_period: float = 7, 
    seasonality_amplitude: float = 5,
    dynamic_cov_amplitude: float = 2,
    future_cov_amplitude: float = 1,
    noise_level: float = 1.0,
    as_frame: bool = False,
    seed: Optional[int] = None,
) -> Union[XBunch, pd.DataFrame]:
    r"""Generate multi-variate time series with static, dynamic, and future features.

    Creates a synthetic dataset suitable for models like TFT/XTFT. It
    simulates data for multiple independent series (e.g., items, locations)
    over a specified number of time steps.

    Each series includes:
    - Static features (unique ID, a noisy base value).
    - Dynamic features (time index features like month/dayofweek,
      a simulated covariate like temperature, lagged target).
    - Known future features (time index features, a simulated binary
      event like promotion).
    - A target variable generated from trend, seasonality, covariates,
      static base, and noise.

    Parameters
    ----------
    n_series : int, default=3
        Number of independent time series (e.g., items, sensors)
        to generate.
    n_timesteps : int, default=100
        Number of time steps (rows) per series.
    freq : str, default='D'
        Pandas frequency string for generating the datetime index
        (e.g., 'D' for daily, 'MS' for month start, 'H' for hourly).
    static_noise_level : float, default=0.1
        Amount of noise added to the static 'base_level' feature.
    trend_base : float, default=10
        Base value for the linear trend component.
    trend_factor : float, default=0.1
        Slope factor for the linear trend component.
    seasonality_period : float, default=7
        Periodicity for the main seasonal component (e.g., 7 for weekly
        pattern with daily data, 12 for yearly pattern with monthly data).
    seasonality_amplitude : float, default=5
        Amplitude of the main seasonal sinusoidal component.
    dynamic_cov_amplitude : float, default=2
        Amplitude of the simulated dynamic covariate (e.g., temperature).
    future_cov_amplitude : float, default=1
        Magnitude of the effect of the simulated future binary event.
    noise_level : float, default=1.0
        Standard deviation of the Gaussian noise added to the final
        target signal.
    as_frame : bool, default=False
        Determines the return type:
        - If ``False`` (default): Returns a Bunch object containing the
          DataFrame and metadata (column names grouped by type).
        - If ``True``: Returns only the pandas DataFrame.
    seed : int, optional
        Seed for NumPy's random number generator for reproducibility.
        Default is None.

    Returns
    -------
    data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame
        If ``as_frame=False`` (default):
        A Bunch object with attributes like ``frame`` (DataFrame),
        ``static_features`` (list of col names), ``dynamic_features``,
        ``future_features``, ``target_col``, ``dt_col``, ``spatial_id_col``,
        and ``DESCR``.
        If ``as_frame=True``:
        The generated data solely as a pandas DataFrame.

    Examples
    --------
    >>> from fusionlab.datasets.make import make_multi_feature_time_series
    >>> # Generate daily data for 5 series
    >>> data_bunch = make_multi_feature_time_series(n_series=5, n_timesteps=100,
    ...                                           freq='D', seasonality_period=7,
    ...                                           seed=42)
    >>> print(data_bunch.frame.head())
    >>> print("Static Features:", data_bunch.static_features)
    >>> print("Dynamic Features:", data_bunch.dynamic_features)
    >>> print("Future Features:", data_bunch.future_features)

    >>> # Generate monthly data as DataFrame
    >>> df_monthly = make_multi_feature_time_series(n_series=2, n_timesteps=36,
    ...                                           freq='MS', seasonality_period=12,
    ...                                           as_frame=True, seed=123)
    >>> print(df_monthly.info())
    """
    # if seed is not None:
    rng = np.random.RandomState(seed)


    all_series_df = []
    start_date = '2020-01-01' # Arbitrary start date

    for i in range(n_series):
        # --- Time Index ---
        date_rng = pd.date_range(
            start=start_date, periods=n_timesteps, freq=freq)
        time_idx = np.arange(n_timesteps)

        # --- Static Features ---
        series_id = i
        # Each series gets a slightly different noisy base level
        base_level = 50 + i * 20 + rng.normal(0, static_noise_level)

        # --- Dynamic Features ---
        month = date_rng.month
        dayofweek = date_rng.dayofweek
        # Simulated dynamic covariate (e.g., temperature-like)
        dynamic_cov = dynamic_cov_amplitude * np.sin(
            2 * np.pi * time_idx / (seasonality_period * 2) + i * np.pi / 3 # Phase shift per series
            ) + rng.normal(0, noise_level * 0.5, n_timesteps)

        # --- Future Features ---
        # Known future event (e.g., promotion flag)
        future_event = rng.randint(0, 2, n_timesteps)
        # Time features known in advance
        future_month = month
        future_dayofweek = dayofweek

        # --- Target Variable ---
        trend = trend_base + trend_factor * time_idx * (1 + i * 0.1)
        seasonality = seasonality_amplitude * np.sin(
            2 * np.pi * time_idx / seasonality_period + i * np.pi / 4 # Phase shift
            )
        event_effect = future_event * future_cov_amplitude * (5 + i) # Event impact
        noise = rng.normal(0, noise_level, n_timesteps)

        target = base_level + trend + seasonality + event_effect + \
                 0.5 * dynamic_cov + noise # Combine components

        # --- Lagged Target (as Dynamic Input) ---
        # Create after calculating target
        lagged_target = pd.Series(target).shift(1).fillna(method='bfill') # Backfill first NaN

        # --- Assemble DataFrame for this series ---
        series_df = pd.DataFrame({
            'date': date_rng,
            'series_id': series_id, # Static identifier
            'base_level': base_level, # Static numerical
            'month': future_month, # Dynamic and Future: month
            'dayofweek': future_dayofweek, # Dynamic and Future:dayofweek
            'dynamic_cov': dynamic_cov, # Dynamic only
            'target_lag1': lagged_target, # Dynamic only
            'future_event': future_event, # Future only
            'target': target # Target variable
        })
        all_series_df.append(series_df)

    # --- Combine all series ---
    df = pd.concat(all_series_df).reset_index(drop=True)

    # --- Define Column Roles ---
    dt_col = 'date'
    target_col = 'target'
    spatial_id_col = 'series_id'
    static_features = ['series_id', 'base_level']
    dynamic_features = ['month', 'dayofweek', 'dynamic_cov', 'target_lag1']
    future_features = ['month', 'dayofweek', 'future_event']
    # Exclude target and ID from features list passed to Bunch
    dynamic_and_future_features = list(set (dynamic_features + future_features))
    feature_names = static_features[1:] + dynamic_and_future_features 
    # dynamic_features + future_features

    # --- Return based on as_frame ---
    if as_frame:
        # Return DataFrame with logical column order
        ordered_cols = (
            [dt_col, spatial_id_col] + static_features[1:] +
            dynamic_and_future_features + [target_col]
            )
        # Ensure columns exist before ordering
        ordered_cols = [c for c in ordered_cols if c in df.columns]
        return df[ordered_cols]
    else:
        # Create Bunch object
        descr = textwrap.dedent(f"""\
        Synthetic Multi-Feature Time Series Data

        **Description:**
        Simulates data for {n_series} independent series over {n_timesteps}
        time steps with frequency '{freq}'. Includes static, dynamic, and
        known future features suitable for TFT/XTFT models.

        **Generation Parameters:** (Approximate)
        - n_series: {n_series}
        - n_timesteps: {n_timesteps}
        - freq: '{freq}'
        - seasonality_period: {seasonality_period}
        - noise_level: {noise_level:.2f}
        - trend/seasonality/covariates included.

        **Data Structure (Bunch object):**
        - frame            : Complete pandas DataFrame.
        - static_features  : List of static column names.
        - dynamic_features : List of dynamic column names.
        - future_features  : List of future column names.
        - target_col       : Name of the target column ('{target_col}').
        - dt_col           : Name of the datetime column ('{dt_col}').
        - spatial_id_col   : Name of the series identifier column ('{spatial_id_col}').
        - feature_names    : Combined list of static (excl. ID), dynamic, future features.
        - DESCR            : This description.
        """)

        # Order frame columns for Bunch frame attribute
        frame_cols = (
             [dt_col, spatial_id_col] + static_features[1:] +
             dynamic_and_future_features + [target_col]
             )
        frame_cols = [c for c in frame_cols if c in df.columns]

        return XBunch(
            frame=df[frame_cols],
            static_features=static_features,
            dynamic_features=dynamic_features,
            future_features=future_features,
            target_col=target_col,
            dt_col=dt_col,
            spatial_id_col=spatial_id_col,
            feature_names=feature_names, # Combined list
            DESCR=descr
        )




[docs]
def make_quantile_prediction_data(
    n_samples: int = 100,
    n_horizons: int = 6,
    quantiles: List[float] = [0.1, 0.5, 0.9],
    target_mean: float = 50.0,
    target_stddev: float = 10.0,
    pred_bias: float = 1.0,
    pred_spread_factor: float = 1.5,
    add_coords: bool = True,
    coord_scale: float = 10.0,
    as_frame: bool = False,
    seed: Optional[int] = None,
) -> Union[XBunch, pd.DataFrame]:
    r"""Generate synthetic actuals and corresponding quantile predictions.

    Creates a dataset simulating the output of a multi-horizon quantile
    forecasting model. It includes actual target values and predicted
    values for specified quantiles across multiple forecast horizons for
    a set of samples (e.g., locations).

    This data is useful for demonstrating and testing functions that evaluate
    or visualize probabilistic forecasts, such as those comparing prediction
    intervals to actual outcomes.

    Parameters
    ----------
    n_samples : int, default=100
        Number of independent samples (e.g., locations) to generate.
    n_horizons : int, default=6
        Number of future time steps (forecast horizon) per sample.
    quantiles : list of float, default=[0.1, 0.5, 0.9]
        List of quantile levels (between 0 and 1) for which to generate
        predictions.
    target_mean : float, default=50.0
        Mean value around which the 'actual' target values are generated.
    target_stddev : float, default=10.0
        Standard deviation for generating the 'actual' target values
        (using a normal distribution).
    pred_bias : float, default=1.0
        Systematic bias added to the median (0.5 quantile) prediction
        relative to the generated actual value.
    pred_spread_factor : float, default=1.5
        Factor controlling the width of the prediction intervals. A higher
        value creates wider intervals between quantiles. Specifically, it
        scales the offsets added/subtracted from the biased median.
    add_coords : bool, default=True
        If ``True``, add 'longitude' and 'latitude' columns with random
        coordinates.
    coord_scale : float, default=10.0
        Scaling factor for the random coordinates if `add_coords` is True.
    as_frame : bool, default=False
        Determines the return type:
        - ``False`` (default): Returns a Bunch object.
        - ``True``: Returns only the pandas DataFrame.
    seed : int, optional
        Seed for NumPy's random number generator for reproducibility.
        Default is None.

    Returns
    -------
    data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame
        If ``as_frame=False`` (default):
        A Bunch object with attributes like ``frame`` (DataFrame),
        ``quantiles`` (list), ``horizons`` (list), ``target_cols``,
        ``prediction_cols`` (nested dict), `longitude`, `latitude`
        (if generated), and ``DESCR``.
        If ``as_frame=True``:
        The generated data solely as a pandas DataFrame in wide format
        (e.g., columns 'target_h1', 'pred_q10_h1', 'pred_q50_h1', ...).

    Examples
    --------
    >>> from fusionlab.datasets import make_quantile_prediction_data
    >>> # Generate data as Bunch
    >>> pred_bunch = make_quantile_prediction_data(n_samples=5, n_horizons=3, seed=1)
    >>> print(pred_bunch.frame.head())
    >>> print("Quantile columns for q=0.1:", pred_bunch.prediction_cols['q0.1'])

    >>> # Generate data as DataFrame
    >>> pred_df = make_quantile_prediction_data(as_frame=True, seed=2)
    >>> print(pred_df.info())
    """
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    if not quantiles or not isinstance(quantiles, list):
        raise ValueError("'quantiles' must be a non-empty list of floats.")

    # Generate base actuals and coordinates
    actuals = rng.normal(
        target_mean, target_stddev, size=(n_samples, n_horizons)
    )
    data_dict = {}
    if add_coords:
        # Simulate coordinates (e.g., centered around 0)
        longitude = rng.uniform(-coord_scale, coord_scale, n_samples)
        latitude = rng.uniform(-coord_scale/2, coord_scale/2, n_samples)
        data_dict['longitude'] = longitude
        data_dict['latitude'] = latitude

    target_cols = []
    prediction_cols = {f"q{q:.1f}".replace("0.", ""): [] for q in quantiles}
    all_pred_cols_flat = []

    # Generate predictions for each horizon step and quantile
    for h in range(n_horizons):
        step = h + 1
        # Add actual column for this step
        target_col_name = f"target_h{step}"
        data_dict[target_col_name] = actuals[:, h]
        target_cols.append(target_col_name)

        # Generate biased median prediction for this step
        median_pred = actuals[:, h] + pred_bias + rng.normal(
            0, target_stddev * 0.5, n_samples) # Add some noise to median

        # Generate other quantiles around the biased median
        for q in quantiles:
            # Calculate offset based on quantile distance from median
            # Scaled by spread factor and target stddev
            quantile_offset = (q - 0.5) * pred_spread_factor * target_stddev
            # Add noise specific to this quantile/step
            q_noise = rng.normal(0, target_stddev * 0.2, n_samples)
            pred_val = median_pred + quantile_offset + q_noise

            # Add prediction column
            q_key = f"q{q:.1f}".replace("0.", "") # e.g., q0.1 -> q1
            pred_col_name = f"pred_{q_key}_h{step}"
            data_dict[pred_col_name] = pred_val
            prediction_cols[q_key].append(pred_col_name)
            all_pred_cols_flat.append(pred_col_name)

    # Create DataFrame
    df = pd.DataFrame(data_dict)

    # Define column categories for Bunch
    feature_names = [c for c in df.columns if c in ['longitude', 'latitude']]
    target_names = target_cols

    if as_frame:
        # Order columns logically
        ordered_cols = feature_names + target_names + sorted(all_pred_cols_flat)
        return df[[c for c in ordered_cols if c in df.columns]]
    else:
        # Create Bunch description
        descr = textwrap.dedent(f"""\
        Synthetic Quantile Prediction Data

        **Description:**
        Simulates {n_samples} samples (e.g., locations) with actual
        target values and corresponding quantile predictions for
        {n_horizons} future horizons. Target values are drawn from a
        normal distribution. Predictions are generated around a biased
        median, with spread controlled by `pred_spread_factor`.

        **Generation Parameters:**
        - n_samples: {n_samples}
        - n_horizons: {n_horizons}
        - quantiles: {quantiles}
        - target_mean: {target_mean:.2f}
        - target_stddev: {target_stddev:.2f}
        - pred_bias: {pred_bias:.2f}
        - pred_spread_factor: {pred_spread_factor:.2f}
        - seed: {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame in wide format.
        - quantiles       : List of quantiles generated.
        - horizons        : List of horizon steps [1, ..., {n_horizons}].
        - feature_names   : List of coordinate columns (if generated).
        - target_cols     : List of target column names ('target_hX').
        - prediction_cols : Dict mapping quantile keys ('qX') to lists
                            of corresponding prediction column names.
        - longitude       : NumPy array of longitude values (if generated).
        - latitude        : NumPy array of latitude values (if generated).
        - DESCR           : This description.
        """)

        bunch_dict = {
            "frame": df,
            "quantiles": quantiles,
            "horizons": list(range(1, n_horizons + 1)),
            "feature_names": feature_names,
            "target_cols": target_names,
            "prediction_cols": prediction_cols,
            "DESCR": descr,
        }
        if add_coords:
            if 'longitude' in df: bunch_dict['longitude'] = df['longitude'].values
            if 'latitude' in df: bunch_dict['latitude'] = df['latitude'].values

        return XBunch(**bunch_dict)




[docs]
def make_anomaly_data(
    n_sequences: int = 200,
    sequence_length: int = 50,
    n_features: int = 1,
    anomaly_fraction: float = 0.1,
    anomaly_type: str = 'spike', # 'spike' or 'level_shift'
    anomaly_magnitude: float = 5.0,
    noise_level: float = 0.2,
    as_frame: bool = False, 
    seed: Optional[int] = None,
) -> Union[Tuple[np.ndarray, np.ndarray], XBunch, pd.DataFrame]:
    r"""Generate sequence data with injected anomalies.

    Creates a dataset of time series sequences, where a specified
    fraction contains synthetically generated anomalies (spikes or
    level shifts). It returns the sequences and corresponding binary
    labels (0 for normal, 1 for anomaly).

    This data is useful for testing and evaluating anomaly detection
    algorithms like :class:`~fusionlab.nn.anomaly_detection.LSTMAutoencoderAnomaly`
    or anomaly-aware training strategies.

    Parameters
    ----------
    n_sequences : int, default=200
        Total number of sequences to generate.
    sequence_length : int, default=50
        Number of time steps in each sequence.
    n_features : int, default=1
        Number of features for each time step. Currently supports 1.
    anomaly_fraction : float, default=0.1
        Fraction of sequences that should contain anomalies (between 0 and 1).
    anomaly_type : {'spike', 'level_shift'}, default='spike'
        Type of anomaly to inject:
        - ``'spike'``: Adds/subtracts `anomaly_magnitude` at a random single point.
        - ``'level_shift'``: Adds/subtracts `anomaly_magnitude` to all points
          after a random point in the sequence.
    anomaly_magnitude : float, default=5.0
        The magnitude (absolute value) of the injected anomaly. The sign
        (add or subtract) is chosen randomly.
    noise_level : float, default=0.2
        Standard deviation of Gaussian noise added to the base signal.
    as_frame : bool, default=False
        Determines return type:
        - If ``False`` (default): Returns a tuple `(sequences, labels)`
          where `sequences` is a NumPy array `(N, T, F)` and `labels`
          is `(N,)`.
        - If ``True``: Attempts to create a DataFrame and returns a Bunch
          object (less standard for sequence data).
    seed : int, optional
        Seed for NumPy's random number generator for reproducibility.
        Default is None.

    Returns
    -------
    data : tuple or :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame
        If ``as_frame=False`` (default):
        Tuple `(sequences, labels)`:
            - sequences : ndarray of shape (n_sequences, sequence_length, n_features)
            - labels : ndarray of shape (n_sequences,) with 0 (normal) or 1 (anomaly).
        If ``as_frame=True``:
        A Bunch object containing a DataFrame (`frame` - potentially very wide
        if sequences flattened), `labels`, `feature_names`, etc. Or just the
        DataFrame if preferred (structure TBD). *Note: Returning sequences
        as a DataFrame can be awkward.*

    Raises
    ------
    ValueError
        If `n_features` is not 1 (currently only supports univariate).
        If `anomaly_fraction` is not between 0 and 1.
        If `anomaly_type` is invalid.

    Examples
    --------
    >>> from fusionlab.datasets import make_anomaly_data
    >>> # Generate sequences and labels as NumPy arrays
    >>> sequences, labels = make_anomaly_data(n_sequences=50, anomaly_fraction=0.2, seed=42)
    >>> print(f"Generated sequences shape: {sequences.shape}")
    >>> print(f"Generated labels shape: {labels.shape}")
    >>> print(f"Number of anomalies: {np.sum(labels)}")

    """
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    if n_features != 1:
        # TODO: Extend to multivariate sequences if needed
        raise ValueError("Currently only supports n_features=1")
    if not 0 <= anomaly_fraction <= 1:
        raise ValueError("'anomaly_fraction' must be between 0 and 1.")
    if anomaly_type not in ['spike', 'level_shift']:
        raise ValueError("anomaly_type must be 'spike' or 'level_shift'")

    n_anomalies = int(n_sequences * anomaly_fraction)
    n_normal = n_sequences - n_anomalies

    sequences = []
    labels = []

    # Generate Normal Sequences (e.g., sine wave + noise)
    for _ in range(n_normal):
        time = np.arange(sequence_length)
        signal = np.sin(time * 0.2 + rng.uniform(0, np.pi)) # Add random phase
        noise = rng.normal(0, noise_level, sequence_length)
        sequences.append((signal + noise).reshape(sequence_length, 1))
        labels.append(0)

    # Generate Anomalous Sequences
    for _ in range(n_anomalies):
        time = np.arange(sequence_length)
        signal = np.sin(time * 0.2 + rng.uniform(0, np.pi))
        noise = rng.normal(0, noise_level, sequence_length)
        sequence = signal + noise

        # Inject anomaly
        anomaly_point = rng.integers(1, sequence_length - 1) # Avoid edges
        direction = rng.choice([-1, 1])
        magnitude = anomaly_magnitude * direction

        if anomaly_type == 'spike':
            sequence[anomaly_point] += magnitude
        elif anomaly_type == 'level_shift':
            sequence[anomaly_point:] += magnitude

        sequences.append(sequence.reshape(sequence_length, 1))
        labels.append(1)

    # Shuffle sequences and labels together
    sequences = np.array(sequences).astype(np.float32)
    labels = np.array(labels).astype(int)
    indices = np.arange(n_sequences)
    rng.shuffle(indices)
    sequences = sequences[indices]
    labels = labels[indices]

    if as_frame:
        # Create DataFrame (less standard for sequences, might flatten)
        # Example: Flattening each sequence - creates many columns
        warnings.warn("Returning sequence data as a DataFrame can lead"
                      " to a very wide table. Tuple (sequences, labels)"
                      " is generally preferred.")
        seq_flat = sequences.reshape(n_sequences, -1)
        col_names = [f"t_{i}" for i in range(sequence_length * n_features)]
        df = pd.DataFrame(seq_flat, columns=col_names)
        df['label'] = labels
        df['sequence_id'] = np.arange(n_sequences)

        descr = textwrap.dedent(f"""\
        Synthetic Anomaly Sequence Data (DataFrame Format)

        **Description:**
        Contains {n_sequences} sequences, each of length {sequence_length}
        with {n_features} feature(s). {n_anomalies} sequences contain
        '{anomaly_type}' anomalies of magnitude ~{anomaly_magnitude}.
        Data is flattened in the 'frame'.

        **Data Structure (Bunch object):**
        - frame         : Flattened sequences + label pandas DataFrame.
        - labels        : NumPy array of labels (0=normal, 1=anomaly).
        - feature_names : List of time step column names.
        - target_names  : ['label'].
        - DESCR         : This description.
        """)
        return XBunch(frame=df, labels=labels, feature_names=col_names,
                     target_names=['label'], DESCR=descr)
    else:
        # Return standard NumPy arrays
        return sequences, labels




[docs]
def make_trend_seasonal_data(
    n_timesteps: int = 365 * 2, # Default 2 years of daily data
    freq: str = 'D',
    trend_order: int = 1, # 0: constant, 1: linear, 2: quadratic
    trend_coeffs: Optional[List[float]] = None, # Specify if order > 0
    seasonal_periods: List[float] = [7, 365.25], # Weekly, Yearly
    seasonal_amplitudes: List[float] = [5, 15], # Amplitudes for each period
    noise_level: float = 1.0,
    base_level: float = 50.0,
    as_frame: bool = False,
    seed: Optional[int] = None,
) -> Union[XBunch, pd.DataFrame]:
    r"""Generate synthetic time series with specified trend and seasonality.

    Creates a univariate time series containing a configurable polynomial
    trend, multiple sinusoidal seasonal components, and Gaussian noise.

    This is useful for testing decomposition methods or how well models
    capture specific trend and seasonal patterns.

    Parameters
    ----------
    n_timesteps : int, default=730
        Number of time steps (rows) to generate.
    freq : str, default='D'
        Pandas frequency string for generating the datetime index.
    trend_order : int, default=1
        Order of the polynomial trend (0=constant, 1=linear, 2=quadratic).
    trend_coeffs : list of float, optional
        Coefficients for the polynomial trend, starting with the constant term.
        Length should be `trend_order + 1`. If None, default coefficients
        are used (e.g., [base_level, 0.1] for order 1). Default is None.
    seasonal_periods : list of float, default=[7, 365.25]
        List of periods for the sinusoidal seasonal components (in number
        of time steps).
    seasonal_amplitudes : list of float, default=[5, 15]
        List of amplitudes corresponding to each period in
        `seasonal_periods`. Length must match `seasonal_periods`.
    noise_level : float, default=1.0
        Standard deviation of the Gaussian noise added to the signal.
    base_level : float, default=50.0
        The constant term (offset) if `trend_order` is 0, or the intercept
        used in default trend coefficients.
    as_frame : bool, default=False
        Return type: ``False`` for Bunch, ``True`` for DataFrame.
    seed : int, optional
        Seed for NumPy's random number generator for reproducibility.

    Returns
    -------
    data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame
        If ``as_frame=False`` (default):
        A Bunch object with ``frame``, ``data`` (values), ``target_names``
        (['value']), ``target`` (values array), ``dt_col`` ('date'),
        and ``DESCR``.
        If ``as_frame=True``:
        The generated data as a pandas DataFrame with 'date' and 'value'.

    Raises
    ------
    ValueError
        If lengths of `seasonal_periods` and `seasonal_amplitudes` mismatch,
        or if `trend_coeffs` length doesn't match `trend_order`.

    Examples
    --------
    >>> from fusionlab.datasets import make_trend_seasonal_data
    >>> # Generate data with linear trend and two seasonalities
    >>> data_bunch = make_trend_seasonal_data(n_timesteps=100, freq='D', seed=1)
    >>> print(data_bunch.frame.head())
    >>> data_bunch.frame.plot(x='date', y='value', figsize=(10, 3)) # Quick plot
    """
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    if len(seasonal_periods) != len(seasonal_amplitudes):
        raise ValueError("Lengths of 'seasonal_periods' and "
                         "'seasonal_amplitudes' must match.")

    # --- Time Index ---
    date_rng = pd.date_range(
        start='2020-01-01', periods=n_timesteps, freq=freq)
    time_idx = np.arange(n_timesteps) # Simple index for trend calc

    # --- Trend Component ---
    if trend_order < 0:
        raise ValueError("'trend_order' must be >= 0.")
    if trend_coeffs is None:
        # Create default coefficients
        if trend_order == 0: trend_coeffs = [base_level]
        elif trend_order == 1: trend_coeffs = [base_level, 0.1] # Slope 0.1
        elif trend_order == 2: trend_coeffs = [base_level, 0.1, 0.01] # Quadratic term
        else: trend_coeffs = [base_level] + [0.01] * trend_order # Small higher orders
    elif len(trend_coeffs) != trend_order + 1:
        raise ValueError(f"Length of 'trend_coeffs' ({len(trend_coeffs)}) must be "
                         f"'trend_order' + 1 ({trend_order + 1}).")

    # Calculate polynomial trend
    trend_component = np.polynomial.polynomial.polyval(time_idx, trend_coeffs)

    # --- Seasonal Component ---
    seasonal_component = np.zeros(n_timesteps)
    for period, amplitude in zip(seasonal_periods, seasonal_amplitudes):
        if period <= 0: continue # Skip invalid periods
        omega = 2 * np.pi / period
        # Add phase shift to make multiple components distinct
        phase_shift = rng.uniform(0, np.pi / 2)
        seasonal_component += amplitude * np.sin(omega * time_idx + phase_shift)

    # --- Noise Component ---
    noise_component = rng.normal(0, noise_level, n_timesteps)

    # --- Combine Components ---
    value = trend_component + seasonal_component + noise_component

    # --- Create DataFrame ---
    df = pd.DataFrame({'date': date_rng, 'value': value})
    target_col = 'value'
    dt_col = 'date'

    if as_frame:
        return df
    else:
        descr = textwrap.dedent(f"""\
        Synthetic Time Series with Trend and Seasonality

        **Description:**
        A univariate time series generated with {n_timesteps} steps
        (frequency '{freq}'). Includes a polynomial trend of order
        {trend_order}, {len(seasonal_periods)} seasonal component(s) with
        periods {seasonal_periods}, and Gaussian noise with standard
        deviation {noise_level:.2f}.

        **Generation Parameters:**
        - trend_coeffs: {trend_coeffs}
        - seasonal_periods: {seasonal_periods}
        - seasonal_amplitudes: {seasonal_amplitudes}
        - noise_level: {noise_level:.2f}
        - seed: {seed}

        **Data Structure (Bunch object):**
        - frame         : pandas DataFrame with 'date' and 'value'.
        - data          : NumPy array of 'value'.
        - target_names  : ['value'].
        - target        : NumPy array of 'value'.
        - dt_col        : 'date'.
        - DESCR         : This description.
        """)

        return XBunch(
            frame=df,
            data=df[target_col].values,
            target_names=[target_col],
            target=df[target_col].values,
            dt_col=dt_col,
            DESCR=descr
        )



[docs]
def make_multivariate_target_data(
    n_series: int = 2,
    n_timesteps: int = 100,
    n_targets: int = 2, # Number of target variables
    freq: str = 'D',
    trend_factor: float = 0.1,
    seasonality_period: float = 7,
    seasonality_amplitude: float = 5,
    noise_level: float = 0.5,
    # Control relationship between targets
    cross_target_lag: int = 1,
    cross_target_factor: float = 0.3,
    as_frame: bool = False,
    seed: Optional[int] = None,
) -> Union[XBunch, pd.DataFrame]:
    r"""Generate multi-series data with multiple related target variables.

    Creates a dataset suitable for demonstrating multivariate forecasting.
    It simulates data for multiple independent series (e.g., items) where
    each series has several features (static, dynamic, future) and
    multiple target variables.

    The target variables are generated with some interdependence (e.g.,
    target 2 depends on the lagged value of target 1).

    Parameters
    ----------
    n_series : int, default=2
        Number of independent time series (e.g., items).
    n_timesteps : int, default=100
        Number of time steps (rows) per series.
    n_targets : int, default=2
        Number of related target variables to generate (e.g., 'target_1',
        'target_2', ...).
    freq : str, default='D'
        Pandas frequency string for the datetime index.
    trend_factor : float, default=0.1
        Slope factor for the linear trend component in targets.
    seasonality_period : float, default=7
        Periodicity for the main seasonal component in targets.
    seasonality_amplitude : float, default=5
        Amplitude of the main seasonal component in targets.
    noise_level : float, default=0.5
        Standard deviation of Gaussian noise added to each target.
    cross_target_lag : int, default=1
        Lag used for the dependency between targets (target N depends on
        target N-1 lagged by this amount).
    cross_target_factor : float, default=0.3
        Coefficient determining the strength of dependence between lagged
        targets.
    as_frame : bool, default=False
        Return type: ``False`` for Bunch, ``True`` for DataFrame.
    seed : int, optional
        Seed for NumPy's random number generator.

    Returns
    -------
    data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame
        If ``as_frame=False`` (default):
        A Bunch object including ``frame`` (DataFrame), lists of
        ``static_features``, ``dynamic_features``, ``future_features``,
        ``target_names`` (list of target columns), ``target`` (NumPy
        array of shape (N_rows, n_targets)), and ``DESCR``.
        If ``as_frame=True``:
        The generated data solely as a pandas DataFrame.

    Examples
    --------
    >>> from fusionlab.datasets import make_multivariate_target_data
    >>> # Generate data with 3 targets for 4 series
    >>> data_bunch = make_multivariate_target_data(n_series=4, n_targets=3, seed=1)
    >>> print(data_bunch.frame.head())
    >>> print("Target names:", data_bunch.target_names)
    >>> print("Target array shape:", data_bunch.target.shape)

    """
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    if n_targets <= 0:
        raise ValueError("'n_targets' must be >= 1.")

    all_series_df = []
    start_date = '2021-01-01'

    for i in range(n_series):
        # --- Time Index ---
        date_rng = pd.date_range(
            start=start_date, periods=n_timesteps, freq=freq)
        time_idx = np.arange(n_timesteps)

        # --- Static Features ---
        series_id = i
        base_level_factor = 1 + rng.uniform(-0.2, 0.2) # Static variation

        # --- Shared Components for Targets ---
        trend = (50 + i * 10) + trend_factor * time_idx
        seasonality = seasonality_amplitude * np.sin(
            2 * np.pi * time_idx / seasonality_period + rng.uniform(0, np.pi)
            )
        base_signal = trend + seasonality

        # --- Generate Multiple Targets ---
        targets = {}
        target_names_list = [f"target_{j+1}" for j in range(n_targets)]
        previous_target_lagged = None

        for j in range(n_targets):
            target_name = target_names_list[j]
            # Base target value
            target_j = base_signal * (base_level_factor + j * 0.1)
            # Add dependency on previous target's lag (if not the first target)
            if j > 0 and previous_target_lagged is not None:
                target_j += cross_target_factor * previous_target_lagged
            # Add noise
            target_j += rng.normal(0, noise_level * (1 + j*0.1), n_timesteps)
            targets[target_name] = target_j
            # Prepare lagged version for the *next* target's calculation
            previous_target_lagged = pd.Series(target_j).shift(
                cross_target_lag).fillna(method='bfill')

        # --- Other Features (Dynamic/Future) ---
        month = date_rng.month
        dayofweek = date_rng.dayofweek
        # Dynamic covariate (example)
        dynamic_cov = rng.normal(5, 1, n_timesteps)
        # Future covariate (example)
        future_event = rng.choice([0, 0, 0, 1], n_timesteps) # Sparse event

        # --- Assemble DataFrame ---
        series_df = pd.DataFrame({
            'date': date_rng,
            'series_id': series_id,
            'base_level_factor': base_level_factor, # Static
            'month': month, # Dynamic/Future
            'dayofweek': dayofweek, # Dynamic/Future
            'dynamic_cov': dynamic_cov, # Dynamic
            'future_event': future_event, # Future
            **targets # Add all target columns
        })
        all_series_df.append(series_df)

    # --- Combine and Define Roles ---
    df = pd.concat(all_series_df).reset_index(drop=True)
    dt_col = 'date'
    target_names = target_names_list # List of generated target names
    spatial_id_col = 'series_id'
    static_features = ['series_id', 'base_level_factor']
    dynamic_features = ['month', 'dayofweek', 'dynamic_cov']
    future_features = ['month', 'dayofweek', 'future_event']
    # Combined feature list for Bunch.feature_names
    feature_names = static_features[1:] + dynamic_features + future_features

    # --- Return ---
    if as_frame:
        ordered_cols = (
             [dt_col, spatial_id_col] + static_features[1:] +
             dynamic_features + future_features + target_names
             )
        ordered_cols = [c for c in ordered_cols if c in df.columns]
        return df[ordered_cols]
    else:
        descr = textwrap.dedent(f"""\
        Synthetic Multi-Series, Multi-Target Data

        **Description:**
        Simulates data for {n_series} independent series over {n_timesteps}
        time steps (frequency '{freq}'). Each series has static, dynamic,
        and future features, along with {n_targets} related target variables.
        Targets exhibit trend, seasonality, noise, and lagged cross-target
        dependencies. Suitable for multivariate forecasting.

        **Generation Parameters:** (Approximate)
        - n_series: {n_series}
        - n_timesteps: {n_timesteps}
        - n_targets: {n_targets}
        - freq: '{freq}'
        - seed: {seed}

        **Data Structure (Bunch object):**
        - frame            : Complete pandas DataFrame.
        - static_features  : List of static column names.
        - dynamic_features : List of dynamic column names.
        - future_features  : List of future column names.
        - target_names     : List of target column names {target_names}.
        - target           : NumPy array of target values shape (N_rows, {n_targets}).
        - dt_col           : Name of datetime column ('{dt_col}').
        - spatial_id_col   : Name of series identifier column ('{spatial_id_col}').
        - feature_names    : Combined list of non-ID/non-target features.
        - DESCR            : This description.
        """)

        target_array = df[target_names].values
        # Extract numerical features for Bunch.data
        try:
            data_cols = [c for c in feature_names if c != spatial_id_col]
            data_array = df[data_cols].select_dtypes(include=np.number).values
        except:
             data_array = None

        return XBunch(
            frame=df,
            static_features=static_features,
            dynamic_features=dynamic_features,
            future_features=future_features,
            target_names=target_names,
            target=target_array,
            dt_col=dt_col,
            spatial_id_col=spatial_id_col,
            feature_names=feature_names,
            data=data_array,
            DESCR=descr
        )