Source code for fusionlab.datasets.make

# -*- coding: utf-8 -*-
# Author: LKouadio <etanoyau@gmail.com>
# License: BSD-3-Clause 
# -------------------------------------------------------------------
# Provides functions for generating synthetic datasets suitable for
# demonstrating and testing fusionlab models.
# -------------------------------------------------------------------
"""
Synthetic Dataset Generation Utilities.

This module provides functions to create synthetic datasets tailored
for demonstrating and testing the various models and utilities within
the `fusionlab` package, particularly those expecting static, dynamic,
and future features (like TFT and XTFT).
"""
from __future__ import annotations

import textwrap
import warnings
import numpy as np
import pandas as pd
from typing import Optional, List, Union, Tuple

from ..api.bunch import XBunch


__all__ = [
    "make_multi_feature_time_series", 
    "make_quantile_prediction_data",
    "make_anomaly_data", 
    "make_trend_seasonal_data",
    "make_multivariate_target_data"
    ]

[docs] def make_multi_feature_time_series( n_series: int = 3, n_timesteps: int = 100, freq: str = 'D', static_noise_level: float = 0.1, trend_base: float = 10, trend_factor: float = 0.1, seasonality_period: float = 7, seasonality_amplitude: float = 5, dynamic_cov_amplitude: float = 2, future_cov_amplitude: float = 1, noise_level: float = 1.0, as_frame: bool = False, seed: Optional[int] = None, ) -> Union[XBunch, pd.DataFrame]: r"""Generate multi-variate time series with static, dynamic, and future features. Creates a synthetic dataset suitable for models like TFT/XTFT. It simulates data for multiple independent series (e.g., items, locations) over a specified number of time steps. Each series includes: - Static features (unique ID, a noisy base value). - Dynamic features (time index features like month/dayofweek, a simulated covariate like temperature, lagged target). - Known future features (time index features, a simulated binary event like promotion). - A target variable generated from trend, seasonality, covariates, static base, and noise. Parameters ---------- n_series : int, default=3 Number of independent time series (e.g., items, sensors) to generate. n_timesteps : int, default=100 Number of time steps (rows) per series. freq : str, default='D' Pandas frequency string for generating the datetime index (e.g., 'D' for daily, 'MS' for month start, 'H' for hourly). static_noise_level : float, default=0.1 Amount of noise added to the static 'base_level' feature. trend_base : float, default=10 Base value for the linear trend component. trend_factor : float, default=0.1 Slope factor for the linear trend component. seasonality_period : float, default=7 Periodicity for the main seasonal component (e.g., 7 for weekly pattern with daily data, 12 for yearly pattern with monthly data). seasonality_amplitude : float, default=5 Amplitude of the main seasonal sinusoidal component. dynamic_cov_amplitude : float, default=2 Amplitude of the simulated dynamic covariate (e.g., temperature). future_cov_amplitude : float, default=1 Magnitude of the effect of the simulated future binary event. noise_level : float, default=1.0 Standard deviation of the Gaussian noise added to the final target signal. as_frame : bool, default=False Determines the return type: - If ``False`` (default): Returns a Bunch object containing the DataFrame and metadata (column names grouped by type). - If ``True``: Returns only the pandas DataFrame. seed : int, optional Seed for NumPy's random number generator for reproducibility. Default is None. Returns ------- data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default): A Bunch object with attributes like ``frame`` (DataFrame), ``static_features`` (list of col names), ``dynamic_features``, ``future_features``, ``target_col``, ``dt_col``, ``spatial_id_col``, and ``DESCR``. If ``as_frame=True``: The generated data solely as a pandas DataFrame. Examples -------- >>> from fusionlab.datasets.make import make_multi_feature_time_series >>> # Generate daily data for 5 series >>> data_bunch = make_multi_feature_time_series(n_series=5, n_timesteps=100, ... freq='D', seasonality_period=7, ... seed=42) >>> print(data_bunch.frame.head()) >>> print("Static Features:", data_bunch.static_features) >>> print("Dynamic Features:", data_bunch.dynamic_features) >>> print("Future Features:", data_bunch.future_features) >>> # Generate monthly data as DataFrame >>> df_monthly = make_multi_feature_time_series(n_series=2, n_timesteps=36, ... freq='MS', seasonality_period=12, ... as_frame=True, seed=123) >>> print(df_monthly.info()) """ # if seed is not None: rng = np.random.RandomState(seed) all_series_df = [] start_date = '2020-01-01' # Arbitrary start date for i in range(n_series): # --- Time Index --- date_rng = pd.date_range( start=start_date, periods=n_timesteps, freq=freq) time_idx = np.arange(n_timesteps) # --- Static Features --- series_id = i # Each series gets a slightly different noisy base level base_level = 50 + i * 20 + rng.normal(0, static_noise_level) # --- Dynamic Features --- month = date_rng.month dayofweek = date_rng.dayofweek # Simulated dynamic covariate (e.g., temperature-like) dynamic_cov = dynamic_cov_amplitude * np.sin( 2 * np.pi * time_idx / (seasonality_period * 2) + i * np.pi / 3 # Phase shift per series ) + rng.normal(0, noise_level * 0.5, n_timesteps) # --- Future Features --- # Known future event (e.g., promotion flag) future_event = rng.randint(0, 2, n_timesteps) # Time features known in advance future_month = month future_dayofweek = dayofweek # --- Target Variable --- trend = trend_base + trend_factor * time_idx * (1 + i * 0.1) seasonality = seasonality_amplitude * np.sin( 2 * np.pi * time_idx / seasonality_period + i * np.pi / 4 # Phase shift ) event_effect = future_event * future_cov_amplitude * (5 + i) # Event impact noise = rng.normal(0, noise_level, n_timesteps) target = base_level + trend + seasonality + event_effect + \ 0.5 * dynamic_cov + noise # Combine components # --- Lagged Target (as Dynamic Input) --- # Create after calculating target lagged_target = pd.Series(target).shift(1).fillna(method='bfill') # Backfill first NaN # --- Assemble DataFrame for this series --- series_df = pd.DataFrame({ 'date': date_rng, 'series_id': series_id, # Static identifier 'base_level': base_level, # Static numerical 'month': future_month, # Dynamic and Future: month 'dayofweek': future_dayofweek, # Dynamic and Future:dayofweek 'dynamic_cov': dynamic_cov, # Dynamic only 'target_lag1': lagged_target, # Dynamic only 'future_event': future_event, # Future only 'target': target # Target variable }) all_series_df.append(series_df) # --- Combine all series --- df = pd.concat(all_series_df).reset_index(drop=True) # --- Define Column Roles --- dt_col = 'date' target_col = 'target' spatial_id_col = 'series_id' static_features = ['series_id', 'base_level'] dynamic_features = ['month', 'dayofweek', 'dynamic_cov', 'target_lag1'] future_features = ['month', 'dayofweek', 'future_event'] # Exclude target and ID from features list passed to Bunch dynamic_and_future_features = list(set (dynamic_features + future_features)) feature_names = static_features[1:] + dynamic_and_future_features # dynamic_features + future_features # --- Return based on as_frame --- if as_frame: # Return DataFrame with logical column order ordered_cols = ( [dt_col, spatial_id_col] + static_features[1:] + dynamic_and_future_features + [target_col] ) # Ensure columns exist before ordering ordered_cols = [c for c in ordered_cols if c in df.columns] return df[ordered_cols] else: # Create Bunch object descr = textwrap.dedent(f"""\ Synthetic Multi-Feature Time Series Data **Description:** Simulates data for {n_series} independent series over {n_timesteps} time steps with frequency '{freq}'. Includes static, dynamic, and known future features suitable for TFT/XTFT models. **Generation Parameters:** (Approximate) - n_series: {n_series} - n_timesteps: {n_timesteps} - freq: '{freq}' - seasonality_period: {seasonality_period} - noise_level: {noise_level:.2f} - trend/seasonality/covariates included. **Data Structure (Bunch object):** - frame : Complete pandas DataFrame. - static_features : List of static column names. - dynamic_features : List of dynamic column names. - future_features : List of future column names. - target_col : Name of the target column ('{target_col}'). - dt_col : Name of the datetime column ('{dt_col}'). - spatial_id_col : Name of the series identifier column ('{spatial_id_col}'). - feature_names : Combined list of static (excl. ID), dynamic, future features. - DESCR : This description. """) # Order frame columns for Bunch frame attribute frame_cols = ( [dt_col, spatial_id_col] + static_features[1:] + dynamic_and_future_features + [target_col] ) frame_cols = [c for c in frame_cols if c in df.columns] return XBunch( frame=df[frame_cols], static_features=static_features, dynamic_features=dynamic_features, future_features=future_features, target_col=target_col, dt_col=dt_col, spatial_id_col=spatial_id_col, feature_names=feature_names, # Combined list DESCR=descr )
[docs] def make_quantile_prediction_data( n_samples: int = 100, n_horizons: int = 6, quantiles: List[float] = [0.1, 0.5, 0.9], target_mean: float = 50.0, target_stddev: float = 10.0, pred_bias: float = 1.0, pred_spread_factor: float = 1.5, add_coords: bool = True, coord_scale: float = 10.0, as_frame: bool = False, seed: Optional[int] = None, ) -> Union[XBunch, pd.DataFrame]: r"""Generate synthetic actuals and corresponding quantile predictions. Creates a dataset simulating the output of a multi-horizon quantile forecasting model. It includes actual target values and predicted values for specified quantiles across multiple forecast horizons for a set of samples (e.g., locations). This data is useful for demonstrating and testing functions that evaluate or visualize probabilistic forecasts, such as those comparing prediction intervals to actual outcomes. Parameters ---------- n_samples : int, default=100 Number of independent samples (e.g., locations) to generate. n_horizons : int, default=6 Number of future time steps (forecast horizon) per sample. quantiles : list of float, default=[0.1, 0.5, 0.9] List of quantile levels (between 0 and 1) for which to generate predictions. target_mean : float, default=50.0 Mean value around which the 'actual' target values are generated. target_stddev : float, default=10.0 Standard deviation for generating the 'actual' target values (using a normal distribution). pred_bias : float, default=1.0 Systematic bias added to the median (0.5 quantile) prediction relative to the generated actual value. pred_spread_factor : float, default=1.5 Factor controlling the width of the prediction intervals. A higher value creates wider intervals between quantiles. Specifically, it scales the offsets added/subtracted from the biased median. add_coords : bool, default=True If ``True``, add 'longitude' and 'latitude' columns with random coordinates. coord_scale : float, default=10.0 Scaling factor for the random coordinates if `add_coords` is True. as_frame : bool, default=False Determines the return type: - ``False`` (default): Returns a Bunch object. - ``True``: Returns only the pandas DataFrame. seed : int, optional Seed for NumPy's random number generator for reproducibility. Default is None. Returns ------- data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default): A Bunch object with attributes like ``frame`` (DataFrame), ``quantiles`` (list), ``horizons`` (list), ``target_cols``, ``prediction_cols`` (nested dict), `longitude`, `latitude` (if generated), and ``DESCR``. If ``as_frame=True``: The generated data solely as a pandas DataFrame in wide format (e.g., columns 'target_h1', 'pred_q10_h1', 'pred_q50_h1', ...). Examples -------- >>> from fusionlab.datasets import make_quantile_prediction_data >>> # Generate data as Bunch >>> pred_bunch = make_quantile_prediction_data(n_samples=5, n_horizons=3, seed=1) >>> print(pred_bunch.frame.head()) >>> print("Quantile columns for q=0.1:", pred_bunch.prediction_cols['q0.1']) >>> # Generate data as DataFrame >>> pred_df = make_quantile_prediction_data(as_frame=True, seed=2) >>> print(pred_df.info()) """ if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() if not quantiles or not isinstance(quantiles, list): raise ValueError("'quantiles' must be a non-empty list of floats.") # Generate base actuals and coordinates actuals = rng.normal( target_mean, target_stddev, size=(n_samples, n_horizons) ) data_dict = {} if add_coords: # Simulate coordinates (e.g., centered around 0) longitude = rng.uniform(-coord_scale, coord_scale, n_samples) latitude = rng.uniform(-coord_scale/2, coord_scale/2, n_samples) data_dict['longitude'] = longitude data_dict['latitude'] = latitude target_cols = [] prediction_cols = {f"q{q:.1f}".replace("0.", ""): [] for q in quantiles} all_pred_cols_flat = [] # Generate predictions for each horizon step and quantile for h in range(n_horizons): step = h + 1 # Add actual column for this step target_col_name = f"target_h{step}" data_dict[target_col_name] = actuals[:, h] target_cols.append(target_col_name) # Generate biased median prediction for this step median_pred = actuals[:, h] + pred_bias + rng.normal( 0, target_stddev * 0.5, n_samples) # Add some noise to median # Generate other quantiles around the biased median for q in quantiles: # Calculate offset based on quantile distance from median # Scaled by spread factor and target stddev quantile_offset = (q - 0.5) * pred_spread_factor * target_stddev # Add noise specific to this quantile/step q_noise = rng.normal(0, target_stddev * 0.2, n_samples) pred_val = median_pred + quantile_offset + q_noise # Add prediction column q_key = f"q{q:.1f}".replace("0.", "") # e.g., q0.1 -> q1 pred_col_name = f"pred_{q_key}_h{step}" data_dict[pred_col_name] = pred_val prediction_cols[q_key].append(pred_col_name) all_pred_cols_flat.append(pred_col_name) # Create DataFrame df = pd.DataFrame(data_dict) # Define column categories for Bunch feature_names = [c for c in df.columns if c in ['longitude', 'latitude']] target_names = target_cols if as_frame: # Order columns logically ordered_cols = feature_names + target_names + sorted(all_pred_cols_flat) return df[[c for c in ordered_cols if c in df.columns]] else: # Create Bunch description descr = textwrap.dedent(f"""\ Synthetic Quantile Prediction Data **Description:** Simulates {n_samples} samples (e.g., locations) with actual target values and corresponding quantile predictions for {n_horizons} future horizons. Target values are drawn from a normal distribution. Predictions are generated around a biased median, with spread controlled by `pred_spread_factor`. **Generation Parameters:** - n_samples: {n_samples} - n_horizons: {n_horizons} - quantiles: {quantiles} - target_mean: {target_mean:.2f} - target_stddev: {target_stddev:.2f} - pred_bias: {pred_bias:.2f} - pred_spread_factor: {pred_spread_factor:.2f} - seed: {seed} **Data Structure (Bunch object):** - frame : Complete pandas DataFrame in wide format. - quantiles : List of quantiles generated. - horizons : List of horizon steps [1, ..., {n_horizons}]. - feature_names : List of coordinate columns (if generated). - target_cols : List of target column names ('target_hX'). - prediction_cols : Dict mapping quantile keys ('qX') to lists of corresponding prediction column names. - longitude : NumPy array of longitude values (if generated). - latitude : NumPy array of latitude values (if generated). - DESCR : This description. """) bunch_dict = { "frame": df, "quantiles": quantiles, "horizons": list(range(1, n_horizons + 1)), "feature_names": feature_names, "target_cols": target_names, "prediction_cols": prediction_cols, "DESCR": descr, } if add_coords: if 'longitude' in df: bunch_dict['longitude'] = df['longitude'].values if 'latitude' in df: bunch_dict['latitude'] = df['latitude'].values return XBunch(**bunch_dict)
[docs] def make_anomaly_data( n_sequences: int = 200, sequence_length: int = 50, n_features: int = 1, anomaly_fraction: float = 0.1, anomaly_type: str = 'spike', # 'spike' or 'level_shift' anomaly_magnitude: float = 5.0, noise_level: float = 0.2, as_frame: bool = False, seed: Optional[int] = None, ) -> Union[Tuple[np.ndarray, np.ndarray], XBunch, pd.DataFrame]: r"""Generate sequence data with injected anomalies. Creates a dataset of time series sequences, where a specified fraction contains synthetically generated anomalies (spikes or level shifts). It returns the sequences and corresponding binary labels (0 for normal, 1 for anomaly). This data is useful for testing and evaluating anomaly detection algorithms like :class:`~fusionlab.nn.anomaly_detection.LSTMAutoencoderAnomaly` or anomaly-aware training strategies. Parameters ---------- n_sequences : int, default=200 Total number of sequences to generate. sequence_length : int, default=50 Number of time steps in each sequence. n_features : int, default=1 Number of features for each time step. Currently supports 1. anomaly_fraction : float, default=0.1 Fraction of sequences that should contain anomalies (between 0 and 1). anomaly_type : {'spike', 'level_shift'}, default='spike' Type of anomaly to inject: - ``'spike'``: Adds/subtracts `anomaly_magnitude` at a random single point. - ``'level_shift'``: Adds/subtracts `anomaly_magnitude` to all points after a random point in the sequence. anomaly_magnitude : float, default=5.0 The magnitude (absolute value) of the injected anomaly. The sign (add or subtract) is chosen randomly. noise_level : float, default=0.2 Standard deviation of Gaussian noise added to the base signal. as_frame : bool, default=False Determines return type: - If ``False`` (default): Returns a tuple `(sequences, labels)` where `sequences` is a NumPy array `(N, T, F)` and `labels` is `(N,)`. - If ``True``: Attempts to create a DataFrame and returns a Bunch object (less standard for sequence data). seed : int, optional Seed for NumPy's random number generator for reproducibility. Default is None. Returns ------- data : tuple or :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default): Tuple `(sequences, labels)`: - sequences : ndarray of shape (n_sequences, sequence_length, n_features) - labels : ndarray of shape (n_sequences,) with 0 (normal) or 1 (anomaly). If ``as_frame=True``: A Bunch object containing a DataFrame (`frame` - potentially very wide if sequences flattened), `labels`, `feature_names`, etc. Or just the DataFrame if preferred (structure TBD). *Note: Returning sequences as a DataFrame can be awkward.* Raises ------ ValueError If `n_features` is not 1 (currently only supports univariate). If `anomaly_fraction` is not between 0 and 1. If `anomaly_type` is invalid. Examples -------- >>> from fusionlab.datasets import make_anomaly_data >>> # Generate sequences and labels as NumPy arrays >>> sequences, labels = make_anomaly_data(n_sequences=50, anomaly_fraction=0.2, seed=42) >>> print(f"Generated sequences shape: {sequences.shape}") >>> print(f"Generated labels shape: {labels.shape}") >>> print(f"Number of anomalies: {np.sum(labels)}") """ if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() if n_features != 1: # TODO: Extend to multivariate sequences if needed raise ValueError("Currently only supports n_features=1") if not 0 <= anomaly_fraction <= 1: raise ValueError("'anomaly_fraction' must be between 0 and 1.") if anomaly_type not in ['spike', 'level_shift']: raise ValueError("anomaly_type must be 'spike' or 'level_shift'") n_anomalies = int(n_sequences * anomaly_fraction) n_normal = n_sequences - n_anomalies sequences = [] labels = [] # Generate Normal Sequences (e.g., sine wave + noise) for _ in range(n_normal): time = np.arange(sequence_length) signal = np.sin(time * 0.2 + rng.uniform(0, np.pi)) # Add random phase noise = rng.normal(0, noise_level, sequence_length) sequences.append((signal + noise).reshape(sequence_length, 1)) labels.append(0) # Generate Anomalous Sequences for _ in range(n_anomalies): time = np.arange(sequence_length) signal = np.sin(time * 0.2 + rng.uniform(0, np.pi)) noise = rng.normal(0, noise_level, sequence_length) sequence = signal + noise # Inject anomaly anomaly_point = rng.integers(1, sequence_length - 1) # Avoid edges direction = rng.choice([-1, 1]) magnitude = anomaly_magnitude * direction if anomaly_type == 'spike': sequence[anomaly_point] += magnitude elif anomaly_type == 'level_shift': sequence[anomaly_point:] += magnitude sequences.append(sequence.reshape(sequence_length, 1)) labels.append(1) # Shuffle sequences and labels together sequences = np.array(sequences).astype(np.float32) labels = np.array(labels).astype(int) indices = np.arange(n_sequences) rng.shuffle(indices) sequences = sequences[indices] labels = labels[indices] if as_frame: # Create DataFrame (less standard for sequences, might flatten) # Example: Flattening each sequence - creates many columns warnings.warn("Returning sequence data as a DataFrame can lead" " to a very wide table. Tuple (sequences, labels)" " is generally preferred.") seq_flat = sequences.reshape(n_sequences, -1) col_names = [f"t_{i}" for i in range(sequence_length * n_features)] df = pd.DataFrame(seq_flat, columns=col_names) df['label'] = labels df['sequence_id'] = np.arange(n_sequences) descr = textwrap.dedent(f"""\ Synthetic Anomaly Sequence Data (DataFrame Format) **Description:** Contains {n_sequences} sequences, each of length {sequence_length} with {n_features} feature(s). {n_anomalies} sequences contain '{anomaly_type}' anomalies of magnitude ~{anomaly_magnitude}. Data is flattened in the 'frame'. **Data Structure (Bunch object):** - frame : Flattened sequences + label pandas DataFrame. - labels : NumPy array of labels (0=normal, 1=anomaly). - feature_names : List of time step column names. - target_names : ['label']. - DESCR : This description. """) return XBunch(frame=df, labels=labels, feature_names=col_names, target_names=['label'], DESCR=descr) else: # Return standard NumPy arrays return sequences, labels
[docs] def make_trend_seasonal_data( n_timesteps: int = 365 * 2, # Default 2 years of daily data freq: str = 'D', trend_order: int = 1, # 0: constant, 1: linear, 2: quadratic trend_coeffs: Optional[List[float]] = None, # Specify if order > 0 seasonal_periods: List[float] = [7, 365.25], # Weekly, Yearly seasonal_amplitudes: List[float] = [5, 15], # Amplitudes for each period noise_level: float = 1.0, base_level: float = 50.0, as_frame: bool = False, seed: Optional[int] = None, ) -> Union[XBunch, pd.DataFrame]: r"""Generate synthetic time series with specified trend and seasonality. Creates a univariate time series containing a configurable polynomial trend, multiple sinusoidal seasonal components, and Gaussian noise. This is useful for testing decomposition methods or how well models capture specific trend and seasonal patterns. Parameters ---------- n_timesteps : int, default=730 Number of time steps (rows) to generate. freq : str, default='D' Pandas frequency string for generating the datetime index. trend_order : int, default=1 Order of the polynomial trend (0=constant, 1=linear, 2=quadratic). trend_coeffs : list of float, optional Coefficients for the polynomial trend, starting with the constant term. Length should be `trend_order + 1`. If None, default coefficients are used (e.g., [base_level, 0.1] for order 1). Default is None. seasonal_periods : list of float, default=[7, 365.25] List of periods for the sinusoidal seasonal components (in number of time steps). seasonal_amplitudes : list of float, default=[5, 15] List of amplitudes corresponding to each period in `seasonal_periods`. Length must match `seasonal_periods`. noise_level : float, default=1.0 Standard deviation of the Gaussian noise added to the signal. base_level : float, default=50.0 The constant term (offset) if `trend_order` is 0, or the intercept used in default trend coefficients. as_frame : bool, default=False Return type: ``False`` for Bunch, ``True`` for DataFrame. seed : int, optional Seed for NumPy's random number generator for reproducibility. Returns ------- data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default): A Bunch object with ``frame``, ``data`` (values), ``target_names`` (['value']), ``target`` (values array), ``dt_col`` ('date'), and ``DESCR``. If ``as_frame=True``: The generated data as a pandas DataFrame with 'date' and 'value'. Raises ------ ValueError If lengths of `seasonal_periods` and `seasonal_amplitudes` mismatch, or if `trend_coeffs` length doesn't match `trend_order`. Examples -------- >>> from fusionlab.datasets import make_trend_seasonal_data >>> # Generate data with linear trend and two seasonalities >>> data_bunch = make_trend_seasonal_data(n_timesteps=100, freq='D', seed=1) >>> print(data_bunch.frame.head()) >>> data_bunch.frame.plot(x='date', y='value', figsize=(10, 3)) # Quick plot """ if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() if len(seasonal_periods) != len(seasonal_amplitudes): raise ValueError("Lengths of 'seasonal_periods' and " "'seasonal_amplitudes' must match.") # --- Time Index --- date_rng = pd.date_range( start='2020-01-01', periods=n_timesteps, freq=freq) time_idx = np.arange(n_timesteps) # Simple index for trend calc # --- Trend Component --- if trend_order < 0: raise ValueError("'trend_order' must be >= 0.") if trend_coeffs is None: # Create default coefficients if trend_order == 0: trend_coeffs = [base_level] elif trend_order == 1: trend_coeffs = [base_level, 0.1] # Slope 0.1 elif trend_order == 2: trend_coeffs = [base_level, 0.1, 0.01] # Quadratic term else: trend_coeffs = [base_level] + [0.01] * trend_order # Small higher orders elif len(trend_coeffs) != trend_order + 1: raise ValueError(f"Length of 'trend_coeffs' ({len(trend_coeffs)}) must be " f"'trend_order' + 1 ({trend_order + 1}).") # Calculate polynomial trend trend_component = np.polynomial.polynomial.polyval(time_idx, trend_coeffs) # --- Seasonal Component --- seasonal_component = np.zeros(n_timesteps) for period, amplitude in zip(seasonal_periods, seasonal_amplitudes): if period <= 0: continue # Skip invalid periods omega = 2 * np.pi / period # Add phase shift to make multiple components distinct phase_shift = rng.uniform(0, np.pi / 2) seasonal_component += amplitude * np.sin(omega * time_idx + phase_shift) # --- Noise Component --- noise_component = rng.normal(0, noise_level, n_timesteps) # --- Combine Components --- value = trend_component + seasonal_component + noise_component # --- Create DataFrame --- df = pd.DataFrame({'date': date_rng, 'value': value}) target_col = 'value' dt_col = 'date' if as_frame: return df else: descr = textwrap.dedent(f"""\ Synthetic Time Series with Trend and Seasonality **Description:** A univariate time series generated with {n_timesteps} steps (frequency '{freq}'). Includes a polynomial trend of order {trend_order}, {len(seasonal_periods)} seasonal component(s) with periods {seasonal_periods}, and Gaussian noise with standard deviation {noise_level:.2f}. **Generation Parameters:** - trend_coeffs: {trend_coeffs} - seasonal_periods: {seasonal_periods} - seasonal_amplitudes: {seasonal_amplitudes} - noise_level: {noise_level:.2f} - seed: {seed} **Data Structure (Bunch object):** - frame : pandas DataFrame with 'date' and 'value'. - data : NumPy array of 'value'. - target_names : ['value']. - target : NumPy array of 'value'. - dt_col : 'date'. - DESCR : This description. """) return XBunch( frame=df, data=df[target_col].values, target_names=[target_col], target=df[target_col].values, dt_col=dt_col, DESCR=descr )
[docs] def make_multivariate_target_data( n_series: int = 2, n_timesteps: int = 100, n_targets: int = 2, # Number of target variables freq: str = 'D', trend_factor: float = 0.1, seasonality_period: float = 7, seasonality_amplitude: float = 5, noise_level: float = 0.5, # Control relationship between targets cross_target_lag: int = 1, cross_target_factor: float = 0.3, as_frame: bool = False, seed: Optional[int] = None, ) -> Union[XBunch, pd.DataFrame]: r"""Generate multi-series data with multiple related target variables. Creates a dataset suitable for demonstrating multivariate forecasting. It simulates data for multiple independent series (e.g., items) where each series has several features (static, dynamic, future) and multiple target variables. The target variables are generated with some interdependence (e.g., target 2 depends on the lagged value of target 1). Parameters ---------- n_series : int, default=2 Number of independent time series (e.g., items). n_timesteps : int, default=100 Number of time steps (rows) per series. n_targets : int, default=2 Number of related target variables to generate (e.g., 'target_1', 'target_2', ...). freq : str, default='D' Pandas frequency string for the datetime index. trend_factor : float, default=0.1 Slope factor for the linear trend component in targets. seasonality_period : float, default=7 Periodicity for the main seasonal component in targets. seasonality_amplitude : float, default=5 Amplitude of the main seasonal component in targets. noise_level : float, default=0.5 Standard deviation of Gaussian noise added to each target. cross_target_lag : int, default=1 Lag used for the dependency between targets (target N depends on target N-1 lagged by this amount). cross_target_factor : float, default=0.3 Coefficient determining the strength of dependence between lagged targets. as_frame : bool, default=False Return type: ``False`` for Bunch, ``True`` for DataFrame. seed : int, optional Seed for NumPy's random number generator. Returns ------- data : :class:`~fusionlab.api.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default): A Bunch object including ``frame`` (DataFrame), lists of ``static_features``, ``dynamic_features``, ``future_features``, ``target_names`` (list of target columns), ``target`` (NumPy array of shape (N_rows, n_targets)), and ``DESCR``. If ``as_frame=True``: The generated data solely as a pandas DataFrame. Examples -------- >>> from fusionlab.datasets import make_multivariate_target_data >>> # Generate data with 3 targets for 4 series >>> data_bunch = make_multivariate_target_data(n_series=4, n_targets=3, seed=1) >>> print(data_bunch.frame.head()) >>> print("Target names:", data_bunch.target_names) >>> print("Target array shape:", data_bunch.target.shape) """ if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() if n_targets <= 0: raise ValueError("'n_targets' must be >= 1.") all_series_df = [] start_date = '2021-01-01' for i in range(n_series): # --- Time Index --- date_rng = pd.date_range( start=start_date, periods=n_timesteps, freq=freq) time_idx = np.arange(n_timesteps) # --- Static Features --- series_id = i base_level_factor = 1 + rng.uniform(-0.2, 0.2) # Static variation # --- Shared Components for Targets --- trend = (50 + i * 10) + trend_factor * time_idx seasonality = seasonality_amplitude * np.sin( 2 * np.pi * time_idx / seasonality_period + rng.uniform(0, np.pi) ) base_signal = trend + seasonality # --- Generate Multiple Targets --- targets = {} target_names_list = [f"target_{j+1}" for j in range(n_targets)] previous_target_lagged = None for j in range(n_targets): target_name = target_names_list[j] # Base target value target_j = base_signal * (base_level_factor + j * 0.1) # Add dependency on previous target's lag (if not the first target) if j > 0 and previous_target_lagged is not None: target_j += cross_target_factor * previous_target_lagged # Add noise target_j += rng.normal(0, noise_level * (1 + j*0.1), n_timesteps) targets[target_name] = target_j # Prepare lagged version for the *next* target's calculation previous_target_lagged = pd.Series(target_j).shift( cross_target_lag).fillna(method='bfill') # --- Other Features (Dynamic/Future) --- month = date_rng.month dayofweek = date_rng.dayofweek # Dynamic covariate (example) dynamic_cov = rng.normal(5, 1, n_timesteps) # Future covariate (example) future_event = rng.choice([0, 0, 0, 1], n_timesteps) # Sparse event # --- Assemble DataFrame --- series_df = pd.DataFrame({ 'date': date_rng, 'series_id': series_id, 'base_level_factor': base_level_factor, # Static 'month': month, # Dynamic/Future 'dayofweek': dayofweek, # Dynamic/Future 'dynamic_cov': dynamic_cov, # Dynamic 'future_event': future_event, # Future **targets # Add all target columns }) all_series_df.append(series_df) # --- Combine and Define Roles --- df = pd.concat(all_series_df).reset_index(drop=True) dt_col = 'date' target_names = target_names_list # List of generated target names spatial_id_col = 'series_id' static_features = ['series_id', 'base_level_factor'] dynamic_features = ['month', 'dayofweek', 'dynamic_cov'] future_features = ['month', 'dayofweek', 'future_event'] # Combined feature list for Bunch.feature_names feature_names = static_features[1:] + dynamic_features + future_features # --- Return --- if as_frame: ordered_cols = ( [dt_col, spatial_id_col] + static_features[1:] + dynamic_features + future_features + target_names ) ordered_cols = [c for c in ordered_cols if c in df.columns] return df[ordered_cols] else: descr = textwrap.dedent(f"""\ Synthetic Multi-Series, Multi-Target Data **Description:** Simulates data for {n_series} independent series over {n_timesteps} time steps (frequency '{freq}'). Each series has static, dynamic, and future features, along with {n_targets} related target variables. Targets exhibit trend, seasonality, noise, and lagged cross-target dependencies. Suitable for multivariate forecasting. **Generation Parameters:** (Approximate) - n_series: {n_series} - n_timesteps: {n_timesteps} - n_targets: {n_targets} - freq: '{freq}' - seed: {seed} **Data Structure (Bunch object):** - frame : Complete pandas DataFrame. - static_features : List of static column names. - dynamic_features : List of dynamic column names. - future_features : List of future column names. - target_names : List of target column names {target_names}. - target : NumPy array of target values shape (N_rows, {n_targets}). - dt_col : Name of datetime column ('{dt_col}'). - spatial_id_col : Name of series identifier column ('{spatial_id_col}'). - feature_names : Combined list of non-ID/non-target features. - DESCR : This description. """) target_array = df[target_names].values # Extract numerical features for Bunch.data try: data_cols = [c for c in feature_names if c != spatial_id_col] data_array = df[data_cols].select_dtypes(include=np.number).values except: data_array = None return XBunch( frame=df, static_features=static_features, dynamic_features=dynamic_features, future_features=future_features, target_names=target_names, target=target_array, dt_col=dt_col, spatial_id_col=spatial_id_col, feature_names=feature_names, data=data_array, DESCR=descr )