Example: Tuning PIHALNet with PIHALTuner¶
This page provides practical examples of how to use the
PIHALTuner to perform
hyperparameter tuning for the PIHALNet
model. We’ll cover the typical workflow, from data preparation to
retrieving the best model, first with synthetic data for a clear
demonstration, and then outlining the steps for a real application.
Prerequisites¶
Ensure you have fusionlab installed with its dependencies, including
tensorflow and keras-tuner.
1# Common imports for the examples
2import os
3import logging
4import numpy as np
5import pandas as pd
6import tensorflow as tf
7from sklearn.model_selection import train_test_split # For splitting data
8import joblib # For saving/loading scalers/encoders
9
10from fusionlab.nn.pinn.tuning import PIHALTuner
11from fusionlab.nn.pinn.models import PIHALNet # PIHALTuner needs to build this
12from fusionlab.nn.pinn.utils import prepare_pinn_data_sequences
13# from fusionlab.datasets.load import load_subsidence_pinn_data # For real data
14# from fusionlab.nn.losses import combined_quantile_loss # If using custom quantile loss
15
16# Basic configuration for logging
17logging.basicConfig(level=logging.INFO,
18 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
Section 1: Tuning with Synthetic Data¶
This section walks through the process using synthetically generated
data. This allows you to understand the mechanics of PIHALTuner
without needing a specific large dataset immediately. Each step includes
code to run and a placeholder for its expected output.
Step 1.1: Configuration for Synthetic Data¶
First, we define configurations for our synthetic data generation and the tuning process. These are simplified for demonstration.
1# --- Configuration for Synthetic Data Example ---
2SYNTHETIC_CITY_NAME = "SyntheticCity"
3SYNTHETIC_RUN_OUTPUT_PATH = f"{SYNTHETIC_CITY_NAME}_tuning_synthetic_output"
4SYNTHETIC_MODEL_NAME_TUNED = f"PIHALNet_{SYNTHETIC_CITY_NAME}_TunedSynthetic"
5
6# Synthetic Data Parameters
7SYNTHETIC_N_SAMPLES_TOTAL = 1000 # Total raw data points to generate
8SYNTHETIC_N_LOCATIONS = 10 # Number of unique spatial locations
9SYNTHETIC_YEARS_PER_LOCATION = SYNTHETIC_N_SAMPLES_TOTAL // SYNTHETIC_N_LOCATIONS
10
11# Column names for synthetic data
12SYNTHETIC_TIME_COL = "year"
13SYNTHETIC_DT_COL_NAME_TEMP = "datetime_temp_synth"
14SYNTHETIC_LON_COL = "longitude"
15SYNTHETIC_LAT_COL = "latitude"
16SYNTHETIC_SUBSIDENCE_COL = "subsidence"
17SYNTHETIC_GWL_COL = "gwl"
18SYNTHETIC_CAT_COL = ['geology_s'] # Using a different name to avoid global conflicts
19SYNTHETIC_NUM_DRIVER_COLS = ['rainfall_s', 'pumping_s']
20
21# Sequence Parameters for PIHALNet (can be small for synthetic example)
22SYNTHETIC_TIME_STEPS = 5
23SYNTHETIC_FORECAST_HORIZON = 5
24SYNTHETIC_OUTPUT_S_DIM = 1
25SYNTHETIC_OUTPUT_G_DIM = 1
26
27# Data Splitting & Batching for Tuner
28SYNTHETIC_TRAIN_VAL_END_YEAR = 2020 # Not strictly needed if all data is used
29SYNTHETIC_VAL_SPLIT = 0.25
30SYNTHETIC_BATCH_SIZE = 16 # Smaller batch size for smaller data
31
32# Tuner Configuration (minimal for quick example)
33SYNTHETIC_TUNER_OBJECTIVE = 'val_total_loss'
34SYNTHETIC_MAX_TRIALS = 3 # Very few trials for a quick run
35SYNTHETIC_EPOCHS_PER_TRIAL = 2 # Very few epochs
36SYNTHETIC_TUNER_TYPE = 'randomsearch' # Faster than hyperband for few trials
37SYNTHETIC_TUNER_SEED = 123
38
39print("Synthetic data configurations set.")
40# Ensure output directory exists
41os.makedirs(SYNTHETIC_RUN_OUTPUT_PATH, exist_ok=True)
Expected Output:
Synthetic data configurations set.
Step 1.2: Synthetic Data Generation and Preprocessing¶
Now, we generate a simple synthetic dataset that mimics the structure
needed by PIHALNet. This includes time series for multiple locations,
categorical features, and numerical features. We also perform basic
preprocessing like encoding and scaling.
1def generate_synthetic_city_data(
2 n_locations: int,
3 years_per_location: int,
4 time_col: str,
5 dt_col_name: str,
6 lon_col: str, lat_col: str,
7 subs_col: str, gwl_col: str,
8 cat_col_names: List[str],
9 num_driver_col_names: List[str],
10 output_path: str,
11 city_name: str
12) -> pd.DataFrame:
13 logger.info(f"Generating synthetic data for {n_locations} locations, "
14 f"{years_per_location} years each.")
15 all_rows = []
16 start_year = 2000
17 for i in range(n_locations):
18 loc_lon = 113.0 + i * 0.01
19 loc_lat = 22.0 + i * 0.01
20 for year_offset in range(years_per_location):
21 current_year = start_year + year_offset
22 row = {
23 time_col: current_year,
24 lon_col: loc_lon, lat_col: loc_lat,
25 subs_col: -10 - i*0.5 - year_offset * 0.2 + np.random.randn()*2,
26 gwl_col: 5 - i*0.1 + year_offset * 0.1 + np.random.randn()*0.5,
27 }
28 for cat_c in cat_col_names:
29 row[cat_c] = f"Type{np.random.choice(['A', 'B'])}"
30 for num_c in num_driver_col_names:
31 row[num_c] = np.random.rand() * 100
32 all_rows.append(row)
33
34 df = pd.DataFrame(all_rows)
35 df[dt_col_name] = pd.to_datetime(df[time_col], format='%Y')
36
37 # Encode Categorical
38 global synthetic_encoded_feature_names
39 synthetic_encoded_feature_names = []
40 cats_to_encode = [c for c in cat_col_names if c in df.columns]
41 if cats_to_encode:
42 encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', dtype=np.float32)
43 encoded_data = encoder.fit_transform(df[cats_to_encode])
44 ohe_cols = encoder.get_feature_names_out(cats_to_encode)
45 synthetic_encoded_feature_names.extend(ohe_cols)
46 enc_df = pd.DataFrame(encoded_data, columns=ohe_cols, index=df.index)
47 df = pd.concat([df.drop(columns=cats_to_encode), enc_df], axis=1)
48 joblib.dump(encoder, os.path.join(output_path, f"{city_name}_synth_ohe.joblib"))
49
50 # Scale Numerical Drivers
51 num_to_scale = [c for c in num_driver_col_names if c in df.columns]
52 if num_to_scale:
53 scaler = MinMaxScaler()
54 df[num_to_scale] = scaler.fit_transform(df[num_to_scale])
55 joblib.dump(scaler, os.path.join(output_path, f"{city_name}_synth_scaler.joblib"))
56
57 global SYNTHETIC_TIME_COL_NUMERIC_PINN
58 SYNTHETIC_TIME_COL_NUMERIC_PINN = f"{time_col}_numeric_pinn_synth"
59 df[SYNTHETIC_TIME_COL_NUMERIC_PINN] = (
60 df[dt_col_name].dt.year +
61 (df[dt_col_name].dt.dayofyear - 1) /
62 (365 + df[dt_col_name].dt.is_leap_year.astype(int))
63 )
64 logger.info(f"Synthetic data generated and preprocessed. Shape: {df.shape}")
65 return df
66
67df_synthetic_processed = generate_synthetic_city_data(
68 SYNTHETIC_N_LOCATIONS, SYNTHETIC_YEARS_PER_LOCATION,
69 SYNTHETIC_TIME_COL, SYNTHETIC_DT_COL_NAME_TEMP,
70 SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL,
71 SYNTHETIC_SUBSIDENCE_COL, SYNTHETIC_GWL_COL,
72 SYNTHETIC_CAT_COL, SYNTHETIC_NUM_DRIVER_COLS,
73 SYNTHETIC_RUN_OUTPUT_PATH, SYNTHETIC_CITY_NAME
74)
75print(df_synthetic_processed.head())
Expected Output (will vary due to randomness):
Synthetic data generated and preprocessed. Shape: (100, 12)
year longitude ... geology_s_TypeB year_numeric_pinn_synth
0 2000 113.0 ... 1.0 2000.0
1 2001 113.0 ... 0.0 2001.0
2 2002 113.0 ... 1.0 2002.0
3 2003 113.0 ... 1.0 2003.0
4 2004 113.0 ... 1.0 2004.0
[5 rows x 11 columns]
Step 1.3: Prepare Synthetic Data for Tuner¶
We split the synthetic data and use prepare_pinn_data_sequences
to create the input/target dictionaries for PIHALNet.
1# Split synthetic data (can use all for train/val in this simple case or split by location)
2synth_unique_locs = df_synthetic_processed[[SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL]].drop_duplicates()
3synth_train_locs, synth_val_locs = train_test_split(
4 synth_unique_locs, test_size=SYNTHETIC_VAL_SPLIT, random_state=SYNTHETIC_TUNER_SEED
5)
6df_synth_tuner_train = df_synthetic_processed.merge(
7 synth_train_locs, on=[SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL], how='inner'
8)
9df_synth_tuner_val = df_synthetic_processed.merge(
10 synth_val_locs, on=[SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL], how='inner'
11)
12
13logger.info(f"Synthetic tuner training data part shape: {df_synth_tuner_train.shape}")
14logger.info(f"Synthetic tuner validation data part shape: {df_synth_tuner_val.shape}")
15
16# Define feature lists for synthetic data
17synth_static_features_list = list(synthetic_encoded_feature_names)
18synth_dynamic_features_list = [SYNTHETIC_GWL_COL] + [
19 c for c in SYNTHETIC_NUM_DRIVER_COLS if c in df_synth_tuner_train.columns
20]
21synth_future_features_list = [ # Example: use one of the drivers as a "known future"
22 c for c in SYNTHETIC_NUM_DRIVER_COLS[:1] if c in df_synth_tuner_train.columns
23]
24
25# Prepare training sequences
26inputs_train_np_s, targets_train_np_s = prepare_pinn_data_sequences(
27 df=df_synth_tuner_train, time_col=SYNTHETIC_TIME_COL_NUMERIC_PINN,
28 lon_col=SYNTHETIC_LON_COL, lat_col=SYNTHETIC_LAT_COL,
29 subsidence_col=SYNTHETIC_SUBSIDENCE_COL, gwl_col=SYNTHETIC_GWL_COL,
30 dynamic_cols=synth_dynamic_features_list, static_cols=synth_static_features_list,
31 future_cols=synth_future_features_list, group_id_cols=[SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL],
32 time_steps=SYNTHETIC_TIME_STEPS, forecast_horizon=SYNTHETIC_FORECAST_HORIZON,
33 output_subsidence_dim=SYNTHETIC_OUTPUT_S_DIM, output_gwl_dim=SYNTHETIC_OUTPUT_G_DIM,
34 normalize_coords=True, return_coord_scaler=False, verbose=0
35)
36
37# Prepare validation sequences
38inputs_val_np_s, targets_val_np_s = prepare_pinn_data_sequences(
39 df=df_synth_tuner_val, time_col=SYNTHETIC_TIME_COL_NUMERIC_PINN,
40 lon_col=SYNTHETIC_LON_COL, lat_col=SYNTHETIC_LAT_COL,
41 subsidence_col=SYNTHETIC_SUBSIDENCE_COL, gwl_col=SYNTHETIC_GWL_COL,
42 dynamic_cols=synth_dynamic_features_list, static_cols=synth_static_features_list,
43 future_cols=synth_future_features_list, group_id_cols=[SYNTHETIC_LON_COL, SYNTHETIC_LAT_COL],
44 time_steps=SYNTHETIC_TIME_STEPS, forecast_horizon=SYNTHETIC_FORECAST_HORIZON,
45 output_subsidence_dim=SYNTHETIC_OUTPUT_S_DIM, output_gwl_dim=SYNTHETIC_OUTPUT_G_DIM,
46 normalize_coords=True, return_coord_scaler=False, verbose=0
47)
48
49print(f"Num training sequences: {inputs_train_np_s['coords'].shape[0]}")
50print(f"Num validation sequences: {inputs_val_np_s['coords'].shape[0]}")
51if inputs_train_np_s['coords'].shape[0] == 0 or inputs_val_np_s['coords'].shape[0] == 0:
52 print("WARNING: Empty train or val sequences for synthetic data. Adjust generation params.")
Expected Output (will vary based on sequence generation success):
Num training sequences: 658
Num validation sequences: 282
Step 1.4: Configure and Run PIHALTuner with Synthetic Data¶
We set up PIHALTuner with fixed parameters derived from our synthetic
data and a simplified hyperparameter search space.
1# Define fixed parameters for PIHALTuner using synthetic data shapes
2fixed_params_synth = {
3 "static_input_dim": inputs_train_np_s.get('static_features', np.zeros((0,0))).shape[-1],
4 "dynamic_input_dim": inputs_train_np_s['dynamic_features'].shape[-1],
5 "future_input_dim": inputs_train_np_s.get('future_features', np.zeros((0,0,0))).shape[-1],
6 "output_subsidence_dim": SYNTHETIC_OUTPUT_S_DIM,
7 "output_gwl_dim": SYNTHETIC_OUTPUT_G_DIM,
8 "forecast_horizon": SYNTHETIC_FORECAST_HORIZON,
9 "quantiles": None, # Point predictions for simpler synthetic example
10 "max_window_size": SYNTHETIC_TIME_STEPS,
11 "pde_mode": "none", # No PDE for simple synthetic example
12 "pinn_coefficient_C": None,
13 "loss_weights": {'subs_pred': 1.0, 'gwl_pred': 1.0},
14 "use_vsn": False, # Simpler model without VSN for quick test
15 "scales": [1], # Single scale LSTM
16 "memory_size": 10, # Small memory
17}
18
19# Simplified hyperparameter space for synthetic example
20param_space_synth = {
21 'embed_dim': {'min_value': 8, 'max_value': 16, 'step': 8},
22 'hidden_units': {'min_value': 16, 'max_value': 32, 'step': 16},
23 'lstm_units': {'min_value': 16, 'max_value': 32, 'step': 16},
24 'attention_units': {'min_value': 8, 'max_value': 16, 'step': 8},
25 'num_heads': [1, 2],
26 'dropout_rate': [0.0, 0.1],
27 'learning_rate': [1e-3, 5e-3],
28 # 'lambda_pde': [0.0] # Not tuning if pde_mode is none
29}
30
31logger.info("Instantiating PIHALTuner for synthetic data...")
32synthetic_tuner = PIHALTuner(
33 fixed_model_params=fixed_params_synth,
34 param_space=param_space_synth,
35 objective=SYNTHETIC_TUNER_OBJECTIVE,
36 max_trials=SYNTHETIC_MAX_TRIALS,
37 project_name=SYNTHETIC_MODEL_NAME_TUNED,
38 directory=os.path.join(SYNTHETIC_RUN_OUTPUT_PATH, "tuner_synth_results"),
39 executions_per_trial=1,
40 tuner_type=SYNTHETIC_TUNER_TYPE,
41 seed=SYNTHETIC_TUNER_SEED,
42 overwrite_tuner=True
43)
44
45# Callbacks
46synthetic_early_stopping = tf.keras.callbacks.EarlyStopping(
47 monitor=SYNTHETIC_TUNER_OBJECTIVE, patience=2, restore_best_weights=True, verbose=0
48)
49
50logger.info(f"Starting synthetic data hyperparameter search ({SYNTHETIC_TUNER_TYPE})...")
51
52if inputs_train_np_s['coords'].shape[0] > 0 and inputs_val_np_s['coords'].shape[0] > 0:
53 synthetic_tuner.run(
54 inputs=inputs_train_np_s,
55 y=targets_train_np_s,
56 validation_data=(inputs_val_np_s, targets_val_np_s),
57 epochs=SYNTHETIC_EPOCHS_PER_TRIAL,
58 batch_size=SYNTHETIC_BATCH_SIZE,
59 callbacks=[synthetic_early_stopping],
60 verbose=1
61 )
62 logger.info("Synthetic data hyperparameter search completed.")
63else:
64 logger.error("Cannot start tuner search: training or validation sequences are empty for synthetic data.")
Expected Output (will be verbose from Keras Tuner):
...(logger messages for instantiation)...
Starting synthetic data hyperparameter search (randomsearch)...
Trial 1 Complete [...]
Best val_total_loss So Far: ...
...(more trials)...
Synthetic data hyperparameter search completed.
Step 1.5: Retrieve and Interpret Results (Synthetic Data)¶
After the search, we inspect the best hyperparameters found for our synthetic data tuning run.
try:
best_hps_list_s = synthetic_tuner.get_best_hyperparameters(num_trials=1)
if not best_hps_list_s:
logger.error("Synthetic tuner could not retrieve best hyperparameters.")
else:
best_hps_s = best_hps_list_s[0]
logger.info("\n--- Best Hyperparameters (Synthetic Data) ---")
for param_name, value in best_hps_s.values.items():
logger.info(f" {param_name}: {value}")
best_models_s = synthetic_tuner.get_best_models(num_models=1)
if best_models_s and best_models_s[0] is not None:
logger.info("Best model from synthetic tuning retrieved.")
# best_models_s[0].summary() # Optional: print summary
else:
logger.error("Synthetic tuner could not retrieve the best model instance.")
except Exception as e_results_s:
logger.error(f"Error during synthetic result retrieval: {e_results_s}")
logger.info(
f"Synthetic tuning finished. Check results in: "
f"{os.path.join(SYNTHETIC_RUN_OUTPUT_PATH, 'tuner_synth_results')}"
)
Expected Output:
...(logger messages)...
--- Best Hyperparameters (Synthetic Data) ---
embed_dim: ...
hidden_units: ...
...(other hyperparameters and their values)...
Best model from synthetic tuning retrieved.
Synthetic tuning finished. Check results in: SyntheticCity_tuning_synthetic_output/tuner_synth_results
This completes the synthetic data example. It demonstrates the full pipeline,
allowing users to test PIHALTuner and understand its operation with
controllable data.
Section 2: Real Application Case - Example Workflow¶
For tuning PIHALNet on a real-world dataset like Zhongshan or Nansha,
the workflow follows the same fundamental steps as the synthetic data example,
but with careful attention to actual data characteristics, more extensive
preprocessing, and a more thorough hyperparameter search.
The detailed steps, as previously outlined (and which formed the original content
of this page), would involve:
1. Configuration: Setting up paths, city-specific parameters, feature definitions, sequence parameters, and tuner settings appropriate for the real dataset.
2. Data Loading and Preprocessing: Loading the actual city data (e.g., from a CSV file), performing robust cleaning, handling missing values, encoding categorical features (e.g., ‘geology’), and scaling numerical driver features. This step is crucial and dataset-specific.
3. Prepare Data for Tuner: Splitting the processed data into training and validation sets suitable for the tuner (e.g., using a time-based or location-aware split to prevent data leakage) and then using prepare_pinn_data_sequences to generate the sequence dictionaries.
4. Configure and Run PIHALTuner: Defining the fixed_model_params (with dimensions inferred from the real prepared data) and a comprehensive param_space_config for the hyperparameters. Instantiating and running PIHALTuner for a significant number of trials and epochs.
5. Retrieve, Analyze, and Use Results: Extracting the best hyperparameters, retrieving the best model, saving these artifacts, and potentially retraining the best model on a larger portion of the data before deployment or further evaluation.
Please refer to the code blocks from Step 1 to Step 5 in the initial version of this document for a detailed code structure for a real application. The key is to adapt the data loading, preprocessing, feature engineering, and fixed/hyperparameter configurations to the specifics of your chosen real-world dataset.
Step 2.1: Configuration¶
Define essential configurations for your tuning run.
1# --- Configuration Constants ---
2DATA_FILE_PATH = "path/to/your/city_data.csv" # IMPORTANT: Update this!
3CITY_NAME = "your_city" # e.g., "zhongshan" or "nansha"
4RUN_OUTPUT_PATH = f"{CITY_NAME}_tuning_example_output"
5MODEL_NAME_TUNED = f"PIHALNet_{CITY_NAME}_TunedExample"
6
7# Data Parameters (adapt these to your dataset)
8TIME_COL = "year"
9DT_COL_NAME_TEMP = "datetime_temp" # Temporary column for datetime objects
10LON_COL = "longitude"
11LAT_COL = "latitude"
12SUBSIDENCE_COL = "subsidence" # Your primary target
13GWL_COL = "GWL" # Your secondary target
14
15# Example feature columns (customize for your dataset)
16CATEGORICAL_COLS = ['geology']
17NUMERICAL_DRIVER_COLS = [ # Features to scale and use as drivers
18 'rainfall_mm', 'pumping_rate', 'river_level'
19 # Add other relevant numerical features for your city
20 # Exclude LON_COL, LAT_COL, TIME_COL (handled by prepare_pinn_data_sequences)
21 # Exclude target columns (SUBSIDENCE_COL, GWL_COL)
22]
23
24# Sequence Parameters for PIHALNet
25TIME_STEPS = 12 # Lookback window
26FORECAST_HORIZON = 3 # Prediction horizon
27OUTPUT_SUBSIDENCE_DIM = 1
28OUTPUT_GWL_DIM = 1
29
30# Data Splitting for Tuner
31# Data up to this year for training/validation by the tuner
32TRAIN_VAL_END_YEAR_TUNER = 2018 # Example
33# Proportion of the above data to use for tuner's internal validation
34VALIDATION_SPLIT_TUNER = 0.2
35BATCH_SIZE_TUNER = 32
36
37# Tuner Configuration
38TUNER_OBJECTIVE = 'val_total_loss' # Metric PIHALNet reports
39MAX_TRIALS_TUNER = 10 # Keep low for example, increase for real tuning
40EPOCHS_PER_TRIAL_TUNER = 25 # Max epochs per trial
41TUNER_TYPE = 'hyperband' # 'randomsearch', 'bayesianoptimization', or 'hyperband'
42TUNER_SEED = 42
Step 2.2: Data Loading and Preprocessing¶
Load your dataset and perform necessary preprocessing steps like cleaning,
encoding categorical features, and scaling numerical features. The
load_subsidence_pinn_data function (if you’re using it from
fusionlab.datasets) can handle some of this. Here, we show a
manual example.
1def load_and_preprocess_city_data(
2 file_path: str,
3 time_col: str,
4 dt_col_name: str,
5 categorical_cols: List[str],
6 numerical_cols: List[str],
7 run_output_path: str,
8 city_name: str
9) -> pd.DataFrame:
10 logger.info(f"Loading data from: {file_path}")
11 if not os.path.exists(file_path):
12 # For this example, we'll raise an error.
13 # In your script, you might generate dummy data as a fallback.
14 raise FileNotFoundError(
15 f"Data file NOT FOUND: {file_path}. Please update."
16 )
17 df = pd.read_csv(file_path)
18 logger.info(f"Original data shape: {df.shape}")
19
20 # Basic Cleaning (adapt to your data)
21 essential_cols = [time_col, LON_COL, LAT_COL, SUBSIDENCE_COL, GWL_COL]
22 df = df.dropna(subset=essential_cols).copy()
23
24 # Convert time column to datetime
25 try:
26 if pd.api.types.is_numeric_dtype(df[time_col]):
27 df[dt_col_name] = pd.to_datetime(df[time_col], format='%Y')
28 else:
29 df[dt_col_name] = pd.to_datetime(df[time_col])
30 except Exception as e:
31 logger.error(f"Error converting time column '{time_col}': {e}")
32 raise
33 df = df.dropna(subset=[dt_col_name])
34
35 os.makedirs(run_output_path, exist_ok=True)
36
37 # Encode Categorical Features
38 global encoded_feature_names_list # Make accessible for feature definition
39 encoded_feature_names_list = []
40 cats_to_encode = [c for c in categorical_cols if c in df.columns]
41 if cats_to_encode:
42 encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', dtype=np.float32)
43 encoded_data = encoder.fit_transform(df[cats_to_encode])
44 ohe_cols = encoder.get_feature_names_out(cats_to_encode)
45 encoded_feature_names_list.extend(ohe_cols)
46 enc_df = pd.DataFrame(encoded_data, columns=ohe_cols, index=df.index)
47 df = pd.concat([df.drop(columns=cats_to_encode), enc_df], axis=1)
48 joblib.dump(encoder, os.path.join(run_output_path, f"{city_name}_ohe.joblib"))
49 logger.info(f"Categorical features encoded: {cats_to_encode}")
50
51 # Scale Numerical Driver Features
52 num_to_scale = [c for c in numerical_cols if c in df.columns]
53 if num_to_scale:
54 scaler = MinMaxScaler()
55 df[num_to_scale] = scaler.fit_transform(df[num_to_scale])
56 joblib.dump(scaler, os.path.join(run_output_path, f"{city_name}_scaler.joblib"))
57 logger.info(f"Numerical driver features scaled: {num_to_scale}")
58
59 # Create the numerical time column for PINN sequences
60 # Ensure this happens after all row manipulations (like dropna)
61 global TIME_COL_NUMERIC_PINN # Make accessible
62 TIME_COL_NUMERIC_PINN = f"{time_col}_numeric_pinn"
63 df[TIME_COL_NUMERIC_PINN] = (
64 df[dt_col_name].dt.year +
65 (df[dt_col_name].dt.dayofyear - 1) /
66 (365 + df[dt_col_name].dt.is_leap_year.astype(int))
67 )
68 logger.info(f"Processed data shape: {df.shape}")
69 return df
70
71# Load and preprocess your data
72df_processed = load_and_preprocess_city_data(
73 DATA_FILE_PATH, TIME_COL, DT_COL_NAME_TEMP,
74 CATEGORICAL_COLS, NUMERICAL_DRIVER_COLS,
75 RUN_OUTPUT_PATH, CITY_NAME
76)
Step 2.3: Prepare Data for Tuner¶
Split the processed data into training and validation sets for the tuner.
Then, use prepare_pinn_data_sequences to format this data into the
structure required by PIHALNet.
1# Split data for tuner
2df_for_tuner_train_val = df_processed[
3 df_processed[DT_COL_NAME_TEMP].dt.year <= TRAIN_VAL_END_YEAR_TUNER
4].copy()
5
6if df_for_tuner_train_val.empty:
7 raise ValueError(f"No data available up to year {TRAIN_VAL_END_YEAR_TUNER}")
8
9# Split unique locations to avoid data leakage between train and val
10unique_locs = df_for_tuner_train_val[[LON_COL, LAT_COL]].drop_duplicates()
11if len(unique_locs) < 2:
12 logger.warning("Very few unique locations for robust train/val split. Using random split on all data.")
13 df_tuner_train, df_tuner_val = train_test_split(
14 df_for_tuner_train_val, test_size=VALIDATION_SPLIT_TUNER, random_state=TUNER_SEED
15 )
16else:
17 train_locations, val_locations = train_test_split(
18 unique_locs, test_size=VALIDATION_SPLIT_TUNER, random_state=TUNER_SEED
19 )
20 df_tuner_train = df_for_tuner_train_val.merge(train_locations, on=[LON_COL, LAT_COL], how='inner')
21 df_tuner_val = df_for_tuner_train_val.merge(val_locations, on=[LON_COL, LAT_COL], how='inner')
22
23if df_tuner_train.empty or df_tuner_val.empty:
24 raise ValueError("Tuner train or validation set is empty after location-based split.")
25
26logger.info(f"Tuner training data part shape: {df_tuner_train.shape}")
27logger.info(f"Tuner validation data part shape: {df_tuner_val.shape}")
28
29# Define feature sets for prepare_pinn_data_sequences
30# Static features often include one-hot encoded categoricals
31static_features_list = list(encoded_feature_names_list)
32# Dynamic features are typically scaled numerical drivers and targets (like GWL)
33dynamic_features_list = [GWL_COL] + [
34 c for c in NUMERICAL_DRIVER_COLS if c in df_tuner_train.columns
35]
36# Future features might include forecasted drivers like rainfall
37future_features_list = ['rainfall_mm'] # Example, if available and scaled
38future_features_list = [c for c in future_features_list if c in df_tuner_train.columns]
39
40
41# Prepare training sequences for the tuner
42logger.info("Preparing training sequences for tuner...")
43inputs_train_np, targets_train_np, coord_scaler = prepare_pinn_data_sequences(
44 df=df_tuner_train,
45 time_col=TIME_COL_NUMERIC_PINN, # Use the generated numeric time
46 lon_col=LON_COL, lat_col=LAT_COL,
47 subsidence_col=SUBSIDENCE_COL, gwl_col=GWL_COL,
48 dynamic_cols=dynamic_features_list,
49 static_cols=static_features_list,
50 future_cols=future_features_list,
51 group_id_cols=[LON_COL, LAT_COL],
52 time_steps=TIME_STEPS,
53 forecast_horizon=FORECAST_HORIZON,
54 output_subsidence_dim=OUTPUT_SUBSIDENCE_DIM,
55 output_gwl_dim=OUTPUT_GWL_DIM,
56 normalize_coords=True, # Let sequence prep handle coordinate normalization
57 return_coord_scaler=True, # We might need this scaler later
58 # cols_to_scale ='auto', # scale numeric data except one hot encoding
59 verbose=7
60)
61
62# Prepare validation sequences for the tuner
63logger.info("Preparing validation sequences for tuner...")
64inputs_val_np, targets_val_np, _ = prepare_pinn_data_sequences(
65 df=df_tuner_val,
66 time_col=TIME_COL_NUMERIC_PINN,
67 lon_col=LON_COL, lat_col=LAT_COL,
68 subsidence_col=SUBSIDENCE_COL, gwl_col=GWL_COL,
69 dynamic_cols=dynamic_features_list,
70 static_cols=static_features_list,
71 future_cols=future_features_list,
72 group_id_cols=[LON_COL, LAT_COL],
73 time_steps=TIME_STEPS,
74 forecast_horizon=FORECAST_HORIZON,
75 output_subsidence_dim=OUTPUT_SUBSIDENCE_DIM,
76 output_gwl_dim=OUTPUT_GWL_DIM,
77 normalize_coords=True, # Use same strategy
78 return_coord_scaler=False, # Scaler from training data is usually sufficient
79 verbose=1
80)
81
82if inputs_train_np['coords'].shape[0] == 0 or inputs_val_np['coords'].shape[0] == 0:
83 raise ValueError("Sequence preparation resulted in empty training or validation data for the tuner.")
Step 2.4: Configure and Run PIHALTuner¶
Define the fixed parameters for PIHALNet (many are inferred from data)
and the hyperparameter search space. Then, instantiate and run PIHALTuner.
1# Define fixed parameters for PIHALTuner
2# These are derived from data or set as constants for this tuning run
3fixed_model_params_for_tuner = {
4 "static_input_dim": inputs_train_np.get('static_features', np.zeros((0,0))).shape[-1],
5 "dynamic_input_dim": inputs_train_np['dynamic_features'].shape[-1],
6 "future_input_dim": inputs_train_np.get('future_features', np.zeros((0,0,0))).shape[-1],
7 "output_subsidence_dim": OUTPUT_SUBSIDENCE_DIM,
8 "output_gwl_dim": OUTPUT_GWL_DIM,
9 "forecast_horizon": FORECAST_HORIZON,
10 "quantiles": [0.1, 0.5, 0.9], # Example, or None
11 "max_window_size": TIME_STEPS,
12 "pde_mode": "consolidation", # Example fixed PDE mode
13 "pinn_coefficient_C": "learnable",
14 "loss_weights": {'subs_pred': 1.0, 'gwl_pred': 0.8},
15 # Add other PIHALNet parameters that should be fixed during tuning
16 "scales": [1, 2], # Example fixed scales
17 "memory_size": 50,
18 "use_vsn": True,
19}
20
21# Define the hyperparameter search space
22param_space_config = {
23 'embed_dim': {'min_value': 32, 'max_value': 64, 'step': 16},
24 'hidden_units': {'min_value': 32, 'max_value': 128, 'step': 32},
25 'lstm_units': {'min_value': 32, 'max_value': 128, 'step': 32},
26 'attention_units': {'min_value': 16, 'max_value': 64, 'step': 16},
27 'num_heads': [2, 4],
28 'dropout_rate': {'min_value': 0.0, 'max_value': 0.2, 'step': 0.1},
29 'vsn_units': {'min_value': 16, 'max_value': 32, 'step': 16},
30 'activation': ['relu', 'gelu'],
31 'learning_rate': [1e-4, 5e-4, 1e-3],
32 'lambda_pde': {'min_value': 0.01, 'max_value': 0.5, 'sampling': 'linear'},
33 # pinn_coefficient_C_type can also be tuned if 'pinn_coefficient_C' is not fixed
34}
35
36logger.info("Instantiating PIHALTuner...")
37tuner = PIHALTuner(
38 fixed_model_params=fixed_model_params_for_tuner,
39 param_space=param_space_config,
40 objective=TUNER_OBJECTIVE,
41 max_trials=MAX_TRIALS_TUNER,
42 project_name=MODEL_NAME_TUNED,
43 directory=os.path.join(RUN_OUTPUT_PATH, "tuner_results"),
44 executions_per_trial=EXECUTIONS_PER_TRIAL,
45 tuner_type=TUNER_TYPE,
46 seed=TUNER_SEED,
47 overwrite_tuner=True # Set to False to resume previous tuning
48)
49
50# Callbacks for the search
51early_stopping_cb = tf.keras.callbacks.EarlyStopping(
52 monitor=TUNER_OBJECTIVE,
53 patience=5, # Shorter patience for faster example
54 restore_best_weights=True,
55 verbose=1
56)
57
58logger.info(f"Starting hyperparameter search ({TUNER_TYPE})...")
59
60# PIHALTuner's `run` method expects NumPy dicts and handles tf.data.Dataset creation
61tuner.run( # Or use tuner.search if PINNTunerBase directly defines it
62 inputs=inputs_train_np,
63 y=targets_train_np, # Ensure keys are "subs_pred", "gwl_pred" or will be renamed
64 validation_data=(inputs_val_np, targets_val_np),
65 epochs=EPOCHS_PER_TRIAL_TUNER,
66 batch_size=BATCH_SIZE_TUNER,
67 callbacks=[early_stopping_cb],
68 verbose=1
69)
70logger.info("Hyperparameter search completed.")
Step 2.5: Retrieve and Use Results¶
After the search, you can get the best hyperparameters and the best model instance.
1try:
2 best_hps_list = tuner.get_best_hyperparameters(num_trials=1)
3 if not best_hps_list:
4 logger.error("Tuner could not retrieve best hyperparameters.")
5 else:
6 best_hps = best_hps_list[0]
7 logger.info("\n--- Best Hyperparameters Found ---")
8 for param_name, value in best_hps.values.items():
9 logger.info(f" {param_name}: {value}")
10
11 # Save best HPs
12 best_hps_path = os.path.join(
13 RUN_OUTPUT_PATH, f"{MODEL_NAME_TUNED}_best_hps.txt"
14 )
15 with open(best_hps_path, 'w') as f:
16 for param, val in best_hps.values.items():
17 f.write(f"{param}: {val}\n")
18 logger.info(f"Best hyperparameters saved to: {best_hps_path}")
19
20 # Get the best model
21 best_models = tuner.get_best_models(num_models=1)
22 if best_models and best_models[0] is not None:
23 best_pihalnet_model = best_models[0]
24 logger.info("\n--- Best Model Summary ---")
25 best_pihalnet_model.summary(line_length=110)
26
27 # Save the best model
28 best_model_path = os.path.join(
29 RUN_OUTPUT_PATH, f"{MODEL_NAME_TUNED}_best_model.keras"
30 )
31 best_pihalnet_model.save(best_model_path)
32 logger.info(f"Best PIHALNet model saved to: {best_model_path}")
33
34 # Optionally, retrain the best model on more data or for more epochs
35 # logger.info("Retraining best model on full train_val data...")
36 # ... (prepare full train_val dataset) ...
37 # best_pihalnet_model.fit(full_train_val_dataset, epochs=50, ...)
38
39 else:
40 logger.error("Tuner could not retrieve the best model instance.")
41
42except Exception as e_results:
43 logger.error(f"Error during result retrieval or saving: {e_results}")
44
45logger.info(
46 f"Tuning process finished. Check results in: "
47 f"{os.path.join(RUN_OUTPUT_PATH, 'tuner_results')}"
48)
This example provides a template. You’ll need to: - Update `DATA_FILE_PATH` and other path/name configurations. - Customize `load_and_preprocess_city_data` for your specific dataset’s cleaning and feature engineering needs. - Adjust feature lists (CATEGORICAL_COLS, NUMERICAL_DRIVER_COLS, static_features_list, etc.) to match your data. - Refine `fixed_model_params_for_tuner` and `param_space_config` to suit the aspects of PIHALNet you want to fix versus tune. - Increase `MAX_TRIALS_TUNER` and `EPOCHS_PER_TRIAL_TUNER` for a more thorough search in a real application.