Source code for clustpy.deep.neural_networks.stacked_autoencoder

"""
@authors:
Collin Leiber
"""

import torch
from clustpy.deep.neural_networks.feedforward_autoencoder import FeedforwardAutoencoder
from clustpy.deep._utils import get_device_from_module
from clustpy.deep._data_utils import get_dataloader
import numpy as np
import tqdm
from collections.abc import Callable
from clustpy.deep._utils import set_torch_seed


[docs]class StackedAutoencoder(FeedforwardAutoencoder): """ A stacked autoencoder. Regarding its architecture, it corresponds to a standard FeedforwardAutoencoder but uses a different training strategy. First, each layer is trained separately in a greedy manner, referred to as layer-wise training. Afterward, all layers are trained at once to finetune the autoencoder. Parameters ---------- layers : list list of the different layer sizes from input to embedding, e.g. an example architecture for MNIST [784, 512, 256, 10], where 784 is the input dimension and 10 the embedding dimension. Note that in case of a StackedAutoencoder the decoder requires the reversed structure of the encoder. batch_norm : bool Set True if you want to use torch.nn.BatchNorm1d (default: False) dropout : float Set the amount of dropout you want to use (default: None) activation_fn : torch.nn.Module activation function from torch.nn, set the activation function for the hidden layers, if None then it will be linear (default: torch.nn.LeakyReLU) bias : bool set False if you do not want to use a bias term in the linear layers (default: True) decoder_output_fn : torch.nn.Module activation function from torch.nn, set the activation function for the decoder output layer, if None then it will be linear. E.g. set to torch.nn.Sigmoid if you want to scale the decoder output between 0 and 1 (default: None) work_on_copy : bool If set to true, deep clustering algorithms will optimize a copy of the autoencoder and not the autoencoder itself. Ensures that the same autoencoder can be used by multiple deep clustering algorithms. As copies of this object are created, the memory requirement increases (default: True) random_state : np.random.RandomState | int use a fixed random state to get a repeatable solution. Can also be of type int (default: None) Attributes ---------- encoder : FullyConnectedBlock encoder part of the autoencoder, responsible for embedding data points (class is FullyConnectedBlock) decoder : FullyConnectedBlock decoder part of the autoencoder, responsible for reconstructing data points from the embedding (class is FullyConnectedBlock) fitted : bool indicates whether the autoencoder is already fitted work_on_copy : bool indicates whether deep clustering algorithms should work on a copy of the original autoencoder References ---------- E.g.: Bengio, Yoshua, et al. "Greedy layer-wise training of deep networks." Advances in neural information processing systems 19 (2006). or Vincent, Pascal, et al. "Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion." Journal of machine learning research 11.12 (2010). """ def __init__(self, layers: list, batch_norm: bool = False, dropout: float = None, activation_fn: torch.nn.Module = torch.nn.LeakyReLU, bias: bool = True, decoder_output_fn: torch.nn.Module = None, work_on_copy: bool = True, random_state: np.random.RandomState | int = None): super().__init__(layers, batch_norm, dropout, activation_fn, bias, None, decoder_output_fn, work_on_copy, random_state)
[docs] def layerwise_training(self, n_epochs_per_layer: int = 20, optimizer_params: dict = None, batch_size: int = 128, data: np.ndarray | torch.Tensor = None, dataloader: torch.utils.data.DataLoader = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), corruption_fn: Callable = None) -> 'StackedAutoencoder': """ Trains the autoencoder in a greedy layer-wise fashion. Parameters ---------- n_epochs_per_layer : int number of epochs for training each layer separately (default: 20) optimizer_params : dict parameters of the optimizer, includes the learning rate (default: {"lr": 1e-3}) batch_size : int size of the data batches (default: 128) data : np.ndarray | torch.Tensor train data set. If data is passed then dataloader can remain empty (default: None) dataloader : torch.utils.data.DataLoader dataloader to be used for training (default: default=None) optimizer_class : torch.optim.Optimizer optimizer to be used (default: torch.optim.Adam) ssl_loss_fn : torch.nn.modules.loss._Loss self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss (default: torch.nn.MSELoss()) corruption_fn : Callable Can be used to corrupt the input data, e.g., when using a denoising autoencoder. Note that the function must match the data and the data loaders. For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None) Returns ------- self : StackedAutoencoder this instance of the autoencoder Raises ---------- ValueError: data cannot be None if dataloader is None """ if dataloader is None: if data is None: raise ValueError("data must be specified if dataloader is None") dataloader = get_dataloader(data, batch_size, True) optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params optimizer = optimizer_class(params=self.parameters(), **optimizer_params) device = get_device_from_module(self) encoder_linear_layer_ids = self.encoder.layer_positions decoder_linear_layer_ids = self.decoder.layer_positions assert len(encoder_linear_layer_ids) == len( decoder_linear_layer_ids), "The decoder must be a reversed version of the encoder" # Start training tbar = tqdm.tqdm(total=n_epochs_per_layer * len(encoder_linear_layer_ids), desc="Stacked AE training") for layer_nr, encoder_layer_to_train in enumerate(encoder_linear_layer_ids): # Train this specific layer for a certain amount of epochs for _ in range(n_epochs_per_layer): total_loss = 0 for batch in dataloader: input_data = batch[1].to(device) with torch.no_grad(): # encode batch using already trained layers including non-linearity functions etc for encode_layer in range(encoder_layer_to_train): input_data = self.encoder.block[encode_layer](input_data) # Calculate loss regarding current layer input_data_adj = input_data if corruption_fn is None else corruption_fn(input_data) encoded = self.encoder.block[encoder_layer_to_train](input_data_adj) decoded = self.decoder.block[decoder_linear_layer_ids[-(layer_nr + 1)]](encoded) loss = ssl_loss_fn(decoded, input_data) # Update network total_loss += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() postfix_str = {"Loss": total_loss, "LayerID": layer_nr} tbar.set_postfix(postfix_str) tbar.update() return self
[docs] def fit(self, n_epochs_per_layer: int = 20, n_epochs: int = 100, optimizer_params: dict = None, batch_size: int = 128, data: np.ndarray | torch.Tensor = None, data_eval: np.ndarray | torch.Tensor = None, dataloader: torch.utils.data.DataLoader = None, evalloader: torch.utils.data.DataLoader = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), patience: int = 5, scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = {}, corruption_fn: Callable = None, model_path: str = None) -> 'StackedAutoencoder': """ Trains the autoencoder in place. First, a greedy layer-wise training is performed. Afterward, the weights are finetuned by training all layer simultaneously. Parameters ---------- n_epochs_per_layer : int number of epochs for training each layer separately (default: 20) n_epochs: int number of epochs for the final finetuning (default: 100) optimizer_params : dict parameters of the optimizer, includes the learning rate (default: {"lr": 1e-3}) batch_size : int size of the data batches (default: 128) data : np.ndarray | torch.Tensor train data set. If data is passed then dataloader can remain empty (default: None) data_eval : np.ndarray | torch.Tensor evaluation data set. If data_eval is passed then evalloader can remain empty. Only used for finetuning (default: None) dataloader : torch.utils.data.DataLoader dataloader to be used for training (default: default=None) evalloader : torch.utils.data.DataLoader dataloader to be used for evaluation, early stopping and learning rate scheduling if scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau. Only used for finetuning (default: None) optimizer_class : torch.optim.Optimizer optimizer to be used (default: torch.optim.Adam) ssl_loss_fn : torch.nn.modules.loss._Loss self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss (default: torch.nn.MSELoss()) patience : int patience parameter for EarlyStopping. Only used for finetuning (default: 5) scheduler : torch.optim.lr_scheduler learning rate scheduler that should be used. If torch.optim.lr_scheduler.ReduceLROnPlateau is used then the behaviour is matched by providing the validation_loss calculated based on samples from evalloader. Only used for finetuning (default: None) scheduler_params : dict dictionary of the parameters of the scheduler object. Only used for finetuning (default: {}) corruption_fn : Callable Can be used to corrupt the input data, e.g., when using a denoising autoencoder. Note that the function must match the data and the data loaders. For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None) model_path : str if specified will save the trained model to the location. If evalloader is used, then only the best model w.r.t. evaluation loss is saved (default: None) Returns ------- self : StackedAutoencoder this instance of the autoencoder Raises ---------- ValueError: data cannot be None if dataloader is None ValueError: evalloader cannot be None if scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau """ set_torch_seed(self.random_state) self.layerwise_training(n_epochs_per_layer, optimizer_params, batch_size, data, dataloader, optimizer_class, ssl_loss_fn, corruption_fn) super().fit(n_epochs, optimizer_params, batch_size, data, data_eval, dataloader, evalloader, optimizer_class, ssl_loss_fn, patience, scheduler, scheduler_params, corruption_fn, model_path) return self