Source code for clustpy.deep._train_utils

from clustpy.deep.autoencoders import FeedforwardAutoencoder
import torch
import copy


def _get_default_layers(input_dim: int, embedding_size: int) -> list:
    """
    Get the default layers for an autoencoder.
    Default layers are [input_dim, 500, 500, 2000, embedding_size]

    Parameters
    ----------
    input_dim : int
        size of the first layer
    embedding_size : int
        size of the last layer

    Returns
    -------
    layers : list
        list containing the layers
    """
    layers = [input_dim, 500, 500, 2000, embedding_size]
    return layers


[docs]def get_trained_autoencoder(trainloader: torch.utils.data.DataLoader, optimizer_params: dict, n_epochs: int, device, optimizer_class: torch.optim.Optimizer, loss_fn: torch.nn.modules.loss._Loss, embedding_size: int, autoencoder: torch.nn.Module = None, autoencoder_class: torch.nn.Module = FeedforwardAutoencoder) -> torch.nn.Module: """This function returns a trained autoencoder. The following cases are considered - If the autoencoder is initialized and trained (autoencoder.fitted==True), then return input autoencoder without training it again. - If the autoencoder is initialized and not trained (autoencoder.fitted==False), it will be fitted (autoencoder.fitted will be set to True) using default parameters. - If the autoencoder is None, a new autoencoder is created using autoencoder_class, and it will be fitted as described above. Beware the input autoencoder_class or autoencoder object needs both a fit() function and the fitted attribute. See clustpy.deep.feedforward_autoencoder.FeedforwardAutoencoder for an example. Parameters ---------- trainloader : torch.utils.data.DataLoader dataloader used to train autoencoder optimizer_params : dict parameters of the optimizer for the autoencoder training, includes the learning rate n_epochs : int number of training epochs device : torch.device device to be trained on optimizer_class : torch.optim.Optimizer optimizer for training. loss_fn : torch.nn.modules.loss._Loss loss function for the reconstruction. embedding_size : int dimension of the innermost layer of the autoencoder autoencoder : torch.nn.Module autoencoder object to be trained (optional) (default: None) autoencoder_class : torch.nn.Module The autoencoder class that should be used (default: FeedforwardAutoencoder) Returns ------- autoencoder : torch.nn.Module The fitted autoencoder """ if autoencoder is None: input_dim = torch.numel(next(iter(trainloader))[1][0]) # Get input dimensions from first batch if embedding_size > input_dim: print( "WARNING: embedding_size is larger than the dimensionality of the input dataset. embedding_size: {0} / input dimensionality: {1}".format( embedding_size, input_dim)) # Init Autoencoder parameters layers = _get_default_layers(input_dim, embedding_size) autoencoder = autoencoder_class(layers=layers) assert hasattr(autoencoder, "fitted"), "Autoencoder has no attribute 'fitted' and is therefore not compatible. Check documentation of fitted clustpy.deep.autoencoders._abstract_autoencoder._AbstractAutoencoder" # Save autoencoder to device autoencoder.to(device) if not autoencoder.fitted: print("Autoencoder is not fitted yet, will be pretrained.") # Pretrain Autoencoder autoencoder.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, device=device, optimizer_class=optimizer_class, loss_fn=loss_fn) if autoencoder.reusable: # If autoencoder is used by multiple deep clustering algorithms, create a deep copy of the object autoencoder = copy.deepcopy(autoencoder) return autoencoder