Source code for clustpy.deep._train_utils

from clustpy.deep.neural_networks import FeedforwardAutoencoder
import torch
import copy
import numpy as np
from sklearn.base import ClusterMixin
from clustpy.deep._data_utils import get_dataloader, get_train_and_test_dataloader, get_data_dim_from_dataloader
from clustpy.deep._utils import run_initial_clustering, detect_device, encode_batchwise


def _get_default_layers(input_dim: int, embedding_size: int) -> list:
    """
    Get the default layers for a feedforward autoencoder.
    Default layers are [input_dim, 500, 500, 2000, embedding_size]

    Parameters
    ----------
    input_dim : int
        size of the first layer
    embedding_size : int
        size of the last layer

    Returns
    -------
    layers : list
        list containing the layers
    """
    layers = [input_dim, 500, 500, 2000, embedding_size]
    return layers


def _get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None,
                        neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
                        neural_network_params: dict = None, neural_network_weights: str = None,
                        random_state: np.random.RandomState | int = None) -> torch.nn.Module:
    """This function returns a new neural_network.
    - If neural_network is already a torch.nn.module, nothing will happen.
    - If neural_network is None, a new neural_network will be created using the neural_network_class and the parameters from neural_network_params.
    Optionally, the weights contained in the state_dict file referenced by neural_network_weights will be loaded.

    Parameters
    ----------
    input_dim : int
        The input number of features
    embedding_size : int
        dimension of the innermost layer of the neural network (default: 10)
    neural_network : torch.nn.Module | tuple
        the neural network used for the computations.
        Can also be None. In this case a new neural network will be created using neural_network_class and neural_network_params (default: None)
    neural_network_class : torch.nn.Module
        The neural network class that should be used (default: FeedforwardAutoencoder)
    neural_network_params : dict
        Parameters to be used when creating a new neural network using the neural_network_class (default: None)
    neural_network_weights : str
        Path to a file containing the state_dict of the neural_network (default: None)
    random_state : np.random.RandomState | int
        use a fixed random state to get a repeatable solution. Can also be of type int (default: None)

    Returns
    -------
    neural_network : torch.nn.Module
        The created neural network
    """
    if neural_network is None:
        if embedding_size > input_dim:
            print(
                "WARNING: embedding_size is larger than the dimensionality of the input dataset. embedding_size: {0} / input dimensionality: {1}".format(
                    embedding_size, input_dim))
        # Init neural network parameters
        if neural_network_params is None:
            neural_network_params = dict()
        if "layers" not in neural_network_params.keys():
            layers = _get_default_layers(input_dim, embedding_size)
            neural_network_params["layers"] = layers
        if "random_state" not in neural_network_params.keys():
            neural_network_params["random_state"] = random_state
        if neural_network_params["layers"][-1] != embedding_size:
            print(
                "WARNING: embedding_size ({0}) in _get_neural_network does not correspond to the layers used to create the neural network. In the following an embedding size of {1} as specified in the layers will be used".format(
                    embedding_size, neural_network_params["layers"][-1]))
        neural_network = neural_network_class(**neural_network_params)
    assert hasattr(neural_network,
                   "fitted"), "Neural network has no attribute 'fitted' and is therefore not compatible. Check documentation of fitted, e.g., at clustpy.deep.neural_networks._abstract_autoencoder._AbstractAutoencoder"
    if neural_network_weights is not None:
        neural_network.load_parameters(neural_network_weights)
    return neural_network


[docs]def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None,
                        n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None,
                        optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None,
                        ssl_loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), embedding_size: int = 10,
                        neural_network: torch.nn.Module | tuple = None,
                        neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
                        neural_network_params: dict = None, neural_network_weights: str = None,
                        random_state: np.random.RandomState | int = None) -> torch.nn.Module:
    """This function returns a trained neural network. The following cases are considered
       - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again.
       - If the neural network is initialized and not trained (neural_network.fitted==False), it will be fitted (neural_network.fitted will be set to True) using default parameters.
       - If the neural network is None, a new neural network is created using neural_network_class, and it will be fitted as described above.
       Beware the input neural_network_class or neural_network object needs both a fit() function and the fitted attribute. See clustpy.deep.feedforward_autoencoder.FeedforwardAutoencoder for an example.

    Parameters
    ----------
    trainloader : torch.utils.data.DataLoader
        dataloader used to train neural_network (default: None)
    data : np.ndarray
        train data set. If data is passed then trainloader can remain empty (default: None)
    n_epochs : int
        number of training epochs (default: 100)
    batch_size : int
        size of the data batches (default: 128)
    optimizer_params : dict
        parameters of the optimizer for the neural network training, includes the learning rate (default: {"lr": 1e-3})
    optimizer_class : torch.optim.Optimizer
        optimizer for training (default: torch.optim.Adam)
    device : torch.device
        The device on which to perform the computations.
        If device is None then it will be automatically chosen: if a gpu is available the gpu with the highest amount of free memory will be chosen (default: None)
    ssl_loss_fn : torch.nn.modules.loss._Loss
        self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders (default: torch.nn.MSELoss())
    embedding_size : int
        dimension of the innermost layer of the neural network (default: 10)
    neural_network : torch.nn.Module | tuple
        neural network object to be trained (optional)
        Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
    neural_network_class : torch.nn.Module
        The neural network class that should be used (default: FeedforwardAutoencoder)
    neural_network_params : dict
        Parameters to be used when creating a new neural network using the neural_network_class (default: None)
    neural_network_weights : str
        Path to a file containing the state_dict of the neural_network (default: None)
    random_state : np.random.RandomState | int
        use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
    
    Returns
    -------
    neural_network : torch.nn.Module
        The fitted neural network
    """
    if trainloader is None:
        if data is None:
            raise ValueError("data must be specified if trainloader is None")
        trainloader = get_dataloader(data, batch_size, True)
    # Get neural network object
    input_dim = get_data_dim_from_dataloader(trainloader)
    if neural_network is not None and type(neural_network) is tuple:
        assert len(
            neural_network) == 2, "If neural_network is a tuple, it has to contain two entries: the neural network class (torch.nn.Module) and the initialization parameters (dict)"
        neural_network_class = neural_network[0]
        neural_network_params = neural_network[1]
        neural_network = None
    neural_network = _get_neural_network(input_dim, embedding_size, neural_network, neural_network_class,
                                         neural_network_params, neural_network_weights, random_state)
    # Move neural network to device
    device = detect_device(device)
    neural_network.to(device)
    if not neural_network.fitted:
        print("Neural network is not fitted yet, will be pretrained.")
        # Pretrain neural network
        optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params
        neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader,
                           optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn)
    if neural_network.work_on_copy:
        # If neural network is used by multiple deep clustering algorithms, create a deep copy of the object
        neural_network = copy.deepcopy(neural_network)
    return neural_network


def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int,
                                               pretrain_optimizer_params: dict, pretrain_epochs: int,
                                               optimizer_class: torch.optim.Optimizer,
                                               ssl_loss_fn: torch.nn.modules.loss._Loss,
                                               neural_network: torch.nn.Module | tuple, embedding_size: int,
                                               custom_dataloaders: tuple, initial_clustering_class: ClusterMixin,
                                               initial_clustering_params: dict, device: torch.device,
                                               random_state: np.random.RandomState,
                                               neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
                                               neural_network_params: dict = None,
                                               neural_network_weights: str = None) -> (
        torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int,
        np.ndarray, np.ndarray, ClusterMixin):
    """
    Get the initial setting for most deep clustering algorithms by pretraining a neural network and obtaining an initial clustering result.
    This function further returns the device, where the optimization should take place (e.g., CPU or GPU), and the dataloaders.

    Parameters
    ----------
    X : np.ndarray | torch.Tensor
        the given data set. Can be a np.ndarray or a torch.Tensor
    n_clusters : int
        number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN
    batch_size : int
        size of the data batches
    pretrain_optimizer_params : dict
        parameters of the optimizer for the pretraining of the neural network, includes the learning rate
    pretrain_epochs : int
        number of epochs for the pretraining of the neural network
    optimizer_class : torch.optim.Optimizer
        the optimizer
    ssl_loss_fn : torch.nn.modules.loss._Loss
        self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
    neural_network : torch.nn.Module | tuple
        the input neural network. If None, a new FeedforwardAutoencoder will be created.
        Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
    embedding_size : int
        size of the embedding within the neural network
    custom_dataloaders : tuple
        tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position.
        Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader.
        In this case the dataloaders will be loaded by torch.load(PATH).
        If None, the default dataloaders will be used
    initial_clustering_class : ClusterMixin
        clustering class to obtain the initial cluster labels after the pretraining.
        If it is None, random labels will be chosen
    initial_clustering_params : dict
        parameters for the initial clustering class
    device : torch.device
        The device on which to perform the computations
    random_state : np.random.RandomState
        use a fixed random state to get a repeatable solution
    neural_network_class : torch.nn.Module
        The neural network class that should be used (default: FeedforwardAutoencoder)
    neural_network_params : dict
        Parameters to be used when creating a new neural network using the neural_network_class (default: None)
    neural_network_weights : str
        Path to a file containing the state_dict of the neural_network (default: None)

    Returns
    -------
    tuple : (torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int, np.ndarray, np.ndarray, ClusterMixin)
        The device,
        The trainloader,
        The testloader,
        The batch size (can be different from input if another value is used within custom_dataloader),
        The pretrained neural network,
        The embedded data,
        The number of clusters (can change if e.g. DBSCAN is used),
        The initial cluster labels,
        The initial cluster centers,
        The clustering object
    """
    device = detect_device(device)
    trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders)
    neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs,
                                         optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class,
                                         device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size,
                                         neural_network=neural_network, neural_network_class=neural_network_class,
                                         neural_network_params=neural_network_params,
                                         neural_network_weights=neural_network_weights,
                                         random_state=random_state)
    # Execute initial clustering in embedded space
    embedded_data = encode_batchwise(testloader, neural_network)
    n_clusters, init_labels, init_centers, init_cluster_obj = run_initial_clustering(embedded_data, n_clusters,
                                                                                     initial_clustering_class,
                                                                                     initial_clustering_params,
                                                                                     random_state)
    return device, trainloader, testloader, batch_size, neural_network, embedded_data, n_clusters, init_labels, init_centers, init_cluster_obj