from clustpy.deep.neural_networks import FeedforwardAutoencoder
import torch
import copy
import numpy as np
from sklearn.base import ClusterMixin
from clustpy.deep._data_utils import get_dataloader, get_train_and_test_dataloader, get_data_dim_from_dataloader
from clustpy.deep._utils import run_initial_clustering, detect_device, encode_batchwise
def _get_default_layers(input_dim: int, embedding_size: int) -> list:
"""
Get the default layers for a feedforward autoencoder.
Default layers are [input_dim, 500, 500, 2000, embedding_size]
Parameters
----------
input_dim : int
size of the first layer
embedding_size : int
size of the last layer
Returns
-------
layers : list
list containing the layers
"""
layers = [input_dim, 500, 500, 2000, embedding_size]
return layers
def _get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None,
neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
neural_network_params: dict = None, neural_network_weights: str = None,
random_state: np.random.RandomState | int = None) -> torch.nn.Module:
"""This function returns a new neural_network.
- If neural_network is already a torch.nn.module, nothing will happen.
- If neural_network is None, a new neural_network will be created using the neural_network_class and the parameters from neural_network_params.
Optionally, the weights contained in the state_dict file referenced by neural_network_weights will be loaded.
Parameters
----------
input_dim : int
The input number of features
embedding_size : int
dimension of the innermost layer of the neural network (default: 10)
neural_network : torch.nn.Module | tuple
the neural network used for the computations.
Can also be None. In this case a new neural network will be created using neural_network_class and neural_network_params (default: None)
neural_network_class : torch.nn.Module
The neural network class that should be used (default: FeedforwardAutoencoder)
neural_network_params : dict
Parameters to be used when creating a new neural network using the neural_network_class (default: None)
neural_network_weights : str
Path to a file containing the state_dict of the neural_network (default: None)
random_state : np.random.RandomState | int
use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
Returns
-------
neural_network : torch.nn.Module
The created neural network
"""
if neural_network is None:
if embedding_size > input_dim:
print(
"WARNING: embedding_size is larger than the dimensionality of the input dataset. embedding_size: {0} / input dimensionality: {1}".format(
embedding_size, input_dim))
# Init neural network parameters
if neural_network_params is None:
neural_network_params = dict()
if "layers" not in neural_network_params.keys():
layers = _get_default_layers(input_dim, embedding_size)
neural_network_params["layers"] = layers
if "random_state" not in neural_network_params.keys():
neural_network_params["random_state"] = random_state
if neural_network_params["layers"][-1] != embedding_size:
print(
"WARNING: embedding_size ({0}) in _get_neural_network does not correspond to the layers used to create the neural network. In the following an embedding size of {1} as specified in the layers will be used".format(
embedding_size, neural_network_params["layers"][-1]))
neural_network = neural_network_class(**neural_network_params)
assert hasattr(neural_network,
"fitted"), "Neural network has no attribute 'fitted' and is therefore not compatible. Check documentation of fitted, e.g., at clustpy.deep.neural_networks._abstract_autoencoder._AbstractAutoencoder"
if neural_network_weights is not None:
neural_network.load_parameters(neural_network_weights)
return neural_network
[docs]def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None,
n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None,
optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None,
ssl_loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), embedding_size: int = 10,
neural_network: torch.nn.Module | tuple = None,
neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
neural_network_params: dict = None, neural_network_weights: str = None,
random_state: np.random.RandomState | int = None) -> torch.nn.Module:
"""This function returns a trained neural network. The following cases are considered
- If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again.
- If the neural network is initialized and not trained (neural_network.fitted==False), it will be fitted (neural_network.fitted will be set to True) using default parameters.
- If the neural network is None, a new neural network is created using neural_network_class, and it will be fitted as described above.
Beware the input neural_network_class or neural_network object needs both a fit() function and the fitted attribute. See clustpy.deep.feedforward_autoencoder.FeedforwardAutoencoder for an example.
Parameters
----------
trainloader : torch.utils.data.DataLoader
dataloader used to train neural_network (default: None)
data : np.ndarray
train data set. If data is passed then trainloader can remain empty (default: None)
n_epochs : int
number of training epochs (default: 100)
batch_size : int
size of the data batches (default: 128)
optimizer_params : dict
parameters of the optimizer for the neural network training, includes the learning rate (default: {"lr": 1e-3})
optimizer_class : torch.optim.Optimizer
optimizer for training (default: torch.optim.Adam)
device : torch.device
The device on which to perform the computations.
If device is None then it will be automatically chosen: if a gpu is available the gpu with the highest amount of free memory will be chosen (default: None)
ssl_loss_fn : torch.nn.modules.loss._Loss
self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders (default: torch.nn.MSELoss())
embedding_size : int
dimension of the innermost layer of the neural network (default: 10)
neural_network : torch.nn.Module | tuple
neural network object to be trained (optional)
Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
neural_network_class : torch.nn.Module
The neural network class that should be used (default: FeedforwardAutoencoder)
neural_network_params : dict
Parameters to be used when creating a new neural network using the neural_network_class (default: None)
neural_network_weights : str
Path to a file containing the state_dict of the neural_network (default: None)
random_state : np.random.RandomState | int
use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
Returns
-------
neural_network : torch.nn.Module
The fitted neural network
"""
if trainloader is None:
if data is None:
raise ValueError("data must be specified if trainloader is None")
trainloader = get_dataloader(data, batch_size, True)
# Get neural network object
input_dim = get_data_dim_from_dataloader(trainloader)
if neural_network is not None and type(neural_network) is tuple:
assert len(
neural_network) == 2, "If neural_network is a tuple, it has to contain two entries: the neural network class (torch.nn.Module) and the initialization parameters (dict)"
neural_network_class = neural_network[0]
neural_network_params = neural_network[1]
neural_network = None
neural_network = _get_neural_network(input_dim, embedding_size, neural_network, neural_network_class,
neural_network_params, neural_network_weights, random_state)
# Move neural network to device
device = detect_device(device)
neural_network.to(device)
if not neural_network.fitted:
print("Neural network is not fitted yet, will be pretrained.")
# Pretrain neural network
optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params
neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader,
optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn)
if neural_network.work_on_copy:
# If neural network is used by multiple deep clustering algorithms, create a deep copy of the object
neural_network = copy.deepcopy(neural_network)
return neural_network
def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int,
pretrain_optimizer_params: dict, pretrain_epochs: int,
optimizer_class: torch.optim.Optimizer,
ssl_loss_fn: torch.nn.modules.loss._Loss,
neural_network: torch.nn.Module | tuple, embedding_size: int,
custom_dataloaders: tuple, initial_clustering_class: ClusterMixin,
initial_clustering_params: dict, device: torch.device,
random_state: np.random.RandomState,
neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
neural_network_params: dict = None,
neural_network_weights: str = None) -> (
torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int,
np.ndarray, np.ndarray, ClusterMixin):
"""
Get the initial setting for most deep clustering algorithms by pretraining a neural network and obtaining an initial clustering result.
This function further returns the device, where the optimization should take place (e.g., CPU or GPU), and the dataloaders.
Parameters
----------
X : np.ndarray | torch.Tensor
the given data set. Can be a np.ndarray or a torch.Tensor
n_clusters : int
number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN
batch_size : int
size of the data batches
pretrain_optimizer_params : dict
parameters of the optimizer for the pretraining of the neural network, includes the learning rate
pretrain_epochs : int
number of epochs for the pretraining of the neural network
optimizer_class : torch.optim.Optimizer
the optimizer
ssl_loss_fn : torch.nn.modules.loss._Loss
self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
neural_network : torch.nn.Module | tuple
the input neural network. If None, a new FeedforwardAutoencoder will be created.
Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
embedding_size : int
size of the embedding within the neural network
custom_dataloaders : tuple
tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position.
Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader.
In this case the dataloaders will be loaded by torch.load(PATH).
If None, the default dataloaders will be used
initial_clustering_class : ClusterMixin
clustering class to obtain the initial cluster labels after the pretraining.
If it is None, random labels will be chosen
initial_clustering_params : dict
parameters for the initial clustering class
device : torch.device
The device on which to perform the computations
random_state : np.random.RandomState
use a fixed random state to get a repeatable solution
neural_network_class : torch.nn.Module
The neural network class that should be used (default: FeedforwardAutoencoder)
neural_network_params : dict
Parameters to be used when creating a new neural network using the neural_network_class (default: None)
neural_network_weights : str
Path to a file containing the state_dict of the neural_network (default: None)
Returns
-------
tuple : (torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int, np.ndarray, np.ndarray, ClusterMixin)
The device,
The trainloader,
The testloader,
The batch size (can be different from input if another value is used within custom_dataloader),
The pretrained neural network,
The embedded data,
The number of clusters (can change if e.g. DBSCAN is used),
The initial cluster labels,
The initial cluster centers,
The clustering object
"""
device = detect_device(device)
trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders)
neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs,
optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class,
device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size,
neural_network=neural_network, neural_network_class=neural_network_class,
neural_network_params=neural_network_params,
neural_network_weights=neural_network_weights,
random_state=random_state)
# Execute initial clustering in embedded space
embedded_data = encode_batchwise(testloader, neural_network)
n_clusters, init_labels, init_centers, init_cluster_obj = run_initial_clustering(embedded_data, n_clusters,
initial_clustering_class,
initial_clustering_params,
random_state)
return device, trainloader, testloader, batch_size, neural_network, embedded_data, n_clusters, init_labels, init_centers, init_cluster_obj