Source code for clustpy.deep.vade

"""
@authors:
Donatella Novakovic,
Lukas Miklautz,
Collin Leiber
"""

import torch
from clustpy.deep._utils import encode_batchwise, get_device_from_module
from clustpy.deep._train_utils import get_default_deep_clustering_initialization
from clustpy.deep.neural_networks.variational_autoencoder import VariationalAutoencoder, _vae_sampling
from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.base import ClusterMixin
import tqdm
from collections.abc import Callable


def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: torch.nn.modules.loss._Loss,
          neural_network: torch.nn.Module | tuple, neural_network_weights: str,
          embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
          custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict,
          device: torch.device, random_state: np.random.RandomState) -> (
        np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
    """
    Start the actual VaDE clustering procedure on the input data set.

    Parameters
    ----------
    X : np.ndarray / torch.Tensor
        the given data set. Can be a np.ndarray or a torch.Tensor
    n_clusters : int
        number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
    batch_size : int
        size of the data batches
    pretrain_optimizer_params : dict
        parameters of the optimizer for the pretraining of the neural network, includes the learning rate
    clustering_optimizer_params : dict
        parameters of the optimizer for the actual clustering procedure, includes the learning rate
    pretrain_epochs : int
        number of epochs for the pretraining of the neural network
    clustering_epochs : int
        number of epochs for the actual clustering procedure
    optimizer_class : torch.optim.Optimizer
        the optimizer class
    ssl_loss_fn : torch.nn.modules.loss._Loss
         self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
    neural_network : torch.nn.Module | tuple
        the input neural network.
        Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
    neural_network_weights : str
        Path to a file containing the state_dict of the neural_network.
    embedding_size : int
        size of the embedding within the neural network (central layer with mean and variance)
    clustering_loss_weight : float
        weight of the clustering loss
    ssl_loss_weight : float
        weight of the self-supervised learning (ssl) loss
    custom_dataloaders : tuple
        tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position.
        Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader.
        In this case the dataloaders will be loaded by torch.load(PATH).
        If None, the default dataloaders will be used
    initial_clustering_class : ClusterMixin
        clustering class to obtain the initial cluster labels after the pretraining
    initial_clustering_params : dict
        parameters for the initial clustering class
    device : torch.device
        The device on which to perform the computations
    random_state : np.random.RandomState
        use a fixed random state to get a repeatable solution

    Returns
    -------
    tuple : (np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module)
        The labels as identified by a final Gaussian Mixture Model,
        The cluster centers as identified by a final Gaussian Mixture Model,
        The covariance matrices as identified by a final Gaussian Mixture Model,
        The labels as identified by VaDE after the training terminated,
        The cluster centers as identified by VaDE after the training terminated,
        The covariance matrices as identified by VaDE after the training terminated,
        The final neural network
    """
    # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
    device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_means, init_clustering_algo = get_default_deep_clustering_initialization(
        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
        neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
        random_state, _VaDE_VAE, neural_network_weights=neural_network_weights)
    # Get parameters from initial clustering algorithm
    init_weights = None if not hasattr(init_clustering_algo, "weights_") else init_clustering_algo.weights_
    init_covs = None if not hasattr(init_clustering_algo, "covariances_") else init_clustering_algo.covariances_
    # Initialize VaDE
    vade_module = _VaDE_Module(n_clusters=n_clusters, embedding_size=embedding_size, weights=init_weights,
                               means=init_means, variances=init_covs).to(device)
    # Use vade learning_rate (usually pretrain_optimizer_params reduced by a magnitude of 10)
    optimizer = optimizer_class(list(neural_network.parameters()) + list(vade_module.parameters()),
                                **clustering_optimizer_params)
    # Vade Training loop
    vade_module.fit(neural_network, testloader, trainloader, clustering_epochs, device, optimizer, ssl_loss_fn,
                    clustering_loss_weight, ssl_loss_weight)
    # Get labels
    vade_labels = _vade_predict_batchwise(testloader, neural_network, vade_module)

    vade_centers = vade_module.p_mean.detach().cpu().numpy()
    vade_covariances = vade_module.p_log_var.detach().cpu().numpy()
    # Do reclustering with GMM
    embedded_data = encode_batchwise(testloader, neural_network)
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', n_init=1, random_state=random_state,
                          means_init=vade_centers)
    gmm_labels = gmm.fit_predict(embedded_data).astype(np.int32)

    # Return results
    return gmm_labels, gmm.means_, gmm.covariances_, gmm.weights_, vade_labels, vade_centers, vade_covariances, neural_network


class _VaDE_VAE(VariationalAutoencoder):
    """
    A special variational autoencoder used for VaDE.
    Has a slightly different forward function while pretraining the autoencoder.
    Further, Loss function is more similar to the FeedforwardAutoencoder while pretraining.

    Attributes
    ----------
    encoder : FullyConnectedBlock
        encoder part of the autoencoder, responsible for embedding data points
    decoder : FullyConnectedBlock
        decoder part of the autoencoder, responsible for reconstructing data points from the embedding
    mean : torch.nn.Linear
        mean value of the central layer
    log_variance : torch.nn.Linear
        logarithmic variance of the central layer (use logarithm of variance - numerical purposes)
    fitted : bool
        indicating whether the autoencoder is already fitted
    """

    def forward(self, x: torch.Tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
        """
        Applies both the encode and decode function.
        The forward function is automatically called if we call self(x).
        Matches forward behavior from FeedforwardAutoencoder for pretraining and from VariationalAutoencoder afterwards.
        Overwrites function from VariationalAutoencoder.

        Parameters
        ----------
        x : torch.Tensor
            input data point, can also be a mini-batch of embedded points

        Returns
        -------
        tuple : (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)
            sampling using q_mean and q_logvar if self.fitted=True, else None
            Mean value of the central VAE layer if self.fitted=True, else None
            Logarithmic variance value of the central VAE layer if self.fitted=True, else None
            The reconstruction of the data point
        """
        if not self.fitted:
            # While pretraining a forward method similar to a regular autoencoder (FeedforwardAutoencoder) should be used
            mean, _ = self.encode(x)
            reconstruction = self.decode(mean)
            z, q_mean, q_logvar = None, None, None
        else:
            # After pretraining the usual forward of a VAE should be used. Super() uses function from VariationalAutoencoder
            z, q_mean, q_logvar, reconstruction = super().forward(x)
        return z, q_mean, q_logvar, reconstruction

    def loss(self, batch: list, ssl_loss_fn: torch.nn.modules.loss._Loss, device: torch.device,
             corruption_fn: Callable = None, beta: float = 1) -> (torch.Tensor, torch.Tensor, torch.Tensor):
        """
        Calculate the loss of a single batch of data.
        Matches loss calculation from FeedforwardAutoencoder for pretraining and from VariationalAutoencoder afterwards.
        Overwrites function from VariationalAutoencoder.

        Parameters
        ----------
        batch: list
            the different parts of a dataloader (id, samples, ...)
        ssl_loss_fn : torch.nn.modules.loss._Loss
            self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
        device : torch.device
            device to be trained on
        corruption_fn : Callable
            Can be used to corrupt the input data, e.g., when using a denoising autoencoder.
            Note that the function must match the data and the data loaders.
            For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None)
        beta : float
            weighting of the KL loss. Not used at the moment (default: 1)

        Returns
        -------
        loss: (torch.Tensor, torch.Tensor, torch.Tensor)
            the ssl loss of the input sample,
            the sampling,
            the reconstruction of the data point
        """
        assert type(batch) is list, "batch must come from a dataloader and therefore be of type list"
        if not self.fitted:
            # While pretraining a loss similar to a regular autoencoder (FeedforwardAutoencoder) should be used
            batch_data = batch[1].to(device)
            batch_data_adj = batch_data if corruption_fn is None else corruption_fn(batch_data)
            z, _, _, reconstruction = self.forward(batch_data_adj)
            loss = ssl_loss_fn(reconstruction, batch_data)
        else:
            # After pretraining the usual loss of a VAE should be used. Super() uses function from VariationalAutoencoder
            loss = super().loss(batch, ssl_loss_fn, device, corruption_fn, beta)
        return loss, z, reconstruction


class _VaDE_Module(torch.nn.Module):
    """
    The _VaDE_Module. Contains most of the algorithm specific procedures.

    Parameters
    ----------
    n_clusters : int
        number of clusters
    embedding_size : int
        size of the central layer within the VAE
    weights : torch.Tensor
        the initial soft cluster assignments (default: None)
    means : torch.Tensor
        the initial means of the VAE (default: None)
    variances : torch.Tensor
        the initial variances of the VAE (default: None)

    Attributes
    ----------
    pi : torch.nn.Parameter
        the soft assignments
    p_mean : torch.nn.Parameter
        the cluster centers
    p_log_var : torch.nn.Parameter
        the variances of the clusters
    normalize_prob : torch.nn.Softmax
        torch.nn.Softmax function for the prediction method
    """

    def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = None, means: torch.Tensor = None,
                 variances: torch.Tensor = None):
        super(_VaDE_Module, self).__init__()
        if weights is None:
            # if not initialized then use uniform distribution
            weights = torch.ones(n_clusters) / n_clusters
        self.pi = torch.nn.Parameter(torch.tensor(weights), requires_grad=True)
        if means is None:
            # if not initialized then use torch.randn
            means = torch.randn(n_clusters, embedding_size)
        self.p_mean = torch.nn.Parameter(torch.tensor(means), requires_grad=True)
        if variances is None:
            variances = torch.ones(n_clusters, embedding_size)
        assert variances.shape == (n_clusters,
                                   embedding_size), "Shape of the initial variances for the Vade_Module must be (n_clusters, embedding_size)"
        self.p_log_var = torch.nn.Parameter(torch.log(torch.tensor(variances)), requires_grad=True)
        self.normalize_prob = torch.nn.Softmax(dim=0)

    def predict(self, q_mean: torch.Tensor, q_logvar: torch.Tensor) -> torch.Tensor:
        """
        Predict the labels given the specific means and variances of given samples.
        Uses argmax to return a hard cluster label.

        Parameters
        ----------
        q_mean : torch.Tensor
            mean values of the central layer of the VAE
        q_logvar : torch.Tensor
            logarithmic variances of the central layer of the VAE (use logarithm of variance - numerical purposes)

        Returns
        -------
        pred: torch.Tensor
            The predicted label
        """
        z = _vae_sampling(q_mean, q_logvar)
        pi_normalized = self.normalize_prob(self.pi)
        p_c_z = _get_gamma(pi_normalized, self.p_mean, self.p_log_var, z)
        pred = torch.argmax(p_c_z, dim=1)
        return pred

    def vade_loss(self, neural_network: VariationalAutoencoder, batch_data: torch.Tensor,
                  ssl_loss_fn: torch.nn.modules.loss._Loss, clustering_loss_weight: float,
                  ssl_loss_weight: float) -> torch.Tensor:
        """
        Calculate the VaDE loss of given samples.

        Parameters
        ----------
        neural_network : VariationalAutoencoder
            the VariationalAutoencoder
        batch_data : torch.Tensor
            the samples
        ssl_loss_fn : torch.nn.modules.loss._Loss
            self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
        clustering_loss_weight : float
            weight of the clustering loss
        ssl_loss_weight : float
            weight of the self-supervised learning (ssl) loss

        Returns
        -------
        loss : torch.Tensor
            returns the loss of the input samples
        """
        z, q_mean, q_logvar, reconstruction = neural_network.forward(batch_data)
        pi_normalized = self.normalize_prob(self.pi)
        p_c_z = _get_gamma(pi_normalized, self.p_mean, self.p_log_var, z)
        loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z,
                                  reconstruction, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight)
        return loss

    def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.data.DataLoader,
            trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device,
            optimizer: torch.optim.Optimizer, ssl_loss_fn: torch.nn.modules.loss._Loss, clustering_loss_weight: float,
            ssl_loss_weight: float) -> '_VaDE_Module':
        """
        Trains the _VaDE_Module in place.

        Parameters
        ----------
        neural_network : VariationalAutoencoder
            The VariationalAutoencoder
        testloader : torch.utils.data.DataLoader
            dataloader used for testing (not used at the moment)
        trainloader : torch.utils.data.DataLoader
            dataloader to be used for training
        n_epochs : int
            number of epochs for the clustering procedure
        device : torch.device
            device to be trained on
        optimizer : torch.optim.Optimizer
            the optimizer
        ssl_loss_fn : torch.nn.modules.loss._Loss
            self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
        clustering_loss_weight : float
            weight of the clustering loss
        ssl_loss_weight : float
            weight of the self-supervised learning (ssl) loss

        Returns
        -------
        self : _VaDE_Module
            this instance of the _VaDE_Module
        """
        # lr_decrease = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
        # training loop
        tbar = tqdm.trange(n_epochs, desc="VaDE training")
        for _ in tbar:
            self.train()
            total_loss = 0
            for batch in trainloader:
                # load batch on device
                batch_data = batch[1].to(device)
                loss = self.vade_loss(neural_network, batch_data, ssl_loss_fn, clustering_loss_weight,
                                      ssl_loss_weight)
                total_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            postfix_str = {"Loss": total_loss}
            tbar.set_postfix(postfix_str)
        return self


def _vade_predict_batchwise(dataloader: torch.utils.data.DataLoader, neural_network: VariationalAutoencoder,
                            vade_module: _VaDE_Module) -> np.ndarray:
    """
    Utility function for predicting the cluster labels over the whole data set in a mini-batch fashion

    Parameters
    ----------
    dataloader : torch.utils.data.DataLoader
        dataloader to be used
    neural_network : VariationalAutoencoder
        the VariationalAutoencoder
    vade_module : _VaDE_Module
        the _VaDE_Module

    Returns
    -------
    predictions_numpy : np.ndarray
        The cluster labels of the data set
    """
    device = get_device_from_module(neural_network)
    predictions = []
    for batch in dataloader:
        batch_data = batch[1].to(device)
        q_mean, q_logvar = neural_network.encode(batch_data)
        prediction = vade_module.predict(q_mean, q_logvar).detach().cpu()
        predictions.append(prediction)
    predictions_numpy = torch.cat(predictions, dim=0).numpy()
    return predictions_numpy


def _get_gamma(pi: torch.Tensor, p_mean: torch.Tensor, p_log_var: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
    """
    Calculate the gamma of samples created by the VAE.

    Parameters
    ----------
    pi : torch.Tensor
        softmax version of the soft cluster assignments in the _VaDE_Module
    p_mean : torch.Tensor
        cluster centers of the _VaDE_Module
    p_log_var : torch.Tensor
        log variances of the _VaDE_Module
    z : torch.Tensor
        the created samples

    Returns
    -------
    p_c_z : torch.Tensor
        The gamma values
    """
    z = z.unsqueeze(1)
    p_log_var = p_log_var.unsqueeze(0)
    pi = pi.unsqueeze(0)

    p_z_c = -torch.sum(0.5 * (np.log(2 * np.pi)) + p_log_var + ((z - p_mean).pow(2) / (2. * torch.exp(p_log_var))),
                       dim=2)

    p_c_z_c = torch.exp(torch.log(pi) + p_z_c) + 1e-10
    p_c_z = p_c_z_c / torch.sum(p_c_z_c, dim=1, keepdim=True)

    return p_c_z


def _compute_vade_loss(pi: torch.Tensor, p_mean: torch.Tensor, p_log_var: torch.Tensor, q_mean: torch.Tensor,
                       q_log_var: torch.Tensor, batch_data: torch.Tensor, p_c_z: torch.Tensor,
                       reconstruction: torch.Tensor, ssl_loss_fn: torch.nn.modules.loss._Loss,
                       clustering_loss_weight: float, ssl_loss_weight: float) -> torch.Tensor:
    """
    Calculate the final loss of the input samples for the VaDE algorithm.

    Parameters
    ----------
    pi : torch.Tensor
        softmax version of the soft cluster assignments in the _VaDE_Module
    p_mean : torch.Tensor
        cluster centers of the _VaDE_Module
    p_log_var : torch.Tensor
        log variances of the _VaDE_Module
    q_mean : torch.Tensor
        mean value of the central layer of the VAE
    q_log_var : torch.Tensor
        logarithmic variance of the central layer of the VAE
    batch_data : torch.Tensor
        the samples
    p_c_z : torch.Tensor
        result of the _get_gamma function
    reconstruction : torch.Tensor
        the reconstructed version of the input samples
    ssl_loss_fn : torch.nn.modules.loss._Loss
        self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders
    clustering_loss_weight : float
        weight of the clustering loss
    ssl_loss_weight : float
        weight of the self-supervised learning (ssl) loss

    Returns
    -------
    loss: torch.Tensor
        Tha VaDE loss
    """
    q_mean = q_mean.unsqueeze(1)
    p_log_var = p_log_var.unsqueeze(0)

    p_x_z = ssl_loss_fn(reconstruction, batch_data)

    p_z_c = torch.sum(p_c_z * (0.5 * np.log(2 * np.pi) + 0.5 * (
            torch.sum(p_log_var, dim=2) + torch.sum(torch.exp(q_log_var.unsqueeze(1)) / torch.exp(p_log_var),
                                                    dim=2) + torch.sum((q_mean - p_mean).pow(2) / torch.exp(p_log_var),
                                                                       dim=2))))
    p_c = torch.sum(p_c_z * torch.log(pi))
    q_z_x = 0.5 * (np.log(2 * np.pi)) + 0.5 * torch.sum(1 + q_log_var)
    q_c_x = torch.sum(p_c_z * torch.log(p_c_z))

    loss = p_z_c - p_c - q_z_x + q_c_x
    loss /= batch_data.size(0)
    loss = clustering_loss_weight * loss + ssl_loss_weight * p_x_z  # Beware that we do not divide two times by number of samples
    return loss


[docs]class VaDE(_AbstractDeepClusteringAlgo):
    """
    The Variational Deep Embedding (VaDE) algorithm.
    First, an variational autoencoder (VAE) will be trained (will be skipped if input neural network is given).
    Afterward, a GMM will be fit to identify the initial clustering structures.
    Last, the VAE will be optimized using the VaDE loss function.

    Parameters
    ----------
    n_clusters : int
        number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
    batch_size : int
        size of the data batches (default: 256)
    pretrain_optimizer_params : dict
        parameters of the optimizer for the pretraining of the neural network, includes the learning rate (default: {"lr": 1e-3})
    clustering_optimizer_params : dict
        parameters of the optimizer for the actual clustering procedure, includes the learning rate (default: {"lr": 1e-4})
    pretrain_epochs : int
        number of epochs for the pretraining of the neural network (default: 10)
    clustering_epochs : int
        number of epochs for the actual clustering procedure (default: 150)
    optimizer_class : torch.optim.Optimizer
        the optimizer class (default: torch.optim.Adam)
    ssl_loss_fn : torch.nn.modules.loss._Loss
         self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders (default: torch.nn.BCELoss(reduction='sum'))
    clustering_loss_weight : float
        weight of the clustering loss (default: 1.0)
    ssl_loss_weight : float
        weight of the self-supervised learning (ssl) loss (default: 1.0)
    neural_network : torch.nn.Module | tuple
        the input neural network. If None, a new VariationalAutoencoder will be created.
        Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
    neural_network_weights : str
        Path to a file containing the state_dict of the neural_network (default: None)
    embedding_size : int
        size of the embedding within the neural network (central layer with mean and variance) (default: 10)
    custom_dataloaders : tuple
        tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position.
        Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader.
        In this case the dataloaders will be loaded by torch.load(PATH).
        If None, the default dataloaders will be used (default: None)
    initial_clustering_class : ClusterMixin
        clustering class to obtain the initial cluster labels after the pretraining (default: GaussianMixture)
    initial_clustering_params : dict
        parameters for the initial clustering class (default: {"n_init": 10, "covariance_type": "diag"})
    device : torch.device
        The device on which to perform the computations.
        If device is None then it will be automatically chosen: if a gpu is available the gpu with the highest amount of free memory will be chosen (default: None)
    random_state : np.random.RandomState | int
        use a fixed random state to get a repeatable solution. Can also be of type int (default: None)

    Attributes
    ----------
    labels_ : np.ndarray
        The labels as identified by a final Gaussian Mixture Model
    cluster_centers_ : np.ndarray
        The cluster centers as identified by a final Gaussian Mixture Model
    covariances_ : np.ndarray
        The covariance matrices as identified by a final Gaussian Mixture Model
    weights_ : np.ndarray
        The weights as identified by a final Gaussian Mixture Model
    vade_labels_ : np.ndarray
        The labels as identified by VaDE after the training terminated
    vade_cluster_centers_ : np.ndarray
        The cluster centers as identified by VaDE after the training terminated
    vade_covariances_ : np.ndarray
        The covariance matrices as identified by VaDE after the training terminated
    neural_network : torch.nn.Module
        The final neural network

    Examples
    ----------
    >>> from clustpy.data import create_subspace_data
    >>> data, labels = create_subspace_data(1500, subspace_features=(3, 50), random_state=1)
    >>> data = (data - np.mean(data)) / np.std(data)
    >>> vade = VaDE(n_clusters=3, pretrain_epochs=3, clustering_epochs=3)
    >>> vade.fit(data)

    References
    ----------
    Jiang, Zhuxi, et al. "Variational Deep Embedding: An Unsupervised and Generative Approach to Clustering." IJCAI. 2017.
    """

    def __init__(self, n_clusters: int, batch_size: int = 256, pretrain_optimizer_params: dict = None,
                 clustering_optimizer_params: dict = None, pretrain_epochs: int = 10,
                 clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                 ssl_loss_fn: torch.nn.modules.loss._Loss = torch.nn.BCELoss(reduction='sum'),
                 clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0,
                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
                 embedding_size: int = 10, custom_dataloaders: tuple = None,
                 initial_clustering_class: ClusterMixin = GaussianMixture, initial_clustering_params: dict = None,
                 device: torch.device = None, random_state: np.random.RandomState | int = None):
        super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state)
        self.n_clusters = n_clusters
        self.pretrain_optimizer_params = {
            "lr": 1e-3} if pretrain_optimizer_params is None else pretrain_optimizer_params
        self.clustering_optimizer_params = {
            "lr": 1e-4} if clustering_optimizer_params is None else clustering_optimizer_params
        self.pretrain_epochs = pretrain_epochs
        self.clustering_epochs = clustering_epochs
        self.optimizer_class = optimizer_class
        self.ssl_loss_fn = ssl_loss_fn
        self.clustering_loss_weight = clustering_loss_weight
        self.ssl_loss_weight = ssl_loss_weight
        self.custom_dataloaders = custom_dataloaders
        self.initial_clustering_class = initial_clustering_class
        self.initial_clustering_params = {"n_init": 10,
                                          "covariance_type": "diag"} if initial_clustering_params is None else initial_clustering_params

[docs]    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE':
        """
        Initiate the actual clustering process on the input data set.
        The resulting cluster labels will be stored in the labels_ attribute.

        Parameters
        ----------
        X : np.ndarray
            the given data set
        y : np.ndarray
            the labels (can be ignored)

        Returns
        -------
        self : VaDE
            this instance of the VaDE algorithm
        """
        assert type(self.ssl_loss_fn) != torch.nn.modules.loss.BCELoss or (np.min(X) >= 0 and np.max(
            X) <= 1), "Your dataset contains values that are not in the value range [0, 1]. Therefore, BCE is not a valid loss function, an alternative might be a MSE loss function."
        super().fit(X, y)
        gmm_labels, gmm_means, gmm_covariances, gmm_weights, vade_labels, vade_centers, vade_covariances, neural_network = _vade(
            X,
            self.n_clusters,
            self.batch_size,
            self.pretrain_optimizer_params,
            self.clustering_optimizer_params,
            self.pretrain_epochs,
            self.clustering_epochs,
            self.optimizer_class,
            self.ssl_loss_fn,
            self.neural_network,
            self.neural_network_weights,
            self.embedding_size,
            self.clustering_loss_weight,
            self.ssl_loss_weight,
            self.custom_dataloaders,
            self.initial_clustering_class,
            self.initial_clustering_params,
            self.device,
            self.random_state)
        self.labels_ = gmm_labels
        self.cluster_centers_ = gmm_means
        self.covariances_ = gmm_covariances
        self.weights_ = gmm_weights
        self.vade_labels_ = vade_labels
        self.vade_cluster_centers_ = vade_centers
        self.vade_covariances_ = vade_covariances
        self.neural_network = neural_network
        return self

[docs]    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predicts the labels of the input data.

        Parameters
        ----------
        X : np.ndarray
            input data

        Returns
        -------
        predicted_labels : np.ndarray
            The predicted labels
        """
        embedded_data = self.transform(X)
        gmm = GaussianMixture(n_components=self.n_clusters, covariance_type='full')
        gmm.means_ = self.cluster_centers_
        gmm.covariances_ = self.covariances_
        gmm.weights_ = self.weights_
        gmm.precisions_cholesky_ = np.linalg.cholesky(np.linalg.inv(self.covariances_))
        predicted_labels = gmm.predict(embedded_data).astype(np.int32)
        return predicted_labels