Source code for clustpy.partition.ldakmeans

"""
authors:
Collin Leiber
"""

from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.decomposition import PCA
import numpy as np
from scipy.linalg import eigh
from clustpy.alternative.nrkmeans import _update_centers_and_scatter_matrix
from sklearn.utils import check_random_state


def _lda_kmeans(X: np.ndarray, n_clusters: int, n_dims: int, max_iter: int, kmeans_repetitions: int,
                random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, float):
    """
    Start the actual LDA-Kmeans clustering procedure on the input data set.

    Parameters
    ----------
    X : np.ndarray
        the given data set
    n_clusters : int
        the number of clusters
    n_dims : int
        The number of features in the resulting subspace
    max_iter : int
        the maximum number of iterations
    kmeans_repetitions : int
            Number of repetitions when executing KMeans. For more information see sklearn.cluster.KMeans (default: 10)
    random_state : np.random.RandomState
        use a fixed random state to get a repeatable solution

    Returns
    -------
    tuple : (np.ndarray, np.ndarray, np.ndarray, float)
        The labels as identified by LDAKmeans,
        The final rotation matrix,
        The cluster centers in the subspace,
        The final error
    """
    assert n_clusters > 1, "n_clusters must be larger than 1"
    assert max_iter > 0, "max_iter must be larger than 0"
    if n_dims >= X.shape[1]:
        km = KMeans(n_clusters, n_init=kmeans_repetitions, random_state=random_state)
        km.fit(X)
        return km.labels_, np.identity(X.shape[1]), km.cluster_centers_, km.inertia_
    # Check if labels stay the same (break condition)
    old_labels = None
    # Global parameters
    global_mean = np.mean(X, axis=0)
    centered_points = X - global_mean
    St = np.matmul(centered_points.T, centered_points) / (X.shape[0] - 1)
    # Get initial rotation
    pca = PCA(n_dims)
    pca.fit(X)
    rotation = pca.components_.T
    # Repeat actions until convergence or max_iter
    for iteration in range(max_iter):
        # Update labels
        X_subspace = np.matmul(X, rotation)
        km = KMeans(n_clusters, n_init=kmeans_repetitions, random_state=random_state)
        km.fit(X_subspace)
        # Check if labels have not changed
        if old_labels is not None and nmi(km.labels_, old_labels) == 1:
            break
        else:
            old_labels = km.labels_.copy()
        # Update subspace
        _, scatter = _update_centers_and_scatter_matrix(X, n_clusters, km.labels_)
        Sw = scatter / (X.shape[0] - 1)
        Sb = St - Sw
        try:
            _, eigen_vectors = eigh(Sb, Sw)
            # Take the eigenvectors with largest eigenvalues
            rotation = eigen_vectors[:, ::-1][:, :n_dims]
        except:
            # In case errors occur during eigenvalue decomposition keep algorithm running
            pass
    return km.labels_, rotation, km.cluster_centers_, km.inertia_


[docs]class LDAKmeans(BaseEstimator, ClusterMixin): """ Execute the LDA-Kmeans clustering procedure. The initial rotation are normally the (n_clusters-1) components of a PCA. Afterward, Kmeans and LDA are executed one after the other until the labels do not change anymore. Kmeans always takes place in the rotated subspace. Parameters ---------- n_clusters : int the number of clusters n_dims : int The number of features in the resulting subspace. If None this will be equal to n_clusters - 1 (default: None) max_iter : int the maximum number of iterations (default: 300) n_init : int number of times LDAKmeans is executed using different seeds. The final result will be the one with lowest costs (default: 1) kmeans_repetitions : int Number of repetitions when executing KMeans. For more information see sklearn.cluster.KMeans (default: 10) random_state : np.random.RandomState | int use a fixed random state to get a repeatable solution. Can also be of type int (default: None) Attributes ---------- labels_ : np.ndarray The final labels rotation_ : np.ndarray The final rotation matrix cluster_centers_ = np.ndarray The cluster centers in the subspace error_ : float The final error (KMeans error in the subspace) References ------- Ding, Chris, and Tao Li. "Adaptive dimension reduction using discriminant analysis and k-means clustering." Proceedings of the 24th international conference on Machine learning. 2007. """ def __init__(self, n_clusters: int, n_dims: int = None, max_iter: int = 300, n_init: int = 1, kmeans_repetitions: int = 10, random_state: np.random.RandomState | int = None): self.n_clusters = n_clusters self.n_dims = n_clusters - 1 if n_dims is None else n_dims self.max_iter = max_iter self.n_init = n_init self.kmeans_repetitions = kmeans_repetitions self.random_state = check_random_state(random_state)
[docs] def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'LDAKmeans': """ Initiate the actual clustering process on the input data set. The resulting cluster labels are contained in the labels_ attribute. Parameters ---------- X : np.ndarray the given data set y : np.ndarray the labels (can be ignored) Returns ------- self : LDAKmeans this instance of the LDAKmeans algorithm """ all_random_states = self.random_state.choice(10000, self.n_init, replace=False) # Get best result best_costs = np.inf for i in range(self.n_init): local_random_state = check_random_state(all_random_states[i]) labels, rotation, centers, error = _lda_kmeans(X, self.n_clusters, self.n_dims, self.max_iter, self.kmeans_repetitions, local_random_state) if error < best_costs: best_costs = error # Update class variables self.labels_ = labels self.rotation_ = rotation self.cluster_centers_ = centers self.error_ = error return self
[docs] def transform_subspace(self, X: np.ndarray) -> np.ndarray: """ Transform the input dataset with the rotation matrix identified by the fit function. Parameters ---------- X : np.ndarray the given data set Returns ------- rotated_data : np.ndarray The rotated data set """ assert hasattr(self, "labels_"), "The LDA-Kmeans algorithm has not run yet. Use the fit() function first." rotated_data = np.matmul(X, self.rotation_) return rotated_data