Source code for clustpy.metrics.internal_clustering_metrics

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
import numpy as np
from clustpy.metrics._metrics_utils import _check_length_data_and_labels


[docs]def cvnn_score(X: np.ndarray, labels: np.ndarray | int | tuple, n_neighbors: int = 5, metric: str = "euclidean") -> float | np.ndarray:
    """
    Evaluate the quality of predicted labels by computing the clustering validation index based on nearest neighbors (CVNN).
    The score is calculated by adding a nearest-neighbor-based cluster separation value with a cluster compactness vale based on inner-cluster distances.
    Usually, it is used with a list of label arrays, i.e., labels is of type list or tuple.
    In this case, the score will be normalized to a value within [0, 2].
    If labels is a single array (of type np.ndarray) a single score is returned that is not normalized.
    In both cases, a lower value indicates a better clustering result (less neighbors in separate clusters and lower inner-cluster distances).

    Parameters
    ----------
    X : np.ndarray
        The data set
    labels : np.ndarray | list | tuple
        The labels as predicted by a clustering algorithm. If labels is a list/tuple it should contain multiple labels arrays of type np.ndarray
    n_neighbors : int
        The amount of neighbors to consider when calculating the cluster separation score. An object is not considered its own neighbor (default: 5)
    metric : str
        The metric used to identify the neighbors and to calculate the inner-cluster distance.
        See scipy.spatial.distance.pdist for more information (default: 'euclidean')

    Returns
    -------
    cvnn : float | np.ndarray
        The cvnn score of type float if labels contains a single labels array, i.e., labels is of type np.ndarray.
        Alternatively, a np.ndarray containing the normalized cvnn scores.

    References
    -------
    Liu, Yanchi, et al. "Understanding and enhancement of internal clustering validation measures."
    IEEE transactions on cybernetics 43.3 (2013): 982-994.
    """
    def _internal_cvnn_score(X: np.ndarray, labels: np.ndarray, nrbs_indices: np.ndarray, metric: str) -> (float, float):
        """
        The real calculation method of the CVNN score. 

        Parameters
        ----------
        X : np.ndarray
            The data set
        labels : np.ndarray
            The given labels
        nrbs_indices : np.ndarray
            The indicices of the nearest neighbors for each point. Has shape n_samples x n_neighbors
        metric : str
            The metric used to calculate the inner-cluster distance.

        Returns
        -------
        tuple : (float, float)
            The cluster spearation and cluster compactness value
        """
        X, labels = _check_length_data_and_labels(X, labels)
        assert isinstance(labels, np.ndarray), "labels must be of type np.nddary. Your input has type {0}".format(type(labels))
        unique_clusters = np.unique(labels)
        # Calculate neighbor weights
        n_neighbors = nrbs_indices.shape[1]
        n_neighbors_not_in_cluster = (labels.reshape((-1, 1)) != labels[nrbs_indices]).mean(1)
        cluster_separation_scores = np.zeros(unique_clusters.shape[0])
        cluster_compactness_scores = np.zeros(unique_clusters.shape[0])
        # Do per-cluster calculations
        for i, c in enumerate(unique_clusters):
            in_cluster = (labels == c)
            # Calculate separation (mean of neighbor weights in cluster)
            cluster_separation_scores[i] = n_neighbors_not_in_cluster[in_cluster].mean()
            # Calculate compartness (mean of pair-wise distances in cluster)
            X_in_cluster = X[in_cluster]
            if X_in_cluster.shape[0] > 1:
                cluster_distances = pdist(X_in_cluster, metric=metric)
                cluster_compactness_scores[i] = cluster_distances.mean()
            else:
                cluster_compactness_scores[i] = 0
        # Calculate final CVNN
        cluster_separation_final = cluster_separation_scores.max()
        cluster_compactness_final = cluster_compactness_scores.sum()
        return cluster_separation_final, cluster_compactness_final
    
    # Compute nearest neighbors
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X)
    _, nrbs_indices = nbrs.kneighbors()
    # Get CVNN
    if isinstance(labels, list) or isinstance(labels, tuple):
        # Calculate cluster separation and cluster compactness for each labels array in the list
        n_labels = len(labels)
        cluster_separations = np.zeros(n_labels)
        cluster_compactnesses = np.zeros(n_labels)
        for i, l in enumerate(labels):
            cluster_separation_l, cluster_compactness_l = _internal_cvnn_score(X, l, nrbs_indices, metric)
            cluster_separations[i] = cluster_separation_l
            cluster_compactnesses[i] = cluster_compactness_l
        # Normalize scores
        max_cluster_separations = cluster_separations.max()
        max_cluster_compactnesses = cluster_compactnesses.max()
        if max_cluster_separations != 0 and max_cluster_compactnesses != 0:
            cvnn = cluster_separations / max_cluster_separations + cluster_compactnesses / max_cluster_compactnesses
        else:
            cvnn = 0
    elif isinstance(labels, np.ndarray):
        # Do not normalize scores
        cluster_separation, cluster_compactness = _internal_cvnn_score(X, labels, nrbs_indices, metric)
        cvnn = cluster_separation + cluster_compactness
    else:
        raise ValueError("The labels must be of type list/tuple (indicating a list of different labels) or np.ndarray (indicating a single labels array). Your input is {0}".format(type(labels)))
    return cvnn