Source code for clustpy.metrics.internal_clustering_metrics

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
import numpy as np
from clustpy.metrics._metrics_utils import _check_length_data_and_labels


[docs]def cvnn_score(X: np.ndarray, labels: np.ndarray | int | tuple, n_neighbors: int = 5, metric: str = "euclidean") -> float | np.ndarray: """ Evaluate the quality of predicted labels by computing the clustering validation index based on nearest neighbors (CVNN). The score is calculated by adding a nearest-neighbor-based cluster separation value with a cluster compactness vale based on inner-cluster distances. Usually, it is used with a list of label arrays, i.e., labels is of type list or tuple. In this case, the score will be normalized to a value within [0, 2]. If labels is a single array (of type np.ndarray) a single score is returned that is not normalized. In both cases, a lower value indicates a better clustering result (less neighbors in separate clusters and lower inner-cluster distances). Parameters ---------- X : np.ndarray The data set labels : np.ndarray | list | tuple The labels as predicted by a clustering algorithm. If labels is a list/tuple it should contain multiple labels arrays of type np.ndarray n_neighbors : int The amount of neighbors to consider when calculating the cluster separation score. An object is not considered its own neighbor (default: 5) metric : str The metric used to identify the neighbors and to calculate the inner-cluster distance. See scipy.spatial.distance.pdist for more information (default: 'euclidean') Returns ------- cvnn : float | np.ndarray The cvnn score of type float if labels contains a single labels array, i.e., labels is of type np.ndarray. Alternatively, a np.ndarray containing the normalized cvnn scores. References ------- Liu, Yanchi, et al. "Understanding and enhancement of internal clustering validation measures." IEEE transactions on cybernetics 43.3 (2013): 982-994. """ def _internal_cvnn_score(X: np.ndarray, labels: np.ndarray, nrbs_indices: np.ndarray, metric: str) -> (float, float): """ The real calculation method of the CVNN score. Parameters ---------- X : np.ndarray The data set labels : np.ndarray The given labels nrbs_indices : np.ndarray The indicices of the nearest neighbors for each point. Has shape n_samples x n_neighbors metric : str The metric used to calculate the inner-cluster distance. Returns ------- tuple : (float, float) The cluster spearation and cluster compactness value """ X, labels = _check_length_data_and_labels(X, labels) assert isinstance(labels, np.ndarray), "labels must be of type np.nddary. Your input has type {0}".format(type(labels)) unique_clusters = np.unique(labels) # Calculate neighbor weights n_neighbors = nrbs_indices.shape[1] n_neighbors_not_in_cluster = (labels.reshape((-1, 1)) != labels[nrbs_indices]).mean(1) cluster_separation_scores = np.zeros(unique_clusters.shape[0]) cluster_compactness_scores = np.zeros(unique_clusters.shape[0]) # Do per-cluster calculations for i, c in enumerate(unique_clusters): in_cluster = (labels == c) # Calculate separation (mean of neighbor weights in cluster) cluster_separation_scores[i] = n_neighbors_not_in_cluster[in_cluster].mean() # Calculate compartness (mean of pair-wise distances in cluster) X_in_cluster = X[in_cluster] if X_in_cluster.shape[0] > 1: cluster_distances = pdist(X_in_cluster, metric=metric) cluster_compactness_scores[i] = cluster_distances.mean() else: cluster_compactness_scores[i] = 0 # Calculate final CVNN cluster_separation_final = cluster_separation_scores.max() cluster_compactness_final = cluster_compactness_scores.sum() return cluster_separation_final, cluster_compactness_final # Compute nearest neighbors nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X) _, nrbs_indices = nbrs.kneighbors() # Get CVNN if isinstance(labels, list) or isinstance(labels, tuple): # Calculate cluster separation and cluster compactness for each labels array in the list n_labels = len(labels) cluster_separations = np.zeros(n_labels) cluster_compactnesses = np.zeros(n_labels) for i, l in enumerate(labels): cluster_separation_l, cluster_compactness_l = _internal_cvnn_score(X, l, nrbs_indices, metric) cluster_separations[i] = cluster_separation_l cluster_compactnesses[i] = cluster_compactness_l # Normalize scores max_cluster_separations = cluster_separations.max() max_cluster_compactnesses = cluster_compactnesses.max() if max_cluster_separations != 0 and max_cluster_compactnesses != 0: cvnn = cluster_separations / max_cluster_separations + cluster_compactnesses / max_cluster_compactnesses else: cvnn = 0 elif isinstance(labels, np.ndarray): # Do not normalize scores cluster_separation, cluster_compactness = _internal_cvnn_score(X, labels, nrbs_indices, metric) cvnn = cluster_separation + cluster_compactness else: raise ValueError("The labels must be of type list/tuple (indicating a list of different labels) or np.ndarray (indicating a single labels array). Your input is {0}".format(type(labels))) return cvnn