Source code for clustpy.metrics.clustering_metrics
import numpy as np
from scipy.optimize import linear_sum_assignment
from clustpy.metrics.confusion_matrix import ConfusionMatrix
from scipy.special import comb
def _check_number_of_points(labels_true: np.ndarray, labels_pred: np.ndarray) -> bool:
"""
Check if the length of the ground truth labels and the prediction labels match.
If they do not match throw an exception.
Parameters
----------
labels_true : np.ndarray
The ground truth labels of the data set
labels_pred : np.ndarray
The labels as predicted by a clustering algorithm
Returns
-------
boolean : bool
True if execution was successful
"""
if labels_pred.shape[0] != labels_true.shape[0]:
raise Exception(
"Number of objects of the prediction and ground truth are not equal.\nNumber of prediction objects: " + str(
labels_pred.shape[0]) + "\nNumber of ground truth objects: " + str(labels_true.shape[0]))
return True
[docs]def variation_of_information(labels_true: np.ndarray, labels_pred: np.ndarray) -> float:
"""
Calculate the variation of information between the ground truth labels and the predicted labels.
Returns a minimum value of 0.0 which corresponds to a perfect match.
Implemented as defined in https://en.wikipedia.org/wiki/Variation_of_information
Parameters
----------
labels_true : np.ndarray
The ground truth labels of the data set
labels_pred : np.ndarray
The labels as predicted by a clustering algorithm
Returns
-------
vi : float
The variation of information
References
-------
Meilă, Marina. "Comparing clusterings by the variation of information."
Learning theory and kernel machines. Springer, Berlin, Heidelberg, 2003. 173-187.
"""
_check_number_of_points(labels_true, labels_pred)
n = len(labels_true)
cluster_ids_true = np.unique(labels_true)
cluster_ids_pred = np.unique(labels_pred)
result = 0.0
for id_true in cluster_ids_true:
points_in_cluster_gt = np.argwhere(labels_true == id_true)[:, 0]
p = len(points_in_cluster_gt) / n
for id_pred in cluster_ids_pred:
points_in_cluster_pred = np.argwhere(labels_pred == id_pred)[:, 0]
q = len(points_in_cluster_pred) / n
r = len([point for point in points_in_cluster_gt if point in points_in_cluster_pred]) / n
if r != 0:
result += r * (np.log(r / p) + np.log(r / q))
vi = -1 * result
return vi
[docs]def unsupervised_clustering_accuracy(labels_true: np.ndarray, labels_pred: np.ndarray) -> float:
"""
Evaluate the quality of predicted labels by comparing it to the ground truth labels using the
clustering accuracy.
Returns a value between 1.0 (perfect match) and 0.0 (arbitrary result).
Since the id of a cluster is not fixed in a clustering setting, the clustering accuracy evaluates each possible
combination with the ground truth labels.
Parameters
----------
labels_true : np.ndarray
The ground truth labels of the data set
labels_pred : np.ndarray
The labels as predicted by a clustering algorithm
Returns
-------
acc : float
The accuracy between the two input label sets.
References
-------
Yang, Yi, et al. "Image clustering using local discriminant models and global integration."
IEEE Transactions on Image Processing 19.10 (2010): 2761-2773.
"""
_check_number_of_points(labels_true, labels_pred)
max_label = int(max(labels_pred.max(), labels_true.max()) + 1)
match_matrix = np.zeros((max_label, max_label), dtype=np.int64)
for i in range(labels_true.shape[0]):
match_matrix[int(labels_true[i]), int(labels_pred[i])] -= 1
indices = linear_sum_assignment(match_matrix)
acc = -np.sum(match_matrix[indices]) / labels_pred.size
return acc
[docs]def information_theoretic_external_cluster_validity_measure(labels_true: np.ndarray, labels_pred: np.ndarray,
scale: bool = True) -> float:
"""
Evaluate the quality of predicted labels by comparing it to the ground truth labels using the
Information-Theoretic External Cluster-Validity Measure. Often simply called DOM.
A lower value indicates a better clustering result.
If the result is scaled, this method will return a value between 1.0 (perfect match) and 0.0 (arbitrary result).
An advantage of this metric is that it also works with a differing number of clusters.
Parameters
----------
labels_true : np.ndarray
The ground truth labels of the data set
labels_pred : np.ndarray
The labels as predicted by a clustering algorithm
scale : bool
Scale the result to (0, 1], where 1 indicates a perfect match and 0 indicates an arbitrary result (default: True)
Returns
-------
dom : float
The validity between the two input label sets.
References
-------
Byron E. Dom. 2002. "An information-theoretic external cluster-validity measure."
In Proceedings of the Eighteenth conference on Uncertainty in artificial intelligence (UAI'02).
"""
_check_number_of_points(labels_true, labels_pred)
# Build confusion matrix
cm = ConfusionMatrix(labels_true, labels_pred)
n_points = labels_true.shape[0]
n_classes = cm.confusion_matrix.shape[0]
# Get number of objects per predicted label
hks = np.sum(cm.confusion_matrix, axis=0)
# Calculate Q_0
empirical_conditional_entropy = cm.confusion_matrix / n_points * np.log(cm.confusion_matrix / hks)
empirical_conditional_entropy = - np.sum(empirical_conditional_entropy[~np.isnan(empirical_conditional_entropy)])
sum_binom_coefficient = np.sum([np.log(comb(hk + n_classes - 1, n_classes - 1)) for hk in hks])
Q_0 = empirical_conditional_entropy + sum_binom_coefficient / n_points
if scale:
# --- Scale Q_0 to (0, 1] ---
# Get number of objects per ground truth label
hcs = np.sum(cm.confusion_matrix, axis=1)
# Calculate Q_2
min_Q_0 = np.sum([np.log(comb(hc + n_classes - 1, n_classes - 1)) for hc in hcs]) / n_points
entropy_H_C = -np.sum([hc / n_points * np.log(hc / n_points) for hc in hcs])
max_Q_0 = entropy_H_C + np.log(n_classes)
assert Q_0 >= min_Q_0 and Q_0 <= max_Q_0, "Q_0 must be between min_Q_0 and max_Q_0"
# Scale Q_0 to receive final value
Q_0 = (max_Q_0 - Q_0) / (max_Q_0 - min_Q_0)
return Q_0