Source code for clustpy.metrics.hierarchical_metrics

from clustpy.hierarchical._cluster_tree import BinaryClusterTree
from clustpy.metrics import purity
import numpy as np


[docs]def leaf_purity(labels_true: np.ndarray, labels_pred: np.ndarray, tree: BinaryClusterTree) -> float:
    """
    Calculates the leaf purity of the tree.
    Uses labels fromm leafs in the tree to calculate the purity (see clustpy.metrics.purity).

    Parameters
    ----------
    labels_true : np.ndarray
        The ground truth labels of the data set
    labels_pred : np.ndarray
        The labels as predicted by a clustering algorithm
    tree : BinaryClusterTree
        The clustering tree

    Returns
    -------
    leaf_purity : float
        The leaf purity

    References
    -------
    Mautz, Dominik, Claudia Plant, and Christian Böhm. "Deepect: The deep embedded cluster tree."
    Data Science and Engineering 5 (2020): 419-432.
    """
    leaf_nodes, _ = tree.get_leaf_and_split_nodes()
    labels_pred_adj = -np.ones(labels_pred.shape[0])
    for i, leaf_node in enumerate(leaf_nodes):
        labels_pred_adj[np.isin(labels_pred, leaf_node.labels)] = i
    leaf_purity = purity(labels_true, labels_pred_adj)
    return leaf_purity


[docs]def dendrogram_purity(labels_true: np.ndarray, labels_pred: np.ndarray, tree: BinaryClusterTree) -> float:
    """
    Calculates the dendrogram purity of the tree.

    Parameters
    ----------
    labels_true : np.ndarray
        The ground truth labels of the data set
    labels_pred : np.ndarray
        The labels as predicted by a clustering algorithm
    tree : BinaryClusterTree
        The clustering tree

    Returns
    -------
    dendrogram_purity : float
        The dendrogram purity

    References
    -------
    Heller, Katherine A., and Zoubin Ghahramani. "Bayesian hierarchical clustering."
    Proceedings of the 22nd international conference on Machine learning. 2005.

    or

    Kobren, Ari, et al. "A hierarchical algorithm for extreme clustering."
    Proceedings of the 23rd ACM SIGKDD international conference on knowledge discovery and data mining. 2017.
    """
    cluster_ids_true, cluster_sizes_true = np.unique(labels_true, return_counts=True)
    total_per_label_pairs_count = np.sum([cluster_size * (cluster_size - 1) / 2 for cluster_size in cluster_sizes_true])
    purity_sum = 0
    for id_true in cluster_ids_true:
        points_in_true_cluster = (labels_true == id_true)
        pred_labels_in_cluster, pred_labels_counts_in_cluster = np.unique(labels_pred[points_in_true_cluster],
                                                                          return_counts=True)
        for i, id_pred_1 in enumerate(pred_labels_in_cluster):
            for j in range(i, len(pred_labels_counts_in_cluster)):
                id_pred_2 = pred_labels_in_cluster[j]
                if i == j:
                    # Get all pairs with same cluster label
                    occurrences_of_pair = pred_labels_counts_in_cluster[i] * (pred_labels_counts_in_cluster[i] - 1) / 2
                else:
                    # Get all pairs with different cluster label
                    occurrences_of_pair = pred_labels_counts_in_cluster[i] * pred_labels_counts_in_cluster[j]
                ancestor_labels = tree.get_least_common_ancestor(id_pred_1, id_pred_2).labels
                contained_in_ancestor_labels = np.isin(labels_pred, ancestor_labels)
                intersection_size = np.sum(contained_in_ancestor_labels & points_in_true_cluster)
                purity_sum += occurrences_of_pair * (intersection_size / np.sum(contained_in_ancestor_labels))
    dendrogram_purity = purity_sum / total_per_label_pairs_count
    return dendrogram_purity