Source code for clustpy.metrics.hierarchical_metrics

from clustpy.hierarchical._cluster_tree import BinaryClusterTree, _ClusterTreeNode
from clustpy.metrics.confusion_matrix import ConfusionMatrix
from clustpy.metrics._metrics_utils import _check_labels_arrays
import numpy as np


[docs]def node_purity(node: _ClusterTreeNode, labels_true: np.ndarray, labels_pred: np.ndarray) -> float:
    """
    Calculate the purity of this node within a Cluster Tree.
    A leaf with no assigned points receives a purity score of 0.

    Parameters
    ----------
    node: _ClusterTreeNode
        The node of a clustering tree
    labels_true : np.ndarray
        The ground truth labels of the data set
    labels_pred : np.ndarray
        The labels as predicted by a clustering algorithm

    Returns
    -------
    node_purity : float
        The node purity
    """
    labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred)
    samples_in_leaf = np.isin(labels_pred, node.labels)
    if np.any(samples_in_leaf):
        sizes_gt_matches = np.unique(labels_true[samples_in_leaf], return_counts=True)[1]
        node_purity = sizes_gt_matches.max() / samples_in_leaf.sum()
    else:
        node_purity = 0.
    return node_purity


[docs]def leaf_purity(
    tree: BinaryClusterTree, labels_true: np.ndarray, labels_pred: np.ndarray
) -> float:
    """
    Calculates the leaf purity of the tree.
    The leaf purity is equal to a weighted average of the maximum class purity across all leaves.
    Uses labels from leafs in the tree to identify the most frequent ground truth class and weights the score by the size of the leaf.
    If each label contains a single label, this is equal to the standard purity metric.

    Parameters
    ----------
    tree : BinaryClusterTree
        The clustering tree
    labels_true : np.ndarray
        The ground truth labels of the data set
    labels_pred : np.ndarray
        The labels as predicted by a clustering algorithm

    Returns
    -------
    leaf_purity : float
        The leaf purity

    References
    -------
    Mautz, Dominik, Claudia Plant, and Christian Böhm. "Deepect: The deep embedded cluster tree."
    Data Science and Engineering 5 (2020): 419-432.
    """
    cm = ConfusionMatrix(labels_true, labels_pred)
    leaf_nodes, _ = tree.get_leaf_and_split_nodes()
    leaf_purity = 0
    for leaf_node in leaf_nodes:
        relevant_columns = np.isin(cm.pred_clusters, leaf_node.labels)
        column_sum = cm.confusion_matrix[:, relevant_columns].sum(1)
        leaf_purity += column_sum.max()
    leaf_purity = leaf_purity / len(labels_true)
    return leaf_purity


[docs]def dendrogram_purity(
    dendrogram: np.ndarray | BinaryClusterTree, labels_true: np.ndarray, labels_pred: np.ndarray | None = None
) -> float:
    """
    Calculates the dendrogram purity of the tree.

    Parameters
    ----------
    dendrogram: np.ndarray | BinaryClusterTree
        BinaryClusterTree or sklearn/scipy-style dendrogram (e.g. AgglomerativeClustering -> children_)
    labels_true : np.ndarray
        The ground truth labels of the data set
    labels_pred : np.ndarray | None
        The labels as predicted by a clustering algorithm.
        If None, we assume a full tree with each leaf node defining its own cluster containing a single point (default: None)

    Returns
    -------
    dendrogram_purity : float
        The dendrogram purity

    References
    -------
    Heller, Katherine A., and Zoubin Ghahramani. "Bayesian hierarchical clustering."
    Proceedings of the 22nd international conference on Machine learning. 2005.

    or

    Kobren, Ari, et al. "A hierarchical algorithm for extreme clustering."
    Proceedings of the 23rd ACM SIGKDD international conference on knowledge discovery and data mining. 2017.
    """
    if labels_pred is None:
        labels_pred = np.arange(labels_true.shape[0])
    labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred)
    if type(dendrogram) is BinaryClusterTree:
        # Transform ClusterTree to sklearn dendrogram
        dendrogram = dendrogram.export_sklearn_dendrogram()
    num_true_classes = labels_true.max() + 1
    num_base_clusters =  dendrogram.shape[0] + 1
    parent_matrix = _get_parent_matrix(dendrogram=dendrogram)
    # counts_per_class contains for every true class label the number of datapoints
    # with this label for each base class. For a full hierarchy this is either 0 or 1.
    # For an overclustering, larger values are possible.
    counts_per_class = ConfusionMatrix(labels_true, labels_pred, (num_true_classes, num_base_clusters)).confusion_matrix

    # parent_matrix[i,:] contains a 1 at position j if j is a descendant of i.
    # the matrix multiplication thus sums up how many original nodes per GT class are at each
    # node in the tree.
    node_counts = counts_per_class @ parent_matrix.T

    purity = 0.0
    total_pairs = 0

    for true_label_index in range(num_true_classes):
        # within-leaf purities (always 0 for complete hierarchies)
        for base_cluster_index in range(num_base_clusters):
            class_count = counts_per_class[true_label_index, base_cluster_index]
            sum_counts_per_class = np.sum(counts_per_class[:, base_cluster_index])
            if sum_counts_per_class > 0:
                leaf_purity = class_count / sum_counts_per_class
                weight = class_count * (class_count - 1) / 2
                purity += leaf_purity * weight
                total_pairs += weight
        # purity where inner nodes of the dendrogram serve as LCA
        for index, (child1, child2) in enumerate(dendrogram[:, :2]):
            counts_child1 = node_counts[true_label_index, int(child1)]
            counts_child2 = node_counts[true_label_index, int(child2)]
            weight = counts_child1 * counts_child2

            class_count = node_counts[true_label_index, index + num_base_clusters]
            sum_node_counts = np.sum(node_counts[:, index + num_base_clusters])
            if sum_node_counts > 0:
                node_purity = class_count / sum_node_counts

                purity += node_purity * weight
                total_pairs += weight

    purity /= total_pairs
    return purity


def _get_parent_matrix(dendrogram: np.ndarray) -> np.ndarray:
    """
    The parent matrix stores for each cluster which of the basic elements/clusters form it.

    Parameters
    ----------
    dendrogram: np.ndarray
        sklearn/scipy-style dendrogram

    Returns
    -------
    parent_matrix : np.ndarray
        The parent matrix
    """
    num_base_clusters = dendrogram.shape[0] + 1
    # each line of the dendrogram corresponds to a cluster, plus we need the initial clusters
    parent_matrix = np.zeros(
        (dendrogram.shape[0] + num_base_clusters, num_base_clusters), dtype=np.int8
    )
    # each initial cluster consists just of itself
    parent_matrix[:num_base_clusters] = np.eye(num_base_clusters)
    for i, (child1, child2) in enumerate(dendrogram[:, :2]):
        parent_matrix[i + num_base_clusters] = parent_matrix[int(child1)] + parent_matrix[int(child2)]
    assert np.max(parent_matrix) == 1, "input dendrogram does not correspond to a tree"
    assert np.all(parent_matrix[-1] == 1), "input dendrogram does not correspond to a tree"
    return parent_matrix