Source code for clustpy.metrics.confusion_matrix

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment
from clustpy.metrics._metrics_utils import _check_labels_arrays


def _rearrange(confusion_matrix: np.ndarray) -> (np.ndarray, np.ndarray):
    """
    Rearrange the confusion matrix in such a way that the sum of the diagonal is maximized.
    Thereby, the best matching combination of labels will be shown.
    Uses the Hungarian Method to identify the best match.
    If parameter inplace is set to True, this method will change the original confusion matrix.
    Else the rearranged matrix will only be returned.

    Parameters
    ----------
    confusion_matrix : np.ndarray
        The original confusion matrix

    Returns
    -------
    rearranged_confusion_matrix : np.ndarray
        The rearranged confusion matrix
        (If number of ground truth labels is larger than the number of predicted labels, the resulting confusion matrix will be quadradic with multiple 0 columns),
        The indices regarding the rearrangement
    """
    # Change order using the Hungarian Method
    max_number_labels = max(confusion_matrix.shape)
    rearranged_confusion_matrix = np.zeros((max_number_labels, max_number_labels), dtype=confusion_matrix.dtype)
    # Linear sum assignment tries to minimize the diagonal sum -> use negative confusion_matrix
    rearranged_confusion_matrix[:confusion_matrix.shape[0], :confusion_matrix.shape[1]] = confusion_matrix
    indices = linear_sum_assignment(-rearranged_confusion_matrix)
    # Change order of the columns
    rearranged_order = indices[1]
    rearranged_confusion_matrix = rearranged_confusion_matrix[:, rearranged_order]
    rearranged_confusion_matrix = rearranged_confusion_matrix[:confusion_matrix.shape[0], :]
    # If there are more columns than rows sort remaining columns by highest value
    if confusion_matrix.shape[1] > confusion_matrix.shape[0]:
        missing_columns = np.arange(confusion_matrix.shape[0], confusion_matrix.shape[1])
        missing_columns_order = np.argsort(np.max(rearranged_confusion_matrix[:, missing_columns], axis=0))[::-1]
        rearranged_confusion_matrix[:, missing_columns] = rearranged_confusion_matrix[:, missing_columns[missing_columns_order]]
        rearranged_order[missing_columns] = rearranged_order[missing_columns[missing_columns_order]]
    return rearranged_confusion_matrix, rearranged_order


def _plot_confusion_matrix(confusion_matrix: np.ndarray, show_text: bool, row_names : list, column_names : list, figsize: tuple, cmap: str, textcolor: str,
                           vmin: float, vmax: float) -> None:
    """
    Plot the confusion matrix.

    Parameters
    ----------
    confusion_matrix : np.ndarray
        The confusion matrix to plot
    show_text : bool
        Show the value in each cell as text
    row_names : list
        List of containing the names of the rows
    column_names : list
        List of containing the names of the columns
    figsize : tuple
        Tuple indicating the height and width of the plot
    cmap : str
        Colormap used for the plot
    textcolor : str
        Color of the text. Only relevant if show_text is True
    vmin : float
        Minimum possible value within a cell of the confusion matrix.
        If None, it will be set as the minimum value within the confusion matrix.
        Used to choose the color from the colormap
    vmax : float
        Maximum possible value within a cell of the confusion matrix.
        If None, it will be set as the maximum value within the confusion matrix.
        Used to choose the color from the colormap
    """
    if len(row_names) != confusion_matrix.shape[0]:
        raise ValueError("Length of the row names list must match the number of rows (ground turth clusters) in the confusion matrix. Length is {0} and number of rows is {1}".format(len(row_names), confusion_matrix.shape[0]))
    if len(column_names) != confusion_matrix.shape[1]:
        raise ValueError("Length of the column names list must match the number of columns (predicted clusters) in the confusion matrix. Length is {0} and number of columns is {1}".format(len(column_names), confusion_matrix.shape[1]))
    fig, ax = plt.subplots(figsize=figsize)
    # Plot confusion matrix using colors
    ax.imshow(confusion_matrix, cmap=cmap, vmin=vmin, vmax=vmax)
    ax.set_xticks(np.arange(confusion_matrix.shape[1]))
    ax.set_xticklabels(column_names)
    ax.set_yticks(np.arange(confusion_matrix.shape[0]))
    ax.set_yticklabels(row_names)
    # Optional: Add text to the color cells
    if show_text:
        for i in range(confusion_matrix.shape[0]):
            for j in range(confusion_matrix.shape[1]):
                ax.text(j, i, confusion_matrix[i, j],
                        ha="center", va="center", color=textcolor)
    plt.show()


[docs]class ConfusionMatrix(): """ Create a Confusion Matrix of predicted and ground truth labels. Each row corresponds to a ground truth label and each column to a predicted label. The number in each cell (i, j) indicates how many objects with ground truth label i have been predicted label j. Parameters ---------- labels_true : np.ndarray The ground truth labels of the data set labels_pred : np.ndarray The labels as predicted by a clustering algorithm shape : tuple | str | None The desired shape of the confusion matrix. Can be "square" to encforce a squared confusion matrix (default: None) Attributes ---------- confusion_matrix : np.ndarray The confusion matrix """ def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray, shape: tuple | str | None=None): labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) true_clusters, true_clusters_idx = np.unique(labels_true, return_inverse=True) pred_clusters, pred_clusters_idx = np.unique(labels_pred, return_inverse=True) self.true_clusters = true_clusters self.pred_clusters = pred_clusters if shape is None: shape = (len(true_clusters), len(pred_clusters)) else: if shape == "square": max_labels = max(len(true_clusters), len(pred_clusters)) shape = (max_labels, max_labels) else: assert len(shape) == 2 and shape[0] >= len(true_clusters) and shape[1] >= len(pred_clusters), f"Shape must be 'square' or a tuple containing two values such that shape[0] >= len(np.unique(labels_true)) and shape[1] >= len(np.unique(labels_pred)). Your values: shape = {shape}, len(np.unique(labels_true)) = {len(np.unique(labels_true))}, len(np.unique(labels_pred)) = {len(np.unique(labels_pred))}" # Fill unique label information (self.true_clusters and self.pred_clusters) with -2 placeholders if shape[0] > len(true_clusters): self.true_clusters = np.append(self.true_clusters, [-2] * (shape[0] - len(true_clusters))) if shape[1] > len(pred_clusters): self.pred_clusters = np.append(self.pred_clusters, [-2] * (shape[1] - len(pred_clusters))) conf_matrix = np.zeros(shape, dtype=int) np.add.at(conf_matrix, (true_clusters_idx, pred_clusters_idx), 1) self.confusion_matrix = conf_matrix def __str__(self): """ Print the confusion matrix. Returns ------- str_confusion_matrix : str The confusion matrix as a string """ str_confusion_matrix = str(self.confusion_matrix) return str_confusion_matrix
[docs] def rearrange(self, inplace: bool = True) -> np.ndarray: """ Rearrange the confusion matrix in such a way that the sum of the diagonal is maximized. Thereby, the best matching combination of labels will be shown. Uses the Hungarian Method to identify the best match. If parameter inplace is set to True, this method will change the original confusion matrix. Else the rearranged matrix will only be returned. Parameters ---------- inplace : bool Should the new confusion matrix overwrite the original one (default: True) Returns ------- rearranged_confusion_matrix : np.ndarray The rearranged confusion matrix If number of ground truth labels is larer than the number of predicted labels, the resulting confusion matrix will be quadradic with multiple 0 columns. """ rearranged_confusion_matrix, rearranged_order = _rearrange(self.confusion_matrix) if inplace: self.confusion_matrix = rearranged_confusion_matrix self.pred_clusters = self.pred_clusters[rearranged_order[:len(self.pred_clusters)]] return rearranged_confusion_matrix
[docs] def plot(self, show_text: bool = True, ground_truth_names: list | None = None, figsize: tuple = (10, 10), cmap: str = "YlGn", textcolor: str = "black", vmin: int = 0, vmax: int = None) -> None: """ Plot the confusion matrix. Parameters ---------- show_text : bool Show the value in each cell as text (default: True) ground_truth_names : list | None List of containing the names of the ground truth clusters figsize : tuple Tuple indicating the height and width of the plot (default: (10, 10)) cmap : str Colormap used for the plot (default: "YlGn") textcolor : str Color of the text. Only relevant if show_text is True (default: "black") vmin : int Minimum possible value within a cell of the confusion matrix. If None, it will be set as the minimum value within the confusion matrix. Used to choose the color from the colormap (default: 0) vmax : int Maximum possible value within a cell of the confusion matrix. If None, it will be set as the maximum value within the confusion matrix. Used to choose the color from the colormap (default: None) """ if ground_truth_names is None: ground_truth_names = self.true_clusters _plot_confusion_matrix(self.confusion_matrix, show_text, ground_truth_names, self.pred_clusters, figsize, cmap, textcolor, vmin, vmax)