Source code for clustpy.utils.plots

import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import Colormap
from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
import numpy as np
from scipy import stats
import matplotlib.patches as mpatches
from sklearn.base import TransformerMixin

"""
Constants
"""
# Circle, Square, Diamond, Plus, X, Triangle down, Star, Pentagon, Triangle Up, Triangle left, Triangle right, Hexagon
_MARKERS = ("o", "s", "D", "P", "X", "v", "*", "p", "^", ">", "<", "h")
_MIN_OBJECTS_FOR_DENS_PLOT = 3


[docs]def plot_with_transformation(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, plot_dimensionality: int = 2, transformation_class: TransformerMixin = PCA, show_legend: bool = True, scattersize: float = 10, equal_axis: bool = False, show_plot: bool = True) -> None: """ In Data Science, it is common to work with high-dimensional data. These cannot be visualized without further ado. Therefore, a dimensionality reduction technique is often applied before a plot is created. Examples for such techniques are PCA, ICA, t-SNE, UMAP, ... Note that the chosen technique must work with a 'fit_transform' method. This method automatically executes the aforementioned pipline: first it reduces the dimensionality, then it creates a plot adjusted to the number of features. Up to three dimensions are visualized with the help of scatter plats. Then a scatter matrix plot is used. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) plot_dimensionality : int The dimensionality of the feature space after the dimensionality reduction technique has been applied (default: 2) transformation_class : TransformerMixin The transformation class / dimensionality reduction technique (default: sklearn.decomposition.PCA) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert plot_dimensionality > 0, "Plot dimensionality must be > 0" if X.ndim == 1: plot_dimensionality = 1 elif plot_dimensionality > X.shape[1]: print( "[WARNING] plot_dimensionality ({0}) is higher than the dimensionaliyty of the input dataset ({1}). " "plot_dimensionality will therefore be set to {1}.".format( plot_dimensionality, X.shape[1])) plot_dimensionality = X.shape[1] # Check if transformation dimensionality is smaller than number of features elif plot_dimensionality < X.shape[1]: # Transfrom data trans = transformation_class(n_components=plot_dimensionality) X = trans.fit_transform(X) if centers is not None: centers = trans.transform(centers) # Create plot if plot_dimensionality == 1: # 1d Plot plot_1d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, show_plot=False) elif plot_dimensionality == 2: # 2d Plot plot_2d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, scattersize=scattersize, equal_axis=equal_axis, show_plot=False) elif plot_dimensionality == 3: # 3d Plot plot_3d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, scattersize=scattersize, show_plot=False) else: # More than 3 features plot_scatter_matrix(X, labels=labels, centers=centers, true_labels=true_labels, scattersize=scattersize, show_legend=show_legend, equal_axis=equal_axis, max_dimensions=plot_dimensionality, show_plot=False) if show_plot: plt.show()
[docs]def plot_1d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, show_legend: bool = True, show_plot: bool = True) -> None: """ Plot a one-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) show_legend : bool Defines whether a legend should be shown (default: True) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 1 or X.shape[1] == 1, "Data must be 1-dimensional" assert centers is None or centers.ndim == 1 or centers.shape[1] == 1, "Centers must be 1-dimensional" # Optional: Get first column of data if X.ndim == 2: X = X[:, 0] # fig, ax = plt.subplots(figsize=figsize) min_value = np.min(X) max_value = np.max(X) plt.hlines(1, min_value, max_value) # Draw a horizontal line y = np.ones(len(X)) plt.scatter(X, y, marker='|', s=500, c=labels) # Plot a line at each location specified in X if centers is not None: # Optional: Get first column of centers if centers.ndim == 2: centers = centers[:, 0] yc = np.ones(len(centers)) plt.scatter(centers, yc, s=300, color="red", marker="x") # plot one center text above line and next below ... centers_order = np.argsort(centers) centers_order = np.argsort(centers_order) for j in range(len(centers)): yt = 1.0005 if centers_order[j] % 2 == 0 else 0.9994 plt.text(centers[j], yt, str(j), weight="bold") if true_labels is not None: plt.hlines(1.001, min_value, max_value) y_true = np.ones(len(X)) * 1.001 plt.scatter(X, y_true, marker='|', s=500, c=true_labels) if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(plt, unique_labels, cmap, norm) if show_plot: plt.show()
[docs]def plot_2d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, cluster_ids_font_size: float = None, centers_ids_font_size: float = 10, show_legend: bool = True, title: str = None, scattersize: float = 10, centers_scattersize: float = 15, equal_axis: bool = False, container: plt.Axes = plt, show_plot: bool = True) -> None: """ Plot a two-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) cluster_ids_font_size : float The font size of the id of a predicted cluster, which is shown as text in the center of that cluster. Can be None if no id should be shown (default: None) centers_ids_font_size: float The font size of the id that is shown next to the red marker of a cluster center. Only relevant if centers is not None. Can be None if no id should be shown (default: 10) show_legend : bool Defines whether a legend should be shown (default: True) title : str Title of the plot (default: None) scattersize : float The size of the scatters (default: 10) centers_scattersize : float The size of the red scatters of the cluster centers (default: 15) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) container : plt.Axes The container to which the scatter plot is added. If another container is defined, show_plot should usually be False (default: matplotlib.pyplot) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 2 and X.shape[1] == 2, "Data must be 2-dimensional" if true_labels is None: container.scatter(X[:, 0], X[:, 1], c=labels, s=scattersize) else: unique_true_labels = np.unique(true_labels) # Change marker for true labels for lab_index, true_lab in enumerate(unique_true_labels): marker = _MARKERS[lab_index % len(_MARKERS)] container.scatter(X[true_labels == true_lab, 0], X[true_labels == true_lab, 1], s=scattersize, c=labels if labels is None else labels[true_labels == true_lab], marker=marker, vmin=np.min(labels), vmax=np.max(labels)) if cluster_ids_font_size is not None: unique_labels = np.unique(labels) mean_positions = [np.mean(X[labels == pred_lab], axis=0) for pred_lab in unique_labels] for i, mp in enumerate(mean_positions): plt.text(mp[0], mp[1], unique_labels[i], fontsize=cluster_ids_font_size) if centers is not None: container.scatter(centers[:, 0], centers[:, 1], s=centers_scattersize, color="red", marker="s") if centers_ids_font_size is not None: for j in range(len(centers)): container.text(centers[j, 0], centers[j, 1], str(j), weight="bold", fontsize=centers_ids_font_size) if equal_axis: container.axis("equal") if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(container, unique_labels, cmap, norm) if title is not None: plt.title(title) if show_plot: container.show()
[docs]def plot_3d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, show_legend: bool = True, scattersize: float = 10, show_plot: bool = True) -> None: """ Plot a three-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 2 or X.shape[1] == 3, "Data must be 3-dimensional" fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # Axes3D(fig) if true_labels is None: ax.scatter(X[:, 0], X[:, 1], zs=X[:, 2], zdir='z', s=scattersize, c=labels, alpha=0.8) else: unique_true_labels = np.unique(true_labels) # Change marker for true labels for lab_index, true_lab in enumerate(unique_true_labels): marker = _MARKERS[lab_index % len(_MARKERS)] ax.scatter(X[true_labels == true_lab, 0], X[true_labels == true_lab, 1], zs=X[true_labels == true_lab, 2], zdir='z', s=scattersize, c=labels if labels is None else labels[true_labels == true_lab], marker=marker, vmin=np.min(labels), vmax=np.max(labels), alpha=0.8) if centers is not None: ax.scatter(centers[:, 0], centers[:, 1], zs=centers[:, 2], zdir='z', s=scattersize * 1.5, color="red", marker="s") for j in range(len(centers)): ax.text(centers[j, 0], centers[j, 1], centers[j, 2], str(j), weight="bold") if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(fig, unique_labels, cmap, norm) if show_plot: plt.show()
[docs]def plot_image(img_data: np.ndarray, black_and_white: bool = False, image_shape: tuple = None, is_color_channel_last: bool = False, max_value: float = None, min_value: float = None, show_plot: bool = True) -> None: """ Plot an image. Color image should occur in the HWC representation (height, width, color channels) if is_color_channel_last is True and in the CHW if is_color_channel_last is False. Parameters ---------- img_data : np.ndarray The image data black_and_white : bool Specifies whether the image should be plotted in grayscale colors. Only relevant for images without color channels (default: False) image_shape : tuple (height, width) for grayscale images or HWC (height, width, color channels) / CHW for color images (default: None) is_color_channel_last : bool if true, the color channels should be in the last dimension, known as HWC representation. Alternatively the color channel can be at the first position, known as CHW representation. Only relevant for color images (default: False) max_value : float maximum pixel value, used for min-max normalization. Is often 255, if None the maximum value in the data set will be used (default: None) min_value : float maximum pixel value, used for min-max normalization. Is often 0, if None the minimum value in the data set will be used (default: 255) show_plot : bool Defines whether the plot should directly be plotted (default: True) Examples ---------- >>> from clustpy.data import load_nrletters, load_optdigits >>> X = load_nrletters().data >>> plot_image(X[0], False, (9, 7, 3), True, 255, 0, show_plot=True) >>> X = load_optdigits().data >>> plot_image(X[0], True, (8, 8), None, 255, 0, show_plot=True) """ assert img_data.ndim <= 3, "Image data can not have more than 3 dimensions." # Data range must match float between [0..1] or int between [0..255] -> use min-max transform if max_value is None: max_value = np.max(img_data) if min_value is None: min_value = np.min(img_data) # Scale image to [0, 1] img_data = (img_data - min_value) / (max_value - min_value) # Reshape array data if img_data.ndim == 1: img_data = img_data.reshape(image_shape) if img_data.ndim == 3 and not is_color_channel_last: # Reshape image to HWC representation img_data = np.transpose(img_data, (1, 2, 0)) # Plot original image or a black-and-white version if black_and_white: plt.imshow(img_data, cmap="Greys") else: plt.imshow(img_data) plt.axis('off') if show_plot: plt.show()
[docs]def plot_histogram(X: np.ndarray, labels: np.ndarray = None, density: bool = True, n_bins: int = 100, show_legend: bool = True, container: plt.Axes = plt, show_plot: bool = True) -> None: """ Plot a histogram. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) density : bool Defines whether a kernel density should be added to the histogram (default: True) n_bins : int Number of bins (default: 100) show_legend : bool Defines whether the legend of the histogram should be shown (default: True) container : plt.Axes The container to which the histogram is added. If another container is defined, show_plot should usually be False (default: matplotlib.pyplot) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 1, "Data must be 1-dimensional" # Plot histogram if labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) for lab in unique_labels: # Get common label colors for histogram and density hist_color = cmap(norm(lab)) container.hist(X[labels == lab], alpha=0.5, bins=n_bins, color=hist_color, range=(np.min(X), np.max(X))) else: container.hist(X, alpha=0.5, bins=n_bins, range=(np.min(X), np.max(X))) # Plot densities if density: # Histogram and density should share same x-axis twin_axis = container.twinx() twin_axis.yaxis.set_visible(False) if labels is not None: for lab in unique_labels: den_objects = X[labels == lab] if den_objects.shape[0] >= _MIN_OBJECTS_FOR_DENS_PLOT: hist_color = cmap(norm(lab)) kde = stats.gaussian_kde(den_objects) steps = np.linspace(np.min(den_objects), np.max(den_objects), 1000) twin_axis.plot(steps, kde(steps), color=hist_color) elif X.shape[0] >= _MIN_OBJECTS_FOR_DENS_PLOT: kde = stats.gaussian_kde(X) steps = np.linspace(np.min(X), np.max(X), 1000) twin_axis.plot(steps, kde(steps)) if show_legend and labels is not None: _add_legend(container, unique_labels, cmap, norm) if show_plot: plt.show()
[docs]def plot_scatter_matrix(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, density: bool = True, n_bins: int = 100, show_legend: bool = True, scattersize: float = 10, equal_axis: bool = False, max_dimensions: int = 10, show_plot: bool = True) -> plt.Axes: """ Create a scatter matrix plot. Visualizes a 2d scatter plot for each combination of features. The center axis shows a histogram of each single feature. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) density : bool Defines whether a kernel density should be added to the histogram (default: True) n_bins : int Number of bins used for the histogram (default: 100) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) max_dimensions : int Maximum Number of dimensions that should be plotted. This value is intended to prevent the creation of overly complex plots that are very confusing and take a long time to create (default: 10) show_plot : bool Defines whether the plot should directly be plotted (default: True) Returns ------- axes : plt.Axes None if show_plot is True, otherwise the used matplotlib axes """ if X.shape[1] > max_dimensions: print( "[WARNING] Dimensionality of the dataset is larger than 10. Creation of scatter matrix plot will be aborted.") # For single dimension only plot histogram if X.shape[1] == 1: plot_histogram(X[:, 0], labels, density, n_bins, show_legend, show_plot=show_plot) if not show_plot: return plt.gca() else: # Get unique labels and unique true labels if labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) # Create subplots if equal_axis: fig, axes = plt.subplots(nrows=X.shape[1], ncols=X.shape[1], sharey="all", sharex="all") else: fig, axes = plt.subplots(nrows=X.shape[1], ncols=X.shape[1], sharey="row", sharex="col") fig.subplots_adjust(hspace=0.05, wspace=0.05) for i in range(X.shape[1]): for j in range(X.shape[1]): ax = axes[i, j] if i == j: # Histogram plot if i != 0: ax.yaxis.set_visible(False) if i != X.shape[1] - 1: ax.xaxis.set_visible(False) # Second plot for actual histogram (use container) twin_axis = ax.twinx() twin_axis.yaxis.set_visible(False) plot_histogram(X[:, i], labels, density, n_bins, show_legend=False, container=twin_axis, show_plot=False) else: # Scatter plot (use container) local_centers = None if centers is None else centers[:, [j, i]] plot_2d_data(X[:, [j, i]], labels, local_centers, true_labels, show_legend=False, scattersize=scattersize, equal_axis=False, container=ax, show_plot=False) if show_legend and labels is not None: _add_legend(fig, unique_labels, cmap, norm) if show_plot: plt.show() else: return axes
def _add_legend(container: plt.Axes, unique_labels: np.ndarray, cmap: Colormap, norm: Normalize) -> None: """ Helper function to add a legend to the histogram. Parameters ---------- container : plt.Axes The container to which the legend is added. unique_labels : np.ndarray The unique labels that should be displayed in the legend cmap : Colormap the colormap norm : Normalize The Normalize object to pick the correct color """ patchlist = [mpatches.Patch(color=cmap(norm(lab)), label=lab) for lab in unique_labels] container.legend(handles=patchlist, loc="center right") def _get_cmap_and_norm(labels: np.ndarray, min_max: tuple = None) -> (np.ndarray, Colormap, Normalize): """ Helper function to get colormap and Normalization object. Parameters ---------- labels : np.ndarray The cluster labels min_max : tuple Tuple containing the minimum and maximum cluster label for coloring the plot (default: None) Returns ------- tuple : (np.ndarray, Colormap, Normalize) The unique labels ids, The colormap, The Normalize object to pick the correct color """ unique_labels = np.unique(labels) if min_max is None: min_max = (unique_labels[0], unique_labels[-1]) assert min_max[0] <= min_max[1], "First value in min_max must be smaller or equal to second value" # Manage colormap cmap = matplotlib.colormaps['viridis'] norm = Normalize(vmin=min_max[0], vmax=min_max[1]) return unique_labels, cmap, norm