Source code for clustpy.utils.plots

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import Colormap
from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from scipy import stats
import matplotlib.patches as mpatches
from sklearn.base import TransformerMixin

"""
Constants
"""
# Circle, Square, Diamond, Plus, X, Triangle down, Star, Pentagon, Triangle Up, Triangle left, Triangle right, Hexagon
_MARKERS = ("o", "s", "D", "P", "X", "v", "*", "p", "^", ">", "<", "h")
_MIN_OBJECTS_FOR_DENS_PLOT = 3


[docs]def plot_with_transformation(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, plot_dimensionality: int = 2, transformation_class: TransformerMixin = PCA, show_legend: bool = True, scattersize: int = 10, equal_axis: bool = False, show_plot: bool = True) -> None: """ In Data Science, it is common to work with high-dimensional data. These cannot be visualized without further ado. Therefore, a dimensionality reduction technique is often applied before a plot is created. Examples for such techniques are PCA, ICA, t-SNE, UMAP, ... Note that the chosen technique must work with a 'fit_transform' method. This method automatically executes the aforementioned pipline: first it reduces the dimensionality, then it creates a plot adjusted to the number of features. Up to three dimensions are visualized with the help of scatter plats. Then a scatter matrix plot is used. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) plot_dimensionality : int The dimensionality of the feature space after the dimensionality reduction technique has been applied (default: 2) transformation_class : TransformerMixin The transformation class / dimensionality reduction technique (default: sklearn.decomposition.PCA) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert plot_dimensionality > 0, "Plot dimensionality must be > 0" if X.ndim == 1: plot_dimensionality = 1 elif plot_dimensionality > X.shape[1]: print( "[WARNING] plot_dimensionality ({0}) is higher than the dimensionaliyty of the input dataset ({1}). " "plot_dimensionality will therefore be set to {1}.".format( plot_dimensionality, X.shape[1])) plot_dimensionality = X.shape[1] # Check if transformation dimensionality is smaller than number of features elif plot_dimensionality < X.shape[1]: # Transfrom data trans = transformation_class(n_components=plot_dimensionality) X = trans.fit_transform(X) if centers is not None: centers = trans.transform(centers) # Create plot if plot_dimensionality == 1: # 1d Plot plot_1d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, show_plot=False) elif plot_dimensionality == 2: # 2d Plot plot_2d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, scattersize=scattersize, equal_axis=equal_axis, show_plot=False) elif plot_dimensionality == 3: # 3d Plot plot_3d_data(X, labels=labels, centers=centers, true_labels=true_labels, show_legend=show_legend, scattersize=scattersize, show_plot=False) else: # More than 3 features plot_scatter_matrix(X, labels=labels, centers=centers, true_labels=true_labels, scattersize=scattersize, show_legend=show_legend, equal_axis=equal_axis, max_dimensions=plot_dimensionality, show_plot=False) if show_plot: plt.show()
[docs]def plot_1d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, show_legend: bool = True, show_plot: bool = True) -> None: """ Plot a one-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) show_legend : bool Defines whether a legend should be shown (default: True) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 1 or X.shape[1] == 1, "Data must be 1-dimensional" assert centers is None or centers.ndim == 1 or centers.shape[1] == 1, "Centers must be 1-dimensional" # Optional: Get first column of data if X.ndim == 2: X = X[:, 0] # fig, ax = plt.subplots(figsize=figsize) min_value = np.min(X) max_value = np.max(X) plt.hlines(1, min_value, max_value) # Draw a horizontal line y = np.ones(len(X)) plt.scatter(X, y, marker='|', s=500, c=labels) # Plot a line at each location specified in X if centers is not None: # Optional: Get first column of centers if centers.ndim == 2: centers = centers[:, 0] yc = np.ones(len(centers)) plt.scatter(centers, yc, s=300, color="red", marker="x") # plot one center text above line and next below ... centers_order = np.argsort(centers) centers_order = np.argsort(centers_order) for j in range(len(centers)): yt = 1.0005 if centers_order[j] % 2 == 0 else 0.9994 plt.text(centers[j], yt, str(j), weight="bold") if true_labels is not None: plt.hlines(1.001, min_value, max_value) y_true = np.ones(len(X)) * 1.001 plt.scatter(X, y_true, marker='|', s=500, c=true_labels) if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(plt, unique_labels, cmap, norm) if show_plot: plt.show()
[docs]def plot_2d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, show_legend: bool = True, scattersize: int = 10, equal_axis: bool = False, container: plt.Axes = plt, show_plot: bool = True) -> None: """ Plot a two-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) container : plt.Axes The container to which the scatter plot is added. If another container is defined, show_plot should usually be False (default: matplotlib.pyplot) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 2 or X.shape[1] == 2, "Data must be 2-dimensional" if true_labels is None: container.scatter(X[:, 0], X[:, 1], c=labels, s=scattersize) else: unique_true_labels = np.unique(true_labels) # Change marker for true labels for lab_index, true_lab in enumerate(unique_true_labels): marker = _MARKERS[lab_index % len(_MARKERS)] container.scatter(X[true_labels == true_lab, 0], X[true_labels == true_lab, 1], s=scattersize, c=labels if labels is None else labels[true_labels == true_lab], marker=marker, vmin=np.min(labels), vmax=np.max(labels)) if centers is not None: container.scatter(centers[:, 0], centers[:, 1], s=scattersize * 1.5, color="red", marker="s") for j in range(len(centers)): container.text(centers[j, 0], centers[j, 1], str(j), weight="bold") if equal_axis: container.axis("equal") if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(container, unique_labels, cmap, norm) if show_plot: container.show()
[docs]def plot_3d_data(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, show_legend: bool = True, scattersize: int = 10, show_plot: bool = True) -> None: """ Plot a three-dimensional data set. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 2 or X.shape[1] == 3, "Data must be 3-dimensional" fig = plt.figure() ax = Axes3D(fig) # fig.add_subplot(111, projection='3d') if true_labels is None: ax.scatter(X[:, 0], X[:, 1], zs=X[:, 2], zdir='z', s=scattersize, c=labels, alpha=0.8) else: unique_true_labels = np.unique(true_labels) # Change marker for true labels for lab_index, true_lab in enumerate(unique_true_labels): marker = _MARKERS[lab_index % len(_MARKERS)] ax.scatter(X[true_labels == true_lab, 0], X[true_labels == true_lab, 1], zs=X[true_labels == true_lab, 2], zdir='z', s=scattersize, c=labels if labels is None else labels[true_labels == true_lab], marker=marker, vmin=np.min(labels), vmax=np.max(labels), alpha=0.8) if centers is not None: ax.scatter(centers[:, 0], centers[:, 1], zs=centers[:, 2], zdir='z', s=scattersize * 1.5, color="red", marker="s") for j in range(len(centers)): ax.text(centers[j, 0], centers[j, 1], centers[j, 2], str(j), weight="bold") if show_legend and labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) _add_legend(fig, unique_labels, cmap, norm) if show_plot: plt.show() plt.figure() # Create new figure for future plots
[docs]def plot_image(img_data: np.ndarray, black_and_white: bool = False, image_shape: tuple = None, max_value: float = None, min_value: float = None, show_plot: bool = True) -> None: """ Plot an image. Expects a color image to occur in the HWC representation (height, width, color channels). Parameters ---------- img_data : np.ndarray The image data black_and_white : bool Specifies whether the image should be plotted in grayscale colors. Only relevant for images without color channels (default: False) image_shape : tuple (height, width) for grayscale images or (height, width, number of channels) for color images (default: None) max_value : float maximum pixel value, used for min-max normalization. Is often 255, if None the maximum value in the data set will be used (default: None) min_value : float maximum pixel value, used for min-max normalization. Is often 0, if None the minimum value in the data set will be used (default: 255) show_plot : bool Defines whether the plot should directly be plotted (default: True) Examples ---------- from clustpy.data import load_nrletters, load_optdigits X, _ = load_nrletters() plot_image(X[0], False, (9, 7, 3), 255, 0, show_plot=True) X, _ = load_optdigits() plot_image(X[0], True, (8, 8), 255, 0, show_plot=True) """ assert img_data.ndim <= 3, "Image data can not have more than 3 dimensions." # Data range must match float between [0..1] or int between [0..255] -> use min-max transform if max_value is None: max_value = np.max(img_data) if min_value is None: min_value = np.min(img_data) img_data = (img_data - min_value) / (max_value - min_value) # Reshape array data if img_data.ndim == 1: img_data = img_data.reshape(image_shape) # Plot original image or a black-and-white version if black_and_white: plt.imshow(img_data, cmap="Greys") else: plt.imshow(img_data) plt.axis('off') if show_plot: plt.show()
[docs]def plot_histogram(X: np.ndarray, labels: np.ndarray = None, density: bool = True, n_bins: int = 100, show_legend: bool = True, container: plt.Axes = plt, show_plot: bool = True) -> None: """ Plot a histogram. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) density : bool Defines whether a kernel density should be added to the histogram (default: True) n_bins : int Number of bins (default: 100) show_legend : bool Defines whether the legend of the histogram should be shown (default: True) container : plt.Axes The container to which the histogram is added. If another container is defined, show_plot should usually be False (default: matplotlib.pyplot) show_plot : bool Defines whether the plot should directly be plotted (default: True) """ assert X.ndim == 1, "Data must be 1-dimensional" # Plot histogram if labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) for lab in unique_labels: # Get common label colors for histogram and density hist_color = cmap(norm(lab)) container.hist(X[labels == lab], alpha=0.5, bins=n_bins, color=hist_color, range=(np.min(X), np.max(X))) else: container.hist(X, alpha=0.5, bins=n_bins, range=(np.min(X), np.max(X))) # Plot densities if density: # Histogram and density should share same x-axis twin_axis = container.twinx() twin_axis.yaxis.set_visible(False) if labels is not None: for lab in unique_labels: den_objects = X[labels == lab] if den_objects.shape[0] >= _MIN_OBJECTS_FOR_DENS_PLOT: hist_color = cmap(norm(lab)) kde = stats.gaussian_kde(den_objects) steps = np.linspace(np.min(den_objects), np.max(den_objects), 1000) twin_axis.plot(steps, kde(steps), color=hist_color) elif X.shape[0] >= _MIN_OBJECTS_FOR_DENS_PLOT: kde = stats.gaussian_kde(X) steps = np.linspace(np.min(X), np.max(X), 1000) twin_axis.plot(steps, kde(steps)) if show_legend and labels is not None: _add_legend(container, unique_labels, cmap, norm) if show_plot: plt.show()
[docs]def plot_scatter_matrix(X: np.ndarray, labels: np.ndarray = None, centers: np.ndarray = None, true_labels: np.ndarray = None, density: bool = True, n_bins: int = 100, show_legend: bool = True, scattersize: int = 10, equal_axis: bool = False, max_dimensions: int = 10, show_plot: bool = True) -> plt.Axes: """ Create a scatter matrix plot. Visualizes a 2d scatter plot for each combination of features. The center axis shows a histogram of each single feature. Parameters ---------- X : np.ndarray the given data set labels : np.ndarray The cluster labels. Specifies the color of the plotted objects. Can be None (default: None) centers : np.ndarray The cluster centers. Will be plotted as red dots labeled by the corresponding cluster id. Can be None (default: None) true_labels : np.ndarray The ground truth labels. Specifies the symbol of the plotted objects. Can be None (default: None) density : bool Defines whether a kernel density should be added to the histogram (default: True) n_bins : int Number of bins used for the histogram (default: 100) show_legend : bool Defines whether a legend should be shown (default: True) scattersize : float The size of the scatters (default: 10) equal_axis : bool Defines whether the axes are to be scaled to the same value range (default: False) max_dimensions : int Maximum Number of dimensions that should be plotted. This value is intended to prevent the creation of overly complex plots that are very confusing and take a long time to create (default: 10) show_plot : bool Defines whether the plot should directly be plotted (default: True) Returns ------- axes : plt.Axes The used matplotlib axes """ if X.shape[1] > max_dimensions: print( "[WARNING] Dimensionality of the dataset is larger than 10. Creation of scatter matrix plot will be aborted.") # For single dimension only plot histogram if X.shape[1] == 1: plot_histogram(X[:, 0], labels, density, n_bins, show_legend) return plt.gca() else: # Get unique labels and unique true labels if labels is not None: unique_labels, cmap, norm = _get_cmap_and_norm(labels) # Create subplots if equal_axis: fig, axes = plt.subplots(nrows=X.shape[1], ncols=X.shape[1], sharey="all", sharex="all") else: fig, axes = plt.subplots(nrows=X.shape[1], ncols=X.shape[1], sharey="row", sharex="col") fig.subplots_adjust(hspace=0.05, wspace=0.05) for i in range(X.shape[1]): for j in range(X.shape[1]): ax = axes[i, j] if i == j: # Histogram plot if i != 0: ax.yaxis.set_visible(False) if i != X.shape[1] - 1: ax.xaxis.set_visible(False) # Second plot for actual histogram (use container) twin_axis = ax.twinx() twin_axis.yaxis.set_visible(False) plot_histogram(X[:, i], labels, density, n_bins, show_legend=False, container=twin_axis, show_plot=False) else: # Scatter plot (use container) local_centers = None if centers is None else centers[:, [j, i]] plot_2d_data(X[:, [j, i]], labels, local_centers, true_labels, show_legend=False, scattersize=scattersize, equal_axis=False, container=ax, show_plot=False) if show_legend and labels is not None: _add_legend(fig, unique_labels, cmap, norm) if show_plot: plt.show() return axes
def _add_legend(container: plt.Axes, unique_labels: np.ndarray, cmap: Colormap, norm: Normalize) -> None: """ Helper function to add a legend to the histogram. Parameters ---------- container : plt.Axes The container to which the legend is added. unique_labels : np.ndarray The unique labels that should be displayed in the legend cmap : Colormap the colormap norm : Normalize The Normalize object to pick the correct color """ patchlist = [mpatches.Patch(color=cmap(norm(lab)), label=lab) for lab in unique_labels] container.legend(handles=patchlist, loc="center right") def _get_cmap_and_norm(labels: np.ndarray) -> (np.ndarray, Colormap, Normalize): """ Helper function to get colormap and Normalization object. Parameters ---------- labels : np.ndarray The cluster labels Returns ------- tuple : (np.ndarray, Colormap, Normalize) The unique labels ids, The colormap, The Normalize object to pick the correct color """ unique_labels = np.unique(labels) # Manage colormap cmap = cm.get_cmap('viridis', 12) norm = Normalize(vmin=unique_labels[0], vmax=unique_labels[-1]) return unique_labels, cmap, norm