Source code for clustpy.data.preprocessing

from sklearn.base import TransformerMixin, BaseEstimator
import numpy as np


[docs]class ZNormalizer(TransformerMixin, BaseEstimator):
    """
    Normalize a data set by calculating (data - mean) / std.
    In general, two strategies are sensible to normalize a data set.
    Either use all features simultaneously for the normalization or normalize each feature separately.
    In the case of image data, a feature-wise transformation usually corresponds to a channel-wise transformation.
    If this normalizer should be applied to RGB image data, the color channels should be in the first dimension, known as CHW representation.

    Parameters
    ----------
    feature_or_channel_wise : bool
        Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)

    Attributes
    ----------
    shape : list
        Shape of the data set with which this normalizer has been fitted
    mean : np.ndarray or int
        Mean value(s) of the data set
    std : np.ndarray or int
        Standard deviation value(s) of the data set
    """

    def __init__(self, feature_or_channel_wise: bool = False):
        self.feature_or_channel_wise = feature_or_channel_wise

[docs]    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'ZNormalizer':
        """
        Compute the mean and std values regarding the input data set.

        Parameters
        ----------
        X : np.ndarray
            the given data set
        y : np.ndarray
            the labels (can be ignored)

        Returns
        -------
        self : ZNormalizer
            this instance of the ZNormalizer
        """
        self.shape = list(X.shape)
        self.shape[0] = -1
        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
            # In case of not feature_or_channel_wise or grayscale images (2d or 3d)
            self.std = np.std(X)
            self.mean = np.mean(X)
        elif self.feature_or_channel_wise and (X.ndim == 2 or (X.ndim in [4, 5] and X.shape[1] == 3)):
            # In case of tabular data or RGB 2D or 3D images
            self.std = np.array([np.std(X[:, j]) for j in range(self.shape[1])])
            self.mean = np.array([np.mean(X[:, j]) for j in range(self.shape[1])])
        else:
            raise Exception(
                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
                    self.feature_or_channel_wise, X.ndim))
        return self

[docs]    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Transform the given data set using the fitted mean and std values.

        Parameters
        ----------
        X : np.ndarray
            the given data set

        Returns
        -------
        X_out : np.ndarray
            The transformed data set
        """
        assert list(X.shape)[1:] == self.shape[
                                    1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
            self.shape)
        X_out = X.astype(float)
        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
            # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
            X_out = (X_out - self.mean) / self.std
        elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
            # In case of tabular data or RGB 2D or 3D images
            for j in range(self.shape[1]):
                X_out[:, j] = (X_out[:, j] - self.mean[j]) / self.std[j]
        else:
            raise Exception(
                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
                    self.feature_or_channel_wise,
                    X.ndim))
        return X_out

[docs]    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Invert the transformation by applying (data * std) + mean.

        Parameters
        ----------
        X : np.ndarray
            the given data set

        Returns
        -------
        X_out : np.ndarray
            The transformed data set
        """
        assert list(X.shape)[1:] == self.shape[
                                    1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
            self.shape)
        X_out = X.astype(float)
        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
            # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
            X_out = X_out * self.std + self.mean
        elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
            # In case of tabular data or RGB 2D or 3D images
            for j in range(self.shape[1]):
                X_out[:, j] = X_out[:, j] * self.std[j] + self.mean[j]
        else:
            raise Exception(
                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
                    self.feature_or_channel_wise,
                    X.ndim))
        return X_out


[docs]def z_normalization(X: np.ndarray, feature_or_channel_wise: bool = False) -> np.ndarray:
    """
    Wrapper for the ZNormalizer.
    It automatically executes: X_transform = ZNormalizer(feature_or_channel_wise).fit_transform(X)

    Parameters
    ----------
    X : np.ndarray
            the given data set
    feature_or_channel_wise : bool
        Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)

    Returns
    -------
    X_transform : np.ndarray
        The transformed data set
    """
    znorm = ZNormalizer(feature_or_channel_wise)
    X_transform = znorm.fit_transform(X)
    return X_transform