Source code for clustpy.data.preprocessing

from sklearn.base import TransformerMixin, BaseEstimator
import numpy as np


[docs]class ZNormalizer(TransformerMixin, BaseEstimator): """ Normalize a data set by calculating (data - mean) / std. In general, two strategies are sensible to normalize a data set. Either use all features simultaneously for the normalization or normalize each feature separately. In the case of image data, a feature-wise transformation usually corresponds to a channel-wise transformation. If this normalizer should be applied to RGB image data, the color channels should be in the first dimension, known as CHW representation. Parameters ---------- feature_or_channel_wise : bool Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False) Attributes ---------- shape : list Shape of the data set with which this normalizer has been fitted mean : np.ndarray or int Mean value(s) of the data set std : np.ndarray or int Standard deviation value(s) of the data set """ def __init__(self, feature_or_channel_wise: bool = False): self.feature_or_channel_wise = feature_or_channel_wise
[docs] def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'ZNormalizer': """ Compute the mean and std values regarding the input data set. Parameters ---------- X : np.ndarray the given data set y : np.ndarray the labels (can be ignored) Returns ------- self : ZNormalizer this instance of the ZNormalizer """ self.shape = list(X.shape) self.shape[0] = -1 if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape): # In case of not feature_or_channel_wise or grayscale images (2d or 3d) self.std = np.std(X) self.mean = np.mean(X) elif self.feature_or_channel_wise and (X.ndim == 2 or (X.ndim in [4, 5] and X.shape[1] == 3)): # In case of tabular data or RGB 2D or 3D images self.std = np.array([np.std(X[:, j]) for j in range(self.shape[1])]) self.mean = np.array([np.mean(X[:, j]) for j in range(self.shape[1])]) else: raise Exception( "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format( self.feature_or_channel_wise, X.ndim)) return self
[docs] def transform(self, X: np.ndarray) -> np.ndarray: """ Transform the given data set using the fitted mean and std values. Parameters ---------- X : np.ndarray the given data set Returns ------- X_out : np.ndarray The transformed data set """ assert list(X.shape)[1:] == self.shape[ 1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format( self.shape) X_out = X.astype(float) if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape): # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise X_out = (X_out - self.mean) / self.std elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]: # In case of tabular data or RGB 2D or 3D images for j in range(self.shape[1]): X_out[:, j] = (X_out[:, j] - self.mean[j]) / self.std[j] else: raise Exception( "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format( self.feature_or_channel_wise, X.ndim)) return X_out
[docs] def inverse_transform(self, X: np.ndarray) -> np.ndarray: """ Invert the transformation by applying (data * std) + mean. Parameters ---------- X : np.ndarray the given data set Returns ------- X_out : np.ndarray The transformed data set """ assert list(X.shape)[1:] == self.shape[ 1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format( self.shape) X_out = X.astype(float) if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape): # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise X_out = X_out * self.std + self.mean elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]: # In case of tabular data or RGB 2D or 3D images for j in range(self.shape[1]): X_out[:, j] = X_out[:, j] * self.std[j] + self.mean[j] else: raise Exception( "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format( self.feature_or_channel_wise, X.ndim)) return X_out
[docs]def z_normalization(X: np.ndarray, feature_or_channel_wise: bool = False) -> np.ndarray: """ Wrapper for the ZNormalizer. It automatically executes: X_transform = ZNormalizer(feature_or_channel_wise).fit_transform(X) Parameters ---------- X : np.ndarray the given data set feature_or_channel_wise : bool Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False) Returns ------- X_transform : np.ndarray The transformed data set """ znorm = ZNormalizer(feature_or_channel_wise) X_transform = znorm.fit_transform(X) return X_transform