Source code for clustpy.utils.checks

from sklearn.utils.estimator_checks import estimator_checks_generator
from sklearn.base import BaseEstimator
import numpy as np
from sklearn.utils import check_X_y, check_array, check_random_state


[docs]def check_clustpy_estimator(estimator_obj: BaseEstimator, checks_to_ignore: tuple | list = ("check_complex_data")): """ Run the check_estimator function from sklearn ignoring the check for complex data. For more information, check: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/estimator_checks.py Parameters ---------- estimator_obj : BaseEstimator Initialization of the tested BaseEstimator checks_to_ignore : tuple | list List containing the names of checks to ignore (default: ("check_complex_data")) """ all_checks = estimator_checks_generator(estimator_obj) for estimator, check in all_checks: check_name = check.func.__name__ if not check_name in checks_to_ignore: try: check(estimator) except Exception as e: print("Check", check_name, "failed.") raise e else: print("Skip check:", check_name)
[docs]def check_parameters(X: np.ndarray, *, y: np.ndarray=None, random_state: np.random.RandomState | int=None, allow_nd: bool=False, allow_size_1: bool=False, estimator_obj: BaseEstimator = None) -> (np.ndarray, np.ndarray, np.random.RandomState): """ Check if parameters for X, y and random_state are defined in accordance with the sklearn standard. Parameters ---------- X : np.ndarray the given data set y : np.ndarray the labels (can usually be ignored) (default: None) random_state : np.random.RandomState | int the random state (default: None) allow_nd : bool allow n-dimensional arrays instead of only allowing 2d arrays (default: False) allow_size_1 : bool allow a dataset with a single sample estimator_obj : BaseEstimator Initialization of the tested BaseEstimator (default: None) Returns ------- tuple : (np.ndarray, np.ndarray, np.random.RandomState) the checked data set, the checked labels the checked random_state """ ensure_2d = not allow_nd if y is None: X = check_array(X, accept_sparse=False, allow_nd=allow_nd, ensure_2d=ensure_2d) else: X, y = check_X_y(X, y, accept_sparse=False, allow_nd=allow_nd, ensure_2d=ensure_2d) class_labels = np.unique(y) if np.min(class_labels) == 1 and np.max(class_labels) == len(class_labels): y -= 1 class_labels -= 1 print("WARNING: labels in y were within [1, {0}], changed to be within [0, {1}] instead".format(len(class_labels), len(class_labels) - 1)) assert np.array_equal(class_labels, np.arange(len(class_labels))), "y is not defined as expected. Should only contain labels within [0, n_classes - 1]. Labels in y: {0}".format(class_labels) if X.ndim == 1: raise ValueError("Data can not be a 1d array.") if not allow_size_1 and X.shape[0] == 1: raise ValueError("Model cannot be fitted if n_samples = 1. X shape =", X.shape) if estimator_obj is not None and estimator_obj.n_features_in_ != X.shape[1]: raise ValueError("X has {0} features, but {1} is expecting {2} features as input.".format(X.shape[1], estimator_obj.__class__.__name__, estimator_obj.n_features_in_)) random_state = check_random_state(random_state) return X, y, random_state