Source code for clustpy.data.real_uci_data

from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data
import numpy as np
import zipfile
import tarfile
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.datasets._base import Bunch
from pathlib import Path


[docs]def load_banknotes(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the banknote authentication data set. It consists of 1372 genuine and forged banknote samples.
    N=1372, d=4, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1372 x 4), the labels numpy array (1372)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/banknote+authentication
    """
    filename = _get_download_dir(downloads_path) / "data_banknote_authentication.txt"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt")
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Banknotes", data=data, target=labels)


[docs]def load_spambase(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the spambase data set. It consists of 4601 spam and non-spam mails.
    N=4601, d=57, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (4601 x 57), the labels numpy array (4601)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/spambase
    """
    filename = _get_download_dir(downloads_path) / "spambase.data"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data")
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Spambase", data=data, target=labels)


[docs]def load_seeds(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the seeds data set. It consists of 210 samples belonging to one of three varieties of wheat.
    N=210, d=7, k=3.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (210 x 7), the labels numpy array (210)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/seeds
    """
    filename = _get_download_dir(downloads_path) / "seeds_dataset.txt"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt",
                                   delimiter=None)
    # Convert labels from 1,... to 0,...
    labels -= 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Seeds", data=data, target=labels)


[docs]def load_skin(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the Skin Segmentation data set. It consists of 245057 skin- and non-skin samples with their B, G, R color
    information.
    N=245057, d=3, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (245057 x 3), the labels numpy array (245057)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/skin+segmentation
    """
    filename = _get_download_dir(downloads_path) / "Skin_NonSkin.txt"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt",
                                   delimiter=None)
    # Convert labels from 1,... to 0,...
    labels -= 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="SkinSegmentation", data=data, target=labels)


[docs]def load_soybean_small(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the small version of the soybean data set. It is a small subset of the original soybean data set.
    It consists of 47 samples belonging to one of 4 classes.
    N=47, d=35, k=4.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (47 x 35), the labels numpy array (47)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/soybean+(small)
    """
    filename = _get_download_dir(downloads_path) / "soybean-small.data"
    if not filename.is_file():
        _download_file(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
            filename)
    # Load data and labels
    df = pd.read_csv(filename, delimiter=",", header=None)
    labels_raw = df.iloc[:, -1]
    data = df.iloc[:, :-1].values
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="SoybeanSmall", data=data, target=labels)


[docs]def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the large version of the soybean data set. It consists of 562 samples belonging to one of 15 classes.
    Originally, the data set would have samples and 19 classes but some samples have attributes showing '?' values. Those
    will be ignored.
    The data set is composed of 266 training and 296 test samples.
    N=562, d=35, k=15.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (562 x 35), the labels numpy array (562)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/soybean+(Large)
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    if subset == "all" or subset == "train":
        filename = _get_download_dir(downloads_path) / "soybean-large.data"
        if not filename.is_file():
            _download_file(
                "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.data",
                filename)
        # Load data and labels
        df_train = pd.read_csv(filename, delimiter=",", header=None)
        df_train = df_train[(df_train != '?').all(axis=1)]
        labels_raw = df_train.pop(0)
        data = df_train.values
    if subset == "all" or subset == "test":
        filename = _get_download_dir(downloads_path) / "soybean-large.test"
        if not filename.is_file():
            _download_file(
                "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.test",
                filename)
        df_test = pd.read_csv(filename, delimiter=",", header=None)
        df_test = df_test[(df_test != '?').all(axis=1)]
        labels_test = df_test.pop(0)
        if subset == "all":
            data = np.r_[data, df_test.values]
            labels_raw = np.r_[labels_raw, labels_test]
        else:
            data = df_test.values
            labels_raw = labels_test
    # Transform data to numerical array
    data = np.array(data, dtype=int)
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="SoybeanLarge", data=data, target=labels)


[docs]def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the pendigits data set. It consists of 10992 vectors of length 16, representing 8 coordinates. The coordinates
    were taken from the task of writing digits (0 to 9) on a tablet.
    The data set is composed of 7494 training and 3498 test samples.
    N=10992, d=16, k=10.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (10992 x 16), the labels numpy array (10992)

    References
    -------
    http://archive.ics.uci.edu/ml/datasets/pen-based+recognition+of+handwritten+digits
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    if subset == "all" or subset == "train":
        filename = _get_download_dir(downloads_path) / "pendigits.tra"
        data, labels = _load_data_file(filename,
                                       "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
    if subset == "all" or subset == "test":
        filename = _get_download_dir(downloads_path) / "pendigits.tes"
        test_data, test_labels = _load_data_file(filename,
                                                 "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes")
        if subset == "all":
            data = np.r_[data, test_data]
            labels = np.r_[labels, test_labels]
        else:
            data = test_data
            labels = test_labels
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Pendigits", data=data, target=labels)


[docs]def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the ecoli data set. It consists of 336 samples belonging to one of 8 classes.
    N=336, d=7, k=8.

    Parameters
    ----------
    ignore_small_clusters : bool
        specify if the three small clusters with size 2, 2 and 5 should be ignored (default: False)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (336 x 7), the labels numpy array (336)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/ecoli
    """
    filename = _get_download_dir(downloads_path) / "ecoli.data"
    if not filename.is_file():
        _download_file(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data",
            filename)
    data = np.zeros((336, 7))
    labels_raw = []
    with open(filename, "r") as f:
        for i, line in enumerate(f.readlines()):
            splited = line.split()
            data[i] = splited[1:-1]
            labels_raw.append(splited[-1])
    if ignore_small_clusters:
        # Optional: Remove the three small clusters consisting of only 2, 2 and 5 samples
        keep_labels = [l not in ["imL", "imS", "omL"] for l in labels_raw]
        data = data[keep_labels]
        labels_raw = [l for i, l in enumerate(labels_raw) if keep_labels[i]]
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Ecoli", data=data, target=labels)


[docs]def load_htru2(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the HTRU2 data set. It consists of 17898 samples belonging to the pulsar or non-pulsar class.
    A special property is that more than 90% of the data belongs to class 0.
    N=17898, d=8, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (17898 x 8), the labels numpy array (17898)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/HTRU2
    """
    directory = _get_download_dir(downloads_path) / "htru2"
    filename = directory / "HTRU2.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # Load data and labels
    dataset = np.genfromtxt(directory / "HTRU_2.csv", delimiter=",")
    data = dataset[:, :-1]
    labels = dataset[:, -1]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="HTRU2", data=data, target=labels)


[docs]def load_letterrecognition(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the Letter Recognition data set. It consists of 20000 samples where each sample represents one of the 26 capital
    letters in the English alphabet. All samples are composed of 16 numerical stimuli describing the respective letter.
    N=20000, d=16, k=26.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (20000 x 16), the labels numpy array (20000)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/letter+recognition
    """
    filename = _get_download_dir(downloads_path) / "letter-recognition.data"
    if not filename.is_file():
        _download_file(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
            filename)
    # Transform letters to integers
    letter_mappings = {"A": "0", "B": "1", "C": "2", "D": "3", "E": "4", "F": "5", "G": "6", "H": "7", "I": "8",
                       "J": "9", "K": "10", "L": "11", "M": "12", "N": "13", "O": "14", "P": "15", "Q": "16",
                       "R": "17", "S": "18", "T": "19", "U": "20", "V": "21", "W": "22", "X": "23", "Y": "24",
                       "Z": "25"}
    with open(filename, "r") as f:
        file_text = f.read()
    file_text = file_text.replace("\n", ",")
    for k in letter_mappings.keys():
        file_text = file_text.replace(k, letter_mappings[k])
    # Create numpy array
    datafile = np.fromstring(file_text, sep=",").reshape(-1, 17)
    data = datafile[:, 1:]
    labels = datafile[:, 0]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Letterrecognition", data=data, target=labels)


[docs]def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the Human Activity Recognition data set. It consists of 10299 samples each representing sensor data of a person
    performing an activity. The six activities are walking, walking_upstairs, walking_downstairs, sitting, standing and
    laying.
    The data set is composed of 7352 training and 2947 test samples.
    N=10992, d=561, k=6.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (10992 x 561), the labels numpy array (10992)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    directory = _get_download_dir(downloads_path) / "har"
    filename = directory / "UCI HAR Dataset.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # Load data and labels
    if subset == "all" or subset == "train":
        data = np.genfromtxt(directory / "UCI HAR Dataset/train/X_train.txt")
        labels = np.genfromtxt(directory / "UCI HAR Dataset/train/y_train.txt")
    if subset == "all" or subset == "test":
        test_data = np.genfromtxt(directory / "UCI HAR Dataset/test/X_test.txt")
        test_labels = np.genfromtxt(directory / "UCI HAR Dataset/test/y_test.txt")
        if subset == "all":
            data = np.r_[data, test_data]
            labels = np.r_[labels, test_labels]
        else:
            data = test_data
            labels = test_labels
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Convert labels from 1,... to 0,...
    labels = labels - 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="HAR", data=data, target=labels)


[docs]def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the statlog shuttle data set. It consists of 58000 samples belonging to one of 7 classes. A special property is
    that about 80% of the data belongs to class 0.
    The data set is composed of 43500 training and 14500 test samples.
    N=58000, d=9, k=7.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (58000 x 9), the labels numpy array (58000)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    directory = _get_download_dir(downloads_path) / "shuttle"
    if subset == "all" or subset == "train":
        filename = directory / "shuttle.trn.Z"
        if not filename.is_file():
            directory.mkdir(parents=False, exist_ok=True)
            _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z",
                           filename)
            # Unpack z-file
            success = _decompress_z_file(filename, directory)
            if not success:
                filename.unlink()
                return (None, None) if return_X_y else None
        # Load data and labels
        dataset = np.genfromtxt(directory / "shuttle.trn")
        data = dataset[:, :-1]
        labels = dataset[:, -1]
    if subset == "all" or subset == "test":
        filename = directory / "shuttle.tst"
        if not filename.is_file():
            _download_file(
                "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst",
                filename)
        test_dataset = np.genfromtxt(directory / "shuttle.tst")
        test_data = test_dataset[:, :-1]
        test_labels = test_dataset[:, -1]
        if subset == "all":
            data = np.r_[data, test_data]
            labels = np.r_[labels, test_labels]
        else:
            data = test_data
            labels = test_labels
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Convert labels from 1,... to 0,...
    labels -= 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="StatlogShuttle", data=data, target=labels)


[docs]def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = False,
                      downloads_path: str | Path = None) -> Bunch:
    """
    Load the Mice Protein Expression data set. It consists of 1077 samples belonging to one of 8 classes.
    Each feature represents the expression level of one of 77 proteins.
    Samples containing more than 43 NaN values (3 cases) will be removed. Afterwards, all columns containing NaN values
    will be removed. This reduces the number of features from 77 to 68.
    The classes can be further subdivided by using the return_additional_labels parameter. This gives the additional
    information mouseID, behavior, treatment type and genotype.
    N=1077, d=68, k=8.

    Parameters
    ----------
    return_additional_labels : bool
        return additional labels (default: False)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1077 x 68), the labels numpy array (1077)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression
    """
    filename = _get_download_dir(downloads_path) / "Data_Cortex_Nuclear.xls"
    if not filename.is_file():
        _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls",
                       filename)
    xls = pd.ExcelFile(filename)
    # Load first page
    sheet = xls.parse(0)
    # Remove special columns
    classes_raw = sheet.pop("class")
    ids_raw = sheet.pop("MouseID")
    bahaviors_raw = sheet.pop("Behavior")
    treatments_raw = sheet.pop("Treatment")
    genotypes_raw = sheet.pop("Genotype")
    original_data = sheet.values
    # Remove rows containing 43 NaN values (3 cases)
    n_of_nans_per_row = np.sum(np.isnan(original_data), axis=1)
    data = original_data[n_of_nans_per_row < 43]
    # Remove columns containing NaN values (removes 9 columns)
    n_of_nans_per_columns = np.sum(np.isnan(data), axis=0)
    data = data[:, n_of_nans_per_columns == 0]
    # Get labels
    LE = LabelEncoder()
    labels = LE.fit_transform(classes_raw)
    if return_additional_labels:
        ids = [entry.split("_")[0] for entry in ids_raw]
        LE = LabelEncoder()
        id_labels = LE.fit_transform(ids)
        LE = LabelEncoder()
        bahaviors_labels = LE.fit_transform(bahaviors_raw)
        LE = LabelEncoder()
        treatment_labels = LE.fit_transform(treatments_raw)
        LE = LabelEncoder()
        genotype_labels = LE.fit_transform(genotypes_raw)
        labels = np.c_[labels, id_labels, bahaviors_labels, treatment_labels, genotype_labels]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Remove rows also from labels (3 cases)
    labels = labels[n_of_nans_per_row < 43]
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="MiceProtein", data=data, target=labels)


[docs]def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the user knowledge data set. It consists of 403 samples belonging to one of 4 classes.
    The 4 classes are the knowledge levels 'very low', 'low', 'middle' and 'high'.
    The data set is composed of 258 training and 145 test samples.
    N=403, d=5, k=4.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (403 x 5), the labels numpy array (403)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    filename = _get_download_dir(downloads_path) / "Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"
    if not filename.is_file():
        _download_file(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls",
            filename)
    xls = pd.ExcelFile(filename)
    if subset == "all" or subset == "train":
        # Load second page
        sheet_train = xls.parse(1)
        # Get data and label columns
        labels_raw = sheet_train.pop(" UNS")
        data = sheet_train.values[:, :5]
    if subset == "all" or subset == "test":
        # Load third page
        sheet_test = xls.parse(2)
        # Get data and label columns
        test_data = sheet_test.values[:, :5]
        uns_test = sheet_test.pop(" UNS")
        # Fix label string 'Very Low' to 'very_low' (as in train file)
        uns_test = [l.replace("Very Low", "very_low") for l in uns_test]
        if subset == "all":
            data = np.r_[data, test_data]
            labels_raw = np.r_[labels_raw, uns_test]
        else:
            data = test_data
            labels_raw = uns_test
    # Transform labels
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    data = np.array(data, dtype=np.float64)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="UserKnowledge", data=data, target=labels)


[docs]def load_breast_tissue(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the breast tissue data set. It consists of 106 samples belonging to one of 6 classes.
    N=106, d=9, k=6.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (106 x 9), the labels numpy array (106)

    References
    -------
    http://archive.ics.uci.edu/ml/datasets/breast+tissue
    """
    filename = _get_download_dir(downloads_path) / "BreastTissue.xls"
    if not filename.is_file():
        _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls",
                       filename)
    xls = pd.ExcelFile(filename)
    # Load second page
    sheet = xls.parse(1)
    # Get data and label columns
    class_column = sheet.pop("Class")
    data = sheet.values[:, 1:]
    # Transform labels
    LE = LabelEncoder()
    labels = LE.fit_transform(class_column)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="BreastTissue", data=data, target=labels)


[docs]def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the forest type mapping data set. It consists of 523 samples belonging to one of 4 classes.
    The data set is composed of 198 training and 325 test samples.
    N=523, d=27, k=4.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (523 x 27), the labels numpy array (523)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    directory = _get_download_dir(downloads_path) / "ForestTypes"
    filename = directory / "ForestTypes.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00333/ForestTypes.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # Load data and labels
    if subset == "all" or subset == "train":
        df_train = pd.read_csv(directory / "training.csv", delimiter=",")
        labels_raw = df_train.pop("class")
        data = df_train.values
    if subset == "all" or subset == "test":
        df_test = pd.read_csv(directory / "testing.csv", delimiter=",")
        labels_test = df_test.pop("class")
        if subset == "all":
            data = np.r_[data, df_test.values]
            labels_raw = np.r_[labels_raw, labels_test]
        else:
            data = df_test.values
            labels_raw = labels_test
    # Transform labels
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="ForestTypes", data=data, target=labels)


[docs]def load_dermatology(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the dermatology data set. It consists of 366 samples belonging to one of 6 classes.
    8 samples contain '?' values and are therefore removed.
    N=358, d=34, k=6.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (358 x 34), the labels numpy array (358)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/dermatology
    """
    filename = _get_download_dir(downloads_path) / "dermatology.data"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data",
                                   delimiter=",")
    # Remove rows with nan
    rows_with_nan = ~np.isnan(data).any(axis=1)
    data = data[rows_with_nan]
    labels = labels[rows_with_nan]
    # Convert labels from 1,... to 0,...
    labels -= 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Dermatology", data=data, target=labels)


[docs]def load_multiple_features(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the multiple features data set. It consists of 2000 samples belonging to one of 10 classes.
    Each class corresponds to handwritten numerals (0-9) extracted from a collection of Dutch utility maps.
    N=2000, d=649, k=10.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (2000 x 649), the labels numpy array (2000)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/Multiple+Features
    """
    directory = _get_download_dir(downloads_path) / "MultipleFeatures"
    directory.mkdir(parents=False, exist_ok=True)
    data = np.zeros((2000, 0))
    # Dataset consists of multiple .xls files
    for file in ["mfeat-fac", "mfeat-fou", "mfeat-kar", "mfeat-mor", "mfeat-pix", "mfeat-zer"]:
        filename = directory / (file + ".xls")
        if not filename.is_file():
            _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/" + file,
                           filename)
        data_tmp = np.genfromtxt(filename, delimiter=None)
        data = np.c_[data, data_tmp]
    # First 200 entries correspond to '0', next 200 to '1' and so on
    labels = np.repeat(range(10), 200)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="MultipleFeatures", data=data, target=labels)


[docs]def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the statlog Australian Credit Approval data set. It consists of 690 samples belonging to one of 2 classes.
    N=690, d=14, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (690 x 14), the labels numpy array (690)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval)
    """
    filename = _get_download_dir(downloads_path) / "australian.dat"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat",
                                   delimiter=None)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="StatlogAustralianCreditApproval", data=data, target=labels)


[docs]def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the original breast cancer Wisconsin data set. It consists of 699 samples belonging to one of 2 classes.
    16 samples contain '?' values and will be removed.
    N=683, d=9, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (683 x 9), the labels numpy array (683)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29
    """
    filename = _get_download_dir(downloads_path) / "breast-cancer-wisconsin.data"
    data, labels = _load_data_file(filename,
                                   "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                                   delimiter=",")
    # First column contains unique ids
    data = data[:, 1:]
    # Remove rows with nan
    rows_with_nan = ~np.isnan(data).any(axis=1)
    data = data[rows_with_nan]
    labels = labels[rows_with_nan]
    # labels are 2 or 4. Convert to 0 or 1
    labels = labels / 2 - 1
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="BreastCancerWisconsin", data=data, target=labels)


[docs]def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the optdigits data set. It consists of 5620 8x8 grayscale images, each representing a digit (0 to 9).
    Each pixel depicts the number of marked pixel within a 4x4 block of the original 32x32 bitmaps.
    The data set is composed of 3823 training and 1797 test samples.
    N=5620, d=64, k=10.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (5620 x 64), the labels numpy array (5620)

    References
    -------
    http://archive.ics.uci.edu/ml/datasets/optical+recognition+of+handwritten+digits
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    if subset == "all" or subset == "train":
        filename = _get_download_dir(downloads_path) / "optdigits.tra"
        data, labels = _load_data_file(filename,
                                       "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra")
    if subset == "all" or subset == "test":
        filename = _get_download_dir(downloads_path) / "optdigits.tes"
        test_data, test_labels = _load_data_file(filename,
                                                 "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")
        if subset == "all":
            data = np.r_[data, test_data]
            labels = np.r_[labels, test_labels]
        else:
            data = test_data
            labels = test_labels
    # Return values
    if return_X_y:
        return data, labels
    else:
        data_image = data.reshape((-1, 8, 8))
        return Bunch(dataset_name="Optdigits", data=data, target=labels, images=data_image, image_format="HW")


[docs]def load_semeion(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the semeion data set. It consists of 1593 samples belonging to one of 10 classes.
    Each sample corresponds to a grayscale 16x16 scan of handwritten digits originating from about 80 different persons.
    Further, each pixel was converted to a boolean value using a fixed threshold.
    N=1593, d=256, k=10.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1593 x 256), the labels numpy array (1593)

    References
    -------
    https://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit
    """
    filename = _get_download_dir(downloads_path) / "semeion.data"
    if not filename.is_file():
        _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
                       filename)
    datafile = np.genfromtxt(filename)
    # Last columns each correspond to one label (one-hot encoding)
    data = datafile[:, :-10]
    labels = np.zeros(data.shape[0], dtype=np.int32)
    for i in range(1, 10):
        labels[datafile[:, -10 + i] == 1] = i
    # Return values
    if return_X_y:
        return data, labels
    else:
        data_image = data.reshape((-1, 16, 16))
        return Bunch(dataset_name="Semeion", data=data, target=labels, images=data_image, image_format="HW")


[docs]def load_cmu_faces(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses
    (up, straight, left, right) and with different expressions (neutral, happy, sad, angry). Additionally, the persons
    can wear sunglasses or not.
    16 images show glitches which is why the final data set only contains 624 images.
    N=624, d=960, k=[20,4,4,2].

    Parameters
    -------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (624 x 960), the labels numpy array (624 x 4)

    References
    -------
    http://archive.ics.uci.edu/ml/datasets/cmu+face+images
    """
    directory = _get_download_dir(downloads_path) / "cmufaces"
    filename = directory / "faces_4.tar.gz"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz",
                       filename)
        # Unpack zipfile
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall(directory)
    names = np.array(
        ["an2i", "at33", "boland", "bpm", "ch4f", "cheyer", "choon", "danieln", "glickman", "karyadi", "kawamura",
         "kk49", "megak", "mitchell", "night", "phoebe", "saavik", "steffi", "sz24", "tammo"])
    positions = np.array(["straight", "left", "right", "up"])
    expressions = np.array(["neutral", "happy", "sad", "angry"])
    eyes = np.array(["open", "sunglasses"])
    data_list = []
    label_list = []
    for name in names:
        path_images = directory / "faces_4" / name
        for image in path_images.iterdir():
            image_str = image.name
            if not image_str.endswith("_4.pgm"):
                continue
            # get image data
            image_array = _load_image_data(image, None, False)
            # Get labels
            name_parts = image_str.split("_")
            user_id = np.argwhere(names == name_parts[0])[0][0]
            position = np.argwhere(positions == name_parts[1])[0][0]
            expression = np.argwhere(expressions == name_parts[2])[0][0]
            eye = np.argwhere(eyes == name_parts[3])[0][0]
            label_data = np.array([user_id, position, expression, eye])
            # Save data and labels
            data_list.append(image_array)
            label_list.append(label_data)
    labels = np.array(label_list, dtype=np.int32)
    data_image = np.array(data_list)
    # Flatten data
    data_flatten = flatten_images(data_image, "HW")
    # Return values
    if return_X_y:
        return data_flatten, labels
    else:
        return Bunch(dataset_name="CMUFace", data=data_flatten, target=labels, images=data_image, image_format="HW",
                     classes=(names, positions, expressions, eyes))


[docs]def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str | Path = None):
    """
    Load the Gene Expression Cancer RNA-SEQ data set. It consists of 801 samples belonging to one of 5 classes.
    N=801, d=20531, k=5.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (801 x 20531), the labels numpy array (801)

    References
    -------
    https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq
    """
    directory = _get_download_dir(downloads_path) / "GeneExpressionRNASEQ"
    filename = directory / "gene+expression+cancer+rna+seq.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/static/public/401/gene+expression+cancer+rna+seq.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
        with tarfile.open(directory / "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar:
            tar.extractall(directory)
    # Load data and labels
    data = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "data.csv", delimiter=",")[1:,1:]
    labels_raw = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "labels.csv", delimiter=",", dtype=str)[1:,1]
    LE = LabelEncoder()
    labels = LE.fit_transform(labels_raw)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="GeneExpressionCancerRNA-SEQ", data=data, target=labels)


[docs]def load_sport_articles(return_X_y: bool = False, downloads_path: str | Path = None):
    """
    Load the Sport Articles data set. It consists of 1000 samples belonging to one of 2 classes (objective or subjective).
    We only consider features that correspond to specific frequencies and, therefore, ignore the attributes 
    totalWordsCount, sentence1st, sentencelast and txtcomplexity.
    N=1000, d=55, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1000 x 55), the labels numpy array (1000)

    References
    -------
    https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis
    """
    directory = _get_download_dir(downloads_path) / "SportArticles"
    filename = directory / "sports+articles+for+objectivity+analysis.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # Parse excel file (can not be read by Pandas)
    data = np.zeros((1000, 55), dtype=int)
    labels = np.zeros(1000, dtype=np.int32)
    row = -2 # first row is the header and should be skipped
    column = 0
    with open(directory / "features.xls", "r") as f:
        for _, line in enumerate(f.readlines()):
            if "</Table>" in line:
                # Next table is not relevant for the data
                break
            if "<Row ss" in line:
                # Next row starts
                column = 0
                row += 1
            if row >= 0 and "<Cell>" in line:
                if column == 2:
                    assert "objective" in line or "subjective" in line
                    labels[row] = 0 if "objective" in line else 1
                if column > 3 and column < 59:
                    data[row, column - 4] = int(line.split('"Number">')[1].split('</Data>')[0])
                column += 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="SportArticles", data=data, target=labels)


[docs]def load_wholesale_customers(return_X_y: bool = False, downloads_path: str | Path = None):
    """
    Load the Wholesale Customers data set. It consists of 440 samples and can be grouped in two different ways:
    Either two classes based on the channel (Horeca or Retail) or three classes based on the region (Lisbon, Oporto or Other region).
    N=440, d=6, k=[2, 3].

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (440 x 6), the labels numpy array (440 x 2)

    References
    -------
    https://archive.ics.uci.edu/dataset/292/wholesale+customers
    """
    directory = _get_download_dir(downloads_path) / "WholeCustomers"
    filename = directory / "wholesale+customers.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/static/public/292/wholesale+customers.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    wholesale = np.genfromtxt(directory / "Wholesale customers data.csv", delimiter=",", skip_header=True)
    labels = wholesale[:,:2] - 1
    data = wholesale[:,2:]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="WholesaleCustomers", data=data, target=labels)


[docs]def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-fx", "earn", "acq", "crude"),
               use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., 
               min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., 
               sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
    """
    Load the Reuters21578 data set. It consists of 21578 Reuters newswire artices divided into different categories.
    When loading the artices, the title will be included in the text.
    The data is preprocessed by only considering articles with a single category.
    Furthermore, the documents are usually converted into feature vectors using tf-idf.
    Note that two different train-test splits are available: Lewis and cgi. The default is Lewis.
    For the Lewis split, the data set is composed of 5791 training and 2300 instances (default settings).
    For the cgi split, the data set is composed of 8091 training and 276 instances.
    N=8367, d=2000, k=5 using the default settings.

    Parameters
    ----------
    subset : str
        can be 'all', 'test', 'train', 'test-cgi' or 'train-cgi'. 'all' combines test and train data (default: 'all')
    categories : tuple
        specify the categories. Can be None if all categories should be used (default: ("grain", "money-fx", "earn", "acq", "crude"))
    use_tfidf : bool
        If true, tf-idf will be applied as the last step of the pipeline (default: True)
    use_stemming : bool
        If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True)
    use_stop_words : bool
        If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True)
    max_df : float | int
        Ignore words that have a document frequency strictly higher than max_df. 
        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0)
    min_df : float | int
        Ignore words that have a document frequency strictly lower than min_df.
        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1)
    max_features : int
        If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer).
        Note that this value could be further reduced if min_variance is smaller than one (default: 2000)
    min_variance : float
        Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). 
        The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.)
    sublinear_tf : bool
        Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str | Path
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (8367 x 2000 - using the default settings), the labels numpy array (8367 - using the default settings)

    References
    -------
    https://archive.ics.uci.edu/dataset/137/reuters+21578+text+categorization+collection
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test", "test-cgi", "train-cgi"], "subset must match 'all', 'train', 'test', 'train-cgi' or 'test-cgi'. Your input {0}".format(subset)
    # Check if data is already downloaded
    directory = _get_download_dir(downloads_path) / "Reuters21578"
    filename = directory / "reuters+21578+text+categorization+collection.zip"
    if not filename.is_file():
        directory.mkdir(parents=False, exist_ok=True)
        _download_file("https://archive.ics.uci.edu/static/public/137/reuters+21578+text+categorization+collection.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
        with tarfile.open(directory / "reuters21578.tar.gz", "r:gz") as tar:
            tar.extractall(directory)
    # Load actual articles into arrays
    all_topics = []
    all_bodies = []
    all_lewis_splits = []
    all_cgi_splits = []
    for file in directory.iterdir():
        if file.suffix == ".sgm":
            in_body = False
            with open(file, "rb") as f:
                for line in f.readlines():
                    # Needed so that reut2-017.sgm is not crashing due to encoding
                    line = line.decode('utf-8','ignore')
                    # New entry starts
                    if line.startswith("<REUTERS"):
                        in_body = False
                        body = ""
                        topics = []
                        lewis_split = line.split("LEWISSPLIT=\"")[1].split("\" CGISPLIT=")[0]
                        cgi_split = line.split("CGISPLIT=\"")[1].split("\" OLDID=")[0]
                        if "CSECS" in cgi_split:
                            # 4 entries have an additional CSECS tag that should be removed
                            cgi_split = cgi_split.split("\" CSECS=")[0]
                        text_id = line.split("NEWID=\"")[1].split("\">")[0]
                    # Extract topics
                    if line.startswith("<TOPICS>"):
                        topics_remaining = line
                        while "<D>" in topics_remaining:
                            topics_remaining = topics_remaining.split("<D>", 1)[1]
                            topics.append(topics_remaining.split("</D>")[0])
                    if line.startswith("<TEXT TYPE=\"UNPROC\""):
                        in_body = True
                    # Add title to text/body
                    if "<TITLE>" in line:
                        body += line.split("<TITLE>")[1].split("</TITLE>")[0] + ". "
                    # New body starts. All following lines are part of the body
                    if "<BODY>" in line:
                        body += line.split("<BODY>")[1].replace("\n", " ")
                        in_body = True
                    elif in_body and line != " Reuter\n":
                        # Check if body part is ending
                        if "</BODY>" in line or "</TEXT>" in line:
                            in_body = False
                        else:
                            body += line.replace("\n", " ")
                    # Entry ends
                    if line.startswith("</REUTERS>"):
                        assert len(body) > 1, f"body empty in {file}, text id = {text_id}"
                        assert len(lewis_split) > 1, f"lewis split empty in {file}, text id = {text_id}"
                        assert len(cgi_split) > 1, f"cgi split empty in {file}, text id = {text_id}"
                        all_bodies.append(body)
                        all_lewis_splits.append(lewis_split)
                        all_cgi_splits.append(cgi_split)
                        all_topics.append(topics)
    assert len(all_bodies) == 21578, "number of articles is not correct. Should be 21578 but is {0}".format(len(all_bodies))
    # Filter documents to receive only articles with a single relevant category
    for i in range(len(all_topics)-1, -1, -1):
        hits = 0
        new_topic = None
        for t in all_topics[i]:
            if categories is None or t in categories:
                hits += 1
                new_topic = t
        if hits != 1:
            del all_bodies[i]
            del all_lewis_splits[i]
            del all_cgi_splits[i]
            del all_topics[i]
        else:
            all_topics[i] = new_topic
    # Transform raw data
    data, vocabulary = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
                                sublinear_tf)
    # Get labels
    LE = LabelEncoder()
    labels = LE.fit_transform(all_topics)
    # Select subset
    if subset != "all":
        if subset == "train":
            relevant = np.array(all_lewis_splits) == "TRAIN"
        elif subset == "test":
            relevant = np.array(all_lewis_splits) == "TEST"
        elif subset == "train-cgi":
            relevant = np.array(all_cgi_splits) == "TRAINING-SET"
        elif subset == "test-cgi":
            relevant = np.array(all_cgi_splits) == "PUBLISHED-TESTSET"
        data = data[relevant]
        labels = labels[relevant]
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories, columns=vocabulary)