Source code for clustpy.data.real_uci_data

from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data
import numpy as np
import zipfile
import tarfile
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.datasets._base import Bunch
from pathlib import Path


[docs]def load_banknotes(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the banknote authentication data set. It consists of 1372 genuine and forged banknote samples. N=1372, d=4, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (1372 x 4), the labels numpy array (1372) References ------- https://archive.ics.uci.edu/ml/datasets/banknote+authentication """ filename = _get_download_dir(downloads_path) / "data_banknote_authentication.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt") # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Banknotes", data=data, target=labels)
[docs]def load_spambase(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the spambase data set. It consists of 4601 spam and non-spam mails. N=4601, d=57, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (4601 x 57), the labels numpy array (4601) References ------- https://archive.ics.uci.edu/ml/datasets/spambase """ filename = _get_download_dir(downloads_path) / "spambase.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data") # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Spambase", data=data, target=labels)
[docs]def load_seeds(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the seeds data set. It consists of 210 samples belonging to one of three varieties of wheat. N=210, d=7, k=3. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (210 x 7), the labels numpy array (210) References ------- https://archive.ics.uci.edu/ml/datasets/seeds """ filename = _get_download_dir(downloads_path) / "seeds_dataset.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt", delimiter=None) # Convert labels from 1,... to 0,... labels -= 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Seeds", data=data, target=labels)
[docs]def load_skin(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Skin Segmentation data set. It consists of 245057 skin- and non-skin samples with their B, G, R color information. N=245057, d=3, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (245057 x 3), the labels numpy array (245057) References ------- https://archive.ics.uci.edu/ml/datasets/skin+segmentation """ filename = _get_download_dir(downloads_path) / "Skin_NonSkin.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt", delimiter=None) # Convert labels from 1,... to 0,... labels -= 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="SkinSegmentation", data=data, target=labels)
[docs]def load_soybean_small(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the small version of the soybean data set. It is a small subset of the original soybean data set. It consists of 47 samples belonging to one of 4 classes. N=47, d=35, k=4. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (47 x 35), the labels numpy array (47) References ------- https://archive.ics.uci.edu/ml/datasets/soybean+(small) """ filename = _get_download_dir(downloads_path) / "soybean-small.data" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data", filename) # Load data and labels df = pd.read_csv(filename, delimiter=",", header=None) labels_raw = df.iloc[:, -1] data = df.iloc[:, :-1].values LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="SoybeanSmall", data=data, target=labels)
[docs]def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the large version of the soybean data set. It consists of 562 samples belonging to one of 15 classes. Originally, the data set would have samples and 19 classes but some samples have attributes showing '?' values. Those will be ignored. The data set is composed of 266 training and 296 test samples. N=562, d=35, k=15. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (562 x 35), the labels numpy array (562) References ------- https://archive.ics.uci.edu/ml/datasets/soybean+(Large) """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": filename = _get_download_dir(downloads_path) / "soybean-large.data" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.data", filename) # Load data and labels df_train = pd.read_csv(filename, delimiter=",", header=None) df_train = df_train[(df_train != '?').all(axis=1)] labels_raw = df_train.pop(0) data = df_train.values if subset == "all" or subset == "test": filename = _get_download_dir(downloads_path) / "soybean-large.test" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.test", filename) df_test = pd.read_csv(filename, delimiter=",", header=None) df_test = df_test[(df_test != '?').all(axis=1)] labels_test = df_test.pop(0) if subset == "all": data = np.r_[data, df_test.values] labels_raw = np.r_[labels_raw, labels_test] else: data = df_test.values labels_raw = labels_test # Transform data to numerical array data = np.array(data, dtype=int) LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="SoybeanLarge", data=data, target=labels)
[docs]def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the pendigits data set. It consists of 10992 vectors of length 16, representing 8 coordinates. The coordinates were taken from the task of writing digits (0 to 9) on a tablet. The data set is composed of 7494 training and 3498 test samples. N=10992, d=16, k=10. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (10992 x 16), the labels numpy array (10992) References ------- http://archive.ics.uci.edu/ml/datasets/pen-based+recognition+of+handwritten+digits """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": filename = _get_download_dir(downloads_path) / "pendigits.tra" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra") if subset == "all" or subset == "test": filename = _get_download_dir(downloads_path) / "pendigits.tes" test_data, test_labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes") if subset == "all": data = np.r_[data, test_data] labels = np.r_[labels, test_labels] else: data = test_data labels = test_labels # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Pendigits", data=data, target=labels)
[docs]def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the ecoli data set. It consists of 336 samples belonging to one of 8 classes. N=336, d=7, k=8. Parameters ---------- ignore_small_clusters : bool specify if the three small clusters with size 2, 2 and 5 should be ignored (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (336 x 7), the labels numpy array (336) References ------- https://archive.ics.uci.edu/ml/datasets/ecoli """ filename = _get_download_dir(downloads_path) / "ecoli.data" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data", filename) data = np.zeros((336, 7)) labels_raw = [] with open(filename, "r") as f: for i, line in enumerate(f.readlines()): splited = line.split() data[i] = splited[1:-1] labels_raw.append(splited[-1]) if ignore_small_clusters: # Optional: Remove the three small clusters consisting of only 2, 2 and 5 samples keep_labels = [l not in ["imL", "imS", "omL"] for l in labels_raw] data = data[keep_labels] labels_raw = [l for i, l in enumerate(labels_raw) if keep_labels[i]] LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Convert labels to int32 format labels = labels.astype(np.int32) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Ecoli", data=data, target=labels)
[docs]def load_htru2(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the HTRU2 data set. It consists of 17898 samples belonging to the pulsar or non-pulsar class. A special property is that more than 90% of the data belongs to class 0. N=17898, d=8, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (17898 x 8), the labels numpy array (17898) References ------- https://archive.ics.uci.edu/ml/datasets/HTRU2 """ directory = _get_download_dir(downloads_path) / "htru2" filename = directory / "HTRU2.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Load data and labels dataset = np.genfromtxt(directory / "HTRU_2.csv", delimiter=",") data = dataset[:, :-1] labels = dataset[:, -1] # Convert labels to int32 format labels = labels.astype(np.int32) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="HTRU2", data=data, target=labels)
[docs]def load_letterrecognition(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Letter Recognition data set. It consists of 20000 samples where each sample represents one of the 26 capital letters in the English alphabet. All samples are composed of 16 numerical stimuli describing the respective letter. N=20000, d=16, k=26. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (20000 x 16), the labels numpy array (20000) References ------- https://archive.ics.uci.edu/ml/datasets/letter+recognition """ filename = _get_download_dir(downloads_path) / "letter-recognition.data" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data", filename) # Transform letters to integers letter_mappings = {"A": "0", "B": "1", "C": "2", "D": "3", "E": "4", "F": "5", "G": "6", "H": "7", "I": "8", "J": "9", "K": "10", "L": "11", "M": "12", "N": "13", "O": "14", "P": "15", "Q": "16", "R": "17", "S": "18", "T": "19", "U": "20", "V": "21", "W": "22", "X": "23", "Y": "24", "Z": "25"} with open(filename, "r") as f: file_text = f.read() file_text = file_text.replace("\n", ",") for k in letter_mappings.keys(): file_text = file_text.replace(k, letter_mappings[k]) # Create numpy array datafile = np.fromstring(file_text, sep=",").reshape(-1, 17) data = datafile[:, 1:] labels = datafile[:, 0] # Convert labels to int32 format labels = labels.astype(np.int32) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Letterrecognition", data=data, target=labels)
[docs]def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Human Activity Recognition data set. It consists of 10299 samples each representing sensor data of a person performing an activity. The six activities are walking, walking_upstairs, walking_downstairs, sitting, standing and laying. The data set is composed of 7352 training and 2947 test samples. N=10992, d=561, k=6. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (10992 x 561), the labels numpy array (10992) References ------- https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) directory = _get_download_dir(downloads_path) / "har" filename = directory / "UCI HAR Dataset.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Load data and labels if subset == "all" or subset == "train": data = np.genfromtxt(directory / "UCI HAR Dataset/train/X_train.txt") labels = np.genfromtxt(directory / "UCI HAR Dataset/train/y_train.txt") if subset == "all" or subset == "test": test_data = np.genfromtxt(directory / "UCI HAR Dataset/test/X_test.txt") test_labels = np.genfromtxt(directory / "UCI HAR Dataset/test/y_test.txt") if subset == "all": data = np.r_[data, test_data] labels = np.r_[labels, test_labels] else: data = test_data labels = test_labels # Convert labels to int32 format labels = labels.astype(np.int32) # Convert labels from 1,... to 0,... labels = labels - 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="HAR", data=data, target=labels)
[docs]def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the statlog shuttle data set. It consists of 58000 samples belonging to one of 7 classes. A special property is that about 80% of the data belongs to class 0. The data set is composed of 43500 training and 14500 test samples. N=58000, d=9, k=7. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (58000 x 9), the labels numpy array (58000) References ------- https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle) """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) directory = _get_download_dir(downloads_path) / "shuttle" if subset == "all" or subset == "train": filename = directory / "shuttle.trn.Z" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z", filename) # Unpack z-file success = _decompress_z_file(filename, directory) if not success: filename.unlink() return (None, None) if return_X_y else None # Load data and labels dataset = np.genfromtxt(directory / "shuttle.trn") data = dataset[:, :-1] labels = dataset[:, -1] if subset == "all" or subset == "test": filename = directory / "shuttle.tst" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst", filename) test_dataset = np.genfromtxt(directory / "shuttle.tst") test_data = test_dataset[:, :-1] test_labels = test_dataset[:, -1] if subset == "all": data = np.r_[data, test_data] labels = np.r_[labels, test_labels] else: data = test_data labels = test_labels # Convert labels to int32 format labels = labels.astype(np.int32) # Convert labels from 1,... to 0,... labels -= 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="StatlogShuttle", data=data, target=labels)
[docs]def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Mice Protein Expression data set. It consists of 1077 samples belonging to one of 8 classes. Each feature represents the expression level of one of 77 proteins. Samples containing more than 43 NaN values (3 cases) will be removed. Afterwards, all columns containing NaN values will be removed. This reduces the number of features from 77 to 68. The classes can be further subdivided by using the return_additional_labels parameter. This gives the additional information mouseID, behavior, treatment type and genotype. N=1077, d=68, k=8. Parameters ---------- return_additional_labels : bool return additional labels (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (1077 x 68), the labels numpy array (1077) References ------- https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression """ filename = _get_download_dir(downloads_path) / "Data_Cortex_Nuclear.xls" if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls", filename) xls = pd.ExcelFile(filename) # Load first page sheet = xls.parse(0) # Remove special columns classes_raw = sheet.pop("class") ids_raw = sheet.pop("MouseID") bahaviors_raw = sheet.pop("Behavior") treatments_raw = sheet.pop("Treatment") genotypes_raw = sheet.pop("Genotype") original_data = sheet.values # Remove rows containing 43 NaN values (3 cases) n_of_nans_per_row = np.sum(np.isnan(original_data), axis=1) data = original_data[n_of_nans_per_row < 43] # Remove columns containing NaN values (removes 9 columns) n_of_nans_per_columns = np.sum(np.isnan(data), axis=0) data = data[:, n_of_nans_per_columns == 0] # Get labels LE = LabelEncoder() labels = LE.fit_transform(classes_raw) if return_additional_labels: ids = [entry.split("_")[0] for entry in ids_raw] LE = LabelEncoder() id_labels = LE.fit_transform(ids) LE = LabelEncoder() bahaviors_labels = LE.fit_transform(bahaviors_raw) LE = LabelEncoder() treatment_labels = LE.fit_transform(treatments_raw) LE = LabelEncoder() genotype_labels = LE.fit_transform(genotypes_raw) labels = np.c_[labels, id_labels, bahaviors_labels, treatment_labels, genotype_labels] # Convert labels to int32 format labels = labels.astype(np.int32) # Remove rows also from labels (3 cases) labels = labels[n_of_nans_per_row < 43] # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="MiceProtein", data=data, target=labels)
[docs]def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the user knowledge data set. It consists of 403 samples belonging to one of 4 classes. The 4 classes are the knowledge levels 'very low', 'low', 'middle' and 'high'. The data set is composed of 258 training and 145 test samples. N=403, d=5, k=4. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (403 x 5), the labels numpy array (403) References ------- https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) filename = _get_download_dir(downloads_path) / "Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls" if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls", filename) xls = pd.ExcelFile(filename) if subset == "all" or subset == "train": # Load second page sheet_train = xls.parse(1) # Get data and label columns labels_raw = sheet_train.pop(" UNS") data = sheet_train.values[:, :5] if subset == "all" or subset == "test": # Load third page sheet_test = xls.parse(2) # Get data and label columns test_data = sheet_test.values[:, :5] uns_test = sheet_test.pop(" UNS") # Fix label string 'Very Low' to 'very_low' (as in train file) uns_test = [l.replace("Very Low", "very_low") for l in uns_test] if subset == "all": data = np.r_[data, test_data] labels_raw = np.r_[labels_raw, uns_test] else: data = test_data labels_raw = uns_test # Transform labels LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Convert labels to int32 format labels = labels.astype(np.int32) data = np.array(data, dtype=np.float64) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="UserKnowledge", data=data, target=labels)
[docs]def load_breast_tissue(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the breast tissue data set. It consists of 106 samples belonging to one of 6 classes. N=106, d=9, k=6. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (106 x 9), the labels numpy array (106) References ------- http://archive.ics.uci.edu/ml/datasets/breast+tissue """ filename = _get_download_dir(downloads_path) / "BreastTissue.xls" if not filename.is_file(): _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls", filename) xls = pd.ExcelFile(filename) # Load second page sheet = xls.parse(1) # Get data and label columns class_column = sheet.pop("Class") data = sheet.values[:, 1:] # Transform labels LE = LabelEncoder() labels = LE.fit_transform(class_column) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="BreastTissue", data=data, target=labels)
[docs]def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the forest type mapping data set. It consists of 523 samples belonging to one of 4 classes. The data set is composed of 198 training and 325 test samples. N=523, d=27, k=4. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (523 x 27), the labels numpy array (523) References ------- https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) directory = _get_download_dir(downloads_path) / "ForestTypes" filename = directory / "ForestTypes.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00333/ForestTypes.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Load data and labels if subset == "all" or subset == "train": df_train = pd.read_csv(directory / "training.csv", delimiter=",") labels_raw = df_train.pop("class") data = df_train.values if subset == "all" or subset == "test": df_test = pd.read_csv(directory / "testing.csv", delimiter=",") labels_test = df_test.pop("class") if subset == "all": data = np.r_[data, df_test.values] labels_raw = np.r_[labels_raw, labels_test] else: data = df_test.values labels_raw = labels_test # Transform labels LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="ForestTypes", data=data, target=labels)
[docs]def load_dermatology(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the dermatology data set. It consists of 366 samples belonging to one of 6 classes. 8 samples contain '?' values and are therefore removed. N=358, d=34, k=6. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (358 x 34), the labels numpy array (358) References ------- https://archive.ics.uci.edu/ml/datasets/dermatology """ filename = _get_download_dir(downloads_path) / "dermatology.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data", delimiter=",") # Remove rows with nan rows_with_nan = ~np.isnan(data).any(axis=1) data = data[rows_with_nan] labels = labels[rows_with_nan] # Convert labels from 1,... to 0,... labels -= 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Dermatology", data=data, target=labels)
[docs]def load_multiple_features(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the multiple features data set. It consists of 2000 samples belonging to one of 10 classes. Each class corresponds to handwritten numerals (0-9) extracted from a collection of Dutch utility maps. N=2000, d=649, k=10. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (2000 x 649), the labels numpy array (2000) References ------- https://archive.ics.uci.edu/ml/datasets/Multiple+Features """ directory = _get_download_dir(downloads_path) / "MultipleFeatures" directory.mkdir(parents=False, exist_ok=True) data = np.zeros((2000, 0)) # Dataset consists of multiple .xls files for file in ["mfeat-fac", "mfeat-fou", "mfeat-kar", "mfeat-mor", "mfeat-pix", "mfeat-zer"]: filename = directory / (file + ".xls") if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/" + file, filename) data_tmp = np.genfromtxt(filename, delimiter=None) data = np.c_[data, data_tmp] # First 200 entries correspond to '0', next 200 to '1' and so on labels = np.repeat(range(10), 200) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="MultipleFeatures", data=data, target=labels)
[docs]def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the statlog Australian Credit Approval data set. It consists of 690 samples belonging to one of 2 classes. N=690, d=14, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (690 x 14), the labels numpy array (690) References ------- https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval) """ filename = _get_download_dir(downloads_path) / "australian.dat" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat", delimiter=None) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="StatlogAustralianCreditApproval", data=data, target=labels)
[docs]def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the original breast cancer Wisconsin data set. It consists of 699 samples belonging to one of 2 classes. 16 samples contain '?' values and will be removed. N=683, d=9, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (683 x 9), the labels numpy array (683) References ------- https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29 """ filename = _get_download_dir(downloads_path) / "breast-cancer-wisconsin.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", delimiter=",") # First column contains unique ids data = data[:, 1:] # Remove rows with nan rows_with_nan = ~np.isnan(data).any(axis=1) data = data[rows_with_nan] labels = labels[rows_with_nan] # labels are 2 or 4. Convert to 0 or 1 labels = labels / 2 - 1 # Convert labels to int32 format labels = labels.astype(np.int32) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="BreastCancerWisconsin", data=data, target=labels)
[docs]def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the optdigits data set. It consists of 5620 8x8 grayscale images, each representing a digit (0 to 9). Each pixel depicts the number of marked pixel within a 4x4 block of the original 32x32 bitmaps. The data set is composed of 3823 training and 1797 test samples. N=5620, d=64, k=10. Parameters ---------- subset : str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Furthermore, the original images are contained in the 'images' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (5620 x 64), the labels numpy array (5620) References ------- http://archive.ics.uci.edu/ml/datasets/optical+recognition+of+handwritten+digits """ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": filename = _get_download_dir(downloads_path) / "optdigits.tra" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra") if subset == "all" or subset == "test": filename = _get_download_dir(downloads_path) / "optdigits.tes" test_data, test_labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes") if subset == "all": data = np.r_[data, test_data] labels = np.r_[labels, test_labels] else: data = test_data labels = test_labels # Return values if return_X_y: return data, labels else: data_image = data.reshape((-1, 8, 8)) return Bunch(dataset_name="Optdigits", data=data, target=labels, images=data_image, image_format="HW")
[docs]def load_semeion(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the semeion data set. It consists of 1593 samples belonging to one of 10 classes. Each sample corresponds to a grayscale 16x16 scan of handwritten digits originating from about 80 different persons. Further, each pixel was converted to a boolean value using a fixed threshold. N=1593, d=256, k=10. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Furthermore, the original images are contained in the 'images' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (1593 x 256), the labels numpy array (1593) References ------- https://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit """ filename = _get_download_dir(downloads_path) / "semeion.data" if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data", filename) datafile = np.genfromtxt(filename) # Last columns each correspond to one label (one-hot encoding) data = datafile[:, :-10] labels = np.zeros(data.shape[0], dtype=np.int32) for i in range(1, 10): labels[datafile[:, -10 + i] == 1] = i # Return values if return_X_y: return data, labels else: data_image = data.reshape((-1, 16, 16)) return Bunch(dataset_name="Semeion", data=data, target=labels, images=data_image, image_format="HW")
[docs]def load_cmu_faces(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses (up, straight, left, right) and with different expressions (neutral, happy, sad, angry). Additionally, the persons can wear sunglasses or not. 16 images show glitches which is why the final data set only contains 624 images. N=624, d=960, k=[20,4,4,2]. Parameters ------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Furthermore, the original images are contained in the 'images' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (624 x 960), the labels numpy array (624 x 4) References ------- http://archive.ics.uci.edu/ml/datasets/cmu+face+images """ directory = _get_download_dir(downloads_path) / "cmufaces" filename = directory / "faces_4.tar.gz" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz", filename) # Unpack zipfile with tarfile.open(filename, "r:gz") as tar: tar.extractall(directory) names = np.array( ["an2i", "at33", "boland", "bpm", "ch4f", "cheyer", "choon", "danieln", "glickman", "karyadi", "kawamura", "kk49", "megak", "mitchell", "night", "phoebe", "saavik", "steffi", "sz24", "tammo"]) positions = np.array(["straight", "left", "right", "up"]) expressions = np.array(["neutral", "happy", "sad", "angry"]) eyes = np.array(["open", "sunglasses"]) data_list = [] label_list = [] for name in names: path_images = directory / "faces_4" / name for image in path_images.iterdir(): image_str = image.name if not image_str.endswith("_4.pgm"): continue # get image data image_array = _load_image_data(image, None, False) # Get labels name_parts = image_str.split("_") user_id = np.argwhere(names == name_parts[0])[0][0] position = np.argwhere(positions == name_parts[1])[0][0] expression = np.argwhere(expressions == name_parts[2])[0][0] eye = np.argwhere(eyes == name_parts[3])[0][0] label_data = np.array([user_id, position, expression, eye]) # Save data and labels data_list.append(image_array) label_list.append(label_data) labels = np.array(label_list, dtype=np.int32) data_image = np.array(data_list) # Flatten data data_flatten = flatten_images(data_image, "HW") # Return values if return_X_y: return data_flatten, labels else: return Bunch(dataset_name="CMUFace", data=data_flatten, target=labels, images=data_image, image_format="HW", classes=(names, positions, expressions, eyes))
[docs]def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Gene Expression Cancer RNA-SEQ data set. It consists of 801 samples belonging to one of 5 classes. N=801, d=20531, k=5. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (801 x 20531), the labels numpy array (801) References ------- https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq """ directory = _get_download_dir(downloads_path) / "GeneExpressionRNASEQ" filename = directory / "gene+expression+cancer+rna+seq.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/401/gene+expression+cancer+rna+seq.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) with tarfile.open(directory / "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar: tar.extractall(directory) # Load data and labels data = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "data.csv", delimiter=",")[1:,1:] labels_raw = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "labels.csv", delimiter=",", dtype=str)[1:,1] LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="GeneExpressionCancerRNA-SEQ", data=data, target=labels)
[docs]def load_sport_articles(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Sport Articles data set. It consists of 1000 samples belonging to one of 2 classes (objective or subjective). We only consider features that correspond to specific frequencies and, therefore, ignore the attributes totalWordsCount, sentence1st, sentencelast and txtcomplexity. N=1000, d=55, k=2. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (1000 x 55), the labels numpy array (1000) References ------- https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis """ directory = _get_download_dir(downloads_path) / "SportArticles" filename = directory / "sports+articles+for+objectivity+analysis.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Parse excel file (can not be read by Pandas) data = np.zeros((1000, 55), dtype=int) labels = np.zeros(1000, dtype=np.int32) row = -2 # first row is the header and should be skipped column = 0 with open(directory / "features.xls", "r") as f: for _, line in enumerate(f.readlines()): if "</Table>" in line: # Next table is not relevant for the data break if "<Row ss" in line: # Next row starts column = 0 row += 1 if row >= 0 and "<Cell>" in line: if column == 2: assert "objective" in line or "subjective" in line labels[row] = 0 if "objective" in line else 1 if column > 3 and column < 59: data[row, column - 4] = int(line.split('"Number">')[1].split('</Data>')[0]) column += 1 # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="SportArticles", data=data, target=labels)
[docs]def load_wholesale_customers(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Wholesale Customers data set. It consists of 440 samples and can be grouped in two different ways: Either two classes based on the channel (Horeca or Retail) or three classes based on the region (Lisbon, Oporto or Other region). N=440, d=6, k=[2, 3]. Parameters ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (440 x 6), the labels numpy array (440 x 2) References ------- https://archive.ics.uci.edu/dataset/292/wholesale+customers """ directory = _get_download_dir(downloads_path) / "WholeCustomers" filename = directory / "wholesale+customers.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/292/wholesale+customers.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) wholesale = np.genfromtxt(directory / "Wholesale customers data.csv", delimiter=",", skip_header=True) labels = wholesale[:,:2] - 1 data = wholesale[:,2:] # Convert labels to int32 format labels = labels.astype(np.int32) # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="WholesaleCustomers", data=data, target=labels)
[docs]def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-fx", "earn", "acq", "crude"), use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Reuters21578 data set. It consists of 21578 Reuters newswire artices divided into different categories. When loading the artices, the title will be included in the text. The data is preprocessed by only considering articles with a single category. Furthermore, the documents are usually converted into feature vectors using tf-idf. Note that two different train-test splits are available: Lewis and cgi. The default is Lewis. For the Lewis split, the data set is composed of 5791 training and 2300 instances (default settings). For the cgi split, the data set is composed of 8091 training and 276 instances. N=8367, d=2000, k=5 using the default settings. Parameters ---------- subset : str can be 'all', 'test', 'train', 'test-cgi' or 'train-cgi'. 'all' combines test and train data (default: 'all') categories : tuple specify the categories. Can be None if all categories should be used (default: ("grain", "money-fx", "earn", "acq", "crude")) use_tfidf : bool If true, tf-idf will be applied as the last step of the pipeline (default: True) use_stemming : bool If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True) use_stop_words : bool If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True) max_df : float | int Ignore words that have a document frequency strictly higher than max_df. If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0) min_df : float | int Ignore words that have a document frequency strictly lower than min_df. If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1) max_features : int If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer). Note that this value could be further reduced if min_variance is smaller than one (default: 2000) min_variance : float Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.) sublinear_tf : bool Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns ------- bunch : Bunch A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Alternatively, if return_X_y is True two arrays will be returned: the data numpy array (8367 x 2000 - using the default settings), the labels numpy array (8367 - using the default settings) References ------- https://archive.ics.uci.edu/dataset/137/reuters+21578+text+categorization+collection """ subset = subset.lower() assert subset in ["all", "train", "test", "test-cgi", "train-cgi"], "subset must match 'all', 'train', 'test', 'train-cgi' or 'test-cgi'. Your input {0}".format(subset) # Check if data is already downloaded directory = _get_download_dir(downloads_path) / "Reuters21578" filename = directory / "reuters+21578+text+categorization+collection.zip" if not filename.is_file(): directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/137/reuters+21578+text+categorization+collection.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) with tarfile.open(directory / "reuters21578.tar.gz", "r:gz") as tar: tar.extractall(directory) # Load actual articles into arrays all_topics = [] all_bodies = [] all_lewis_splits = [] all_cgi_splits = [] for file in directory.iterdir(): if file.suffix == ".sgm": in_body = False with open(file, "rb") as f: for line in f.readlines(): # Needed so that reut2-017.sgm is not crashing due to encoding line = line.decode('utf-8','ignore') # New entry starts if line.startswith("<REUTERS"): in_body = False body = "" topics = [] lewis_split = line.split("LEWISSPLIT=\"")[1].split("\" CGISPLIT=")[0] cgi_split = line.split("CGISPLIT=\"")[1].split("\" OLDID=")[0] if "CSECS" in cgi_split: # 4 entries have an additional CSECS tag that should be removed cgi_split = cgi_split.split("\" CSECS=")[0] text_id = line.split("NEWID=\"")[1].split("\">")[0] # Extract topics if line.startswith("<TOPICS>"): topics_remaining = line while "<D>" in topics_remaining: topics_remaining = topics_remaining.split("<D>", 1)[1] topics.append(topics_remaining.split("</D>")[0]) if line.startswith("<TEXT TYPE=\"UNPROC\""): in_body = True # Add title to text/body if "<TITLE>" in line: body += line.split("<TITLE>")[1].split("</TITLE>")[0] + ". " # New body starts. All following lines are part of the body if "<BODY>" in line: body += line.split("<BODY>")[1].replace("\n", " ") in_body = True elif in_body and line != " Reuter\n": # Check if body part is ending if "</BODY>" in line or "</TEXT>" in line: in_body = False else: body += line.replace("\n", " ") # Entry ends if line.startswith("</REUTERS>"): assert len(body) > 1, f"body empty in {file}, text id = {text_id}" assert len(lewis_split) > 1, f"lewis split empty in {file}, text id = {text_id}" assert len(cgi_split) > 1, f"cgi split empty in {file}, text id = {text_id}" all_bodies.append(body) all_lewis_splits.append(lewis_split) all_cgi_splits.append(cgi_split) all_topics.append(topics) assert len(all_bodies) == 21578, "number of articles is not correct. Should be 21578 but is {0}".format(len(all_bodies)) # Filter documents to receive only articles with a single relevant category for i in range(len(all_topics)-1, -1, -1): hits = 0 new_topic = None for t in all_topics[i]: if categories is None or t in categories: hits += 1 new_topic = t if hits != 1: del all_bodies[i] del all_lewis_splits[i] del all_cgi_splits[i] del all_topics[i] else: all_topics[i] = new_topic # Transform raw data data, vocabulary = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, sublinear_tf) # Get labels LE = LabelEncoder() labels = LE.fit_transform(all_topics) # Select subset if subset != "all": if subset == "train": relevant = np.array(all_lewis_splits) == "TRAIN" elif subset == "test": relevant = np.array(all_lewis_splits) == "TEST" elif subset == "train-cgi": relevant = np.array(all_cgi_splits) == "TRAINING-SET" elif subset == "test-cgi": relevant = np.array(all_cgi_splits) == "PUBLISHED-TESTSET" data = data[relevant] labels = labels[relevant] # Return values if return_X_y: return data, labels else: return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories, columns=vocabulary)