try:
from PIL import Image
except:
print(
"[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images
import os
import numpy as np
import zipfile
import tarfile
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.datasets._base import Bunch
[docs]def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the banknote authentication data set. It consists of 1372 genuine and forged banknote samples.
N=1372, d=4, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (1372 x 4), the labels numpy array (1372)
References
-------
https://archive.ics.uci.edu/ml/datasets/banknote+authentication
"""
filename = _get_download_dir(downloads_path) + "/data_banknote_authentication.txt"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt")
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Banknotes", data=data, target=labels)
[docs]def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the spambase data set. It consists of 4601 spam and non-spam mails.
N=4601, d=57, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (4601 x 57), the labels numpy array (4601)
References
-------
https://archive.ics.uci.edu/ml/datasets/spambase
"""
filename = _get_download_dir(downloads_path) + "/spambase.data"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data")
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Spambase", data=data, target=labels)
[docs]def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the seeds data set. It consists of 210 samples belonging to one of three varieties of wheat.
N=210, d=7, k=3.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (210 x 7), the labels numpy array (210)
References
-------
https://archive.ics.uci.edu/ml/datasets/seeds
"""
filename = _get_download_dir(downloads_path) + "/seeds_dataset.txt"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt",
delimiter=None)
# Convert labels from 1,... to 0,...
labels -= 1
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Seeds", data=data, target=labels)
[docs]def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the Skin Segmentation data set. It consists of 245057 skin- and non-skin samples with their B, G, R color
information.
N=245057, d=3, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (245057 x 3), the labels numpy array (245057)
References
-------
https://archive.ics.uci.edu/ml/datasets/skin+segmentation
"""
filename = _get_download_dir(downloads_path) + "/Skin_NonSkin.txt"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt",
delimiter=None)
# Convert labels from 1,... to 0,...
labels -= 1
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="SkinSegmentation", data=data, target=labels)
[docs]def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the small version of the soybean data set. It is a small subset of the original soybean data set.
It consists of 47 samples belonging to one of 4 classes.
N=47, d=35, k=4.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (47 x 35), the labels numpy array (47)
References
-------
https://archive.ics.uci.edu/ml/datasets/soybean+(small)
"""
filename = _get_download_dir(downloads_path) + "/soybean-small.data"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
filename)
# Load data and labels
df = pd.read_csv(filename, delimiter=",", header=None)
labels_raw = df.iloc[:, -1]
data = df.iloc[:, :-1].values
LE = LabelEncoder()
labels = LE.fit_transform(labels_raw)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="SoybeanSmall", data=data, target=labels)
[docs]def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the large version of the soybean data set. It consists of 562 samples belonging to one of 15 classes.
Originally, the data set would have samples and 19 classes but some samples have attributes showing '?' values. Those
will be ignored.
The data set is composed of 266 training and 296 test samples.
N=562, d=35, k=15.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (562 x 35), the labels numpy array (562)
References
-------
https://archive.ics.uci.edu/ml/datasets/soybean+(Large)
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
if subset == "all" or subset == "train":
filename = _get_download_dir(downloads_path) + "/soybean-large.data"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.data",
filename)
# Load data and labels
df_train = pd.read_csv(filename, delimiter=",", header=None)
df_train = df_train[(df_train != '?').all(axis=1)]
labels_raw = df_train.pop(0)
data = df_train.values
if subset == "all" or subset == "test":
filename = _get_download_dir(downloads_path) + "/soybean-large.test"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.test",
filename)
df_test = pd.read_csv(filename, delimiter=",", header=None)
df_test = df_test[(df_test != '?').all(axis=1)]
labels_test = df_test.pop(0)
if subset == "all":
data = np.r_[data, df_test.values]
labels_raw = np.r_[labels_raw, labels_test]
else:
data = df_test.values
labels_raw = labels_test
# Transform data to numerical array
data = np.array(data, dtype=int)
LE = LabelEncoder()
labels = LE.fit_transform(labels_raw)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="SoybeanLarge", data=data, target=labels)
[docs]def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the pendigits data set. It consists of 10992 vectors of length 16, representing 8 coordinates. The coordinates
were taken from the task of writing digits (0 to 9) on a tablet.
The data set is composed of 7494 training and 3498 test samples.
N=10992, d=16, k=10.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (10992 x 16), the labels numpy array (10992)
References
-------
http://archive.ics.uci.edu/ml/datasets/pen-based+recognition+of+handwritten+digits
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
if subset == "all" or subset == "train":
filename = _get_download_dir(downloads_path) + "/pendigits.tra"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
if subset == "all" or subset == "test":
filename = _get_download_dir(downloads_path) + "/pendigits.tes"
test_data, test_labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes")
if subset == "all":
data = np.r_[data, test_data]
labels = np.r_[labels, test_labels]
else:
data = test_data
labels = test_labels
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Pendigits", data=data, target=labels)
[docs]def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the ecoli data set. It consists of 336 samples belonging to one of 8 classes.
N=336, d=7, k=8.
Parameters
----------
ignore_small_clusters : bool
specify if the three small clusters with size 2, 2 and 5 should be ignored (default: False)
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (336 x 7), the labels numpy array (336)
References
-------
https://archive.ics.uci.edu/ml/datasets/ecoli
"""
filename = _get_download_dir(downloads_path) + "/ecoli.data"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data",
filename)
data = np.zeros((336, 7))
labels_raw = []
with open(filename, "r") as f:
for i, line in enumerate(f.readlines()):
splited = line.split()
data[i] = splited[1:-1]
labels_raw.append(splited[-1])
if ignore_small_clusters:
# Optional: Remove the three small clusters consisting of only 2, 2 and 5 samples
keep_labels = [l not in ["imL", "imS", "omL"] for l in labels_raw]
data = data[keep_labels]
labels_raw = [l for i, l in enumerate(labels_raw) if keep_labels[i]]
LE = LabelEncoder()
labels = LE.fit_transform(labels_raw)
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Ecoli", data=data, target=labels)
[docs]def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the HTRU2 data set. It consists of 17898 samples belonging to the pulsar or non-pulsar class.
A special property is that more than 90% of the data belongs to class 0.
N=17898, d=8, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (17898 x 8), the labels numpy array (17898)
References
-------
https://archive.ics.uci.edu/ml/datasets/HTRU2
"""
directory = _get_download_dir(downloads_path) + "/htru2/"
filename = directory + "HTRU2.zip"
if not os.path.isfile(filename):
if not os.path.isdir(directory):
os.mkdir(directory)
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip",
filename)
# Unpack zipfile
with zipfile.ZipFile(filename, 'r') as zipf:
zipf.extractall(directory)
# Load data and labels
dataset = np.genfromtxt(directory + "HTRU_2.csv", delimiter=",")
data = dataset[:, :-1]
labels = dataset[:, -1]
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="HTRU2", data=data, target=labels)
[docs]def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the Letter Recognition data set. It consists of 20000 samples where each sample represents one of the 26 capital
letters in the English alphabet. All samples are composed of 16 numerical stimuli describing the respective letter.
N=20000, d=16, k=26.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (20000 x 16), the labels numpy array (20000)
References
-------
https://archive.ics.uci.edu/ml/datasets/letter+recognition
"""
filename = _get_download_dir(downloads_path) + "/letter-recognition.data"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
filename)
# Transform letters to integers
letter_mappings = {"A": "0", "B": "1", "C": "2", "D": "3", "E": "4", "F": "5", "G": "6", "H": "7", "I": "8",
"J": "9", "K": "10", "L": "11", "M": "12", "N": "13", "O": "14", "P": "15", "Q": "16",
"R": "17", "S": "18", "T": "19", "U": "20", "V": "21", "W": "22", "X": "23", "Y": "24",
"Z": "25"}
with open(filename, "r") as f:
file_text = f.read()
file_text = file_text.replace("\n", ",")
for k in letter_mappings.keys():
file_text = file_text.replace(k, letter_mappings[k])
# Create numpy array
datafile = np.fromstring(file_text, sep=",").reshape(-1, 17)
data = datafile[:, 1:]
labels = datafile[:, 0]
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Letterrecognition", data=data, target=labels)
[docs]def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the Human Activity Recognition data set. It consists of 10299 samples each representing sensor data of a person
performing an activity. The six activities are walking, walking_upstairs, walking_downstairs, sitting, standing and
laying.
The data set is composed of 7352 training and 2947 test samples.
N=10992, d=561, k=6.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (10992 x 561), the labels numpy array (10992)
References
-------
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
directory = _get_download_dir(downloads_path) + "/har/"
filename = directory + "UCI HAR Dataset.zip"
if not os.path.isfile(filename):
if not os.path.isdir(directory):
os.mkdir(directory)
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip",
filename)
# Unpack zipfile
with zipfile.ZipFile(filename, 'r') as zipf:
zipf.extractall(directory)
# Load data and labels
if subset == "all" or subset == "train":
data = np.genfromtxt(directory + "UCI HAR Dataset/train/X_train.txt")
labels = np.genfromtxt(directory + "UCI HAR Dataset/train/y_train.txt")
if subset == "all" or subset == "test":
test_data = np.genfromtxt(directory + "UCI HAR Dataset/test/X_test.txt")
test_labels = np.genfromtxt(directory + "UCI HAR Dataset/test/y_test.txt")
if subset == "all":
data = np.r_[data, test_data]
labels = np.r_[labels, test_labels]
else:
data = test_data
labels = test_labels
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Convert labels from 1,... to 0,...
labels = labels - 1
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="HAR", data=data, target=labels)
[docs]def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the statlog shuttle data set. It consists of 58000 samples belonging to one of 7 classes. A special property is
that about 80% of the data belongs to class 0.
The data set is composed of 43500 training and 14500 test samples.
N=58000, d=9, k=7.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (58000 x 9), the labels numpy array (58000)
References
-------
https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
directory = _get_download_dir(downloads_path) + "/shuttle/"
if subset == "all" or subset == "train":
filename = directory + "shuttle.trn.Z"
if not os.path.isfile(filename):
if not os.path.isdir(directory):
os.mkdir(directory)
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z",
filename)
# Unpack z-file
success = _decompress_z_file(filename, directory)
if not success:
os.remove(filename)
return (None, None) if return_X_y else None
# Load data and labels
dataset = np.genfromtxt(directory + "shuttle.trn")
data = dataset[:, :-1]
labels = dataset[:, -1]
if subset == "all" or subset == "test":
filename = directory + "shuttle.tst"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst",
filename)
test_dataset = np.genfromtxt(directory + "shuttle.tst")
test_data = test_dataset[:, :-1]
test_labels = test_dataset[:, -1]
if subset == "all":
data = np.r_[data, test_data]
labels = np.r_[labels, test_labels]
else:
data = test_data
labels = test_labels
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Convert labels from 1,... to 0,...
labels -= 1
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="StatlogShuttle", data=data, target=labels)
[docs]def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = False,
downloads_path: str = None) -> Bunch:
"""
Load the Mice Protein Expression data set. It consists of 1077 samples belonging to one of 8 classes.
Each feature represents the expression level of one of 77 proteins.
Samples containing more than 43 NaN values (3 cases) will be removed. Afterwards, all columns containing NaN values
will be removed. This reduces the number of features from 77 to 68.
The classes can be further subdivided by using the return_additional_labels parameter. This gives the additional
information mouseID, behavior, treatment type and genotype.
N=1077, d=68, k=8.
Parameters
----------
return_additional_labels : bool
return additional labels (default: False)
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (1077 x 68), the labels numpy array (1077)
References
-------
https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression
"""
filename = _get_download_dir(downloads_path) + "/Data_Cortex_Nuclear.xls"
if not os.path.isfile(filename):
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls",
filename)
xls = pd.ExcelFile(filename)
# Load first page
sheet = xls.parse(0)
# Remove special columns
classes_raw = sheet.pop("class")
ids_raw = sheet.pop("MouseID")
bahaviors_raw = sheet.pop("Behavior")
treatments_raw = sheet.pop("Treatment")
genotypes_raw = sheet.pop("Genotype")
original_data = sheet.values
# Remove rows containing 43 NaN values (3 cases)
n_of_nans_per_row = np.sum(np.isnan(original_data), axis=1)
data = original_data[n_of_nans_per_row < 43]
# Remove columns containing NaN values (removes 9 columns)
n_of_nans_per_columns = np.sum(np.isnan(data), axis=0)
data = data[:, n_of_nans_per_columns == 0]
# Get labels
LE = LabelEncoder()
labels = LE.fit_transform(classes_raw)
if return_additional_labels:
ids = [entry.split("_")[0] for entry in ids_raw]
LE = LabelEncoder()
id_labels = LE.fit_transform(ids)
LE = LabelEncoder()
bahaviors_labels = LE.fit_transform(bahaviors_raw)
LE = LabelEncoder()
treatment_labels = LE.fit_transform(treatments_raw)
LE = LabelEncoder()
genotype_labels = LE.fit_transform(genotypes_raw)
labels = np.c_[labels, id_labels, bahaviors_labels, treatment_labels, genotype_labels]
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Remove rows also from labels (3 cases)
labels = labels[n_of_nans_per_row < 43]
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="MiceProtein", data=data, target=labels)
[docs]def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the user knowledge data set. It consists of 403 samples belonging to one of 4 classes.
The 4 classes are the knowledge levels 'very low', 'low', 'middle' and 'high'.
The data set is composed of 258 training and 145 test samples.
N=403, d=5, k=4.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (403 x 5), the labels numpy array (403)
References
-------
https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
filename = _get_download_dir(downloads_path) + "/Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"
if not os.path.isfile(filename):
_download_file(
"https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls",
filename)
xls = pd.ExcelFile(filename)
if subset == "all" or subset == "train":
# Load second page
sheet_train = xls.parse(1)
# Get data and label columns
labels_raw = sheet_train.pop(" UNS")
data = sheet_train.values[:, :5]
if subset == "all" or subset == "test":
# Load third page
sheet_test = xls.parse(2)
# Get data and label columns
test_data = sheet_test.values[:, :5]
uns_test = sheet_test.pop(" UNS")
# Fix label string 'Very Low' to 'very_low' (as in train file)
uns_test = [l.replace("Very Low", "very_low") for l in uns_test]
if subset == "all":
data = np.r_[data, test_data]
labels_raw = np.r_[labels_raw, uns_test]
else:
data = test_data
labels_raw = uns_test
# Transform labels
LE = LabelEncoder()
labels = LE.fit_transform(labels_raw)
# Convert labels to int32 format
labels = labels.astype(np.int32)
data = np.array(data, dtype=np.float64)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="UserKnowledge", data=data, target=labels)
[docs]def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the breast tissue data set. It consists of 106 samples belonging to one of 6 classes.
N=106, d=9, k=6.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (106 x 9), the labels numpy array (106)
References
-------
http://archive.ics.uci.edu/ml/datasets/breast+tissue
"""
filename = _get_download_dir(downloads_path) + "/BreastTissue.xls"
if not os.path.isfile(filename):
_download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls",
filename)
xls = pd.ExcelFile(filename)
# Load second page
sheet = xls.parse(1)
# Get data and label columns
class_column = sheet.pop("Class")
data = sheet.values[:, 1:]
# Transform labels
LE = LabelEncoder()
labels = LE.fit_transform(class_column)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="BreastTissue", data=data, target=labels)
[docs]def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the forest type mapping data set. It consists of 523 samples belonging to one of 4 classes.
The data set is composed of 198 training and 325 test samples.
N=523, d=27, k=4.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (523 x 27), the labels numpy array (523)
References
-------
https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
directory = _get_download_dir(downloads_path) + "/ForestTypes/"
filename = directory + "ForestTypes.zip"
if not os.path.isfile(filename):
if not os.path.isdir(directory):
os.mkdir(directory)
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00333/ForestTypes.zip",
filename)
# Unpack zipfile
with zipfile.ZipFile(filename, 'r') as zipf:
zipf.extractall(directory)
# Load data and labels
if subset == "all" or subset == "train":
df_train = pd.read_csv(directory + "/training.csv", delimiter=",")
labels_raw = df_train.pop("class")
data = df_train.values
if subset == "all" or subset == "test":
df_test = pd.read_csv(directory + "/testing.csv", delimiter=",")
labels_test = df_test.pop("class")
if subset == "all":
data = np.r_[data, df_test.values]
labels_raw = np.r_[labels_raw, labels_test]
else:
data = df_test.values
labels_raw = labels_test
# Transform labels
LE = LabelEncoder()
labels = LE.fit_transform(labels_raw)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="ForestTypes", data=data, target=labels)
[docs]def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the dermatology data set. It consists of 366 samples belonging to one of 6 classes.
8 samples contain '?' values and are therefore removed.
N=358, d=34, k=6.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (358 x 34), the labels numpy array (358)
References
-------
https://archive.ics.uci.edu/ml/datasets/dermatology
"""
filename = _get_download_dir(downloads_path) + "/dermatology.data"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data",
delimiter=",")
# Remove rows with nan
rows_with_nan = ~np.isnan(data).any(axis=1)
data = data[rows_with_nan]
labels = labels[rows_with_nan]
# Convert labels from 1,... to 0,...
labels -= 1
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="Dermatology", data=data, target=labels)
[docs]def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the multiple features data set. It consists of 2000 samples belonging to one of 10 classes.
Each class corresponds to handwritten numerals (0-9) extracted from a collection of Dutch utility maps.
N=2000, d=649, k=10.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (2000 x 649), the labels numpy array (2000)
References
-------
https://archive.ics.uci.edu/ml/datasets/Multiple+Features
"""
directory = _get_download_dir(downloads_path) + "/MultipleFeatures/"
if not os.path.isdir(directory):
os.mkdir(directory)
data = np.zeros((2000, 0))
# Dataset consists of multiple .xls files
for file in ["mfeat-fac", "mfeat-fou", "mfeat-kar", "mfeat-mor", "mfeat-pix", "mfeat-zer"]:
filename = directory + file + ".xls"
if not os.path.isfile(filename):
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/" + file,
filename)
data_tmp = np.genfromtxt(filename, delimiter=None)
data = np.c_[data, data_tmp]
# First 200 entries correspond to '0', next 200 to '1' and so on
labels = np.repeat(range(10), 200)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="MultipleFeatures", data=data, target=labels)
[docs]def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the statlog Australian Credit Approval data set. It consists of 690 samples belonging to one of 2 classes.
N=690, d=14, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (690 x 14), the labels numpy array (690)
References
-------
https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval)
"""
filename = _get_download_dir(downloads_path) + "/australian.dat"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat",
delimiter=None)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="StatlogAustralianCreditApproval", data=data, target=labels)
[docs]def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the original breast cancer Wisconsin data set. It consists of 699 samples belonging to one of 2 classes.
16 samples contain '?' values and will be removed.
N=683, d=9, k=2.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (683 x 9), the labels numpy array (683)
References
-------
https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29
"""
filename = _get_download_dir(downloads_path) + "/breast-cancer-wisconsin.data"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
delimiter=",")
# First column contains unique ids
data = data[:, 1:]
# Remove rows with nan
rows_with_nan = ~np.isnan(data).any(axis=1)
data = data[rows_with_nan]
labels = labels[rows_with_nan]
# labels are 2 or 4. Convert to 0 or 1
labels = labels / 2 - 1
# Convert labels to int32 format
labels = labels.astype(np.int32)
# Return values
if return_X_y:
return data, labels
else:
return Bunch(dataset_name="BreastCancerWisconsin", data=data, target=labels)
[docs]def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the optdigits data set. It consists of 5620 8x8 grayscale images, each representing a digit (0 to 9).
Each pixel depicts the number of marked pixel within a 4x4 block of the original 32x32 bitmaps.
The data set is composed of 3823 training and 1797 test samples.
N=5620, d=64, k=10.
Parameters
----------
subset : str
can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Furthermore, the original images are contained in the 'images' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (5620 x 64), the labels numpy array (5620)
References
-------
http://archive.ics.uci.edu/ml/datasets/optical+recognition+of+handwritten+digits
"""
subset = subset.lower()
assert subset in ["all", "train",
"test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
if subset == "all" or subset == "train":
filename = _get_download_dir(downloads_path) + "/optdigits.tra"
data, labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra")
if subset == "all" or subset == "test":
filename = _get_download_dir(downloads_path) + "/optdigits.tes"
test_data, test_labels = _load_data_file(filename,
"https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")
if subset == "all":
data = np.r_[data, test_data]
labels = np.r_[labels, test_labels]
else:
data = test_data
labels = test_labels
# Return values
if return_X_y:
return data, labels
else:
data_image = data.reshape((-1, 8, 8))
return Bunch(dataset_name="Optdigits", data=data, target=labels, images=data_image, image_format="HW")
[docs]def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the semeion data set. It consists of 1593 samples belonging to one of 10 classes.
Each sample corresponds to a grayscale 16x16 scan of handwritten digits originating from about 80 different persons.
Further, each pixel was converted to a boolean value using a fixed threshold.
N=1593, d=256, k=10.
Parameters
----------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Furthermore, the original images are contained in the 'images' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (1593 x 256), the labels numpy array (1593)
References
-------
https://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit
"""
filename = _get_download_dir(downloads_path) + "/semeion.data"
if not os.path.isfile(filename):
_download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
filename)
datafile = np.genfromtxt(filename)
# Last columns each correspond to one label (one-hot encoding)
data = datafile[:, :-10]
labels = np.zeros(data.shape[0], dtype=np.int32)
for i in range(1, 10):
labels[datafile[:, -10 + i] == 1] = i
# Return values
if return_X_y:
return data, labels
else:
data_image = data.reshape((-1, 16, 16))
return Bunch(dataset_name="Semeion", data=data, target=labels, images=data_image, image_format="HW")
[docs]def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
"""
Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses
(up, straight, left, right) and with different expressions (neutral, happy, sad, angry). Additionally, the persons
can wear sunglasses or not.
16 images show glitches which is why the final data set only contains 624 images.
N=624, d=400, k=[20,4,4,2].
Parameters
-------
return_X_y : bool
If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
downloads_path : str
path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
Returns
-------
bunch : Bunch
A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
Furthermore, the original images are contained in the 'images' attribute.
Alternatively, if return_X_y is True two arrays will be returned:
the data numpy array (624 x 400), the labels numpy array (624 x 4)
References
-------
http://archive.ics.uci.edu/ml/datasets/cmu+face+images
"""
directory = _get_download_dir(downloads_path) + "/cmufaces/"
filename = directory + "faces_4.tar.gz"
if not os.path.isfile(filename):
if not os.path.isdir(directory):
os.mkdir(directory)
_download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz",
filename)
# Unpack zipfile
with tarfile.open(filename, "r:gz") as tar:
tar.extractall(directory)
names = np.array(
["an2i", "at33", "boland", "bpm", "ch4f", "cheyer", "choon", "danieln", "glickman", "karyadi", "kawamura",
"kk49", "megak", "mitchell", "night", "phoebe", "saavik", "steffi", "sz24", "tammo"])
positions = np.array(["straight", "left", "right", "up"])
expressions = np.array(["neutral", "happy", "sad", "angry"])
eyes = np.array(["open", "sunglasses"])
data_list = []
label_list = []
for name in names:
path_images = directory + "/faces_4/" + name
for image in os.listdir(path_images):
if not image.endswith("_4.pgm"):
continue
# get image data
image_data = Image.open(path_images + "/" + image)
image_array = np.array(image_data)
# Get labels
name_parts = image.split("_")
user_id = np.argwhere(names == name_parts[0])[0][0]
position = np.argwhere(positions == name_parts[1])[0][0]
expression = np.argwhere(expressions == name_parts[2])[0][0]
eye = np.argwhere(eyes == name_parts[3])[0][0]
label_data = np.array([user_id, position, expression, eye])
# Save data and labels
data_list.append(image_array)
label_list.append(label_data)
labels = np.array(label_list, dtype=np.int32)
data_image = np.array(data_list)
# Flatten data
data_flatten = flatten_images(data_image, "HW")
# Return values
if return_X_y:
return data_flatten, labels
else:
return Bunch(dataset_name="CMUFace", data=data_flatten, target=labels, images=data_image, image_format="HW",
classes=[names, positions, expressions, eyes])