Source code for clustpy.data.real_world_data

try:
    from nltk.stem import SnowballStemmer
except:
    print(
        "[WARNING] Could not import nltk in clustpy.data.real_world_data. Please install nltk by 'pip install nltk' if necessary")
try:
    from PIL import Image
except:
    print(
        "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
from clustpy.data._utils import _download_file, _get_download_dir, _download_file_from_google_drive, _load_image_data, \
    flatten_images
import os
import numpy as np
import zipfile
import tarfile
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups, fetch_rcv1, load_iris as sk_load_iris, load_wine as sk_load_wine, \
    load_breast_cancer as sk_load_breast_cancer, fetch_olivetti_faces
from scipy.io import loadmat
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets._base import Bunch

# More datasets https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps


"""
Load Sklearn datasets
"""


[docs]def load_iris(return_X_y: bool = False) -> Bunch:
    """
    Load the iris data set. It consists of the petal and sepal width and length of three different types of irises (Setosa,
    Versicolour, Virginica).
    N=150, d=4, k=3.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (150 x 4), the labels numpy array (150)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html
    https://archive.ics.uci.edu/ml/datasets/iris
    """
    dataset = sk_load_iris(return_X_y=return_X_y)
    if not return_X_y:
        dataset.dataset_name = "Iris"
    return dataset


[docs]def load_wine(return_X_y: bool = False) -> Bunch:
    """
    Load the wine data set. It consists of 13 different properties of three different types of wine.
    N=178, d=13, k=3.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)


    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (178 x 13), the labels numpy array (178)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html
    https://archive.ics.uci.edu/ml/datasets/wine
    """
    dataset = sk_load_wine(return_X_y=return_X_y)
    if not return_X_y:
        dataset.dataset_name = "Wine"
    return dataset


[docs]def load_breast_cancer(return_X_y: bool = False) -> Bunch:
    """
    Load the breast cancer wisconsin data set. It consists of 32 features computed from digitized images of fine needle
    aspirate of breast mass. The classes are the result of a diagnosis (malignant or benign).
    N=569, d=30, k=2.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (569 x 30), the labels numpy array (569)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer
    https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(diagnostic)
    """
    dataset = sk_load_breast_cancer(return_X_y=return_X_y)
    if not return_X_y:
        dataset.dataset_name = "BreastCancer"
    return dataset


[docs]def load_olivetti_faces(return_X_y: bool = False) -> Bunch:
    """
    Load the olivetti faces data set. It consists of 400 64x64 grayscale images showing faces of 40 different persons.
    N=400, d=4096, k=40.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (400 x 4096), the labels numpy array (400)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_olivetti_faces.html
    """
    dataset = fetch_olivetti_faces()
    if return_X_y:
        return dataset.data, dataset.target
    else:
        dataset.image_format = "HW"
        dataset.dataset_name = "OlivettiFaces"
        return dataset


[docs]def load_newsgroups(subset: str = "all", n_features: int = 2000, return_X_y: bool = False) -> Bunch:
    """
    Load the 20 newsgroups data set. It consists of a collection of 18846 newsgroup documents, partitioned
    (nearly) evenly across 20 different newsgroups. The documents are converted into feature vectors using TF-IDF.
    The data set is composed of 11314 training and 7532 test documents.
    N=18846, d=2000, k=20 using the default settings.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    n_features : int
        number of features used by TF-IDF (default: 2000)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (18846 x 2000 - using the default settings), the labels numpy array (18846)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups
    http://qwone.com/~jason/20Newsgroups/
    """
    newsgroups = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))
    vectorizer = TfidfVectorizer(max_features=n_features, dtype=np.float64, sublinear_tf=True)
    data_sparse = vectorizer.fit_transform(newsgroups.data)
    data = np.asarray(data_sparse.todense())
    if return_X_y:
        return data, newsgroups.target
    else:
        return Bunch(dataset_name="20Newsgroups", data=data, target=newsgroups.target)


[docs]def load_reuters(subset: str = "all", n_features: int = 2000, categories: tuple = ("CCAT", "GCAT", "MCAT", "ECAT"),
                 return_X_y: bool = False) -> Bunch:
    """
    Load the Reuters data set. It consists of over 800000 manually categorized newswire stories made available by Reuters,
    Ltd. Usually only a subset of the categories is used. Those categories are defined by the attribute 'categories'.
    We use only those articles that belong to a single category. Further, we only use the n_features most frequent
    features.
    The data set is composed of 19806 training and 665265 test documents using the default settings.
    N=685071, d=2000, k=4 using the default settings.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    n_features : int
        number of features used (default: 2000)
    categories : tuple
        the categories that should be contained (default: ("CCAT", "GCAT", "MCAT", "ECAT"))
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (685071 x 2000 - using the default settings), the labels numpy array (685071 - using the default settings)

    References
    -------
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_rcv1.html#sklearn.datasets.fetch_rcv1

    and

    Lewis, David D., et al. "Rcv1: A new benchmark collection for text categorization research." Journal of machine
    learning research 5.Apr (2004): 361-397.
    """
    reuters = fetch_rcv1(subset=subset)
    # Get samples with relevant main categories
    relevant_cats = [i for i, tn in enumerate(reuters.target_names) if tn in categories]
    filtered_labels = reuters.target[:, relevant_cats]
    # Only get documents with single category
    sum_of_labelings = np.sum(filtered_labels, axis=1)
    single_doc_ids = np.where(sum_of_labelings == 1)[0]
    # Get category of these documents
    labels = np.argmax(filtered_labels[single_doc_ids], axis=1)
    labels = np.asarray(labels)[:, 0]
    for i, cat in enumerate(relevant_cats):
        labels[labels == cat] = i
    # Get most frequent columns
    reuters_data = reuters.data[single_doc_ids]
    frequencies = np.asarray(np.sum(reuters_data, axis=0))[0]
    sorted_frequencies = np.argsort(frequencies)[::-1]
    selected_features = sorted_frequencies[:n_features]
    data = np.asarray(reuters_data[:, selected_features].todense())
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="Reuters", data=data, target=labels)


"""
Other datasets
"""


[docs]def load_imagenet_dog(subset: str = "all",
                      image_size: tuple = (224, 224),
                      breeds: list = ["n02085936-Maltese_dog", "n02086646-Blenheim_spaniel", "n02088238-basset",
                                      "n02091467-Norwegian_elkhound", "n02097209-standard_schnauzer",
                                      "n02099601-golden_retriever", "n02101388-Brittany_spaniel", "n02101556-clumber",
                                      "n02102177-Welsh_springer_spaniel", "n02105056-groenendael", "n02105412-kelpie",
                                      "n02105855-Shetland_sheepdog", "n02107142-Doberman", "n02110958-pug",
                                      "n02112137-chow"],
                      return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the ImageNet Dog data set. It consists of 20580 color images of different sizes showing 120 breeds of dogs.
    The data set is composed of 12000 training and 8580 test images.
    Usually, a subset of 15 dog breeds is used (Maltese_dog, Blenheim_spaniel, Basset, Norwegian_elkhound,
    Standard_schnauzer, Golden_retriever, Brittany_spaniel, Clumber, Welsh_springer_spaniel, Groenendael, Kelpie,
    Shetland_sheepdog, Doberman, Pug, Chow), resulting in 2574 images for the "all" subset.
    N=20580, d=image_size[0]*image_size[1]*3, k=120.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    image_size : tuple
        the images of various sizes must be converted into a coherent size.
        The tuple equals (width, height) of the images (default: (224, 224))
    breeds : list
        list containing all the identifiers of the dog breeds that should be extracted. All entries must be of type str.
        If None, all breeds will be extracted.
        Usually, a subset consisting of 15 breeds is extracted (default: list with 15 dog breeds)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : bool
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Note that the data within 'data' is in HWC format and within 'images' in the CHW format.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (20580 x image_size[0]*image_size[1]*3), the labels numpy array (20580)

    References
    -------
    http://vision.stanford.edu/aditya86/ImageNetDogs/main.html

    and

    Khosla, Aditya, et al. "Novel dataset for fine-grained image categorization: Stanford dogs."
    Proc. CVPR workshop on fine-grained visual categorization (FGVC). Vol. 2. No. 1. Citeseer, 2011.
    """
    assert len(image_size) == 2, "image_size format must match (width, height)"
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    directory = _get_download_dir(downloads_path) + "/ImageNetDog/"
    filename = directory + "images.tar"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar",
                       filename)
        # Unpack zipfile
        with tarfile.open(filename, "r") as tar:
            tar.extractall(directory)
    # Get files for test/train split
    train_test_filename = directory + "lists.tar"
    if not os.path.isfile(train_test_filename):
        _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar",
                       train_test_filename)
        # Unpack zipfile
        with tarfile.open(train_test_filename, "r") as tar:
            tar.extractall(directory)
    # Check breeds list
    if breeds is None:
        breeds = os.listdir(directory + "/Images")
    # Load data lists
    data_list = []
    if subset == "train":
        object_list = loadmat(directory + "/train_list.mat")
    elif subset == "test":
        object_list = loadmat(directory + "/test_list.mat")
    else:
        object_list = loadmat(directory + "/file_list.mat")
    labels = object_list["labels"]
    file_list = object_list["file_list"]
    # get image data
    use_image = np.ones(labels.shape[0], dtype=bool)
    for i, file in enumerate(file_list):
        file = file[0][0]
        if file.split("/")[0] in breeds:
            image_data = _load_image_data(directory + "/Images/" + file, image_size, True)
            data_list.append(image_data)
        else:
            use_image[i] = False
    data_image = np.array(data_list)
    # Flatten data
    data_flatten = flatten_images(data_image, "HWC")
    # Convert labels to int32 format
    labels = labels[use_image, 0].astype(np.int32) - 1
    if breeds is not None:
        # Transform labels
        LE = LabelEncoder()
        labels = LE.fit_transform(labels)
    # Return values
    if return_X_y:
        return data_flatten, labels
    else:
        data_image = np.transpose(data_image, [0, 3, 1, 2])
        image_format = "CHW"
        return Bunch(dataset_name="ImagenetDog", data=data_flatten, target=labels,
                     images=data_image, image_format=image_format, classes=breeds)


[docs]def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the ImageNet-10 data set. This is a subset of the well-known ImageNet data set with only 10 classes.
    It consists of 13000 224x224 (or 96x96) color images showing different objects.
    N=13000, d=150528, k=10.

    Parameters
    ----------
    use_224_size : bool
        defines wheter the images should be loaded in the size (224 x 224) or (96 x 96) (default: True)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Note that the data within 'data' is in HWC format and within 'images' in the CHW format.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (13000 x 150528), the labels numpy array (13000)

    References
    -------
    https://www.image-net.org/

    and

    Russakovsky, Olga, et al. "Imagenet large scale visual recognition challenge."
    International journal of computer vision 115 (2015): 211-252.
    """
    directory = _get_download_dir(downloads_path) + "/ImageNet10"
    if not os.path.isdir(directory):
        os.mkdir(directory)
    # Source: https://drive.google.com/drive/folders/1XL0Nohi4vO2f1I4znf388n2pMP8PiKFd
    if use_224_size:
        filename_data = directory + "/data_224.npy"
        if not os.path.isfile(filename_data):
            _download_file_from_google_drive("1sLfA0U9s9Q5Cf8o32GxYoyiyrzZN1K_6", filename_data)
        filename_labels = directory + "/labels_224.npy"
        if not os.path.isfile(filename_labels):
            _download_file_from_google_drive("1OjAQwaGnAfJBW66HFkR7yODLFxnTZWWI", filename_labels)
    else:
        filename_data = directory + "/data_96.npy"
        if not os.path.isfile(filename_data):
            _download_file_from_google_drive("13VbP1qYz6bSeibnoR-w0J_jL9bQf6tGX", filename_data)
        filename_labels = directory + "/labels_96.npy"
        if not os.path.isfile(filename_labels):
            _download_file_from_google_drive("1uiuYUdjyCITLURc5eo8ByP9b51MK_Uk6", filename_labels)
    # Load data and labels
    data_image = np.load(filename_data)
    labels = np.load(filename_labels)
    # Flatten data
    data_flatten = flatten_images(data_image, "HWC")
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    # Return values
    if return_X_y:
        return data_flatten, labels
    else:
        data_image = np.transpose(data_image, [0, 3, 1, 2])
        image_format = "CHW"
        return Bunch(dataset_name="Imagenet10", data=data_flatten, target=labels,
                     images=data_image, image_format=image_format)


[docs]def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the COIL-20 data set.
    It consists of 1440 128x128 gray-scale images of 20 objects photographed from 72 different angles.
    N=1440, d=16384, k=20.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1440 x 16384), the labels numpy array (1440)

    References
    -------
    https://www.cs.columbia.edu/CAVE/software/softlib/coil-20.php
    """
    directory = _get_download_dir(downloads_path) + "/COIL20/"
    filename = directory + "coil-20-proc.zip"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # get image data
    data_list = []
    labels = np.zeros(1440, dtype=np.int32)
    for i in range(20):
        for j in range(72):
            image_data = _load_image_data(directory + "coil-20-proc/obj{0}__{1}.png".format(i + 1, j), None, False)
            assert image_data.shape == (
                128, 128), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128) but is {2}".format(i + 1,
                                                                                                                 j,
                                                                                                                 image_data.shape)
            data_list.append(image_data)
            labels[i * 72:(i + 1) * 72] = i
    # Convert data to numpy
    data_image = np.array(data_list)
    # Flatten data
    data_flatten = flatten_images(data_image, "HW")
    # Return values
    if return_X_y:
        return data_flatten, labels
    else:
        return Bunch(dataset_name="COIL20", data=data_flatten, target=labels, images=data_image, image_format="HW")


[docs]def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the COIL-100 data set.
    It consists of 7200 128x128 color images of 100 objects photographed from 72 different angles.
    N=7200, d=49152, k=100.

    Parameters
    ----------
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Furthermore, the original images are contained in the 'images' attribute.
        Note that the data within 'data' is in HWC format and within 'images' in the CHW format.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (7200 x 49152), the labels numpy array (7200)

    References
    -------
    https://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php
    """
    directory = _get_download_dir(downloads_path) + "/COIL100/"
    filename = directory + "coil-100.zip"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # get image data
    data_list = []
    labels = np.zeros(7200, dtype=np.int32)
    for i in range(100):
        for j in range(72):
            image_data = _load_image_data(directory + "coil-100/obj{0}__{1}.png".format(i + 1, j * 5), None, True)
            assert image_data.shape == (
                128, 128, 3), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128, 3) but is {2}".format(
                i + 1, j, image_data.shape)
            data_list.append(image_data)
            labels[i * 72:(i + 1) * 72] = i
    # Convert data to numpy
    data_image = np.array(data_list)
    # Flatten data
    data_flatten = flatten_images(data_image, "HWC")
    # Return values
    if return_X_y:
        return data_flatten, labels
    else:
        data_image = np.transpose(data_image, [0, 3, 1, 2])
        image_format = "CHW"
        return Bunch(dataset_name="COIL100", data=data_flatten, target=labels, images=data_image,
                     image_format=image_format)


"""
Load WebKB
"""


[docs]def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wisconsin"),
               use_categories: tuple = ("course", "faculty", "project", "student"), remove_headers: bool = True,
               min_doc_frequency: float = 0.01, min_variance: float = 0.25, return_X_y: bool = False,
               downloads_path: str = None) -> Bunch:
    """
    Load the WebKB data set. It consists of 1041 Html documents from different universities (default: "cornell", "texas",
    "washington" and "wisconsin"). These web pages have a specified category (default: "course", "faculty", "project",
    "student"). For more information see the references website.
    The data is preprocessed by using stemming and removing stop words. Furthermore, words with a document frequency
    smaller than min_doc_frequency or with a variance smaller than min_variance will be removed.
    N=1041, d=323, k=[4,4] using the default settings.

    Parameters
    ----------
    use_universities : tuple
        specify the universities (default: ("cornell", "texas", "washington", "wisconsin"))
    use_categories : tuple
        specify the categories (default: ("course", "faculty", "project", "student"))
    remove_headers : bool
        should the headers of the Html files be removed? (default: True)
    min_doc_frequency : float
        minimum document frequency of the words (default: 0.01)
    min_variance : float
        minimum variance of the words (default: 0.25)
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1041 x 323 - using the default settings), the labels numpy array (1041 x 2 - using the default settings)

    References
    -------
    http://www.cs.cmu.edu/~webkb/
    """
    directory = _get_download_dir(downloads_path) + "/WebKB/"
    filename = directory + "webkb-data.gtar.gz"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/webkb-data.gtar.gz",
                       filename)
        # Unpack zipfile
        with tarfile.open(filename, "r:gz") as tar:
            for obj in tar.getmembers():
                if obj.isdir():
                    # Create Directory
                    tar.extract(obj, directory)
                else:
                    # Can not handle filenames with special characters. Therefore, rename files
                    new_name = obj.name.replace("~", "_").replace(".", "_").replace("^", "_").replace(":", "_").replace(
                        "\r", "")
                    # Get file content
                    f = tar.extractfile(obj)
                    lines = f.readlines()
                    # Write file
                    with open(directory + new_name, "wb") as output:
                        for line in lines:
                            output.write(line)
    texts = []
    labels = np.empty((0, 2), dtype=np.int32)
    hmtl_tags = re.compile(r'<[^>]+>')
    head_tags = re.compile(r'MIME-Version:[:,./\-\w\s]+<html>')
    number_tags = re.compile(r'\d*')
    # Read files
    for i, category in enumerate(use_categories):
        for j, univerity in enumerate(use_universities):
            inner_directory = "{0}webkb/{1}/{2}/".format(directory, category, univerity)
            files = os.listdir(inner_directory)
            for file in files:
                with open(inner_directory + file, "r", encoding='latin-1') as f:
                    lines = f.read()
                    if remove_headers:
                        # Remove header
                        lines = head_tags.sub('', lines)
                    # Remove HTML tags
                    lines = hmtl_tags.sub('', lines)
                    lines = number_tags.sub('', lines)
                    texts.append(lines)
                    labels = np.r_[labels, [[i, j]]]
    # Execute TF-IDF, remove stop-words and use the snowball stemmer
    vectorizer = _StemmedCountVectorizer(dtype=np.float64, stop_words="english", min_df=min_doc_frequency)
    data_sparse = vectorizer.fit_transform(texts)
    selector = VarianceThreshold(min_variance)
    data_sparse = selector.fit_transform(data_sparse)
    tfidf = TfidfTransformer(sublinear_tf=True)
    data_sparse = tfidf.fit_transform(data_sparse)
    data = np.asarray(data_sparse.todense())
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="WebKB", data=data, target=labels, classes=[use_categories, use_universities])


class _StemmedCountVectorizer(CountVectorizer):
    """
    Helper class for load_webkb(). Combines the CountVectorizer with the SnowballStemmer.
    See: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
    """

    def build_analyzer(self):
        """
        Custom build_analyzer method. Calls the build_analyzer of the CountVectorizer parent class and then applies
        SnowballStemmer('english')

        Returns
        -------
        stemmed_words : Generator
            the stemmed words in the document
        """
        stemmer = SnowballStemmer('english')
        analyzer = super(_StemmedCountVectorizer, self).build_analyzer()
        stemmed_words = lambda doc: (stemmer.stem(word) for word in analyzer(doc))
        return stemmed_words