Source code for clustpy.data.real_nr_data

import numpy as np
import os
from clustpy.data._utils import _get_download_dir, _download_file
import tarfile
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import VarianceThreshold
from nltk.stem import SnowballStemmer
from PIL import Image


def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray):
    """
    Helper function to load a non-redundant data set from ClustPys internal data sets directory.
    The first n_labels columns will be specified as labels.

    Parameters
    ----------
    file_name: str
        Name of the data set
    n_labels: int
        Number of label sets

    Returns
    -------
    data, labels : (np.ndarray, np.ndarray)
        the data numpy array, the labels numpy array
    """
    path = os.path.dirname(__file__) + "/datasets/" + file_name
    dataset = np.genfromtxt(path, delimiter=",")
    data = dataset[:, n_labels:]
    labels = dataset[:, :n_labels]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    return data, labels


[docs]def load_aloi_small() -> (np.ndarray, np.ndarray):
    """
    Load a subset of the Amsterdam Library of Object Image (ALOI) consisting of 288 images of the objects red ball,
    red cylinder, green ball and green cylinder. The two label sets are cylinder/ball and red/green.
    N=288, d=611, k=[2,2].

    Returns
    -------
    data, labels: (np.ndarray, np.ndarray)
        the data numpy array (288 x 611), the labels numpy array (288 x 2)

    References
    -------
    https://aloi.science.uva.nl/

    and

    Ye, Wei, et al. "Generalized independent subspace clustering." 2016 IEEE 16th International Conference on Data
    Mining (ICDM). IEEE, 2016.
    """
    return _load_nr_data("aloi_small.data", 2)


[docs]def load_fruit() -> (np.ndarray, np.ndarray):
    """
    Load the fruits data set. It consists of 105 preprocessed images of apples, bananas and grapes in red, green and yellow.
    N=105, d=6, k=[3,3].

    Returns
    -------
    data, labels  : (np.ndarray, np.ndarray)
        the data numpy array (105 x 6), the labels numpy array (105 x 2)

    References
    -------
    Hu, Juhua, et al. "Finding multiple stable clusterings." Knowledge and Information Systems 51.3 (2017): 991-1021.
    """
    return _load_nr_data("fruit.data", 2)


[docs]def load_nrletters() -> (np.ndarray, np.ndarray):
    """
    Load the NRLetters data set. It consists of 10000 9x7 images of the letters A, B, C, X, Y and Z in pink, cyan and
    yellow. Additionally, each image highlights one corner in color.
    N=10000, d=189, k=[6,3,4].

    Returns
    -------
    data, labels : (np.ndarray, np.ndarray)
        the data numpy array (10000 x 189), the labels numpy array (10000 x 3)

    References
    -------
    Leiber, Collin, et al. "Automatic Parameter Selection for Non-Redundant Clustering." Proceedings of the 2022 SIAM
    International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2022.
    """
    return _load_nr_data("nrLetters.data", 3)


[docs]def load_stickfigures() -> (np.ndarray, np.ndarray):
    """
    Load the Dancing Stick Figures data set. It consists of 900 20x20 grayscale images of stick figures in different poses.
    The poses can be divided into three upp-body and three lower-body motions.
    N=900, d=400, k=[3,3].

    Returns
    -------
    data, labels : (np.ndarray, np.ndarray)
        the data numpy array (900 x 400), labels: the labels numpy array (900 x 2)

    References
    -------
    Günnemann, Stephan, et al. "Smvc: semi-supervised multi-view clustering in subspace projections." Proceedings of
    the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. 2014.
    """
    return _load_nr_data("stickfigures.data", 2)


"""
UCI
"""


[docs]def load_cmu_faces(downloads_path: str = None) -> (np.ndarray, np.ndarray):
    """
    Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses
    (up, straight, left, right) und with different expressions (neutral, happy, sad, angry). Additionally, the persons
    can wear sunglasses or not.
    16 images show glitches which is why the final data set only contains 624 images.
    N=624, d=400, k=[20,4,4,2].

    Parameters
    -------
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    data, labels : (np.ndarray, np.ndarray)
        the data numpy array (624 x 400), the labels numpy array (624 x 4)

    References
    -------
    http://archive.ics.uci.edu/ml/datasets/cmu+face+images
    """
    directory = _get_download_dir(downloads_path) + "/cmufaces/"
    filename = directory + "faces_4.tar.gz"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz",
                       filename)
        # Unpack zipfile
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall(directory)
    names = np.array(
        ["an2i", "at33", "boland", "bpm", "ch4f", "cheyer", "choon", "danieln", "glickman", "karyadi", "kawamura",
         "kk49", "megak", "mitchell", "night", "phoebe", "saavik", "steffi", "sz24", "tammo"])
    positions = np.array(["straight", "left", "right", "up"])
    expressions = np.array(["neutral", "happy", "sad", "angry"])
    eyes = np.array(["open", "sunglasses"])
    data_list = []
    label_list = []
    for name in names:
        path_images = directory + "/faces_4/" + name
        for image in os.listdir(path_images):
            if not image.endswith("_4.pgm"):
                continue
            # get image data
            image_data = Image.open(path_images + "/" + image)
            image_data_vector = np.array(image_data).reshape(image_data.size[0] * image_data.size[1])
            # Get labels
            name_parts = image.split("_")
            user_id = np.argwhere(names == name_parts[0])[0][0]
            position = np.argwhere(positions == name_parts[1])[0][0]
            expression = np.argwhere(expressions == name_parts[2])[0][0]
            eye = np.argwhere(eyes == name_parts[3])[0][0]
            label_data = np.array([user_id, position, expression, eye])
            # Save data and labels
            data_list.append(image_data_vector)
            label_list.append(label_data)
    labels = np.array(label_list, dtype=np.int32)
    data = np.array(data_list)
    return data, labels


"""
Load WebKB
"""


[docs]def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wisconsin"),
               use_categories: tuple = ("course", "faculty", "project", "student"), remove_headers: bool = True,
               min_doc_frequency: float = 0.01, min_variance: float = 0.25, downloads_path: str = None) -> (
        np.ndarray, np.ndarray):
    """
    Load the WebKB data set. It consists of 1041 Html documents from different universities (default: "cornell", "texas",
    "washington" and "wisconsin"). These web pages have a specified category (default: "course", "faculty", "project",
    "student"). For more information see the references website.
    The data is preprocessed by using stemming and removing stop words. Furthermore, words with a document frequency
    smaller than min_doc_frequency or with a variance smaller than min_variance will be removed.
    N=1041, d=323, k=[4,4] using the default settings.

    Parameters
    ----------
    use_universities : tuple
        specify the universities (default: ("cornell", "texas", "washington", "wisconsin"))
    use_categories : tuple
        specify the categories (default: ("course", "faculty", "project", "student"))
    remove_headers : bool
        should the headers of the Html files be removed? (default: True)
    min_doc_frequency : float
        minimum document frequency of the words (default: 0.01)
    min_variance : float
        minimum variance of the words (default: 0.25)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    data, labels: (np.ndarray, np.ndarray)
        the data numpy array (1041 x 323 - using the default settings), the labels numpy array (1041 x 2 - using the default settings)

    References
    -------
    http://www.cs.cmu.edu/~webkb/
    """
    directory = _get_download_dir(downloads_path) + "/WebKB/"
    filename = directory + "webkb-data.gtar.gz"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/webkb-data.gtar.gz",
                       filename)
        # Unpack zipfile
        with tarfile.open(filename, "r:gz") as tar:
            for obj in tar.getmembers():
                if obj.isdir():
                    # Create Directory
                    tar.extract(obj, directory)
                else:
                    # Can not handle filenames with special characters. Therefore, rename files
                    new_name = obj.name.replace("~", "_").replace(".", "_").replace("^", "_").replace(":", "_").replace(
                        "\r", "")
                    # Get file content
                    f = tar.extractfile(obj)
                    lines = f.readlines()
                    # Write file
                    with open(directory + new_name, "wb") as output:
                        for line in lines:
                            output.write(line)
    texts = []
    labels = np.empty((0, 2), dtype=np.int32)
    hmtl_tags = re.compile(r'<[^>]+>')
    head_tags = re.compile(r'MIME-Version:[:,./\-\w\s]+<html>')
    number_tags = re.compile(r'\d*')
    # Read files
    for i, category in enumerate(use_categories):
        for j, univerity in enumerate(use_universities):
            inner_directory = "{0}webkb/{1}/{2}/".format(directory, category, univerity)
            files = os.listdir(inner_directory)
            for file in files:
                with open(inner_directory + file, "r", encoding='latin-1') as f:
                    lines = f.read()
                    if remove_headers:
                        # Remove header
                        lines = head_tags.sub('', lines)
                    # Remove HTML tags
                    lines = hmtl_tags.sub('', lines)
                    lines = number_tags.sub('', lines)
                    texts.append(lines)
                    labels = np.r_[labels, [[i, j]]]
    # Execute TF-IDF, remove stop-words and use the snowball stemmer
    vectorizer = _StemmedCountVectorizer(dtype=np.float64, stop_words="english", min_df=min_doc_frequency)
    data_sparse = vectorizer.fit_transform(texts)
    selector = VarianceThreshold(min_variance)
    data_sparse = selector.fit_transform(data_sparse)
    tfidf = TfidfTransformer(sublinear_tf=True)
    data_sparse = tfidf.fit_transform(data_sparse)
    data = np.asarray(data_sparse.todense())
    return data, labels


class _StemmedCountVectorizer(CountVectorizer):
    """
    Helper class for load_webkb(). Combines the CountVectorizer with the SnowballStemmer.
    See: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
    """

    def build_analyzer(self):
        """
        Custom build_analyzer method. Calls the build_analyzer of the CountVectorizer parent class and then applies
        SnowballStemmer('english')

        Returns
        -------
        stemmed_words : Generator
            the stemmed words in the document
        """
        stemmer = SnowballStemmer('english')
        analyzer = super(_StemmedCountVectorizer, self).build_analyzer()
        stemmed_words = lambda doc: (stemmer.stem(word) for word in analyzer(doc))
        return stemmed_words