Source code for clustpy.data.real_timeseries_data

import numpy as np
from clustpy.data._utils import _get_download_dir, _download_file
from sklearn.datasets._base import Bunch
import os
import zipfile


def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_minus_one: bool, file_type: str,
                                         last_column_are_labels: bool, return_X_y: bool, downloads_path: str) -> Bunch:
    """
    Helper function to load timeseries data from www.timeseriesclassification.com.

    Parameters
    ----------
    dataset_name : str
        name of the data set
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data
    labels_minus_one : bool
        Convert labels from 1,... to 0,...
    file_type : str
        file type within the zip file. Currently supported are "txt" and "ts". Is usually "txt"
    last_column_are_labels : bool
        specifies if the last column contains the labels. If false labels should be contained in the first column
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object
    downloads_path : str
        path to the directory where the data is stored. If input was None this will be equal to
        '[USER]/Downloads/clustpy_datafiles'

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array, the labels numpy array
    """
    subset = subset.lower()
    assert subset in ["all", "train",
                      "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
    directory = _get_download_dir(downloads_path) + "/" + dataset_name + "/"
    filename = directory + dataset_name + ".zip"
    if not os.path.isfile(filename):
        if not os.path.isdir(directory):
            os.mkdir(directory)
        _download_file("http://www.timeseriesclassification.com/aeon-toolkit/" + dataset_name + ".zip",
                       filename)
        # Unpack zipfile
        with zipfile.ZipFile(filename, 'r') as zipf:
            zipf.extractall(directory)
    # Load data and labels
    if subset == "all" or subset == "train":
        # Normally we have txt files
        if file_type == "txt":
            dataset = np.genfromtxt(directory + dataset_name + "_TRAIN.txt")
        elif file_type == "ts":
            # Ts files must be changed first
            with open(directory + dataset_name + "_TRAIN.ts", "rb") as f:
                clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f)
                dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#")
        # Are labels in first or last column?
        if last_column_are_labels:
            data = dataset[:, :-1]
            labels = dataset[:, -1]
        else:
            data = dataset[:, 1:]
            labels = dataset[:, 0]
    if subset == "all" or subset == "test":
        # Normally we have txt files
        if file_type == "txt":
            test_dataset = np.genfromtxt(directory + dataset_name + "_TEST.txt")
        elif file_type == "ts":
            # Ts files must be changed first
            with open(directory + dataset_name + "_TEST.ts", "rb") as f:
                clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f)
                test_dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#")
        # Are labels in first or last column?
        if last_column_are_labels:
            if subset == "all":
                data = np.r_[data, test_dataset[:, :-1]]
                labels = np.r_[labels, test_dataset[:, -1]]
            else:
                data = test_dataset[:, :-1]
                labels = test_dataset[:, -1]
        else:
            if subset == "all":
                data = np.r_[data, test_dataset[:, 1:]]
                labels = np.r_[labels, test_dataset[:, 0]]
            else:
                data = test_dataset[:, 1:]
                labels = test_dataset[:, 0]
    # Convert labels to int32 format
    labels = labels.astype(np.int32)
    if labels_minus_one:
        # Convert labels from 1,... to 0,...
        labels -= 1
    # Return values
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name=dataset_name, data=data, target=labels)


[docs]def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the motestrain data set. It consists of 1272 samples belonging to one of 2 classes.
    The data set is composed of 20 training and 1252 test samples.
    N=1272, d=84, k=2.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1272 x 84), the labels numpy array (1272)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=MoteStrain
    """
    return _load_timeseries_classification_data("MoteStrain", subset, True, "txt", False, return_X_y, downloads_path)


[docs]def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the proximal phalanx outline data set. It consists of 876 samples belonging to one of 2 classes.
    The data set is composed of 600 training and 276 test samples.
    N=876, d=80, k=2.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (876 x 80), the labels numpy array (876)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=ProximalPhalanxOutlineCorrect
    """
    return _load_timeseries_classification_data("DistalPhalanxOutlineCorrect", subset, False, "txt", False,
                                                return_X_y, downloads_path)


[docs]def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the diatom size reduction data set. It consists of 322 samples belonging to one of 4 classes.
    The data set is composed of 16 training and 306 test samples.
    N=322, d=345, k=4.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (322 x 345), the labels numpy array (322)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=DiatomSizeReduction
    """
    return _load_timeseries_classification_data("DiatomSizeReduction", subset, True, "txt", False,
                                                return_X_y, downloads_path)


[docs]def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the symbols data set. It consists of 1020 samples belonging to one of 6 classes.
    The data set is composed of 25 training and 995 test samples.
    N=1020, d=398, k=6.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (1020 x 398), the labels numpy array (1020)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=Symbols
    """
    return _load_timeseries_classification_data("Symbols", subset, True, "txt", False, return_X_y, downloads_path)


[docs]def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the OliveOil data set. It consists of 60 samples belonging to one of 4 classes.
    The data set is composed of 30 training and 30 test samples.
    N=60, d=570, k=4.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (60 x 570), the labels numpy array (60)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=OliveOil
    """
    return _load_timeseries_classification_data("OliveOil", subset, True, "txt", False, return_X_y, downloads_path)


[docs]def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the plane data set. It consists of 210 samples belonging to one of 7 classes.
    The data set is composed of 105 training and 105 test samples.
    N=210, d=144, k=7.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (210 x 144), the labels numpy array (210)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=Plane
    """
    return _load_timeseries_classification_data("Plane", subset, True, "txt", False, return_X_y, downloads_path)


[docs]def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the Sony AIBO Robot Surface 1 data set. It consists of 621 samples belonging to one of 2 classes.
    The data set is composed of 20 training and 601 test samples.
    N=621, d=70, k=2.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (621 x 70), the labels numpy array (621)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=SonyAIBORobotSurface1
    """
    return _load_timeseries_classification_data("SonyAIBORobotSurface1", subset, True, "txt", False,
                                                return_X_y, downloads_path)


[docs]def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the two patterns data set. It consists of 5000 samples belonging to one of 4 classes.
    The data set is composed of 1000 training and 4000 test samples.
    N=5000, d=128, k=4.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (5000 x 128), the labels numpy array (5000)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=TwoPatterns
    """
    return _load_timeseries_classification_data("TwoPatterns", subset, True, "txt", False, return_X_y, downloads_path)


[docs]def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
    """
    Load the LSST data set. It consists of 4925 samples belonging to one of 14 classes.
    The data set is composed of 2459 training and 2466 test samples.
    N=4925, d=216, k=14.

    Parameters
    ----------
    subset : str
        can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
    return_X_y : bool
        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
    downloads_path : str
        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)

    Returns
    -------
    bunch : Bunch
        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
        Alternatively, if return_X_y is True two arrays will be returned:
        the data numpy array (4925 x 216), the labels numpy array (4925)

    References
    -------
    http://www.timeseriesclassification.com/description.php?Dataset=LSST
    """
    dataset = _load_timeseries_classification_data("LSST", subset, True, "ts", True, False, downloads_path)
    data = dataset.data
    labels = dataset.target
    # Current labels are: 5, 14, 15, 41, 51, 52, ... -> change to: 0, 1, 2, 3, 4, ...
    for i, l in enumerate(np.unique(labels)):
        labels[labels == l] = i
    if return_X_y:
        return data, labels
    else:
        return Bunch(dataset_name="LSST", data=data, target=labels)