"""
@authors:
Collin Leiber
"""
from sklearn.cluster import KMeans
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state
from clustpy.partition.xmeans import _initial_kmeans_clusters, _execute_two_means
from scipy.stats import anderson
def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clusters: int, n_split_trials: int,
random_state: np.random.RandomState) -> (int, np.ndarray, np.ndarray):
"""
Start the actual GMeans clustering procedure on the input data set.
Parameters
----------
X : np.ndarray
the given data set
significance : float
Threshold to decide if the result of the Anderson Darling Test indicates a Gaussian distribution
n_clusters_init : int
The initial number of clusters. Can also by of type np.ndarray if initial cluster centers are specified
max_n_clusters : int
Maximum number of clusters. Must be larger than n_clusters_init
n_split_trials : int
Number tries to split a cluster. For each try 2-KMeans is executed with different cluster centers
random_state : np.random.RandomState
use a fixed random state to get a repeatable solution
Returns
-------
tuple : (int, np.ndarray, np.ndarray)
The final number of clusters,
The labels as identified by GMeans,
The cluster centers as identified by GMeans
"""
assert max_n_clusters >= n_clusters_init, "max_n_clusters can not be smaller than n_clusters_init"
assert significance >= 0 and significance <= 1, "significance must be a value in the range [0, 1]"
# Initialize parameters
n_clusters, labels, centers, _ = _initial_kmeans_clusters(X, n_clusters_init, random_state)
while n_clusters < max_n_clusters:
n_clusters_old = n_clusters
for c in range(n_clusters_old):
ids_in_cluster = np.where(labels == c)[0]
if ids_in_cluster.shape[0] < 2:
continue
# Split cluster into two
labels_split, centers_split, _ = _execute_two_means(X[ids_in_cluster], [np.arange(ids_in_cluster.shape[0])], 0,
np.array([centers[c]]), n_split_trials, random_state)
# Project data form cluster onto resulting connection axis
projected_data = np.dot(X[ids_in_cluster], centers_split[0] - centers_split[1])
# Use Anderson Darling to test if data is Gaussian
ad_result = anderson(projected_data, "norm")
p_value = _anderson_darling_statistic_to_prob(ad_result.statistic, len(ids_in_cluster))
if p_value < significance:
# If data is not Gaussian, keep the newly created cluster centers
centers[c] = centers_split[0]
centers = np.r_[centers, [centers_split[1]]]
labels[ids_in_cluster[labels_split == 1]] = n_clusters
n_clusters += 1
# If maximum number of clusters is reached, stop iterating over the current clusters
if n_clusters == max_n_clusters:
break
# If no cluster changed, GMeans terminates
if n_clusters == n_clusters_old:
break
else:
# Prepare the cluster for the next iteration
kmeans = KMeans(n_clusters=n_clusters, init=centers, n_init=1, random_state=random_state)
kmeans.fit(X)
centers = kmeans.cluster_centers_
labels = kmeans.labels_
return n_clusters, labels, centers
def _anderson_darling_statistic_to_prob(statistic: float, n_points: int) -> float:
"""
Transform the statistic returned by the Anderson Darling test into a p_value.
First the adjusted statistic will be calculated.
Afterwards, the actual p-value can be obtained.
Parameters
----------
statistic : float
The original statistic from the Anderson Darling test.
n_points : int
The number of samples
Returns
-------
p_value : float
The p-value
References
----------
D'Agostino, Ralph B., and Michael A. Stephens. "Goodness-of-fit techniques."
Statistics: Textbooks and Monographs (1986).
"""
adjusted_stat = statistic * (1 + (.75 / n_points) + 2.25 / (n_points ** 2))
if adjusted_stat < 0.2:
# is log q => therefore add 1 - ...
p_value = 1 - np.exp(-13.436 + 101.14 * adjusted_stat - 223.73 * (adjusted_stat ** 2))
elif adjusted_stat < 0.34:
# is log q => therefore add 1 - ...
p_value = 1 - np.exp(-8.318 + 42.796 * adjusted_stat - 59.938 * (adjusted_stat ** 2))
elif adjusted_stat < 0.6:
p_value = np.exp(0.9177 - 4.279 * adjusted_stat - 1.38 * (adjusted_stat ** 2))
else:
p_value = np.exp(1.2937 - 5.709 * adjusted_stat - 0.0186 * (adjusted_stat ** 2))
return p_value
[docs]class GMeans(BaseEstimator, ClusterMixin):
"""
Execute the GMeans clustering procedure.
Determines the number of clusters by repeatedly trying to split a cluster into two clusters.
Therefore, the data is projected onto the axis connecting the two resulting centers.
If the Anderson Darling test does not assume a Gaussian distribution for this projection, the new clusters are retained.
Otherwise, the cluster remains as it was originally. This is repeated until no cluster changes.
Parameters
----------
significance : float
Threshold to decide if the result of the Anderson Darling Test indicates a Gaussian distribution (default: 0.001)
n_clusters_init : int
The initial number of clusters. Can also by of type np.ndarray if initial cluster centers are specified (default: 1)
max_n_clusters : int
Maximum number of clusters. Must be larger than n_clusters_init (default: np.inf)
n_split_trials : int
Number tries to split a cluster. For each try 2-KMeans is executed with different cluster centers (default: 10)
random_state : np.random.RandomState | int
use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
Attributes
----------
n_clusters_ : int
The final number of clusters
labels_ : np.ndarray
The final labels
cluster_centers_ : np.ndarray
The final cluster centers
References
----------
Hamerly, Greg, and Charles Elkan. "Learning the k in k-means."
Advances in neural information processing systems. 2004.
and
D'Agostino, Ralph B., and Michael A. Stephens. "Goodness-of-fit techniques."
Statistics: Textbooks and Monographs (1986).
"""
def __init__(self, significance: float = 0.0001, n_clusters_init: int = 1, max_n_clusters: int = np.inf,
n_split_trials: int = 10, random_state: np.random.RandomState | int = None):
self.significance = significance
self.n_clusters_init = n_clusters_init
self.max_n_clusters = max_n_clusters
self.n_split_trials = n_split_trials
self.random_state = check_random_state(random_state)
[docs] def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'GMeans':
"""
Initiate the actual clustering process on the input data set.
The resulting cluster labels will be stored in the labels_ attribute.
Parameters
----------
X : np.ndarray
the given data set
y : np.ndarray
the labels (can be ignored)
Returns
-------
self : GMeans
this instance of the GMeans algorithm
"""
n_clusters, labels, centers = _gmeans(X, self.significance, self.n_clusters_init, self.max_n_clusters,
self.n_split_trials, self.random_state)
self.n_clusters_ = n_clusters
self.labels_ = labels
self.cluster_centers_ = centers
return self