try:
from clustpy.utils.dipModule import c_diptest # noqa - Import from C file (could be marked as unresolved)
except:
print("[WARNING] Could not import c_diptest in clustpy.utils.dipModule")
import numpy as np
import matplotlib.pyplot as plt
from clustpy.utils.plots import plot_histogram
from sklearn.utils import check_random_state
[docs]def dip_test(X: np.ndarray, just_dip: bool = True, is_data_sorted: bool = False, return_gcm_lcm_mn_mj: bool = False,
use_c: bool = True, debug: bool = False) -> (
float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
"""
Calculate the Dip-value. This can either be done using the C implementation or the python version.
In addition to the Dip-value additional values can be returned.
These are e.g. the modal interval (indices of the beginning and end of the steepest slop of the ECDF) and the modal interval (used to calculate the gradient of the Dip-value) if just_dip is False.
Further, the indices of the Greatest Convex Minorant (gcm), Least Concave Majorant (lcm), minorant and majorant values can be returned by setting return_gcm_lcm_mn_mj to True.
Note that modal_triangle can be (-1,-1,-1) if the triangle could not be determined correctly.
Parameters
----------
X : np.ndarray
the given univariate data set
just_dip : bool
Defines whether only the Dip-value should be returned or also the modal interval and modal triangle (default: True)
is_data_sorted : bool
Should be True if the data set is already sorted (default: False)
return_gcm_lcm_mn_mj : bool
Defines whether the gcm, lcm, mn and mj arrays should be returned. In this case just_dip must be False (default: False)
use_c : bool
Defines whether the C implementation should be used (defualt: True)
debug : bool
If true, additional information will be printed to the console (default: False)
Returns
-------
tuple: (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
The resulting Dip-value,
The indices of the modal_interval - corresponds to the steepest slope in the ECDF (if just_dip is False),
The indices of the modal triangle (if just_dip is False),
The indices of points that are part of the Greatest Convex Minorant (gcm) (if just_dip is False and return_gcm_lcm_mn_mj is True),
The indices of points that are part of the Least Concave Majorant (lcm) (if just_dip is False and return_gcm_lcm_mn_mj is True),
The minorant values (if just_dip is False and return_gcm_lcm_mn_mj is True),
The majorant values (if just_dip is False and return_gcm_lcm_mn_mj is True)
References
----------
Hartigan, John A., and Pamela M. Hartigan.
"The dip test of unimodality." The annals of Statistics (1985): 70-84.
and
Hartigan, P. M. "Computation of the dip statistic to test for unimodality: Algorithm as 217."
Applied Statistics 34.3 (1985): 320-5.
"""
assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
assert just_dip or is_data_sorted == True, "Data must be sorted if modal interval and/or modal triangle should be returned (else indices will not match)"
assert not return_gcm_lcm_mn_mj or not just_dip, "If GCM, LCM, mn and mj should be returned, 'just_dip' must be False"
if not is_data_sorted:
X = np.sort(X)
# Obtain results
if X.shape[0] < 4 or X[0] == X[-1]:
dip_value = 0.0
modal_interval = (0, X.shape[0] - 1)
modal_triangle = (-1, -1, -1)
mn, mj = None, None
elif use_c:
try:
dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_c_impl(X, debug)
except Exception as e:
print("[WARNING] C implementation can not be used for dip calculation.")
print(e)
dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_python_impl(X, debug)
else:
dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_python_impl(X, debug)
# Return results
if just_dip:
return dip_value
elif return_gcm_lcm_mn_mj:
if mn is not None and mj is not None:
gcm, lcm = _get_complete_gcm_lcm(mn, mj, modal_interval)
else:
gcm, lcm = None, None
return dip_value, modal_interval, modal_triangle, gcm, lcm, mn, mj
else:
return dip_value, modal_interval, modal_triangle
def _dip_c_impl(X: np.ndarray, debug: bool) -> (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
"""
Calls the Dip C implementation by Martin Maechler.
Parameters
----------
X : np.ndarray
the given univariate data set
debug : bool
If true, additional information will be printed to the console
Returns
-------
tuple : (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
The resulting Dip-value,
The indices of the modal_interval - corresponds to the steepest slope in the ECDF,
The indices of the modal triangle
The indices of points that are part of the Greatest Convex Minorant (gcm),
The indices of points that are part of the Least Concave Majorant (lcm),
The minorant values,
The majorant values
"""
# Create reference numpy arrays
modal_interval = np.zeros(2, dtype=np.int32)
modal_triangle = -np.ones(3, dtype=np.int32)
gcm = np.zeros(X.shape, dtype=np.int32)
lcm = np.zeros(X.shape, dtype=np.int32)
mj = np.zeros(X.shape, dtype=np.int32)
mn = np.zeros(X.shape, dtype=np.int32)
# Execute C function
dip_value = c_diptest(X.astype(np.float64), modal_interval, modal_triangle, gcm, lcm, mn, mj, X.shape[0],
1 if debug else 0)
return dip_value, (modal_interval[0], modal_interval[1]), (
modal_triangle[0], modal_triangle[1], modal_triangle[2]), gcm, lcm, mn, mj
def _dip_python_impl(X: np.ndarray, debug: bool) -> (
float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
"""
A python version of the Dip C implementation by Martin Maechler.
Parameters
----------
X : np.ndarray
the given univariate data set
debug : bool
If true, additional information will be printed to the console
Returns
-------
tuple : (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
The resulting Dip-value,
The indices of the modal_interval - corresponds to the steepest slope in the ECDF,
The indices of the modal triangle
The indices of points that are part of the Greatest Convex Minorant (gcm),
The indices of points that are part of the Least Concave Majorant (lcm),
The minorant values,
The majorant values
"""
assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
N = len(X)
if N < 4 or X[0] == X[-1]:
d = 0.0
return d, (0, 0), (-1, -1, -1), None, None, None, None
low = 0
high = N - 1
dip_value = 0.0
# Create modal triangle
modaltriangle_i1 = -1
modaltriangle_i2 = -1
modaltriangle_i3 = -1
# Establish the indices mn[0..n-1] over which combination is necessary for the convex MINORANT (GCM) fit.
mn = np.zeros(N, dtype=np.int32)
mn[0] = 0
for j in range(1, N):
mn[j] = j - 1
while True:
mnj = mn[j]
mnmnj = mn[mnj]
if mnj == 0 or (X[j] - X[mnj]) * (mnj - mnmnj) < (X[mnj] - X[mnmnj]) * (j - mnj):
break
mn[j] = mnmnj
# Establish the indices mj[0..n-1] over which combination is necessary for the concave MAJORANT (LCM) fit.
mj = np.zeros(N, dtype=np.int32)
mj[N - 1] = N - 1
for k in range(N - 2, -1, -1):
mj[k] = k + 1
while True:
mjk = mj[k]
mjmjk = mj[mjk]
if mjk == N - 1 or (X[k] - X[mjk]) * (mjk - mjmjk) < (X[mjk] - X[mjmjk]) * (k - mjk):
break
mj[k] = mjmjk
gcm = np.zeros(N, dtype=int) # np.arange(N)
lcm = np.zeros(N, dtype=int) # np.arange(N, -1, -1)
while True:
# GCM
gcm[0] = high
i = 0
while gcm[i] > low:
gcm[i + 1] = mn[gcm[i]]
i += 1
ig = i
l_gcm = i
ix = ig - 1
# LCM
lcm[0] = low
i = 0
while lcm[i] < high:
lcm[i + 1] = mj[lcm[i]]
i += 1
ih = i
l_lcm = i
iv = 1
if debug:
print("'dip': LOOP-BEGIN: 2n*D= {0} [low,high] = [{1},{2}]:".format(dip_value, low, high))
print("gcm[0:{0}] = {1}".format(l_gcm, gcm[:l_gcm + 1]))
print("lcm[0:{0}] = {1}".format(l_lcm, lcm[:l_lcm + 1]))
d = 0.0
if l_gcm != 1 or l_lcm != 1:
if debug:
print(" while(gcm[ix] != lcm[iv])")
while True:
gcmix = gcm[ix]
lcmiv = lcm[iv]
if gcmix > lcmiv:
gcmil = gcm[ix + 1]
dx = (lcmiv - gcmil + 1) - (X[lcmiv] - X[gcmil]) * (gcmix - gcmil) / (X[gcmix] - X[gcmil])
iv += 1
if dx >= d:
d = dx
ig = ix + 1
ih = iv - 1
if debug:
print("L({0},{1})".format(ig, ih))
else:
lcmivl = lcm[iv - 1]
dx = (X[gcmix] - X[lcmivl]) * (lcmiv - lcmivl) / (X[lcmiv] - X[lcmivl]) - (gcmix - lcmivl - 1)
ix -= 1
if dx >= d:
d = dx
ig = ix + 1
ih = iv
if debug:
print("G({0},{1})".format(ig, ih))
if ix < 0:
ix = 0
if iv > l_lcm:
iv = l_lcm
if debug:
print(" --> ix = {0}, iv = {1}".format(ix, iv))
if gcm[ix] == lcm[iv]:
break
else:
d = 0.0
if debug:
print(" ** (l_lcm,l_gcm) = ({0},{1}) ==> d := {2}".format(l_lcm, l_gcm, d))
if d < dip_value:
break
if debug:
print(" calculating dip ..")
j_l = -1
j_u = -1
lcm_modalTriangle_i1 = -1
lcm_modalTriangle_i3 = -1
gcm_modalTriangle_i1 = -1
gcm_modalTriangle_i3 = -1
# The DIP for the convex minorant
dip_l = 0
for j in range(ig, l_gcm):
max_t = 1
j_ = -1
jb = gcm[j + 1]
je = gcm[j]
if je - jb > 1 and X[je] != X[jb]:
C = (je - jb) / (X[je] - X[jb])
for jj in range(jb, je + 1):
t = (jj - jb + 1) - (X[jj] - X[jb]) * C
if max_t < t:
max_t = t
j_ = jj
if dip_l < max_t:
dip_l = max_t
j_l = j_
gcm_modalTriangle_i1 = jb
gcm_modalTriangle_i3 = je
# The DIP for the concave majorant
dip_u = 0
for j in range(ih, l_lcm):
max_t = 1
j_ = -1
jb = lcm[j]
je = lcm[j + 1]
if je - jb > 1 and X[je] != X[jb]:
C = (je - jb) / (X[je] - X[jb])
for jj in range(jb, je + 1):
t = (X[jj] - X[jb]) * C - (jj - jb - 1)
if max_t < t:
max_t = t
j_ = jj
if dip_u < max_t:
dip_u = max_t
j_u = j_
lcm_modalTriangle_i1 = jb
lcm_modalTriangle_i3 = je
if debug:
print(" (dip_l, dip_u) = ({0}, {1})".format(dip_l, dip_u))
if dip_u > dip_l:
dip_new = dip_u
j_best = j_u
if debug:
print(" -> new larger dip {0} (j_best = {1}) gcm-centred triple ({2},{3},{4})".format(dip_new, j_best,
lcm_modalTriangle_i1,
j_best,
lcm_modalTriangle_i3))
else:
dip_new = dip_l
j_best = j_l
if debug:
print(" -> new larger dip {0} (j_best = {1}) lcm-centred triple ({2},{3},{4})".format(dip_new, j_best,
gcm_modalTriangle_i1,
j_best,
gcm_modalTriangle_i3))
if dip_value < dip_new:
dip_value = dip_new
if dip_u > dip_l:
modaltriangle_i1 = lcm_modalTriangle_i1
modaltriangle_i2 = j_best
modaltriangle_i3 = lcm_modalTriangle_i3
else:
modaltriangle_i1 = gcm_modalTriangle_i1
modaltriangle_i2 = j_best
modaltriangle_i3 = gcm_modalTriangle_i3
if low == gcm[ig] and high == lcm[ih]:
if debug:
print("No improvement in low = {0} nor high = {1} --> END".format(low, high))
break
low = gcm[ig]
high = lcm[ih]
dip_value /= (2 * N)
return dip_value, (low, high), (modaltriangle_i1, modaltriangle_i2, modaltriangle_i3), gcm, lcm, mn, mj
[docs]def dip_pval(dip_value: float, n_points: int, pval_strategy: str = "table", n_boots: int = 1000,
random_state: np.random.RandomState | int = None) -> float:
"""
Get the p-value of a corresponding Dip-value.
P-values depend on the input Dip-value and the sample size.
There are several strategies to calculate the p-value. These are:
'table' (most common), 'function' (available for all sample sizes) and 'bootstrap' (slow for large sample sizes)
Parameters
----------
dip_value : flaat
The Dip-value
n_points : int
The number of samples
pval_strategy : str
Specifies the strategy that should be used to calculate the p-value (default: 'table')
n_boots : int
Number of random data sets that should be created to calculate Dip-values. Only relevant if pval_strategy is 'bootstrap' (default: 1000)
random_state : np.random.RandomState | int
use a fixed random state to get a repeatable solution. Can also be of type int. Only relevant if pval_strategy is 'bootstrap' (default: None)
Returns
-------
pval : float
The resulting p-value
References
----------
Hartigan, John A., and Pamela M. Hartigan.
"The dip test of unimodality." The annals of Statistics (1985): 70-84.
and
Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
"""
assert type(pval_strategy) is str, "pval_stratgegy must be of type string"
pval_strategy = pval_strategy.lower()
assert pval_strategy in ["bootstrap", "table",
"function"], "pval_strategy must match 'bootstrap', 'table' or 'function'. " \
"Your input: {0}".format(pval_strategy)
if n_points < 4:
pval = 1.0
elif pval_strategy == "bootstrap":
boot_dips = dip_boot_samples(int(n_points), n_boots, random_state)
pval = np.mean(dip_value <= boot_dips)
elif pval_strategy == "table":
pval = _dip_pval_table(dip_value, n_points)
elif pval_strategy == "function":
pval = _dip_pval_function(dip_value, n_points)
else:
raise Exception(
"pval_strategy must match 'bootstrap', 'table' or 'function. Your input: {0}".format(pval_strategy))
return pval
[docs]def dip_boot_samples(n_points: int, n_boots: int = 1000,
random_state: np.random.RandomState | int = None) -> np.ndarray:
"""
Sample random data sets and calculate corresponding Dip-values.
E.g. used to determine p-values.
Parameters
----------
n_points : int
The number of samples
n_boots : int
Number of random data sets that should be created to calculate Dip-values (default: 1000)
random_state : np.random.RandomState | int
use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
Returns
-------
boot_dips : np.ndarray
Array of Dip-values
"""
# random uniform vectors
random_state = check_random_state(random_state)
boot_samples = random_state.rand(n_boots, n_points)
boot_dips = np.array([dip_test(boot_s, just_dip=True, is_data_sorted=False) for boot_s in boot_samples])
return boot_dips
def _get_complete_gcm_lcm(mn: np.ndarray, mj: np.ndarray, modal_interval: tuple) -> (np.ndarray, np.ndarray):
"""
Complete the GCM and LCM returned by the Dip-test.
Adapted from: https://github.com/samhelmholtz/skinny-dip/blob/master/code/skinny-dip/RPackageDipTestCustom/diptest/R/dip.R
Parameters
----------
mn : np.ndarray
The minorant values
mj : np.ndarray
The majorant values
modal_interval : tuple
Indices of the modal interval - corresponds to the steepest slope in the ECDF (default: None)
Returns
-------
tuple : (np.ndarray, np.ndarray)
The adjusted indices of points that are part of the Greatest Convex Minorant,
The adjusted indices of points that are part of the Least Concave Majorant
"""
low = modal_interval[0]
high = modal_interval[1]
gcm = np.zeros(mn.shape[0], dtype=np.int32)
lcm = np.zeros(mj.shape[0], dtype=np.int32)
# Collect the gcm points from 0 to the upper end of the modal interval
i = 0
gcm[0] = high
while gcm[i] > 0:
gcm[i + 1] = mn[gcm[i]]
i += 1
gcm = gcm[:i + 1]
# Collect the lcm points from the lower end of the modal interval to the last point
i = 0
lcm[i] = low
while lcm[i] < mj.shape[0] - 1:
lcm[i + 1] = mj[lcm[i]]
i += 1
lcm = lcm[:i + 1]
return gcm, lcm
"""
Dip Gradient
"""
[docs]def dip_gradient(X: np.ndarray, X_proj: np.ndarray, sorted_indices: np.ndarray, modal_triangle: tuple) -> np.ndarray:
"""
Calculate the gradient of the Dip-value regarding the projection axis.
Parameters
----------
X : np.ndarray
the given data set
X_proj : np.ndarray
The univariate projected data set
sorted_indices : np.ndarray
The indices of the sorted univariate data set
modal_triangle : tuple
Indices of the modal triangle
Returns
-------
gradient : np.ndarray
The gradient of the Dip-value regarding the projection axis
References
----------
Krause, Andreas, and Volkmar Liebscher.
"Multimodal projection pursuit using the dip statistic." (2005).
"""
if modal_triangle is None or modal_triangle[0] == -1:
return np.zeros(X.shape[1])
data_index_i1 = sorted_indices[modal_triangle[0]]
data_index_i2 = sorted_indices[modal_triangle[1]]
data_index_i3 = sorted_indices[modal_triangle[2]]
# Get A and c
A = modal_triangle[0] - modal_triangle[1] + \
(modal_triangle[2] - modal_triangle[0]) * (X_proj[data_index_i2] - X_proj[data_index_i1]) / (
X_proj[data_index_i3] - X_proj[data_index_i1])
constant = (modal_triangle[2] - modal_triangle[0]) / (2 * X.shape[0])
# Check A
if A < 0:
constant = -constant
# Calculate gradient
quotient = (X_proj[data_index_i3] - X_proj[data_index_i1])
gradient = (X[data_index_i2] - X[data_index_i1]) / quotient - \
(X[data_index_i3] - X[data_index_i1]) * (
X_proj[data_index_i2] - X_proj[data_index_i1]) / quotient ** 2
gradient = gradient * constant
return gradient
[docs]def dip_pval_gradient(X: np.ndarray, X_proj: np.ndarray, sorted_indices: np.ndarray, modal_triangle: tuple,
dip_value: float) -> np.ndarray:
"""
Calculate the gradient of the Dip p-value function regarding the projection axis.
Parameters
----------
X : np.ndarray
the given data set
X_proj : np.ndarray
The univariate projected data set
sorted_indices : np.ndarray
The indices of the sorted univariate data set
modal_triangle : tuple
Indices of the modal triangle
dip_value : float
The Dip-value
Returns
-------
pval_grad : np.ndarray
The gradient of the Dip p-value function regarding the projection axis
References
----------
Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
"""
if modal_triangle is None or modal_triangle[0] == -1:
return np.zeros(X.shape[1])
# Calculate gradient of dip-value
dip_grad = dip_gradient(X, X_proj, sorted_indices, modal_triangle)
# Get factor for that gradient from p-value gradeint
b = _dip_pval_function_get_b(X.shape[0])
exponent = np.exp(-b * dip_value + 6.5)
quotient = (0.6 * (1 + 1.6 * exponent) ** (0.625) + 0.4 * (1 + 0.2 * exponent) ** 5) ** 2
grad_factor = (0.6 * (1 + 1.6 * exponent) ** (-0.375) + 0.4 * (1 + 0.2 * exponent) ** 4) / quotient
grad_factor *= exponent * -b
# Combine values
pval_grad = grad_factor * dip_grad
return pval_grad
"""
Plot
"""
[docs]def plot_dip(X: np.ndarray, is_data_sorted: bool = False, dip_value: float = None, modal_interval: tuple = None,
modal_triangle: tuple = None, gcm: np.ndarray = None, lcm: np.ndarray = None, linewidth_ecdf: float = 1,
linewidth_extra: float = 2, show_legend: bool = True, add_histogram: bool = True,
histogram_labels: np.ndarray = None, histogram_show_legend: bool = True, histogram_density: bool = True,
histogram_n_bins: int = 100, height_ratio: tuple = (1, 2), show_plot: bool = True) -> None:
"""
Plot a visual representation of the computational process of the Dip.
Upper part shows an optional histogram of the data and the lower part shows the corresponding ECDF.
Parameters
----------
X : np.ndarray
the given data set
is_data_sorted : bool
Should be True if the data set is already sorted (default: False)
dip_value : float
The Dip-value (default: None)
modal_interval : tuple
Indices of the modal interval - corresponds to the steepest slope in the ECDF (default: None)
modal_triangle : tuple
Indices of the modal triangle (default: None)
gcm : np.ndarray
The indices of points that are part of the Greatest Convex Minorant (gcm) (default: None)
lcm : np.ndarray
The indices of points that are part of the Least Concave Majorant (lcm) (default None)
linewidth_ecdf : flaot
The linewidth for the eCDF (default: 1)
linewidth_extra : float
The linewidth for the visualization of the dip, modal interval, modal triangle, gcm and lcm (default: 2)
show_legend : bool
Defines whether the legend of the ECDF plot should be added (default: True)
add_histogram : bool
Defines whether the histogram should be shown above the ECDF plot (default: True)
histogram_labels : np.ndarray
Labels used to color parts of the histogram (default: None)
histogram_show_legend : bool
Defines whether the legend of the histogram should be added (default: True)
histogram_density : bool
Defines whether a kernel density should be added to the histogram plot (default: True)
histogram_n_bins : int
Number of bins used for the histogram (default: 100)
height_ratio : tuple
Defines the height ratio between histogram and ECDF plot. Only relevant if add_histogram is True.
First value in the tuple defines the height of the histogram and the second value the height of the ECDF plot (default: (1, 2))
show_plot : bool
Defines whether the plot should directly be plotted (default: True)
"""
assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
N = len(X)
if not is_data_sorted:
argsorted_X = np.argsort(X)
X = X[argsorted_X]
if histogram_labels is not None:
histogram_labels = histogram_labels[argsorted_X]
if add_histogram:
# Add histogram at the top of the plot (uses plot_histogram from clustpy.utils.plots)
fig, (ax1, ax2) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': height_ratio})
plot_histogram(X, histogram_labels, density=histogram_density, n_bins=histogram_n_bins,
show_legend=histogram_show_legend,
container=ax1, show_plot=False)
if modal_interval is not None:
y_axis_limit = ax1.get_ylim()
ax1.plot([X[modal_interval[0]], X[modal_interval[0]]], y_axis_limit, "g--", linewidth=linewidth_extra)
ax1.plot([X[modal_interval[1]], X[modal_interval[1]]], y_axis_limit, "g--", linewidth=linewidth_extra)
dip_container = ax2
# Remove spacing between the two plots
fig.subplots_adjust(hspace=0)
else:
dip_container = plt
# Plot ECDF
dip_container.plot(X, np.arange(N) / N, "b", label="eCDF", linewidth=linewidth_ecdf)
if dip_value is not None:
# Add Dip range around ECDF
dip_container.plot(X, np.arange(N) / N - dip_value * 2, "k:", alpha=0.7, label="2x dip",
linewidth=linewidth_extra)
dip_container.plot(X, np.arange(N) / N + dip_value * 2, "k:", alpha=0.7, linewidth=linewidth_extra)
if modal_interval is not None:
# Add modal interval in green
dip_container.plot([X[modal_interval[0]], X[modal_interval[1]]], [modal_interval[0] / N, modal_interval[1] / N],
linewidth=linewidth_extra, c="g", label="modal interval")
if modal_triangle is not None:
# Add modal triangle in red
dip_container.plot([X[modal_triangle[0]], X[modal_triangle[1]], X[modal_triangle[2]], X[modal_triangle[0]]],
[modal_triangle[0] / N, modal_triangle[1] / N, modal_triangle[2] / N, modal_triangle[0] / N],
linewidth=linewidth_extra, c="r", label="modal triangle")
# Add process of gcm and lcm curves
if gcm is not None:
dip_container.plot(X[gcm], gcm / N, "y--", linewidth=linewidth_extra, label="gcm")
if lcm is not None:
dip_container.plot(X[lcm], lcm / N, "c--", linewidth=linewidth_extra, label="lcm")
if show_legend:
dip_container.legend(loc="lower right")
if show_plot:
plt.show()
"""
Dip p-value methods
"""
def _dip_pval_table(dip_value: float, n_points: int) -> float:
"""
Get the p-value of a corresponding Dip-value using a lookup table.
Depends on the number of samples.
Source: https://github.com/alimuldal/diptest
Parameters
----------
dip_value : flaat
The Dip-value
n_points : int
The number of samples
Returns
-------
pval : float
The resulting p-value
References
----------
Hartigan, John A., and Pamela M. Hartigan.
"The dip test of unimodality." The annals of Statistics (1985): 70-84.
"""
N, SIG, CV = _get_dip_table_values()
if n_points > max(N):
print(
"[dip_pval_table] WARNING: The number of samples is too large for pval_strategy 'table' (max n_points is 72000), 'function' will be used instead.")
return _dip_pval_function(dip_value, n_points)
i1 = N.searchsorted(n_points, side='left')
i0 = i1 - 1
# if n falls outside the range of tabulated sample sizes, use the
# critical values for the nearest tabulated n (i.e. treat them as
# 'asymptotic')
i0 = max(0, i0)
i1 = min(N.shape[0] - 1, i1)
# interpolate on sqrt(n)
n0, n1 = N[[i0, i1]]
if i0 != i1:
fn = float(n_points - n0) / (n1 - n0)
else:
fn = 0
y0 = np.sqrt(n0) * CV[i0]
y1 = np.sqrt(n1) * CV[i1]
sD = np.sqrt(n_points) * dip_value
pval = 1. - np.interp(sD, y0 + fn * (y1 - y0), SIG)
return pval
def _dip_pval_function(dip_value: float, n_points: int) -> float:
"""
Get the p-value of a corresponding Dip-value using the sigmoid function as described by Bauer et al..
Depends on the number of samples.
Parameters
----------
dip_value : flaat
The Dip-value
n_points : int
The number of samples
Returns
-------
pval : float
The resulting p-value
References
----------
Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
"""
b = _dip_pval_function_get_b(n_points)
exponent = np.exp(-b * dip_value + 6.5)
pval = 1 - 1 / (0.6 * (1 + 1.6 * exponent) ** (0.625) + 0.4 * (1 + 0.2 * exponent) ** 5)
return pval
def _dip_pval_function_get_b(n_points: int) -> float:
"""
Helper function for the Dip-p-value calculation using the sigmoid function by Bauer et al..
Parameters
----------
n_points : int
The number of samples
Returns
-------
b : float
The b value
References
----------
Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
"""
b = 17.30784022 * np.sqrt(n_points) + 12.04917889
return b
def _get_dip_table_values() -> (np.ndarray, np.ndarray, np.ndarray):
"""
Get the values of the Dip-p-value lookup table.
Returns
-------
tuple : (np.ndarray, np.ndarray, np.ndarray)
The sample sizes (21 values),
The probabilities (26 values),
The Dip-values (21 x 26 values)
"""
N = np.array([4, 5, 6, 7, 8, 9, 10, 15, 20,
30, 50, 100, 200, 500, 1000, 2000, 5000, 10000,
20000, 40000, 72000])
SIG = np.array([0., 0.01, 0.02, 0.05, 0.1, 0.2,
0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
0.9, 0.95, 0.98, 0.99, 0.995, 0.998,
0.999, 0.9995, 0.9998, 0.9999, 0.99995, 0.99998,
0.99999, 1.])
# [len(N), len(SIG)] table of critical values
CV = np.array([[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.132559548782689,
0.157497369040235, 0.187401878807559, 0.20726978858736, 0.223755804629222, 0.231796258864192,
0.237263743826779, 0.241992892688593, 0.244369839049632, 0.245966625504691, 0.247439597233262,
0.248230659656638, 0.248754269146416, 0.249302039974259, 0.249459652323225, 0.24974836247845],
[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.108720593576329, 0.121563798026414, 0.134318918697053,
0.147298798976252, 0.161085025702604, 0.176811998476076, 0.186391796027944, 0.19361253363045,
0.196509139798845, 0.198159967287576, 0.199244279362433, 0.199617527406166, 0.199800941499028,
0.199917081834271, 0.199959029093075, 0.199978395376082, 0.199993151405815, 0.199995525025673,
0.199999835639211],
[0.0833333333333333, 0.0833333333333333, 0.0833333333333333, 0.0833333333333333,
0.0833333333333333, 0.0924514470941933, 0.103913431059949, 0.113885220640212, 0.123071187137781,
0.13186973390253, 0.140564796497941, 0.14941924112913, 0.159137064572627, 0.164769608513302,
0.179176547392782, 0.191862827995563, 0.202101971042968, 0.213015781111186, 0.219518627282415,
0.224339047394446, 0.229449332154241, 0.232714530449602, 0.236548128358969, 0.2390887911995,
0.240103566436295, 0.244672883617768],
[0.0714285714285714, 0.0714285714285714, 0.0714285714285714, 0.0725717816250742,
0.0817315478539489, 0.09405901819225269, 0.103244490800871, 0.110964599995697,
0.117807846504335, 0.124216086833531, 0.130409013968317, 0.136639642123068, 0.144240669035124,
0.159903395678336, 0.175196553271223, 0.184118659121501, 0.191014396174306, 0.198216795232182,
0.202341010748261, 0.205377566346832, 0.208306562526874, 0.209866047852379, 0.210967576933451,
0.212233348558702, 0.212661038312506, 0.21353618608817],
[0.0625, 0.0625, 0.06569119945032829, 0.07386511360717619, 0.0820045917762512,
0.0922700601131892, 0.09967371895993631, 0.105733531802737, 0.111035129847705,
0.115920055749988, 0.120561479262465, 0.125558759034845, 0.141841067033899, 0.153978303998561,
0.16597856724751, 0.172988528276759, 0.179010413496374, 0.186504388711178, 0.19448404115794,
0.200864297005026, 0.208849997050229, 0.212556040406219, 0.217149174137299, 0.221700076404503,
0.225000835357532, 0.233772919687683],
[0.0555555555555556, 0.0613018090298924, 0.0658615858179315, 0.0732651142535317,
0.0803941629593475, 0.0890432420913848, 0.0950811420297928, 0.09993808978110461,
0.104153560075868, 0.108007802361932, 0.112512617124951, 0.122915033480817, 0.136412639387084,
0.146603784954019, 0.157084065653166, 0.164164643657217, 0.172821674582338, 0.182555283567818,
0.188658833121906, 0.194089120768246, 0.19915700809389, 0.202881598436558, 0.205979795735129,
0.21054115498898, 0.21180033095039, 0.215379914317625],
[0.05, 0.0610132555623269, 0.0651627333214016, 0.0718321619656165, 0.077966212182459,
0.08528353598345639, 0.09032041737070989, 0.0943334983745117, 0.0977817630384725,
0.102180866696628, 0.109960948142951, 0.118844767211587, 0.130462149644819, 0.139611395137099,
0.150961728882481, 0.159684158858235, 0.16719524735674, 0.175419540856082, 0.180611195797351,
0.185286416050396, 0.191203083905044, 0.195805159339184, 0.20029398089673, 0.205651089646219,
0.209682048785853, 0.221530282182963],
[0.0341378172277919, 0.0546284208048975, 0.0572191260231815, 0.0610087367689692,
0.06426571373304441, 0.06922341079895911, 0.0745462114365167, 0.07920308789817621,
0.083621033469191, 0.08811984822029049, 0.093124666680253, 0.0996694393390689,
0.110087496900906, 0.118760769203664, 0.128890475210055, 0.13598356863636, 0.142452483681277,
0.150172816530742, 0.155456133696328, 0.160896499106958, 0.166979407946248, 0.17111793515551,
0.175900505704432, 0.181856676013166, 0.185743454151004, 0.192240563330562],
[0.033718563622065, 0.0474333740698401, 0.0490891387627092, 0.052719998201553,
0.0567795509056742, 0.0620134674468181, 0.06601638720690479, 0.06965060750664009,
0.07334377405927139, 0.07764606628802539, 0.0824558407118372, 0.08834462700173699,
0.09723460181229029, 0.105130218270636, 0.114309704281253, 0.120624043335821, 0.126552378036739,
0.13360135382395, 0.138569903791767, 0.14336916123968, 0.148940116394883, 0.152832538183622,
0.156010163618971, 0.161319225839345, 0.165568255916749, 0.175834459522789],
[0.0262674485075642, 0.0395871890405749, 0.0414574606741673, 0.0444462614069956,
0.0473998525042686, 0.0516677370374349, 0.0551037519001622, 0.058265005347493,
0.0614510857304343, 0.0649164408053978, 0.0689178762425442, 0.0739249074078291,
0.08147913793901269, 0.0881689143126666, 0.0960564383013644, 0.101478558893837,
0.10650487144103, 0.112724636524262, 0.117164140184417, 0.121425859908987, 0.126733051889401,
0.131198578897542, 0.133691739483444, 0.137831637950694, 0.141557509624351, 0.163833046059817],
[0.0218544781364545, 0.0314400501999916, 0.0329008160470834, 0.0353023819040016,
0.0377279973102482, 0.0410699984399582, 0.0437704598622665, 0.0462925642671299,
0.048851155289608, 0.0516145897865757, 0.0548121932066019, 0.0588230482851366,
0.06491363240467669, 0.0702737877191269, 0.07670958860791791, 0.0811998415355918,
0.0852854646662134, 0.09048478274902939, 0.0940930106666244, 0.0974904344916743,
0.102284204283997, 0.104680624334611, 0.107496694235039, 0.11140887547015, 0.113536607717411,
0.117886716865312],
[0.0164852597438403, 0.022831985803043, 0.0238917486442849, 0.0256559537977579,
0.0273987414570948, 0.0298109370830153, 0.0317771496530253, 0.0336073821590387,
0.0354621760592113, 0.0374805844550272, 0.0398046179116599, 0.0427283846799166,
0.047152783315718, 0.0511279442868827, 0.0558022052195208, 0.059024132304226,
0.0620425065165146, 0.06580160114660991, 0.0684479731118028, 0.0709169443994193,
0.0741183486081263, 0.0762579402903838, 0.0785735967934979, 0.08134583568891331,
0.0832963013755522, 0.09267804230967371],
[0.0111236388849688, 0.0165017735429825, 0.0172594157992489, 0.0185259426032926,
0.0197917612637521, 0.0215233745778454, 0.0229259769870428, 0.024243848341112,
0.025584358256487, 0.0270252129816288, 0.0286920262150517, 0.0308006766341406,
0.0339967814293504, 0.0368418413878307, 0.0402729850316397, 0.0426864799777448,
0.044958959158761, 0.0477643873749449, 0.0497198001867437, 0.0516114611801451,
0.0540543978864652, 0.0558704526182638, 0.0573877056330228, 0.0593365901653878,
0.0607646310473911, 0.0705309107882395],
[0.00755488597576196, 0.0106403461127515, 0.0111255573208294, 0.0119353655328931,
0.0127411306411808, 0.0138524542751814, 0.0147536004288476, 0.0155963185751048,
0.0164519238025286, 0.017383057902553, 0.0184503949887735, 0.0198162679782071,
0.0218781313182203, 0.0237294742633411, 0.025919578977657, 0.0274518022761997,
0.0288986369564301, 0.0306813505050163, 0.0320170996823189, 0.0332452747332959,
0.0348335698576168, 0.0359832389317461, 0.0369051995840645, 0.0387221159256424,
0.03993025905765, 0.0431448163617178],
[0.00541658127872122, 0.00760286745300187, 0.007949878346447991, 0.008521651834359399,
0.00909775605533253, 0.009889245210140779, 0.0105309297090482, 0.0111322726797384,
0.0117439009052552, 0.012405033293814, 0.0131684179320803, 0.0141377942603047,
0.0156148055023058, 0.0169343970067564, 0.018513067368104, 0.0196080260483234,
0.0206489568587364, 0.0219285176765082, 0.0228689168972669, 0.023738710122235,
0.0248334158891432, 0.0256126573433596, 0.0265491336936829, 0.027578430100536, 0.0284430733108,
0.0313640941982108],
[0.00390439997450557, 0.00541664181796583, 0.00566171386252323, 0.00607120971135229,
0.0064762535755248, 0.00703573098590029, 0.00749421254589299, 0.007920878896017331,
0.008355737247680061, 0.00882439333812351, 0.00936785820717061, 0.01005604603884,
0.0111019116837591, 0.0120380990328341, 0.0131721010552576, 0.0139655122281969,
0.0146889122204488, 0.0156076779647454, 0.0162685615996248, 0.0168874937789415,
0.0176505093388153, 0.0181944265400504, 0.0186226037818523, 0.0193001796565433,
0.0196241518040617, 0.0213081254074584],
[0.00245657785440433, 0.00344809282233326, 0.00360473943713036, 0.00386326548010849,
0.00412089506752692, 0.00447640050137479, 0.00476555693102276, 0.00503704029750072,
0.00531239247408213, 0.00560929919359959, 0.00595352728377949, 0.00639092280563517,
0.00705566126234625, 0.0076506368153935, 0.00836821687047215, 0.008863578928549141,
0.00934162787186159, 0.009932186363240289, 0.0103498795291629, 0.0107780907076862,
0.0113184316868283, 0.0117329446468571, 0.0119995948968375, 0.0124410052027886,
0.0129467396733128, 0.014396063834027],
[0.00174954269199566, 0.00244595133885302, 0.00255710802275612, 0.00273990955227265,
0.0029225480567908, 0.00317374638422465, 0.00338072258533527, 0.00357243876535982,
0.00376734715752209, 0.00397885007249132, 0.00422430013176233, 0.00453437508148542,
0.00500178808402368, 0.00542372242836395, 0.00592656681022859, 0.00628034732880374,
0.00661030641550873, 0.00702254699967648, 0.00731822628156458, 0.0076065423418208,
0.00795640367207482, 0.008227052458435399, 0.00852240989786251, 0.00892863905540303,
0.009138539330002131, 0.009522345795667729],
[0.00119458814106091, 0.00173435346896287, 0.00181194434584681, 0.00194259470485893,
0.00207173719623868, 0.00224993202086955, 0.00239520831473419, 0.00253036792824665,
0.00266863168718114, 0.0028181999035216, 0.00299137548142077, 0.00321024899920135,
0.00354362220314155, 0.00384330190244679, 0.00420258799378253, 0.00445774902155711,
0.00469461513212743, 0.00499416069129168, 0.00520917757743218, 0.00540396235924372,
0.00564540201704594, 0.00580460792299214, 0.00599774739593151, 0.00633099254378114,
0.00656987109386762, 0.00685829448046227],
[0.000852415648011777, 0.00122883479310665, 0.00128469304457018, 0.00137617650525553,
0.00146751502006323, 0.00159376453672466, 0.00169668445506151, 0.00179253418337906,
0.00189061261635977, 0.00199645471886179, 0.00211929748381704, 0.00227457698703581,
0.00250999080890397, 0.00272375073486223, 0.00298072958568387, 0.00315942194040388,
0.0033273652798148, 0.00353988965698579, 0.00369400045486625, 0.00383345715372182,
0.00400793469634696, 0.00414892737222885, 0.0042839159079761, 0.00441870104432879,
0.00450818604569179, 0.00513477467565583],
[0.000644400053256997, 0.000916872204484283, 0.000957932946765532, 0.00102641863872347,
0.00109495154218002, 0.00118904090369415, 0.00126575197699874, 0.00133750966361506,
0.00141049709228472, 0.00148936709298802, 0.00158027541945626, 0.00169651643860074,
0.00187306184725826, 0.00203178401610555, 0.00222356097506054, 0.00235782814777627,
0.00248343580127067, 0.00264210826339498, 0.0027524322157581, 0.0028608570740143,
0.00298695044508003, 0.00309340092038059, 0.00319932767198801, 0.00332688234611187,
0.00339316094477355, 0.00376331697005859]])
return N, SIG, CV