Source code for clustpy.utils.diptest

try:
    from clustpy.utils.dipModule import c_diptest  # noqa - Import from C file (could be marked as unresolved)
except:
    print("[WARNING] Could not import c_diptest in clustpy.utils.dipModule")
import numpy as np
import matplotlib.pyplot as plt
from clustpy.utils.plots import plot_histogram
from sklearn.utils import check_random_state


[docs]def dip_test(X: np.ndarray, just_dip: bool = True, is_data_sorted: bool = False, return_gcm_lcm_mn_mj: bool = False,
             use_c: bool = True, debug: bool = False) -> (
        float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    """
    Calculate the Dip-value. This can either be done using the C implementation or the python version.
    In addition to the Dip-value additional values can be returned.
    These are e.g. the modal interval (indices of the beginning and end of the steepest slop of the ECDF) and the modal interval (used to calculate the gradient of the Dip-value) if just_dip is False.
    Further, the indices of the Greatest Convex Minorant (gcm), Least Concave Majorant (lcm), minorant and majorant values can be returned by setting return_gcm_lcm_mn_mj to True.
    Note that modal_triangle can be (-1,-1,-1) if the triangle could not be determined correctly.

    Parameters
    ----------
    X : np.ndarray
        the given univariate data set
    just_dip : bool
        Defines whether only the Dip-value should be returned or also the modal interval and modal triangle (default: True)
    is_data_sorted : bool
        Should be True if the data set is already sorted (default: False)
    return_gcm_lcm_mn_mj : bool
        Defines whether the gcm, lcm, mn and mj arrays should be returned. In this case just_dip must be False (default: False)
    use_c : bool
        Defines whether the C implementation should be used (defualt: True)
    debug : bool
        If true, additional information will be printed to the console (default: False)

    Returns
    -------
    tuple: (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
        The resulting Dip-value,
        The indices of the modal_interval - corresponds to the steepest slope in the ECDF (if just_dip is False),
        The indices of the modal triangle (if just_dip is False),
        The indices of points that are part of the Greatest Convex Minorant (gcm) (if just_dip is False and return_gcm_lcm_mn_mj is True),
        The indices of points that are part of the Least Concave Majorant (lcm) (if just_dip is False and return_gcm_lcm_mn_mj is True),
        The minorant values (if just_dip is False and return_gcm_lcm_mn_mj is True),
        The majorant values (if just_dip is False and return_gcm_lcm_mn_mj is True)

    References
    ----------
    Hartigan, John A., and Pamela M. Hartigan.
    "The dip test of unimodality." The annals of Statistics (1985): 70-84.

    and

    Hartigan, P. M. "Computation of the dip statistic to test for unimodality: Algorithm as 217."
    Applied Statistics 34.3 (1985): 320-5.
    """
    assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
    assert just_dip or is_data_sorted == True, "Data must be sorted if modal interval and/or modal triangle should be returned (else indices will not match)"
    assert not return_gcm_lcm_mn_mj or not just_dip, "If GCM, LCM, mn and mj should be returned, 'just_dip' must be False"
    if not is_data_sorted:
        X = np.sort(X)
    # Obtain results
    if X.shape[0] < 4 or X[0] == X[-1]:
        dip_value = 0.0
        modal_interval = (0, X.shape[0] - 1)
        modal_triangle = (-1, -1, -1)
        mn, mj = None, None
    elif use_c:
        try:
            dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_c_impl(X, debug)
        except Exception as e:
            print("[WARNING] C implementation can not be used for dip calculation.")
            print(e)
            dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_python_impl(X, debug)
    else:
        dip_value, modal_interval, modal_triangle, _, _, mn, mj = _dip_python_impl(X, debug)
    # Return results
    if just_dip:
        return dip_value
    elif return_gcm_lcm_mn_mj:
        if mn is not None and mj is not None:
            gcm, lcm = _get_complete_gcm_lcm(mn, mj, modal_interval)
        else:
            gcm, lcm = None, None
        return dip_value, modal_interval, modal_triangle, gcm, lcm, mn, mj
    else:
        return dip_value, modal_interval, modal_triangle


def _dip_c_impl(X: np.ndarray, debug: bool) -> (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    """
    Calls the Dip C implementation by Martin Maechler.

    Parameters
    ----------
    X : np.ndarray
        the given univariate data set
    debug : bool
        If true, additional information will be printed to the console

    Returns
    -------
    tuple : (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
        The resulting Dip-value,
        The indices of the modal_interval - corresponds to the steepest slope in the ECDF,
        The indices of the modal triangle
        The indices of points that are part of the Greatest Convex Minorant (gcm),
        The indices of points that are part of the Least Concave Majorant (lcm),
        The minorant values,
        The majorant values
    """
    # Create reference numpy arrays
    modal_interval = np.zeros(2, dtype=np.int32)
    modal_triangle = -np.ones(3, dtype=np.int32)
    gcm = np.zeros(X.shape, dtype=np.int32)
    lcm = np.zeros(X.shape, dtype=np.int32)
    mj = np.zeros(X.shape, dtype=np.int32)
    mn = np.zeros(X.shape, dtype=np.int32)
    # Execute C function
    dip_value = c_diptest(X.astype(np.float64), modal_interval, modal_triangle, gcm, lcm, mn, mj, X.shape[0],
                          1 if debug else 0)
    return dip_value, (modal_interval[0], modal_interval[1]), (
        modal_triangle[0], modal_triangle[1], modal_triangle[2]), gcm, lcm, mn, mj


def _dip_python_impl(X: np.ndarray, debug: bool) -> (
        float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    """
    A python version of the Dip C implementation by Martin Maechler.

    Parameters
    ----------
    X : np.ndarray
        the given univariate data set
    debug : bool
        If true, additional information will be printed to the console

    Returns
    -------
    tuple : (float, tuple, tuple, np.ndarray, np.ndarray, np.ndarray, np.ndarray)
        The resulting Dip-value,
        The indices of the modal_interval - corresponds to the steepest slope in the ECDF,
        The indices of the modal triangle
        The indices of points that are part of the Greatest Convex Minorant (gcm),
        The indices of points that are part of the Least Concave Majorant (lcm),
        The minorant values,
        The majorant values
    """
    assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
    N = len(X)
    if N < 4 or X[0] == X[-1]:
        d = 0.0
        return d, (0, 0), (-1, -1, -1), None, None, None, None
    low = 0
    high = N - 1
    dip_value = 0.0
    # Create modal triangle
    modaltriangle_i1 = -1
    modaltriangle_i2 = -1
    modaltriangle_i3 = -1
    # Establish the indices mn[0..n-1] over which combination is necessary for the convex MINORANT (GCM) fit.
    mn = np.zeros(N, dtype=np.int32)
    mn[0] = 0
    for j in range(1, N):
        mn[j] = j - 1
        while True:
            mnj = mn[j]
            mnmnj = mn[mnj]
            if mnj == 0 or (X[j] - X[mnj]) * (mnj - mnmnj) < (X[mnj] - X[mnmnj]) * (j - mnj):
                break
            mn[j] = mnmnj
    # Establish the indices   mj[0..n-1]  over which combination is necessary for the concave MAJORANT (LCM) fit.
    mj = np.zeros(N, dtype=np.int32)
    mj[N - 1] = N - 1
    for k in range(N - 2, -1, -1):
        mj[k] = k + 1
        while True:
            mjk = mj[k]
            mjmjk = mj[mjk]
            if mjk == N - 1 or (X[k] - X[mjk]) * (mjk - mjmjk) < (X[mjk] - X[mjmjk]) * (k - mjk):
                break
            mj[k] = mjmjk
    gcm = np.zeros(N, dtype=int)  # np.arange(N)
    lcm = np.zeros(N, dtype=int)  # np.arange(N, -1, -1)
    while True:
        # GCM
        gcm[0] = high
        i = 0
        while gcm[i] > low:
            gcm[i + 1] = mn[gcm[i]]
            i += 1
        ig = i
        l_gcm = i
        ix = ig - 1
        # LCM
        lcm[0] = low
        i = 0
        while lcm[i] < high:
            lcm[i + 1] = mj[lcm[i]]
            i += 1
        ih = i
        l_lcm = i
        iv = 1
        if debug:
            print("'dip': LOOP-BEGIN: 2n*D= {0}  [low,high] = [{1},{2}]:".format(dip_value, low, high))
            print("gcm[0:{0}] = {1}".format(l_gcm, gcm[:l_gcm + 1]))
            print("lcm[0:{0}] = {1}".format(l_lcm, lcm[:l_lcm + 1]))
        d = 0.0
        if l_gcm != 1 or l_lcm != 1:
            if debug:
                print("  while(gcm[ix] != lcm[iv])")
            while True:
                gcmix = gcm[ix]
                lcmiv = lcm[iv]
                if gcmix > lcmiv:
                    gcmil = gcm[ix + 1]
                    dx = (lcmiv - gcmil + 1) - (X[lcmiv] - X[gcmil]) * (gcmix - gcmil) / (X[gcmix] - X[gcmil])
                    iv += 1
                    if dx >= d:
                        d = dx
                        ig = ix + 1
                        ih = iv - 1
                        if debug:
                            print("L({0},{1})".format(ig, ih))
                else:
                    lcmivl = lcm[iv - 1]
                    dx = (X[gcmix] - X[lcmivl]) * (lcmiv - lcmivl) / (X[lcmiv] - X[lcmivl]) - (gcmix - lcmivl - 1)
                    ix -= 1
                    if dx >= d:
                        d = dx
                        ig = ix + 1
                        ih = iv
                        if debug:
                            print("G({0},{1})".format(ig, ih))
                if ix < 0:
                    ix = 0
                if iv > l_lcm:
                    iv = l_lcm
                if debug:
                    print("  --> ix = {0}, iv = {1}".format(ix, iv))
                if gcm[ix] == lcm[iv]:
                    break
        else:
            d = 0.0
            if debug:
                print("  ** (l_lcm,l_gcm) = ({0},{1}) ==> d := {2}".format(l_lcm, l_gcm, d))
        if d < dip_value:
            break
        if debug:
            print("  calculating dip ..")
        j_l = -1
        j_u = -1
        lcm_modalTriangle_i1 = -1
        lcm_modalTriangle_i3 = -1
        gcm_modalTriangle_i1 = -1
        gcm_modalTriangle_i3 = -1
        # The DIP for the convex minorant
        dip_l = 0
        for j in range(ig, l_gcm):
            max_t = 1
            j_ = -1
            jb = gcm[j + 1]
            je = gcm[j]
            if je - jb > 1 and X[je] != X[jb]:
                C = (je - jb) / (X[je] - X[jb])
                for jj in range(jb, je + 1):
                    t = (jj - jb + 1) - (X[jj] - X[jb]) * C
                    if max_t < t:
                        max_t = t
                        j_ = jj
            if dip_l < max_t:
                dip_l = max_t
                j_l = j_
                gcm_modalTriangle_i1 = jb
                gcm_modalTriangle_i3 = je
        # The DIP for the concave majorant
        dip_u = 0
        for j in range(ih, l_lcm):
            max_t = 1
            j_ = -1
            jb = lcm[j]
            je = lcm[j + 1]
            if je - jb > 1 and X[je] != X[jb]:
                C = (je - jb) / (X[je] - X[jb])
                for jj in range(jb, je + 1):
                    t = (X[jj] - X[jb]) * C - (jj - jb - 1)
                    if max_t < t:
                        max_t = t
                        j_ = jj
            if dip_u < max_t:
                dip_u = max_t
                j_u = j_
                lcm_modalTriangle_i1 = jb
                lcm_modalTriangle_i3 = je
        if debug:
            print(" (dip_l, dip_u) = ({0}, {1})".format(dip_l, dip_u))
        if dip_u > dip_l:
            dip_new = dip_u
            j_best = j_u
            if debug:
                print(" -> new larger dip {0} (j_best = {1}) gcm-centred triple ({2},{3},{4})".format(dip_new, j_best,
                                                                                                      lcm_modalTriangle_i1,
                                                                                                      j_best,
                                                                                                      lcm_modalTriangle_i3))
        else:
            dip_new = dip_l
            j_best = j_l
            if debug:
                print(" -> new larger dip {0} (j_best = {1}) lcm-centred triple ({2},{3},{4})".format(dip_new, j_best,
                                                                                                      gcm_modalTriangle_i1,
                                                                                                      j_best,
                                                                                                      gcm_modalTriangle_i3))
        if dip_value < dip_new:
            dip_value = dip_new
            if dip_u > dip_l:
                modaltriangle_i1 = lcm_modalTriangle_i1
                modaltriangle_i2 = j_best
                modaltriangle_i3 = lcm_modalTriangle_i3
            else:
                modaltriangle_i1 = gcm_modalTriangle_i1
                modaltriangle_i2 = j_best
                modaltriangle_i3 = gcm_modalTriangle_i3
        if low == gcm[ig] and high == lcm[ih]:
            if debug:
                print("No improvement in  low = {0}  nor  high = {1} --> END".format(low, high))
            break
        low = gcm[ig]
        high = lcm[ih]
    dip_value /= (2 * N)
    return dip_value, (low, high), (modaltriangle_i1, modaltriangle_i2, modaltriangle_i3), gcm, lcm, mn, mj


[docs]def dip_pval(dip_value: float, n_points: int, pval_strategy: str = "table", n_boots: int = 1000,
             random_state: np.random.RandomState | int = None) -> float:
    """
    Get the p-value of a corresponding Dip-value.
    P-values depend on the input Dip-value and the sample size.
    There are several strategies to calculate the p-value. These are:
    'table' (most common), 'function' (available for all sample sizes) and 'bootstrap' (slow for large sample sizes)

    Parameters
    ----------
    dip_value : flaat
        The Dip-value
    n_points : int
        The number of samples
    pval_strategy : str
        Specifies the strategy that should be used to calculate the p-value (default: 'table')
    n_boots : int
        Number of random data sets that should be created to calculate Dip-values. Only relevant if pval_strategy is 'bootstrap' (default: 1000)
    random_state : np.random.RandomState | int
        use a fixed random state to get a repeatable solution. Can also be of type int. Only relevant if pval_strategy is 'bootstrap' (default: None)

    Returns
    -------
    pval : float
        The resulting p-value

    References
    ----------
    Hartigan, John A., and Pamela M. Hartigan.
    "The dip test of unimodality." The annals of Statistics (1985): 70-84.

    and

    Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
    Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
    """
    assert type(pval_strategy) is str, "pval_stratgegy must be of type string"
    pval_strategy = pval_strategy.lower()
    assert pval_strategy in ["bootstrap", "table",
                             "function"], "pval_strategy must match 'bootstrap', 'table' or 'function'. " \
                                          "Your input: {0}".format(pval_strategy)
    if n_points < 4:
        pval = 1.0
    elif pval_strategy == "bootstrap":
        boot_dips = dip_boot_samples(int(n_points), n_boots, random_state)
        pval = np.mean(dip_value <= boot_dips)
    elif pval_strategy == "table":
        pval = _dip_pval_table(dip_value, n_points)
    elif pval_strategy == "function":
        pval = _dip_pval_function(dip_value, n_points)
    else:
        raise Exception(
            "pval_strategy must match 'bootstrap', 'table' or 'function. Your input: {0}".format(pval_strategy))
    return pval


[docs]def dip_boot_samples(n_points: int, n_boots: int = 1000,
                     random_state: np.random.RandomState | int = None) -> np.ndarray:
    """
    Sample random data sets and calculate corresponding Dip-values.
    E.g. used to determine p-values.

    Parameters
    ----------
    n_points : int
        The number of samples
    n_boots : int
        Number of random data sets that should be created to calculate Dip-values (default: 1000)
    random_state : np.random.RandomState | int
        use a fixed random state to get a repeatable solution. Can also be of type int (default: None)

    Returns
    -------
    boot_dips : np.ndarray
        Array of Dip-values
    """
    # random uniform vectors
    random_state = check_random_state(random_state)
    boot_samples = random_state.rand(n_boots, n_points)
    boot_dips = np.array([dip_test(boot_s, just_dip=True, is_data_sorted=False) for boot_s in boot_samples])
    return boot_dips


def _get_complete_gcm_lcm(mn: np.ndarray, mj: np.ndarray, modal_interval: tuple) -> (np.ndarray, np.ndarray):
    """
    Complete the GCM and LCM returned by the Dip-test.
    Adapted from: https://github.com/samhelmholtz/skinny-dip/blob/master/code/skinny-dip/RPackageDipTestCustom/diptest/R/dip.R

    Parameters
    ----------
    mn : np.ndarray
        The minorant values
    mj : np.ndarray
        The majorant values
    modal_interval : tuple
        Indices of the modal interval - corresponds to the steepest slope in the ECDF (default: None)

    Returns
    -------
    tuple : (np.ndarray, np.ndarray)
        The adjusted indices of points that are part of the Greatest Convex Minorant,
        The adjusted indices of points that are part of the Least Concave Majorant
    """
    low = modal_interval[0]
    high = modal_interval[1]
    gcm = np.zeros(mn.shape[0], dtype=np.int32)
    lcm = np.zeros(mj.shape[0], dtype=np.int32)
    # Collect the gcm points from 0 to the upper end of the modal interval
    i = 0
    gcm[0] = high
    while gcm[i] > 0:
        gcm[i + 1] = mn[gcm[i]]
        i += 1
    gcm = gcm[:i + 1]
    # Collect the lcm points from the lower end of the modal interval to the last point
    i = 0
    lcm[i] = low
    while lcm[i] < mj.shape[0] - 1:
        lcm[i + 1] = mj[lcm[i]]
        i += 1
    lcm = lcm[:i + 1]
    return gcm, lcm


"""
Dip Gradient
"""


[docs]def dip_gradient(X: np.ndarray, X_proj: np.ndarray, sorted_indices: np.ndarray, modal_triangle: tuple) -> np.ndarray:
    """
    Calculate the gradient of the Dip-value regarding the projection axis.

    Parameters
    ----------
    X : np.ndarray
        the given data set
    X_proj : np.ndarray
        The univariate projected data set
    sorted_indices : np.ndarray
        The indices of the sorted univariate data set
    modal_triangle : tuple
        Indices of the modal triangle

    Returns
    -------
    gradient : np.ndarray
        The gradient of the Dip-value regarding the projection axis

    References
    ----------
    Krause, Andreas, and Volkmar Liebscher.
    "Multimodal projection pursuit using the dip statistic." (2005).
    """
    if modal_triangle is None or modal_triangle[0] == -1:
        return np.zeros(X.shape[1])
    data_index_i1 = sorted_indices[modal_triangle[0]]
    data_index_i2 = sorted_indices[modal_triangle[1]]
    data_index_i3 = sorted_indices[modal_triangle[2]]
    # Get A and c
    A = modal_triangle[0] - modal_triangle[1] + \
        (modal_triangle[2] - modal_triangle[0]) * (X_proj[data_index_i2] - X_proj[data_index_i1]) / (
                X_proj[data_index_i3] - X_proj[data_index_i1])
    constant = (modal_triangle[2] - modal_triangle[0]) / (2 * X.shape[0])
    # Check A
    if A < 0:
        constant = -constant
    # Calculate gradient
    quotient = (X_proj[data_index_i3] - X_proj[data_index_i1])
    gradient = (X[data_index_i2] - X[data_index_i1]) / quotient - \
               (X[data_index_i3] - X[data_index_i1]) * (
                       X_proj[data_index_i2] - X_proj[data_index_i1]) / quotient ** 2
    gradient = gradient * constant
    return gradient


[docs]def dip_pval_gradient(X: np.ndarray, X_proj: np.ndarray, sorted_indices: np.ndarray, modal_triangle: tuple,
                      dip_value: float) -> np.ndarray:
    """
    Calculate the gradient of the Dip p-value function regarding the projection axis.

    Parameters
    ----------
    X : np.ndarray
        the given data set
    X_proj : np.ndarray
        The univariate projected data set
    sorted_indices : np.ndarray
        The indices of the sorted univariate data set
    modal_triangle : tuple
        Indices of the modal triangle
    dip_value : float
        The Dip-value

    Returns
    -------
    pval_grad : np.ndarray
        The gradient of the Dip p-value function regarding the projection axis

    References
    ----------
    Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
    Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
    """
    if modal_triangle is None or modal_triangle[0] == -1:
        return np.zeros(X.shape[1])
    # Calculate gradient of dip-value
    dip_grad = dip_gradient(X, X_proj, sorted_indices, modal_triangle)
    # Get factor for that gradient from p-value gradeint
    b = _dip_pval_function_get_b(X.shape[0])
    exponent = np.exp(-b * dip_value + 6.5)
    quotient = (0.6 * (1 + 1.6 * exponent) ** (0.625) + 0.4 * (1 + 0.2 * exponent) ** 5) ** 2
    grad_factor = (0.6 * (1 + 1.6 * exponent) ** (-0.375) + 0.4 * (1 + 0.2 * exponent) ** 4) / quotient
    grad_factor *= exponent * -b
    # Combine values
    pval_grad = grad_factor * dip_grad
    return pval_grad


"""
Plot
"""


[docs]def plot_dip(X: np.ndarray, is_data_sorted: bool = False, dip_value: float = None, modal_interval: tuple = None,
             modal_triangle: tuple = None, gcm: np.ndarray = None, lcm: np.ndarray = None, linewidth_ecdf: float = 1,
             linewidth_extra: float = 2, show_legend: bool = True, add_histogram: bool = True,
             histogram_labels: np.ndarray = None, histogram_show_legend: bool = True, histogram_density: bool = True,
             histogram_n_bins: int = 100, height_ratio: tuple = (1, 2), show_plot: bool = True) -> None:
    """
    Plot a visual representation of the computational process of the Dip.
    Upper part shows an optional histogram of the data and the lower part shows the corresponding ECDF.

    Parameters
    ----------
    X : np.ndarray
        the given data set
    is_data_sorted : bool
        Should be True if the data set is already sorted (default: False)
    dip_value : float
        The Dip-value (default: None)
    modal_interval : tuple
        Indices of the modal interval - corresponds to the steepest slope in the ECDF (default: None)
    modal_triangle : tuple
        Indices of the modal triangle (default: None)
    gcm : np.ndarray
        The indices of points that are part of the Greatest Convex Minorant (gcm) (default: None)
    lcm : np.ndarray
        The indices of points that are part of the Least Concave Majorant (lcm) (default None)
    linewidth_ecdf : flaot
        The linewidth for the eCDF (default: 1)
    linewidth_extra : float
        The linewidth for the visualization of the dip, modal interval, modal triangle, gcm and lcm (default: 2)
    show_legend : bool
        Defines whether the legend of the ECDF plot should be added (default: True)
    add_histogram : bool
        Defines whether the histogram should be shown above the ECDF plot (default: True)
    histogram_labels : np.ndarray
        Labels used to color parts of the histogram (default: None)
    histogram_show_legend : bool
        Defines whether the legend of the histogram should be added (default: True)
    histogram_density : bool
        Defines whether a kernel density should be added to the histogram plot (default: True)
    histogram_n_bins : int
        Number of bins used for the histogram (default: 100)
    height_ratio : tuple
        Defines the height ratio between histogram and ECDF plot. Only relevant if add_histogram is True.
        First value in the tuple defines the height of the histogram and the second value the height of the ECDF plot (default: (1, 2))
    show_plot : bool
        Defines whether the plot should directly be plotted (default: True)
    """
    assert X.ndim == 1, "Data must be 1-dimensional for the dip-test. Your shape:{0}".format(X.shape)
    N = len(X)
    if not is_data_sorted:
        argsorted_X = np.argsort(X)
        X = X[argsorted_X]
        if histogram_labels is not None:
            histogram_labels = histogram_labels[argsorted_X]
    if add_histogram:
        # Add histogram at the top of the plot (uses plot_histogram from clustpy.utils.plots)
        fig, (ax1, ax2) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': height_ratio})
        plot_histogram(X, histogram_labels, density=histogram_density, n_bins=histogram_n_bins,
                       show_legend=histogram_show_legend,
                       container=ax1, show_plot=False)
        if modal_interval is not None:
            y_axis_limit = ax1.get_ylim()
            ax1.plot([X[modal_interval[0]], X[modal_interval[0]]], y_axis_limit, "g--", linewidth=linewidth_extra)
            ax1.plot([X[modal_interval[1]], X[modal_interval[1]]], y_axis_limit, "g--", linewidth=linewidth_extra)
        dip_container = ax2
        # Remove spacing between the two plots
        fig.subplots_adjust(hspace=0)
    else:
        dip_container = plt
    # Plot ECDF
    dip_container.plot(X, np.arange(N) / N, "b", label="eCDF", linewidth=linewidth_ecdf)
    if dip_value is not None:
        # Add Dip range around ECDF
        dip_container.plot(X, np.arange(N) / N - dip_value * 2, "k:", alpha=0.7, label="2x dip",
                           linewidth=linewidth_extra)
        dip_container.plot(X, np.arange(N) / N + dip_value * 2, "k:", alpha=0.7, linewidth=linewidth_extra)
    if modal_interval is not None:
        # Add modal interval in green
        dip_container.plot([X[modal_interval[0]], X[modal_interval[1]]], [modal_interval[0] / N, modal_interval[1] / N],
                           linewidth=linewidth_extra, c="g", label="modal interval")
    if modal_triangle is not None:
        # Add modal triangle in red
        dip_container.plot([X[modal_triangle[0]], X[modal_triangle[1]], X[modal_triangle[2]], X[modal_triangle[0]]],
                           [modal_triangle[0] / N, modal_triangle[1] / N, modal_triangle[2] / N, modal_triangle[0] / N],
                           linewidth=linewidth_extra, c="r", label="modal triangle")
    # Add process of gcm and lcm curves
    if gcm is not None:
        dip_container.plot(X[gcm], gcm / N, "y--", linewidth=linewidth_extra, label="gcm")
    if lcm is not None:
        dip_container.plot(X[lcm], lcm / N, "c--", linewidth=linewidth_extra, label="lcm")
    if show_legend:
        dip_container.legend(loc="lower right")
    if show_plot:
        plt.show()


"""
Dip p-value methods
"""


def _dip_pval_table(dip_value: float, n_points: int) -> float:
    """
    Get the p-value of a corresponding Dip-value using a lookup table.
    Depends on the number of samples.
    Source: https://github.com/alimuldal/diptest

    Parameters
    ----------
    dip_value : flaat
        The Dip-value
    n_points : int
        The number of samples

    Returns
    -------
    pval : float
        The resulting p-value

    References
    ----------
    Hartigan, John A., and Pamela M. Hartigan.
    "The dip test of unimodality." The annals of Statistics (1985): 70-84.
    """
    N, SIG, CV = _get_dip_table_values()
    if n_points > max(N):
        print(
            "[dip_pval_table] WARNING: The number of samples is too large for pval_strategy 'table' (max n_points is 72000), 'function' will be used instead.")
        return _dip_pval_function(dip_value, n_points)
    i1 = N.searchsorted(n_points, side='left')
    i0 = i1 - 1
    # if n falls outside the range of tabulated sample sizes, use the
    # critical values for the nearest tabulated n (i.e. treat them as
    # 'asymptotic')
    i0 = max(0, i0)
    i1 = min(N.shape[0] - 1, i1)
    # interpolate on sqrt(n)
    n0, n1 = N[[i0, i1]]
    if i0 != i1:
        fn = float(n_points - n0) / (n1 - n0)
    else:
        fn = 0
    y0 = np.sqrt(n0) * CV[i0]
    y1 = np.sqrt(n1) * CV[i1]
    sD = np.sqrt(n_points) * dip_value
    pval = 1. - np.interp(sD, y0 + fn * (y1 - y0), SIG)
    return pval


def _dip_pval_function(dip_value: float, n_points: int) -> float:
    """
    Get the p-value of a corresponding Dip-value using the sigmoid function as described by Bauer et al..
    Depends on the number of samples.

    Parameters
    ----------
    dip_value : flaat
        The Dip-value
    n_points : int
        The number of samples

    Returns
    -------
    pval : float
        The resulting p-value

    References
    ----------
    Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
    Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
    """
    b = _dip_pval_function_get_b(n_points)
    exponent = np.exp(-b * dip_value + 6.5)
    pval = 1 - 1 / (0.6 * (1 + 1.6 * exponent) ** (0.625) + 0.4 * (1 + 0.2 * exponent) ** 5)
    return pval


def _dip_pval_function_get_b(n_points: int) -> float:
    """
    Helper function for the Dip-p-value calculation using the sigmoid function by Bauer et al..

    Parameters
    ----------
    n_points : int
        The number of samples

    Returns
    -------
    b : float
        The b value

    References
    ----------
    Bauer, Lena, et al. "Extension of the Dip-test Repertoire - Efficient and Differentiable p-value Calculation for Clustering."
    Proceedings of the 2023 SIAM International Conference on Data Mining (SDM). Society for Industrial and Applied Mathematics, 2023.
    """
    b = 17.30784022 * np.sqrt(n_points) + 12.04917889
    return b


def _get_dip_table_values() -> (np.ndarray, np.ndarray, np.ndarray):
    """
    Get the values of the Dip-p-value lookup table.

    Returns
    -------
    tuple : (np.ndarray, np.ndarray, np.ndarray)
        The sample sizes (21 values),
        The probabilities (26 values),
        The Dip-values (21 x 26 values)
    """
    N = np.array([4, 5, 6, 7, 8, 9, 10, 15, 20,
                  30, 50, 100, 200, 500, 1000, 2000, 5000, 10000,
                  20000, 40000, 72000])
    SIG = np.array([0., 0.01, 0.02, 0.05, 0.1, 0.2,
                    0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                    0.9, 0.95, 0.98, 0.99, 0.995, 0.998,
                    0.999, 0.9995, 0.9998, 0.9999, 0.99995, 0.99998,
                    0.99999, 1.])
    # [len(N), len(SIG)] table of critical values
    CV = np.array([[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.132559548782689,
                    0.157497369040235, 0.187401878807559, 0.20726978858736, 0.223755804629222, 0.231796258864192,
                    0.237263743826779, 0.241992892688593, 0.244369839049632, 0.245966625504691, 0.247439597233262,
                    0.248230659656638, 0.248754269146416, 0.249302039974259, 0.249459652323225, 0.24974836247845],
                   [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.108720593576329, 0.121563798026414, 0.134318918697053,
                    0.147298798976252, 0.161085025702604, 0.176811998476076, 0.186391796027944, 0.19361253363045,
                    0.196509139798845, 0.198159967287576, 0.199244279362433, 0.199617527406166, 0.199800941499028,
                    0.199917081834271, 0.199959029093075, 0.199978395376082, 0.199993151405815, 0.199995525025673,
                    0.199999835639211],
                   [0.0833333333333333, 0.0833333333333333, 0.0833333333333333, 0.0833333333333333,
                    0.0833333333333333, 0.0924514470941933, 0.103913431059949, 0.113885220640212, 0.123071187137781,
                    0.13186973390253, 0.140564796497941, 0.14941924112913, 0.159137064572627, 0.164769608513302,
                    0.179176547392782, 0.191862827995563, 0.202101971042968, 0.213015781111186, 0.219518627282415,
                    0.224339047394446, 0.229449332154241, 0.232714530449602, 0.236548128358969, 0.2390887911995,
                    0.240103566436295, 0.244672883617768],
                   [0.0714285714285714, 0.0714285714285714, 0.0714285714285714, 0.0725717816250742,
                    0.0817315478539489, 0.09405901819225269, 0.103244490800871, 0.110964599995697,
                    0.117807846504335, 0.124216086833531, 0.130409013968317, 0.136639642123068, 0.144240669035124,
                    0.159903395678336, 0.175196553271223, 0.184118659121501, 0.191014396174306, 0.198216795232182,
                    0.202341010748261, 0.205377566346832, 0.208306562526874, 0.209866047852379, 0.210967576933451,
                    0.212233348558702, 0.212661038312506, 0.21353618608817],
                   [0.0625, 0.0625, 0.06569119945032829, 0.07386511360717619, 0.0820045917762512,
                    0.0922700601131892, 0.09967371895993631, 0.105733531802737, 0.111035129847705,
                    0.115920055749988, 0.120561479262465, 0.125558759034845, 0.141841067033899, 0.153978303998561,
                    0.16597856724751, 0.172988528276759, 0.179010413496374, 0.186504388711178, 0.19448404115794,
                    0.200864297005026, 0.208849997050229, 0.212556040406219, 0.217149174137299, 0.221700076404503,
                    0.225000835357532, 0.233772919687683],
                   [0.0555555555555556, 0.0613018090298924, 0.0658615858179315, 0.0732651142535317,
                    0.0803941629593475, 0.0890432420913848, 0.0950811420297928, 0.09993808978110461,
                    0.104153560075868, 0.108007802361932, 0.112512617124951, 0.122915033480817, 0.136412639387084,
                    0.146603784954019, 0.157084065653166, 0.164164643657217, 0.172821674582338, 0.182555283567818,
                    0.188658833121906, 0.194089120768246, 0.19915700809389, 0.202881598436558, 0.205979795735129,
                    0.21054115498898, 0.21180033095039, 0.215379914317625],
                   [0.05, 0.0610132555623269, 0.0651627333214016, 0.0718321619656165, 0.077966212182459,
                    0.08528353598345639, 0.09032041737070989, 0.0943334983745117, 0.0977817630384725,
                    0.102180866696628, 0.109960948142951, 0.118844767211587, 0.130462149644819, 0.139611395137099,
                    0.150961728882481, 0.159684158858235, 0.16719524735674, 0.175419540856082, 0.180611195797351,
                    0.185286416050396, 0.191203083905044, 0.195805159339184, 0.20029398089673, 0.205651089646219,
                    0.209682048785853, 0.221530282182963],
                   [0.0341378172277919, 0.0546284208048975, 0.0572191260231815, 0.0610087367689692,
                    0.06426571373304441, 0.06922341079895911, 0.0745462114365167, 0.07920308789817621,
                    0.083621033469191, 0.08811984822029049, 0.093124666680253, 0.0996694393390689,
                    0.110087496900906, 0.118760769203664, 0.128890475210055, 0.13598356863636, 0.142452483681277,
                    0.150172816530742, 0.155456133696328, 0.160896499106958, 0.166979407946248, 0.17111793515551,
                    0.175900505704432, 0.181856676013166, 0.185743454151004, 0.192240563330562],
                   [0.033718563622065, 0.0474333740698401, 0.0490891387627092, 0.052719998201553,
                    0.0567795509056742, 0.0620134674468181, 0.06601638720690479, 0.06965060750664009,
                    0.07334377405927139, 0.07764606628802539, 0.0824558407118372, 0.08834462700173699,
                    0.09723460181229029, 0.105130218270636, 0.114309704281253, 0.120624043335821, 0.126552378036739,
                    0.13360135382395, 0.138569903791767, 0.14336916123968, 0.148940116394883, 0.152832538183622,
                    0.156010163618971, 0.161319225839345, 0.165568255916749, 0.175834459522789],
                   [0.0262674485075642, 0.0395871890405749, 0.0414574606741673, 0.0444462614069956,
                    0.0473998525042686, 0.0516677370374349, 0.0551037519001622, 0.058265005347493,
                    0.0614510857304343, 0.0649164408053978, 0.0689178762425442, 0.0739249074078291,
                    0.08147913793901269, 0.0881689143126666, 0.0960564383013644, 0.101478558893837,
                    0.10650487144103, 0.112724636524262, 0.117164140184417, 0.121425859908987, 0.126733051889401,
                    0.131198578897542, 0.133691739483444, 0.137831637950694, 0.141557509624351, 0.163833046059817],
                   [0.0218544781364545, 0.0314400501999916, 0.0329008160470834, 0.0353023819040016,
                    0.0377279973102482, 0.0410699984399582, 0.0437704598622665, 0.0462925642671299,
                    0.048851155289608, 0.0516145897865757, 0.0548121932066019, 0.0588230482851366,
                    0.06491363240467669, 0.0702737877191269, 0.07670958860791791, 0.0811998415355918,
                    0.0852854646662134, 0.09048478274902939, 0.0940930106666244, 0.0974904344916743,
                    0.102284204283997, 0.104680624334611, 0.107496694235039, 0.11140887547015, 0.113536607717411,
                    0.117886716865312],
                   [0.0164852597438403, 0.022831985803043, 0.0238917486442849, 0.0256559537977579,
                    0.0273987414570948, 0.0298109370830153, 0.0317771496530253, 0.0336073821590387,
                    0.0354621760592113, 0.0374805844550272, 0.0398046179116599, 0.0427283846799166,
                    0.047152783315718, 0.0511279442868827, 0.0558022052195208, 0.059024132304226,
                    0.0620425065165146, 0.06580160114660991, 0.0684479731118028, 0.0709169443994193,
                    0.0741183486081263, 0.0762579402903838, 0.0785735967934979, 0.08134583568891331,
                    0.0832963013755522, 0.09267804230967371],
                   [0.0111236388849688, 0.0165017735429825, 0.0172594157992489, 0.0185259426032926,
                    0.0197917612637521, 0.0215233745778454, 0.0229259769870428, 0.024243848341112,
                    0.025584358256487, 0.0270252129816288, 0.0286920262150517, 0.0308006766341406,
                    0.0339967814293504, 0.0368418413878307, 0.0402729850316397, 0.0426864799777448,
                    0.044958959158761, 0.0477643873749449, 0.0497198001867437, 0.0516114611801451,
                    0.0540543978864652, 0.0558704526182638, 0.0573877056330228, 0.0593365901653878,
                    0.0607646310473911, 0.0705309107882395],
                   [0.00755488597576196, 0.0106403461127515, 0.0111255573208294, 0.0119353655328931,
                    0.0127411306411808, 0.0138524542751814, 0.0147536004288476, 0.0155963185751048,
                    0.0164519238025286, 0.017383057902553, 0.0184503949887735, 0.0198162679782071,
                    0.0218781313182203, 0.0237294742633411, 0.025919578977657, 0.0274518022761997,
                    0.0288986369564301, 0.0306813505050163, 0.0320170996823189, 0.0332452747332959,
                    0.0348335698576168, 0.0359832389317461, 0.0369051995840645, 0.0387221159256424,
                    0.03993025905765, 0.0431448163617178],
                   [0.00541658127872122, 0.00760286745300187, 0.007949878346447991, 0.008521651834359399,
                    0.00909775605533253, 0.009889245210140779, 0.0105309297090482, 0.0111322726797384,
                    0.0117439009052552, 0.012405033293814, 0.0131684179320803, 0.0141377942603047,
                    0.0156148055023058, 0.0169343970067564, 0.018513067368104, 0.0196080260483234,
                    0.0206489568587364, 0.0219285176765082, 0.0228689168972669, 0.023738710122235,
                    0.0248334158891432, 0.0256126573433596, 0.0265491336936829, 0.027578430100536, 0.0284430733108,
                    0.0313640941982108],
                   [0.00390439997450557, 0.00541664181796583, 0.00566171386252323, 0.00607120971135229,
                    0.0064762535755248, 0.00703573098590029, 0.00749421254589299, 0.007920878896017331,
                    0.008355737247680061, 0.00882439333812351, 0.00936785820717061, 0.01005604603884,
                    0.0111019116837591, 0.0120380990328341, 0.0131721010552576, 0.0139655122281969,
                    0.0146889122204488, 0.0156076779647454, 0.0162685615996248, 0.0168874937789415,
                    0.0176505093388153, 0.0181944265400504, 0.0186226037818523, 0.0193001796565433,
                    0.0196241518040617, 0.0213081254074584],
                   [0.00245657785440433, 0.00344809282233326, 0.00360473943713036, 0.00386326548010849,
                    0.00412089506752692, 0.00447640050137479, 0.00476555693102276, 0.00503704029750072,
                    0.00531239247408213, 0.00560929919359959, 0.00595352728377949, 0.00639092280563517,
                    0.00705566126234625, 0.0076506368153935, 0.00836821687047215, 0.008863578928549141,
                    0.00934162787186159, 0.009932186363240289, 0.0103498795291629, 0.0107780907076862,
                    0.0113184316868283, 0.0117329446468571, 0.0119995948968375, 0.0124410052027886,
                    0.0129467396733128, 0.014396063834027],
                   [0.00174954269199566, 0.00244595133885302, 0.00255710802275612, 0.00273990955227265,
                    0.0029225480567908, 0.00317374638422465, 0.00338072258533527, 0.00357243876535982,
                    0.00376734715752209, 0.00397885007249132, 0.00422430013176233, 0.00453437508148542,
                    0.00500178808402368, 0.00542372242836395, 0.00592656681022859, 0.00628034732880374,
                    0.00661030641550873, 0.00702254699967648, 0.00731822628156458, 0.0076065423418208,
                    0.00795640367207482, 0.008227052458435399, 0.00852240989786251, 0.00892863905540303,
                    0.009138539330002131, 0.009522345795667729],
                   [0.00119458814106091, 0.00173435346896287, 0.00181194434584681, 0.00194259470485893,
                    0.00207173719623868, 0.00224993202086955, 0.00239520831473419, 0.00253036792824665,
                    0.00266863168718114, 0.0028181999035216, 0.00299137548142077, 0.00321024899920135,
                    0.00354362220314155, 0.00384330190244679, 0.00420258799378253, 0.00445774902155711,
                    0.00469461513212743, 0.00499416069129168, 0.00520917757743218, 0.00540396235924372,
                    0.00564540201704594, 0.00580460792299214, 0.00599774739593151, 0.00633099254378114,
                    0.00656987109386762, 0.00685829448046227],
                   [0.000852415648011777, 0.00122883479310665, 0.00128469304457018, 0.00137617650525553,
                    0.00146751502006323, 0.00159376453672466, 0.00169668445506151, 0.00179253418337906,
                    0.00189061261635977, 0.00199645471886179, 0.00211929748381704, 0.00227457698703581,
                    0.00250999080890397, 0.00272375073486223, 0.00298072958568387, 0.00315942194040388,
                    0.0033273652798148, 0.00353988965698579, 0.00369400045486625, 0.00383345715372182,
                    0.00400793469634696, 0.00414892737222885, 0.0042839159079761, 0.00441870104432879,
                    0.00450818604569179, 0.00513477467565583],
                   [0.000644400053256997, 0.000916872204484283, 0.000957932946765532, 0.00102641863872347,
                    0.00109495154218002, 0.00118904090369415, 0.00126575197699874, 0.00133750966361506,
                    0.00141049709228472, 0.00148936709298802, 0.00158027541945626, 0.00169651643860074,
                    0.00187306184725826, 0.00203178401610555, 0.00222356097506054, 0.00235782814777627,
                    0.00248343580127067, 0.00264210826339498, 0.0027524322157581, 0.0028608570740143,
                    0.00298695044508003, 0.00309340092038059, 0.00319932767198801, 0.00332688234611187,
                    0.00339316094477355, 0.00376331697005859]])
    return N, SIG, CV