Source code for histo_kit.tissue_seg.find_thr

import warnings
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


[docs]
def GaMRed_hist(x, y, K, draw, SW):
    """
    Estimate noise components using a Gaussian Mixture Model (GMM) fitted to histogram data.

    This function fits a Gaussian mixture model to histogrammed data using the
    Expectation-Maximization (EM) algorithm. It computes the Bayesian Information
    Criterion (BIC) for model evaluation and determines a threshold separating
    the mixture components, if applicable. The function returns the threshold,
    BIC value, and a dictionary of fitted model statistics.

    Parameters
    ----------
    x : ndarray of shape (M,)
        Bin centers of the histogram (can be unsorted; will be sorted internally).
    y : ndarray of shape (M,)
        Bin counts (frequency of observations for each bin).
    K : int
        Number of Gaussian components to fit in the mixture model.
    draw : bool
        Whether to print diagnostic information or draw plots.
    SW : object
        Settings structure or dictionary controlling the EM algorithm behavior
        (e.g., convergence criteria, maximum iterations). The exact format
        depends on the helper functions (e.g., :func:`EM_iter_hist`).

    Returns
    -------
    thr : float
        Threshold value separating mixture components along the x-axis.
        If no threshold can be determined, returns ``np.nan``.
    bic : float
        Bayesian Information Criterion (BIC) value for the fitted model.
    stats : dict
        Dictionary containing model statistics and parameters:
        - ``"thr"`` : float, threshold value
        - ``"alpha"`` : ndarray, mixture component weights
        - ``"mu"`` : ndarray, component means
        - ``"K"`` : int, number of fitted components
        - ``"sigma"`` : ndarray, component standard deviations
        - ``"logL"`` : float, log-likelihood of the fitted model

    Raises
    ------
    ValueError
        If the EM algorithm fails (e.g., returns invalid log-likelihood leading
        to infinite or zero BIC), prompting a rerun with different initialization.

    Notes
    -----
    - For ``K == 1``, no separation is possible, and the threshold is set
      below the minimum x value.
    - For ``K == 2``, the threshold is computed directly between the two Gaussian
      components.
    - For ``K > 2``, components are grouped into two clusters using
      :class:`sklearn.cluster.KMeans` before determining the separating threshold.

    References
    ----------
    MATLAB implementation : Michal Marczyk (Michal.Marczyk@polsl.pl)
    Algorithm is described in \ :footcite:p:`Marczyk2018GaMRed`

    Examples
    --------
    >>> thr_back = {"R": 0.6, "G": 0.6, "B": 0.6}
    >>> thr, bic, stats = GaMRed_hist(x, y, K=2, draw=False, SW=my_settings)
    >>> print(f"Threshold: {thr:.3f}, BIC: {bic:.2f}")
    >>> print("Component means:", stats["mu"])
    """
    ind = np.argsort(x)
    x = np.sort(x)
    y = y[ind]
    N = np.sum(y) #nb of measurements
    bic = np.inf

    # remove no signal at the beginning or the end
    y[0] = 0
    y[-1] = 0

    # initial conditions
    if K == 1:
        alpha_init = 1
        mi_init = np.mean(x)
        sigma_init = np.std(x)
    else:
        alpha_init, mi_init, sigma_init = gmm_init_dp_hist(x, y, K)

    if draw:
        print("Starting values")
        print(f"alpha_init: {alpha_init}, mi_init: {mi_init}, sigma_init: {sigma_init}")

    while bic == np.inf or bic == 0:
        # EM algorithm
        alpha, mi, sigma, logL = EM_iter_hist(x, y, alpha_init, mi_init, sigma_init, SW)

        # calculating BIC
        bic = -2*logL + (3*K-1)*np.log(N)
        if bic == np.inf or bic == 0:
            raise ValueError("EM crash. Repeat calculations.")

    ind = np.argsort(mi)
    mi  = np.sort(mi)
    alpha = alpha[ind]
    sigma = sigma[ind]

    if draw:
        print("Final values")
        print(f"alpha: {alpha}, mi: {mi}, sigma: {sigma}")

    # find threshold between components
    if K == 1:
        thr = np.min(x) - 1e-10
    elif K == 2:
        thr = find_thr(x, alpha, mi, sigma, np.array([0, 1]), draw)
    else:
        temp = np.column_stack((alpha, mi, sigma))
        kmeans = KMeans(n_clusters=2, n_init=50, random_state=0).fit(temp)
        idx = kmeans.labels_
        thr = find_thr(x, alpha, mi, sigma,idx-1, draw)

    try:
        thr
    except NameError:
        thr = np.nan

    stats = {
        "thr": thr,
        "alpha": alpha,
        "mu": mi,
        "K": K,
        "sigma": sigma,
        "logL": logL,
    }

    return thr, bic, stats



[docs]
def EM_iter_hist(x, y, alpha, mu, sig, SW):
    """
    Perform one iteration of the Expectation-Maximization (EM) algorithm for a Gaussian Mixture Model (GMM) on binned data.

    The function iteratively updates the mixture weights, means, and standard deviations
    of a GMM fitted to histogram data using the EM procedure until convergence or
    a maximum number of iterations is reached. It returns the updated parameters
    and the log-likelihood of the fitted model.

    Parameters
    ----------
    x : ndarray of shape (N,)
        Bin centers of the histogram.
    y : ndarray of shape (N,)
        Counts or frequencies for each bin.
    alpha : ndarray of shape (K,)
        Initial weights of the Gaussian components (sum to 1).
    mu : ndarray of shape (K,)
        Initial means of the Gaussian components.
    sig : ndarray of shape (K,)
        Initial standard deviations of the Gaussian components.
    SW : float
        Minimum allowed standard deviation (safeguard against too small variances).

    Returns
    -------
    alpha : ndarray of shape (K,)
        Updated weights of the Gaussian components.
    mu : ndarray of shape (K,)
        Updated means of the Gaussian components, sorted in ascending order.
    sig : ndarray of shape (K,)
        Updated standard deviations of the Gaussian components, sorted according to `mu`.
    logL : float
        Log-likelihood of the histogram data under the fitted GMM.

    Notes
    -----
    - The algorithm stops when the change in parameters is below a threshold (1e-6)
      or after 10,000 iterations.

    Examples
    --------
    >>> alpha, mu, sig, logL = EM_iter_hist(x, y, alpha_init, mu_init, sig_init, SW=0.01)
    >>> print("Updated means:", mu)
    >>> print("Log-likelihood:", logL)
    """

    N = len(y)
    n = np.sum(y)
    sig2 = np.maximum(sig ** 2, SW ** 2)
    change = np.inf
    count = 1
    SW = SW ** 2
    eps_change = 1e-6
    KS = len(alpha)

    while change > eps_change and count < 10000:
        old_alpha = alpha.copy()
        old_sig2 = sig2.copy()

        f = np.zeros((KS, N))
        sig = np.sqrt(sig2)

        for a in range(KS):
            f[a, :] = norm_pdf(x, mu[a], sig[a])

        px = alpha @ f
        px[np.isnan(px) | (px == 0)] = 5e-324

        for a in range(KS):
            pk = ((alpha[a] * f[a, :]) * y) / px
            denom = np.sum(pk)
            mu[a] = (pk @ x) / denom
            sig2num = np.sum(pk @ ((x - mu[a]) ** 2))
            sig2[a] = np.maximum(SW, sig2num / denom)
            alpha[a] = denom / n

        change = np.sum(np.abs(alpha - old_alpha)) + np.sum(np.abs(sig2 - old_sig2) / sig2) / len(alpha)
        count += 1

    # return results
    logL = np.sum(np.log(px) * y)
    mu_est = np.sort(mu)
    ind = np.argsort(mu)
    sig_est = np.sqrt(sig2[ind])
    pp_est = alpha[ind]

    return pp_est, mu_est, sig_est, logL



[docs]
def gmm_init_dp_hist(x, y, K):
    """
    Compute initial parameters for a Gaussian Mixture Model using dynamic programming on binned data.

    This function estimates starting values for the mixture weights, means, and standard deviations
    of a GMM by partitioning the histogram (binned data) into K segments.

    Parameters
    ----------
    x : ndarray of shape (N,)
        Bin centers of the histogram.
    y : ndarray of shape (N,)
        Counts (frequencies) of each bin.
    K : int
        Number of Gaussian components (partitions) to initialize.

    Returns
    -------
    alpha : ndarray of shape (K,)
        Initial weights of the Gaussian components, proportional to the sum of counts in each partition.
    mu : ndarray of shape (K,)
        Initial means of the Gaussian components, computed as weighted averages of bin centers within each partition.
    sigma : ndarray of shape (K,)
        Initial standard deviations of the Gaussian components, corrected for binned data using Sheppard's correction.

    Notes
    -----
    - Sheppard's correction is applied to account for variance underestimation due to binning:
      `s_corr = ((x[1] - x[0]) ** 2) / 12`.

    Examples
    --------
    >>> alpha_init, mu_init, sigma_init = gmm_init_dp_hist(x, y, K=3)
    >>> print("Initial weights:", alpha_init)
    >>> print("Initial means:", mu_init)
    >>> print("Initial standard deviations:", sigma_init)
    """
    # parameters
    par1 = 0.1  # for robustness (fine for data in range 0-20)
    par2 = 10  # min number of points in signal fragment

    # initialize
    s_corr = ((x[1] - x[0]) ** 2) / 12  # sheppards correction for binned data
    K = K - 1
    N = len(x)
    p_opt_idx = np.zeros(N)
    p_aux = np.zeros(N)
    opt_pals = np.zeros((K, N))

    for a in range(N):
        invec = x[a:N]
        yinvec = y[a:N]
        if np.sum(yinvec) <= par2:
            p_opt_idx[a] = np.inf
        else:
            wwec = yinvec / (np.sum(yinvec))
            var_bin = np.sum(((invec - np.sum(invec * wwec)) ** 2) * wwec)
            if var_bin > s_corr:
                p_opt_idx[a] = (par1 + np.sqrt(var_bin - s_corr)) / (np.max(invec) - np.min(invec))
            else:
                p_opt_idx[a] = np.inf

    # diff p_opt_idx => e-17

    # aux mx
    aux_mx = np.zeros((N, N))
    for a in range(N - 1):
        for b in range(a + 1, N):
            invec = x[a:b]
            yinvec = y[a:b]
            if np.sum(yinvec) <= par2:
                aux_mx[a, b] = np.inf
            else:
                wwec = yinvec / (np.sum(yinvec))
                var_bin = np.sum(((invec - np.sum(invec * wwec)) ** 2) * wwec)
                if var_bin > s_corr:
                    aux_mx[a, b] = (par1 + np.sqrt(var_bin - s_corr)) / (np.max(invec) - np.min(invec))
                else:
                    aux_mx[a, b] = np.inf

    # iterate
    for kster in range(K):
        # kster
        for a in range(N - kster - 1):
            for b in range(a + 1, N - kster):
                p_aux[b] = aux_mx[a, b] + p_opt_idx[b]
            mm = np.min(p_aux[a + 1:N - kster])
            ix = np.argmin(p_aux[a + 1:N - kster])
            p_opt_idx[a] = mm  # e-16
            opt_pals[kster, a] = a + ix + 1

    # restore optimal decisions
    opt_part = np.zeros(K)
    opt_part[0] = opt_pals[K-1, 0]
    for kster in range(K - 2, -1, -1):
        opt_part[K - kster -1] = opt_pals[kster, int(opt_part[K - kster - 2])]

    # find initial conditions
    opt_part = np.concatenate(([0], opt_part, [N]))
    alpha = np.zeros(K + 1)
    mu = np.zeros(K + 1)
    sigma = np.zeros(K + 1)

    for a in range(K + 1):
        invec = x[int(opt_part[a]):int(opt_part[a + 1])]
        yinvec = y[int(opt_part[a]):int(opt_part[a + 1])]
        wwec = yinvec / (np.sum(yinvec))
        alpha[a] = np.sum(yinvec) / np.sum(y)
        mu[a] = np.sum(invec * wwec)
        sigma[a] = np.sqrt(np.sum(((invec - np.sum(invec * wwec)) ** 2) * wwec) - s_corr)


    return alpha, mu, sigma



[docs]
def norm_pdf(x, mu, sigma):
    """
    Evaluate the probability density function (PDF) of a normal distribution.

    Computes the PDF values of a normal (Gaussian) distribution with specified
    mean and standard deviation at the given input points.

    Parameters
    ----------
    x : array_like
        Input values where the PDF is evaluated.
    mu : float
        Mean of the normal distribution.
    sigma : float
        Standard deviation of the normal distribution.

    Returns
    -------
    y : ndarray of shape (1, N)
        PDF values evaluated at `x`, returned as a 2D array with one row.

    Examples
    --------
    >>> y = norm_pdf(np.array([0, 1, 2]), mu=0, sigma=1)
    >>> print(y)
    """
    y = norm.pdf(x, mu, sigma)
    return y.reshape(1, -1)



[docs]
def find_thr(data, alpha, mi, sigma, idx, draw):
    """
    Determine the threshold separating informative and non-informative GMM components.

    The function evaluates the Gaussian Mixture Model (GMM) over a finely spaced grid,
    computes the combined densities of informative and non-informative components,
    and identifies the threshold where the absolute difference between these densities
    is minimized. Optionally, plots the component densities.

    Parameters
    ----------
    data : ndarray
        Input data points used to determine the threshold range.
    alpha : ndarray of shape (K,)
        Weights of the GMM components.
    mi : ndarray of shape (K,)
        Means of the GMM components.
    sigma : ndarray of shape (K,)
        Standard deviations of the GMM components.
    idx : array_like of shape (K,)
        Boolean index indicating which components are considered informative (True)
        and non-informative (False).
    draw : bool
        If True, plots the component densities, their difference, and the threshold.

    Returns
    -------
    thr : float
        Threshold value along the data axis that best separates informative from
        non-informative components.

    Notes
    -----
    - The function uses a high-resolution grid (`1e7` points) for precise threshold determination.
    - If no valid index is found within the constraints, a ValueError is raised.

    Examples
    --------
    >>> idx = np.array([False, True])
    >>> thr = find_thr(data, alpha, mu, sigma, idx, draw=True)
    >>> print("Threshold value:", thr)
    """

    idx = idx.astype(bool)

    # generate data with better precision
    K = len(mi)
    f_temp=np.zeros((int(1e7), K))
    x_temp=np.linspace(np.min(data), np.max(data), int(1e7))

    for k in range(K):
        f_temp[:, k] = alpha[k]*norm_pdf(x_temp, mi[k], sigma[k])

    # find GMM for informative and non-informative components
    f1 = np.sum(f_temp[:, ~idx], axis=1)
    f2 = np.sum(f_temp[:, idx], axis=1)

    # calculate difference of f1 and f2 and find its global minimum
    f_diff = np.abs(f1 - f2)
    ind1 = np.argmax(f1)
    ind2 = np.argmax(f2)
    ind = np.argsort(f_diff)

    ind = ind[(ind<ind1) & (ind>ind2)]

    if len(ind) == 0:
        ind = np.argsort(f_diff)
        a=0
        thr_ind = ind[a]

        while thr_ind<ind1 or thr_ind>ind2:
            a+=1
            if a>=len(ind):
                raise ValueError("Missing index")
            thr_ind = ind[a]
    else:
        thr_ind = ind[0]

    thr = x_temp[thr_ind]

    if draw:
        fig, axes = plt.subplots(2, 1, figsize=(8, 6))
        axes[0].plot(x_temp, f1, 'g', linewidth=2, label='f1')
        axes[0].plot(x_temp, f2, 'r', linewidth=2, label='f2')
        axes[0].set_xlabel("Variable")
        axes[0].set_ylabel("Model", fontsize=14)
        axes[0].legend()

        axes[1].plot(x_temp, f_diff, 'r', linewidth=2, label='f_diff')
        axes[1].set_xlabel("Variable")
        axes[1].set_ylabel("Models difference", fontsize=14)
        axes[1].set_title(f"Threshold: {thr:.4f}")

        plt.tight_layout()
        plt.show()

    return thr



[docs]
def get_pixel_distribution(img):
    """
    Compute the pixel value distribution for each RGB channel of an image.

    This function calculates histograms of pixel intensities (0–255) separately
    for the Red, Green, and Blue channels of an input RGB image. Counts for
    near-white pixels (values 254 and 255) are set to zero to remove artificial
    or background artifacts.

    Parameters
    ----------
    img : ndarray of shape (M, N, 3)
        Input RGB image as a NumPy array with values in the range [0, 255].

    Returns
    -------
    R : ndarray of shape (256,)
        Histogram of Red channel pixel intensities (with counts for 254 and 255 set to zero).
    G : ndarray of shape (256,)
        Histogram of Green channel pixel intensities (with counts for 254 and 255 set to zero).
    B : ndarray of shape (256,)
        Histogram of Blue channel pixel intensities (with counts for 254 and 255 set to zero).

    Examples
    --------
    >>> R, G, B = get_pixel_distribution(image)
    >>> print("Red channel counts:", R)
    >>> print("Green channel counts:", G)
    >>> print("Blue channel counts:", B)
    """

    # get distribution of pixel values per color channel
    R = img[:, :, 0].ravel()
    G = img[:, :, 1].ravel()
    B = img[:, :, 2].ravel()

    bins = np.arange(-0.5, 256.5, 1)

    R, _ = np.histogram(R, bins=bins)
    B, _ = np.histogram(B, bins=bins)
    G, _ = np.histogram(G, bins=bins)

    # remove counts from artificial white pixels
    R[254:] = 0
    G[254:] = 0
    B[254:] = 0

    return R, G, B




[docs]
def get_thr_image(img, thr_min = 0.7*255, verbose=False):
    """
    Compute per-channel thresholds for an RGB image using the GaMRed algorithm.

    This function estimates thresholds for each color channel (Red, Green, Blue)
    of an RGB image using the Gaussian Mixture Reduction (GaMRed) method. If a
    computed threshold is lower than `thr_min`, the function falls back to Otsu's
    method for that channel. The function also returns the histogram of pixel
    values for each channel.

    Parameters
    ----------
    img : ndarray of shape (M, N, 3)
        Input RGB image with pixel values in the range [0, 255].
    thr_min : float, optional
        Minimum allowable threshold. If a GaMRed threshold is below this value,
        Otsu's method is used instead. Default is 0.7*255.
    verbose : bool, optional
        If True, prints messages when Otsu's method is used due to low thresholds.
        Default is False.

    Returns
    -------
    thr : dict
        Dictionary of thresholds for each color channel:
        - ``"R"`` : float, threshold for the Red channel
        - ``"G"`` : float, threshold for the Green channel
        - ``"B"`` : float, threshold for the Blue channel
    R : ndarray of shape (256,)
        Histogram of Red channel pixel values.
    G : ndarray of shape (256,)
        Histogram of Green channel pixel values.
    B : ndarray of shape (256,)
        Histogram of Blue channel pixel values.

    Notes
    -----
    - Uses :func:`get_pixel_distribution` to compute per-channel histograms.
    - Thresholds are initially estimated using :func:`GaMRed_hist`.
    - If a threshold is below `thr_min`, the function uses :func:`two_step_otsu`
      as a fallback for robustness.
    - `K=2` and `SW=5` are fixed parameters for the GaMRed algorithm.

    Examples
    --------
    >>> thr, R, G, B = get_thr_image(image, thr_min=180, verbose=True)
    >>> print("Thresholds:", thr)
    >>> print("Red channel histogram:", R)
    """

    x = np.arange(256)
    K = 2
    SW = 5
    draw = False

    R, G, B = get_pixel_distribution(img)
    hist = {"R": R,
            "G": G,
            "B": B}

    thr = {"R": GaMRed_hist(x, hist["R"], K, draw, SW)[0],
           "G": GaMRed_hist(x, hist["G"], K, draw, SW)[0],
           "B": GaMRed_hist(x, hist["B"], K, draw, SW)[0]}

    for k, v in thr.items():
        if v < thr_min:
            thr[k] = two_step_otsu(hist=hist[k])
            if verbose:
                print(f"Too low threshold for {k} channel, use Otsu instead.")

    return thr, R, G, B



[docs]
def two_step_otsu(hist):
    """
    Compute a threshold using a two-step Otsu algorithm.

    This function applies a hierarchical, two-step version of Otsu's method
    to determine a threshold from a histogram. The first Otsu threshold divides
    the histogram roughly in half, and the second Otsu threshold refines the
    separation within the upper segment of the histogram. This is useful for
    images with uneven lighting or bimodal intensity distributions.

    Parameters
    ----------
    hist : ndarray of shape (256,)
        Histogram of pixel intensities (counts per bin).

    Returns
    -------
    thr : float
        Computed threshold value in the range [0, 255].

    Notes
    -----
    - Relies on :func:`otsuthresh` (assumed available) for standard Otsu threshold computation.
    - The second step focuses on the upper portion of the histogram to refine
      the threshold.
    - The final threshold is scaled to the 0–255 range and rounded to the nearest integer.

    Examples
    --------
    >>> thr = two_step_otsu(hist)
    >>> print("Two-step Otsu threshold:", thr)

    """

    tmp, _ = otsuthresh(hist)
    tmp = int(tmp*255)
    tmp2, _ = otsuthresh(hist[tmp-1:])
    thr = np.round(tmp+(255-tmp)*tmp2)
    return thr



[docs]
def otsuthresh(counts):
    """
    Compute Otsu's threshold and effectiveness metric for a histogram.

    This function implements Otsu's method in Python, based on the MATLAB implementation.
    It calculates the threshold that maximizes the between-class variance for a histogram
    of pixel counts, along with an effectiveness metric indicating the separation quality.

    Parameters
    ----------
    counts : array_like
        Histogram of pixel intensities (counts per bin).

    Returns
    -------
    t : float
        Normalized threshold value in the range [0, 1].
    em : float
        Effectiveness metric of the threshold (ratio of between-class variance
        to total variance). Higher values indicate better separation.

    Notes
    -----
    - Counts are converted to probabilities and cumulative sums are computed
      to calculate the between-class variance.
    - NaN values arising from division by zero are safely replaced with `-inf`.
    - Function is based on MATLAB's implementation.

    Examples
    --------
    >>> t, em = otsuthresh(hist)
    >>> print("Normalized Otsu threshold:", t)
    >>> print("Effectiveness metric:", em)

    References
    ----------

    Algorithm is described in \ :footcite:p:`Otsu`

    .. footbibliography::
    """

    counts = np.asarray(counts, dtype=np.float64).ravel()
    num_bins = counts.size

    # Probabilities
    p = counts / counts.sum()

    # Cumulative sums
    omega = np.cumsum(p)
    mu = np.cumsum(p * np.arange(1, num_bins+1))
    mu_t = mu[-1]

    # Between-class variance
    with warnings.catch_warnings():
        # Ignore invalid value encountered in divide (handled in the next lines of code)
        warnings.simplefilter("ignore", category=RuntimeWarning)
        sigma_b_squared = (mu_t * omega - mu) ** 2 / (omega * (1 - omega))

    # Handle NaNs (avoid division by zero cases)
    sigma_b_squared = np.nan_to_num(sigma_b_squared, nan=-np.inf)

    maxval = sigma_b_squared.max()

    if np.isfinite(maxval) and maxval > 0:
        idx = np.mean(np.where(sigma_b_squared == maxval)[0]) + 1
        # Normalize threshold
        t = (idx - 1) / (num_bins - 1)
    else:
        t = 0.0

    # Effectiveness metric
    if np.isfinite(maxval) and maxval > 0:
        em = maxval / (np.sum(p * (np.arange(1, num_bins + 1) ** 2)) - mu_t ** 2)
    else:
        em = 0.0

    return t, em