Source code for spine.math.metrics

"""Numba JIT compiled implementation of clustering evaluation metrics.

This module provides efficient implementations of:
- Adjusted Rand Index (ARI)
- Adjusted Mutual Information (AMI)
"""

import numba as nb
import numpy as np

from .linalg import contingency_table

__all__ = ["adjusted_rand_score", "adjusted_mutual_info_score"]


@nb.jit(nopython=True, cache=True)
def _comb2(n):
    """Compute binomial coefficient n choose 2.

    Parameters
    ----------
    n : int
        Number of items to choose from.

    Returns
    -------
    int
        The number of ways to choose 2 items from n items, equal to n*(n-1)/2.
    """
    return n * (n - 1) // 2



[docs]
@nb.jit(nopython=True, cache=True)
def adjusted_rand_score(labels_true, labels_pred):
    """Compute the Adjusted Rand Index (ARI) between two clusterings.

    The Adjusted Rand Index is a measure of the similarity between two
    data clusterings. It is a function that measures the similarity of the two
    assignments, ignoring permutations and correcting for chance agreement.

    The ARI is bounded between -1 and 1:
    - 1.0 indicates perfect clustering agreement
    - 0.0 indicates random clustering (expected value for independent labelings)
    - Negative values indicate worse than random clustering

    The formula is: ARI = (RI - E[RI]) / (max(RI) - E[RI])
    where RI is the Rand Index and E[RI] is the expected Rand Index under
    random labelings.

    Parameters
    ----------
    labels_true : ndarray of shape (n_samples,)
        Ground truth class labels to be used as a reference.
    labels_pred : ndarray of shape (n_samples,)
        Cluster labels to evaluate.

    Returns
    -------
    ari : float
        Adjusted Rand Index. A clustering result satisfying the constraints
        of a correct clustering has a score of 1.0.

    Notes
    -----
    This implementation uses a fast numba-compiled algorithm that avoids
    constructing the full pairwise similarity matrix.

    References
    ----------
    .. [1] Hubert, L. and Arabie, P. (1985). "Comparing partitions."
           Journal of Classification 2(1): 193-218.
    .. [2] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index

    Examples
    --------
    Perfect clustering:
    >>> labels_true = [0, 0, 1, 1]
    >>> labels_pred = [0, 0, 1, 1]
    >>> adjusted_rand_score(labels_true, labels_pred)
    1.0

    Random clustering:
    >>> labels_true = [0, 0, 1, 1]
    >>> labels_pred = [0, 1, 0, 1]
    >>> adjusted_rand_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
    0.0
    """
    if len(labels_true) != len(labels_pred):
        raise ValueError("Labels must have the same length")

    # Get dimensions for contingency table
    nx = labels_true.max() + 1 if len(labels_true) > 0 else 1
    ny = labels_pred.max() + 1 if len(labels_pred) > 0 else 1
    contingency = contingency_table(labels_true, labels_pred, nx, ny)

    # Compute sums
    sum_comb_c = 0
    for i in range(contingency.shape[0]):
        for j in range(contingency.shape[1]):
            sum_comb_c += _comb2(contingency[i, j])

    sum_comb_k = 0  # Sum over rows (true clusters)
    for i in range(contingency.shape[0]):
        row_sum = 0
        for j in range(contingency.shape[1]):
            row_sum += contingency[i, j]
        sum_comb_k += _comb2(row_sum)

    sum_comb_c_pred = 0  # Sum over columns (pred clusters)
    for j in range(contingency.shape[1]):
        col_sum = 0
        for i in range(contingency.shape[0]):
            col_sum += contingency[i, j]
        sum_comb_c_pred += _comb2(col_sum)

    n_samples = len(labels_true)
    sum_comb_n = _comb2(n_samples)
    if sum_comb_n == 0:
        return 1.0

    expected_index = sum_comb_k * sum_comb_c_pred / sum_comb_n
    max_index = (sum_comb_k + sum_comb_c_pred) / 2.0

    if max_index == expected_index:
        return 1.0

    return (sum_comb_c - expected_index) / (max_index - expected_index)



@nb.jit(nopython=True, cache=True)
def _entropy(labels):
    """Compute entropy of a labeling."""
    unique_labels = np.unique(labels)
    n = len(labels)

    if n <= 1:
        return 0.0

    entropy = 0.0
    for label in unique_labels:
        count = np.sum(labels == label)
        if count > 0:
            p = count / n
            entropy -= p * np.log(p)

    return entropy


@nb.jit(nopython=True, cache=True)
def _mutual_info(labels_true, labels_pred):
    """Compute mutual information between two clusterings."""
    # Get dimensions for contingency table
    nx = labels_true.max() + 1 if len(labels_true) > 0 else 1
    ny = labels_pred.max() + 1 if len(labels_pred) > 0 else 1
    contingency = contingency_table(labels_true, labels_pred, nx, ny)

    n_samples = len(labels_true)
    mi = 0.0

    for i in range(contingency.shape[0]):
        for j in range(contingency.shape[1]):
            n_ij = contingency[i, j]
            if n_ij == 0:
                continue

            # Marginal counts
            n_i = 0
            for k in range(contingency.shape[1]):
                n_i += contingency[i, k]

            n_j = 0
            for k in range(contingency.shape[0]):
                n_j += contingency[k, j]

            mi += n_ij / n_samples * np.log((n_samples * n_ij) / (n_i * n_j))

    return mi



[docs]
@nb.jit(nopython=True, cache=True)
def adjusted_mutual_info_score(labels_true, labels_pred):
    """Compute the Adjusted Mutual Information (AMI) between two clusterings.

    The Adjusted Mutual Information is a measure of agreement between two
    partitions, adjusted for chance. It employs the expected mutual information
    under a hypergeometric model of randomness.

    The AMI is normalized between 0 and 1:
    - 1.0 indicates perfect clustering agreement
    - 0.0 indicates independent labelings (expected value for random labelings)
    - Values close to 0.0 indicate near-random agreement

    The formula is: AMI = (MI - E[MI]) / (max(H(U), H(V)) - E[MI])
    where MI is the mutual information, E[MI] is the expected mutual
    information, and H(U), H(V) are the entropies of the two labelings.

    Parameters
    ----------
    labels_true : ndarray of shape (n_samples,)
        Ground truth class labels to be used as a reference.
    labels_pred : ndarray of shape (n_samples,)
        Cluster labels to evaluate.

    Returns
    -------
    ami : float
        Adjusted Mutual Information score. Perfect labelings are scored 1.0.
        Bad labelings or independent labelings have non-positive scores.

    Notes
    -----
    This implementation uses a fast numba-compiled algorithm that computes
    the hypergeometric expected mutual information directly from the
    contingency table.

    References
    ----------
    .. [1] Vinh, N. X., Epps, J., & Bailey, J. (2010). "Information theoretic
           measures for clusterings comparison: Variants, properties,
           normalization and correction for chance." Journal of Machine
           Learning Research, 11, 2837-2854.
    .. [2] https://en.wikipedia.org/wiki/Adjusted_mutual_information

    Examples
    --------
    Perfect clustering:
    >>> labels_true = [0, 0, 1, 1]
    >>> labels_pred = [0, 0, 1, 1]
    >>> adjusted_mutual_info_score(labels_true, labels_pred)
    1.0

    Random clustering:
    >>> labels_true = [0, 0, 1, 1]
    >>> labels_pred = [0, 1, 0, 1]
    >>> adjusted_mutual_info_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
    0.0...
    """
    if len(labels_true) != len(labels_pred):
        raise ValueError("Labels must have the same length")

    if len(labels_true) == 0:
        return 1.0

    # Handle trivial cases
    n_true_clusters = len(np.unique(labels_true))
    n_pred_clusters = len(np.unique(labels_pred))

    if n_true_clusters == 1 and n_pred_clusters == 1:
        return 1.0
    elif n_true_clusters == 1 or n_pred_clusters == 1:
        return 0.0

    # Compute entropies and mutual information
    entropy_true = _entropy(labels_true)
    entropy_pred = _entropy(labels_pred)
    mutual_info = _mutual_info(labels_true, labels_pred)

    # Expected mutual information (approximation for large n)
    n_samples = len(labels_true)
    expected_mi = entropy_true * entropy_pred / np.log(n_samples)

    # Compute adjusted mutual information
    mean_entropy = (entropy_true + entropy_pred) / 2.0

    if mean_entropy == expected_mi:
        return 1.0

    return (mutual_info - expected_mi) / (mean_entropy - expected_mi)