Source code for spine.math.cluster

"""Numba JIT compiled implementation of clustering routines."""

import numba as nb
import numpy as np

from .distance import METRICS, get_metric_id
from .graph import connected_components, radius_graph

__all__ = ["DBSCAN", "dbscan"]

DBSCAN_DTYPE = (
    ("eps", nb.float32),
    ("min_samples", nb.int64),
    ("metric_id", nb.int64),
    ("p", nb.float32),
)


[docs] @nb.experimental.jitclass(spec=DBSCAN_DTYPE) # type: ignore[call-arg] class DBSCAN: """Class-version of the Numba-accelerate :func:`dbscan` function. Attributes ---------- eps : float Distance scale (determines neighborhood) min_samples : int Minimum number of neighbors (including oneself) to be considered a core point metric : str Distance metric to be used to establish neighborhood """ def __init__( self, eps: float, min_samples: int = 1, metric: str = "euclidean", p: float = 2.0, ) -> None: """Initialize the DBSCAN parameters. Parameters ---------- eps : float Distance scale (determines neighborhood) min_samples : int Minimum number of neighbors (including oneself) to be considered a core point metric : str Distance metric to be used to establish neighborhood p : float, default 2. p-norm factor for the Minkowski metric, if used """ if eps < 0.0: raise ValueError("Epsilon must be non-negative.") if min_samples <= 0: raise ValueError("Minimum number of samples must be positive.") # For Euclidean, save time by using squared Euclidean if metric == "euclidean": metric = "sqeuclidean" eps = eps * eps # Store parameters self.eps = eps self.min_samples = min_samples self.metric_id = get_metric_id(metric, p) self.p = p def fit_predict(self, x): """Runs DBSCAN on 3D points and returns the group assignments. Parameters ---------- x : np.ndarray (N, 3) array of point coordinates eps : float Distance below which two points are considered neighbors min_samples : int Minimum number of neighbors for a point to be a core point metric : str, default 'euclidean' Distance metric used to compute pdist Returns ------- np.ndarray (N,) Group assignments """ # Produce a radius graph edge_index = radius_graph(x, self.eps, self.metric_id, self.p) # Build groups return connected_components( edge_index, len(x), self.min_samples, directed=False )
[docs] @nb.njit(cache=True) def dbscan( x: np.ndarray, eps: float, min_samples: int = 1, metric_id: int = METRICS["euclidean"], p: float = 2.0, ) -> np.ndarray: """Runs DBSCAN on 3D points and returns the group assignments. Parameters ---------- x : np.ndarray (N, 3) array of point coordinates eps : float Distance below which two points are considered neighbors min_samples : int Minimum number of neighbors for a point to be a core point metric : str, default 'euclidean' Distance metric used to compute pdist p : float, default 2. p-norm factor for the Minkowski metric, if used Returns ------- np.ndarray (N,) Group assignments """ if eps < 0.0: raise ValueError("Epsilon must be non-negative.") if min_samples <= 0: raise ValueError("Minimum number of samples must be positive.") # Produce a radius graph edge_index = radius_graph(x, eps, metric_id, p) # Build groups return connected_components(edge_index, len(x), min_samples, directed=False)