"""Numba JIT compiled implementation of clustering routines."""
import numba as nb
import numpy as np
from .distance import METRICS, get_metric_id
from .graph import connected_components, radius_graph
__all__ = ["DBSCAN", "dbscan"]
DBSCAN_DTYPE = (
("eps", nb.float32),
("min_samples", nb.int64),
("metric_id", nb.int64),
("p", nb.float32),
)
[docs]
@nb.experimental.jitclass(spec=DBSCAN_DTYPE) # type: ignore[call-arg]
class DBSCAN:
"""Class-version of the Numba-accelerate :func:`dbscan` function.
Attributes
----------
eps : float
Distance scale (determines neighborhood)
min_samples : int
Minimum number of neighbors (including oneself) to be considered
a core point
metric : str
Distance metric to be used to establish neighborhood
"""
def __init__(
self,
eps: float,
min_samples: int = 1,
metric: str = "euclidean",
p: float = 2.0,
) -> None:
"""Initialize the DBSCAN parameters.
Parameters
----------
eps : float
Distance scale (determines neighborhood)
min_samples : int
Minimum number of neighbors (including oneself) to be considered
a core point
metric : str
Distance metric to be used to establish neighborhood
p : float, default 2.
p-norm factor for the Minkowski metric, if used
"""
if eps < 0.0:
raise ValueError("Epsilon must be non-negative.")
if min_samples <= 0:
raise ValueError("Minimum number of samples must be positive.")
# For Euclidean, save time by using squared Euclidean
if metric == "euclidean":
metric = "sqeuclidean"
eps = eps * eps
# Store parameters
self.eps = eps
self.min_samples = min_samples
self.metric_id = get_metric_id(metric, p)
self.p = p
def fit_predict(self, x):
"""Runs DBSCAN on 3D points and returns the group assignments.
Parameters
----------
x : np.ndarray
(N, 3) array of point coordinates
eps : float
Distance below which two points are considered neighbors
min_samples : int
Minimum number of neighbors for a point to be a core point
metric : str, default 'euclidean'
Distance metric used to compute pdist
Returns
-------
np.ndarray
(N,) Group assignments
"""
# Produce a radius graph
edge_index = radius_graph(x, self.eps, self.metric_id, self.p)
# Build groups
return connected_components(
edge_index, len(x), self.min_samples, directed=False
)
[docs]
@nb.njit(cache=True)
def dbscan(
x: np.ndarray,
eps: float,
min_samples: int = 1,
metric_id: int = METRICS["euclidean"],
p: float = 2.0,
) -> np.ndarray:
"""Runs DBSCAN on 3D points and returns the group assignments.
Parameters
----------
x : np.ndarray
(N, 3) array of point coordinates
eps : float
Distance below which two points are considered neighbors
min_samples : int
Minimum number of neighbors for a point to be a core point
metric : str, default 'euclidean'
Distance metric used to compute pdist
p : float, default 2.
p-norm factor for the Minkowski metric, if used
Returns
-------
np.ndarray
(N,) Group assignments
"""
if eps < 0.0:
raise ValueError("Epsilon must be non-negative.")
if min_samples <= 0:
raise ValueError("Minimum number of samples must be positive.")
# Produce a radius graph
edge_index = radius_graph(x, eps, metric_id, p)
# Build groups
return connected_components(edge_index, len(x), min_samples, directed=False)