# -*- coding: utf-8 -*-
"""
HDBSCAN: Hierarchical Density-Based Spatial Clustering
of Applications with Noise
"""
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.metrics import pairwise_distances
from scipy.sparse import issparse
from sklearn.neighbors import KDTree, BallTree
from joblib import Memory
import six
from warnings import warn
from sklearn.utils import check_array
from joblib.parallel import cpu_count
from scipy.sparse import csgraph
from ._hdbscan_linkage import (single_linkage,
mst_linkage_core,
mst_linkage_core_vector,
label)
from ._hdbscan_tree import (condense_tree,
compute_stability,
get_clusters,
outlier_scores)
from ._hdbscan_reachability import (mutual_reachability,
sparse_mutual_reachability)
from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm
from .dist_metrics import DistanceMetric
from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree
from .prediction import PredictionData
FAST_METRICS = (KDTree.valid_metrics + BallTree.valid_metrics +
['cosine', 'arccos'])
# Author: Leland McInnes <leland.mcinnes@gmail.com>
# Steve Astels <sastels@gmail.com>
# John Healy <jchealy@gmail.com>
#
# License: BSD 3 clause
from numpy import isclose
def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
cluster_selection_method='eom',
allow_single_cluster=False,
match_reference_implementation=False,
cluster_selection_epsilon=0.0):
"""Converts a pretrained tree and cluster size into a
set of labels and probabilities.
"""
condensed_tree = condense_tree(single_linkage_tree,
min_cluster_size)
stability_dict = compute_stability(condensed_tree)
labels, probabilities, stabilities = get_clusters(condensed_tree,
stability_dict,
cluster_selection_method,
allow_single_cluster,
match_reference_implementation,
cluster_selection_epsilon)
return (labels, probabilities, stabilities, condensed_tree,
single_linkage_tree)
def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
leaf_size=None, gen_min_span_tree=False, **kwargs):
if metric == 'minkowski':
distance_matrix = pairwise_distances(X, metric=metric, p=p)
elif metric == 'arccos':
distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
elif metric == 'precomputed':
# Treating this case explicitly, instead of letting
# sklearn.metrics.pairwise_distances handle it,
# enables the usage of numpy.inf in the distance
# matrix to indicate missing distance information.
# TODO: Check if copying is necessary
distance_matrix = X.copy()
else:
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
if issparse(distance_matrix):
# raise TypeError('Sparse distance matrices not yet supported')
return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples,
alpha, metric, p,
leaf_size, gen_min_span_tree,
**kwargs)
mutual_reachability_ = mutual_reachability(distance_matrix,
min_samples, alpha)
min_spanning_tree = mst_linkage_core(mutual_reachability_)
# Warn if the MST couldn't be constructed around the missing distances
if np.isinf(min_spanning_tree.T[2]).any():
warn('The minimum spanning tree contains edge weights with value '
'infinity. Potentially, you are missing too many distances '
'in the initial distance matrix for the given neighborhood '
'size.', UserWarning)
# mst_linkage_core does not generate a full minimal spanning tree
# If a tree is required then we must build the edges from the information
# returned by mst_linkage_core (i.e. just the order of points to be merged)
if gen_min_span_tree:
result_min_span_tree = min_spanning_tree.copy()
for index, row in enumerate(result_min_span_tree[1:], 1):
candidates = np.where(isclose(mutual_reachability_[int(row[1])],
row[2]))[0]
candidates = np.intersect1d(candidates,
min_spanning_tree[:index, :2].astype(
int))
candidates = candidates[candidates != row[1]]
assert len(candidates) > 0
row[0] = candidates[0]
else:
result_min_span_tree = None
# Sort edges of the min_spanning_tree by weight
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
:]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
return single_linkage_tree, result_min_span_tree
def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
gen_min_span_tree=False, **kwargs):
assert issparse(X)
# Check for connected component on X
if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
raise ValueError('Sparse distance matrix has multiple connected '
'components!\nThat is, there exist groups of points '
'that are completely disjoint -- there are no distance '
'relations connecting them\n'
'Run hdbscan on each component.')
lil_matrix = X.tolil()
# Compute sparse mutual reachability graph
# if max_dist > 0, max distance to use when the reachability is infinite
max_dist = kwargs.get("max_dist", 0.)
mutual_reachability_ = sparse_mutual_reachability(lil_matrix,
min_points=min_samples,
max_dist=max_dist,
alpha=alpha)
# Check connected component on mutual reachability
# If more than one component, it means that even if the distance matrix X
# has one component, there exists with less than `min_samples` neighbors
if csgraph.connected_components(mutual_reachability_, directed=False,
return_labels=False) > 1:
raise ValueError(('There exists points with less than %s neighbors. '
'Ensure your distance matrix has non zeros values for '
'at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), '
'or specify a `max_dist` to use when distances are missing.')
% (min_samples, min_samples))
# Compute the minimum spanning tree for the sparse graph
sparse_min_spanning_tree = csgraph.minimum_spanning_tree(
mutual_reachability_)
# Convert the graph to scipy cluster array format
nonzeros = sparse_min_spanning_tree.nonzero()
nonzero_vals = sparse_min_spanning_tree[nonzeros]
min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T
# Sort edges of the min_spanning_tree by weight
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
:][0]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
if gen_min_span_tree:
return single_linkage_tree, min_spanning_tree
else:
return single_linkage_tree, None
def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
gen_min_span_tree=False, **kwargs):
if X.dtype != np.float64:
X = X.astype(np.float64)
# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')
tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
# TO DO: Deal with p for minkowski appropriately
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
# Get distance to kth nearest neighbour
core_distances = tree.query(X, k=min_samples,
dualtree=True,
breadth_first=True)[0][:, -1].copy(order='C')
# Mutual reachability distance is implicit in mst_linkage_core_vector
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
alpha)
# Sort edges of the min_spanning_tree by weight
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
:]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
if gen_min_span_tree:
warn('Cannot generate Minimum Spanning Tree; '
'the implemented Prim\'s does not produce '
'the full minimum spanning tree ', UserWarning)
return single_linkage_tree, None
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
gen_min_span_tree=False, **kwargs):
if X.dtype != np.float64:
X = X.astype(np.float64)
# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')
tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
# Get distance to kth nearest neighbour
core_distances = tree.query(X, k=min_samples,
dualtree=True,
breadth_first=True)[0][:, -1].copy(order='C')
# Mutual reachability distance is implicit in mst_linkage_core_vector
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
alpha)
# Sort edges of the min_spanning_tree by weight
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
:]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
if gen_min_span_tree:
warn('Cannot generate Minimum Spanning Tree; '
'the implemented Prim\'s does not produce '
'the full minimum spanning tree ', UserWarning)
return single_linkage_tree, None
def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
approx_min_span_tree=True,
gen_min_span_tree=False,
core_dist_n_jobs=4, **kwargs):
if leaf_size < 3:
leaf_size = 3
if core_dist_n_jobs < 1:
core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
if X.dtype != np.float64:
X = X.astype(np.float64)
tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
leaf_size=leaf_size // 3,
approx_min_span_tree=approx_min_span_tree,
n_jobs=core_dist_n_jobs, **kwargs)
min_spanning_tree = alg.spanning_tree()
# Sort edges of the min_spanning_tree by weight
row_order = np.argsort(min_spanning_tree.T[2])
min_spanning_tree = min_spanning_tree[row_order, :]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
if gen_min_span_tree:
return single_linkage_tree, min_spanning_tree
else:
return single_linkage_tree, None
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
approx_min_span_tree=True,
gen_min_span_tree=False,
core_dist_n_jobs=4, **kwargs):
if leaf_size < 3:
leaf_size = 3
if core_dist_n_jobs < 1:
core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
if X.dtype != np.float64:
X = X.astype(np.float64)
tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
leaf_size=leaf_size // 3,
approx_min_span_tree=approx_min_span_tree,
n_jobs=core_dist_n_jobs, **kwargs)
min_spanning_tree = alg.spanning_tree()
# Sort edges of the min_spanning_tree by weight
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
:]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = label(min_spanning_tree)
if gen_min_span_tree:
return single_linkage_tree, min_spanning_tree
else:
return single_linkage_tree, None
def check_precomputed_distance_matrix(X):
"""Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
"""
tmp = X.copy()
tmp[np.isinf(tmp)] = 1
check_array(tmp)
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0,
metric='minkowski', p=2, leaf_size=40,
algorithm='best', memory=Memory(cachedir=None, verbose=0),
approx_min_span_tree=True, gen_min_span_tree=False,
core_dist_n_jobs=4,
cluster_selection_method='eom', allow_single_cluster=False,
match_reference_implementation=False, **kwargs):
"""Perform HDBSCAN clustering from a vector array or distance matrix.
Parameters
----------
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
min_cluster_size : int, optional (default=5)
The minimum number of samples in a group for that group to be
considered a cluster; groupings smaller than this size will be left
as noise.
min_samples : int, optional (default=None)
The number of samples in a neighborhood for a point
to be considered as a core point. This includes the point itself.
defaults to the min_cluster_size.
cluster_selection_epsilon: float, optional (default=0.0)
A distance threshold. Clusters below this value will be merged.
See [3]_ for more information.
alpha : float, optional (default=1.0)
A distance scaling parameter as used in robust single linkage.
See [2]_ for more information.
metric : string or callable, optional (default='minkowski')
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.pairwise_distances for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.
p : int, optional (default=2)
p value to use if using the minkowski metric.
leaf_size : int, optional (default=40)
Leaf size for trees responsible for fast nearest
neighbour queries.
algorithm : string, optional (default='best')
Exactly which algorithm to use; hdbscan has variants specialised
for different characteristics of the data. By default this is set
to ``best`` which chooses the "best" algorithm given the nature of
the data. You can force other options if you believe you know
better. Options are:
* ``best``
* ``generic``
* ``prims_kdtree``
* ``prims_balltree``
* ``boruvka_kdtree``
* ``boruvka_balltree``
memory : instance of joblib.Memory or string, optional
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
approx_min_span_tree : bool, optional (default=True)
Whether to accept an only approximate minimum spanning tree.
For some algorithms this can provide a significant speedup, but
the resulting clustering may be of marginally lower quality.
If you are willing to sacrifice speed for correctness you may want
to explore this; in general this should be left at the default True.
gen_min_span_tree : bool, optional (default=False)
Whether to generate the minimum spanning tree for later analysis.
core_dist_n_jobs : int, optional (default=4)
Number of parallel jobs to run in core distance computations (if
supported by the specific algorithm). For ``core_dist_n_jobs``
below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
cluster_selection_method : string, optional (default='eom')
The method used to select clusters from the condensed tree. The
standard approach for HDBSCAN* is to use an Excess of Mass algorithm
to find the most persistent clusters. Alternatively you can instead
select the clusters at the leaves of the tree -- this provides the
most fine grained and homogeneous clusters. Options are:
* ``eom``
* ``leaf``
allow_single_cluster : bool, optional (default=False)
By default HDBSCAN* will not produce a single cluster, setting this
to t=True will override this and allow single cluster results in
the case that you feel this is a valid result for your dataset.
(default False)
match_reference_implementation : bool, optional (default=False)
There exist some interpretational differences between this
HDBSCAN* implementation and the original authors reference
implementation in Java. This can result in very minor differences
in clustering results. Setting this flag to True will, at a some
performance cost, ensure that the clustering results match the
reference implementation.
**kwargs : optional
Arguments passed to the distance metric
Returns
-------
labels : ndarray, shape (n_samples, )
Cluster labels for each point. Noisy samples are given the label -1.
probabilities : ndarray, shape (n_samples, )
Cluster membership strengths for each point. Noisy samples are assigned
0.
cluster_persistence : array, shape (n_clusters, )
A score of how persistent each cluster is. A score of 1.0 represents
a perfectly stable cluster that persists over all distance scales,
while a score of 0.0 represents a perfectly ephemeral cluster. These
scores can be guage the relative coherence of the clusters output
by the algorithm.
condensed_tree : record array
The condensed cluster hierarchy used to generate clusters.
single_linkage_tree : ndarray, shape (n_samples - 1, 4)
The single linkage tree produced during clustering in scipy
hierarchical clustering format
(see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
min_spanning_tree : ndarray, shape (n_samples - 1, 3)
The minimum spanning as an edgelist. If gen_min_span_tree was False
this will be None.
References
----------
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
Density-based clustering based on hierarchical density estimates.
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
(pp. 160-172). Springer Berlin Heidelberg.
.. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
cluster tree. In Advances in Neural Information Processing Systems
(pp. 343-351).
.. [3] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
"""
if min_samples is None:
min_samples = min_cluster_size
if type(min_samples) is not int or type(min_cluster_size) is not int:
raise ValueError('Min samples and min cluster size must be integers!')
if min_samples <= 0 or min_cluster_size <= 0:
raise ValueError('Min samples and Min cluster size must be positive'
' integers')
if min_cluster_size == 1:
raise ValueError('Min cluster size must be greater than one')
if type(cluster_selection_epsilon) is int:
cluster_selection_epsilon = float(cluster_selection_epsilon)
if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0:
raise ValueError('Epsilon must be a float value greater than or equal to 0!')
if not isinstance(alpha, float) or alpha <= 0.0:
raise ValueError('Alpha must be a positive float value greater than'
' 0!')
if leaf_size < 1:
raise ValueError('Leaf size must be greater than 0!')
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not'
' defined!')
if match_reference_implementation:
min_samples = min_samples - 1
min_cluster_size = min_cluster_size + 1
approx_min_span_tree = False
if cluster_selection_method not in ('eom', 'leaf'):
raise ValueError('Invalid Cluster Selection Method: %s\n'
'Should be one of: "eom", "leaf"\n')
# Checks input and converts to an nd-array where possible
if metric != 'precomputed' or issparse(X):
X = check_array(X, accept_sparse='csr')
else:
# Only non-sparse, precomputed distance matrices are handled here
# and thereby allowed to contain numpy.inf for missing distances
check_precomputed_distance_matrix(X)
# Python 2 and 3 compliant string_type checking
if isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)
size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1
if algorithm != 'best':
if metric != 'precomputed' and issparse(X) and metric != 'generic':
raise ValueError("Sparse data matrices only support algorithm 'generic'.")
if algorithm == 'generic':
(single_linkage_tree,
result_min_span_tree) = memory.cache(
_hdbscan_generic)(X, min_samples, alpha, metric,
p, leaf_size, gen_min_span_tree, **kwargs)
elif algorithm == 'prims_kdtree':
if metric not in KDTree.valid_metrics:
raise ValueError("Cannot use Prim's with KDTree for this"
" metric!")
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_prims_kdtree)(X, min_samples, alpha,
metric, p, leaf_size,
gen_min_span_tree, **kwargs)
elif algorithm == 'prims_balltree':
if metric not in BallTree.valid_metrics:
raise ValueError("Cannot use Prim's with BallTree for this"
" metric!")
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_prims_balltree)(X, min_samples, alpha,
metric, p, leaf_size,
gen_min_span_tree, **kwargs)
elif algorithm == 'boruvka_kdtree':
if metric not in BallTree.valid_metrics:
raise ValueError("Cannot use Boruvka with KDTree for this"
" metric!")
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
metric, p, leaf_size,
approx_min_span_tree,
gen_min_span_tree,
core_dist_n_jobs, **kwargs)
elif algorithm == 'boruvka_balltree':
if metric not in BallTree.valid_metrics:
raise ValueError("Cannot use Boruvka with BallTree for this"
" metric!")
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_boruvka_balltree)(X, min_samples, alpha,
metric, p, leaf_size,
approx_min_span_tree,
gen_min_span_tree,
core_dist_n_jobs, **kwargs)
else:
raise TypeError('Unknown algorithm type %s specified' % algorithm)
else:
if issparse(X) or metric not in FAST_METRICS:
# We can't do much with sparse matrices ...
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_generic)(X, min_samples,
alpha, metric, p, leaf_size,
gen_min_span_tree, **kwargs)
elif metric in KDTree.valid_metrics:
# TO DO: Need heuristic to decide when to go to boruvka;
# still debugging for now
if X.shape[1] > 60:
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_prims_kdtree)(X, min_samples, alpha,
metric, p, leaf_size,
gen_min_span_tree, **kwargs)
else:
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
metric, p, leaf_size,
approx_min_span_tree,
gen_min_span_tree,
core_dist_n_jobs, **kwargs)
else: # Metric is a valid BallTree metric
# TO DO: Need heuristic to decide when to go to boruvka;
# still debugging for now
if X.shape[1] > 60:
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_prims_balltree)(X, min_samples, alpha,
metric, p, leaf_size,
gen_min_span_tree, **kwargs)
else:
(single_linkage_tree, result_min_span_tree) = memory.cache(
_hdbscan_boruvka_balltree)(X, min_samples, alpha,
metric, p, leaf_size,
approx_min_span_tree,
gen_min_span_tree,
core_dist_n_jobs, **kwargs)
return _tree_to_labels(X,
single_linkage_tree,
min_cluster_size,
cluster_selection_method,
allow_single_cluster,
match_reference_implementation,
cluster_selection_epsilon) + \
(result_min_span_tree,)
# Inherits from sklearn
[docs]class HDBSCAN(BaseEstimator, ClusterMixin):
"""Perform HDBSCAN clustering from vector array or distance matrix.
HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications
with Noise. Performs DBSCAN over varying epsilon values and integrates
the result to find a clustering that gives the best stability over epsilon.
This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN),
and be more robust to parameter selection.
Parameters
----------
min_cluster_size : int, optional (default=5)
The minimum size of clusters; single linkage splits that contain
fewer points than this will be considered points "falling out" of a
cluster rather than a cluster splitting into two new clusters.
min_samples : int, optional (default=None)
The number of samples in a neighbourhood for a point to be
considered a core point.
metric : string, or callable, optional (default='euclidean')
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.pairwise_distances for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.
p : int, optional (default=None)
p value to use if using the minkowski metric.
alpha : float, optional (default=1.0)
A distance scaling parameter as used in robust single linkage.
See [3]_ for more information.
cluster_selection_epsilon: float, optional (default=0.0)
A distance threshold. Clusters below this value will be merged.
See [5]_ for more information.
algorithm : string, optional (default='best')
Exactly which algorithm to use; hdbscan has variants specialised
for different characteristics of the data. By default this is set
to ``best`` which chooses the "best" algorithm given the nature of
the data. You can force other options if you believe you know
better. Options are:
* ``best``
* ``generic``
* ``prims_kdtree``
* ``prims_balltree``
* ``boruvka_kdtree``
* ``boruvka_balltree``
leaf_size: int, optional (default=40)
If using a space tree algorithm (kdtree, or balltree) the number
of points ina leaf node of the tree. This does not alter the
resulting clustering, but may have an effect on the runtime
of the algorithm.
memory : Instance of joblib.Memory or string (optional)
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
approx_min_span_tree : bool, optional (default=True)
Whether to accept an only approximate minimum spanning tree.
For some algorithms this can provide a significant speedup, but
the resulting clustering may be of marginally lower quality.
If you are willing to sacrifice speed for correctness you may want
to explore this; in general this should be left at the default True.
gen_min_span_tree: bool, optional (default=False)
Whether to generate the minimum spanning tree with regard
to mutual reachability distance for later analysis.
core_dist_n_jobs : int, optional (default=4)
Number of parallel jobs to run in core distance computations (if
supported by the specific algorithm). For ``core_dist_n_jobs``
below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
cluster_selection_method : string, optional (default='eom')
The method used to select clusters from the condensed tree. The
standard approach for HDBSCAN* is to use an Excess of Mass algorithm
to find the most persistent clusters. Alternatively you can instead
select the clusters at the leaves of the tree -- this provides the
most fine grained and homogeneous clusters. Options are:
* ``eom``
* ``leaf``
allow_single_cluster : bool, optional (default=False)
By default HDBSCAN* will not produce a single cluster, setting this
to True will override this and allow single cluster results in
the case that you feel this is a valid result for your dataset.
prediction_data : boolean, optional
Whether to generate extra cached data for predicting labels or
membership vectors few new unseen points later. If you wish to
persist the clustering object for later re-use you probably want
to set this to True.
(default False)
match_reference_implementation : bool, optional (default=False)
There exist some interpretational differences between this
HDBSCAN* implementation and the original authors reference
implementation in Java. This can result in very minor differences
in clustering results. Setting this flag to True will, at a some
performance cost, ensure that the clustering results match the
reference implementation.
**kwargs : optional
Arguments passed to the distance metric
Attributes
----------
labels_ : ndarray, shape (n_samples, )
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1.
probabilities_ : ndarray, shape (n_samples, )
The strength with which each sample is a member of its assigned
cluster. Noise points have probability zero; points in clusters
have values assigned proportional to the degree that they
persist as part of the cluster.
cluster_persistence_ : ndarray, shape (n_clusters, )
A score of how persistent each cluster is. A score of 1.0 represents
a perfectly stable cluster that persists over all distance scales,
while a score of 0.0 represents a perfectly ephemeral cluster. These
scores can be guage the relative coherence of the clusters output
by the algorithm.
condensed_tree_ : CondensedTree object
The condensed tree produced by HDBSCAN. The object has methods
for converting to pandas, networkx, and plotting.
single_linkage_tree_ : SingleLinkageTree object
The single linkage tree produced by HDBSCAN. The object has methods
for converting to pandas, networkx, and plotting.
minimum_spanning_tree_ : MinimumSpanningTree object
The minimum spanning tree of the mutual reachability graph generated
by HDBSCAN. Note that this is not generated by default and will only
be available if `gen_min_span_tree` was set to True on object creation.
Even then in some optimized cases a tre may not be generated.
outlier_scores_ : ndarray, shape (n_samples, )
Outlier scores for clustered points; the larger the score the more
outlier-like the point. Useful as an outlier detection technique.
Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander.
prediction_data_ : PredictionData object
Cached data used for predicting the cluster labels of new or
unseen points. Necessary only if you are using functions from
``hdbscan.prediction`` (see
:func:`~hdbscan.prediction.approximate_predict`,
:func:`~hdbscan.prediction.membership_vector`,
and :func:`~hdbscan.prediction.all_points_membership_vectors`).
exemplars_ : list
A list of exemplar points for clusters. Since HDBSCAN supports
arbitrary shapes for clusters we cannot provide a single cluster
exemplar per cluster. Instead a list is returned with each element
of the list being a numpy array of exemplar points for a cluster --
these points are the "most representative" points of the cluster.
relative_validity_ : float
A fast approximation of the Density Based Cluster Validity (DBCV)
score [4]. The only differece, and the speed, comes from the fact
that this relative_validity_ is computed using the mutual-
reachability minimum spanning tree, i.e. minimum_spanning_tree_,
instead of the all-points minimum spanning tree used in the
reference. This score might not be an objective measure of the
goodness of clusterering. It may only be used to compare results
across different choices of hyper-parameters, therefore is only a
relative score.
References
----------
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
Density-based clustering based on hierarchical density estimates.
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
(pp. 160-172). Springer Berlin Heidelberg.
.. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015).
Hierarchical density estimates for data clustering, visualization,
and outlier detection. ACM Transactions on Knowledge Discovery
from Data (TKDD), 10(1), 5.
.. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
cluster tree. In Advances in Neural Information Processing Systems
(pp. 343-351).
.. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
Sander, J., 2014. Density-Based Clustering Validation. In SDM
(pp. 839-847).
.. [5] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
"""
def __init__(self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0,
metric='euclidean', alpha=1.0, p=None,
algorithm='best', leaf_size=40,
memory=Memory(cachedir=None, verbose=0),
approx_min_span_tree=True,
gen_min_span_tree=False,
core_dist_n_jobs=4,
cluster_selection_method='eom',
allow_single_cluster=False,
prediction_data=False,
match_reference_implementation=False, **kwargs):
self.min_cluster_size = min_cluster_size
self.min_samples = min_samples
self.alpha = alpha
self.cluster_selection_epsilon = cluster_selection_epsilon
self.metric = metric
self.p = p
self.algorithm = algorithm
self.leaf_size = leaf_size
self.memory = memory
self.approx_min_span_tree = approx_min_span_tree
self.gen_min_span_tree = gen_min_span_tree
self.core_dist_n_jobs = core_dist_n_jobs
self.cluster_selection_method = cluster_selection_method
self.allow_single_cluster = allow_single_cluster
self.match_reference_implementation = match_reference_implementation
self.prediction_data = prediction_data
self._metric_kwargs = kwargs
self._condensed_tree = None
self._single_linkage_tree = None
self._min_spanning_tree = None
self._raw_data = None
self._outlier_scores = None
self._prediction_data = None
self._relative_validity = None
def fit(self, X, y=None):
"""Perform HDBSCAN clustering from features or distance matrix.
Parameters
----------
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
Returns
-------
self : object
Returns self
"""
if self.metric != 'precomputed':
X = check_array(X, accept_sparse='csr')
self._raw_data = X
elif issparse(X):
# Handle sparse precomputed distance matrices separately
X = check_array(X, accept_sparse='csr')
else:
# Only non-sparse, precomputed distance matrices are allowed
# to have numpy.inf values indicating missing distances
check_precomputed_distance_matrix(X)
kwargs = self.get_params()
# prediction data only applies to the persistent model, so remove
# it from the keyword args we pass on the the function
kwargs.pop('prediction_data', None)
kwargs.update(self._metric_kwargs)
(self.labels_,
self.probabilities_,
self.cluster_persistence_,
self._condensed_tree,
self._single_linkage_tree,
self._min_spanning_tree) = hdbscan(X, **kwargs)
if self.prediction_data:
self.generate_prediction_data()
return self
def fit_predict(self, X, y=None):
"""Performs clustering on X and returns cluster labels.
Parameters
----------
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
Returns
-------
y : ndarray, shape (n_samples, )
cluster labels
"""
self.fit(X)
return self.labels_
def generate_prediction_data(self):
"""
Create data that caches intermediate results used for predicting
the label of new/unseen points. This data is only useful if
you are intending to use functions from ``hdbscan.prediction``.
"""
if self.metric in FAST_METRICS:
min_samples = self.min_samples or self.min_cluster_size
if self.metric in KDTree.valid_metrics:
tree_type = 'kdtree'
elif self.metric in BallTree.valid_metrics:
tree_type = 'balltree'
else:
warn('Metric {} not supported for prediction data!'.format(self.metric))
return
self._prediction_data = PredictionData(
self._raw_data, self.condensed_tree_, min_samples,
tree_type=tree_type, metric=self.metric,
**self._metric_kwargs
)
else:
warn('Cannot generate prediction data for non-vector'
'space inputs -- access to the source data rather'
'than mere distances is required!')
@property
def prediction_data_(self):
if self._prediction_data is None:
raise AttributeError('No prediction data was generated')
else:
return self._prediction_data
@property
def outlier_scores_(self):
if self._outlier_scores is not None:
return self._outlier_scores
else:
if self._condensed_tree is not None:
self._outlier_scores = outlier_scores(self._condensed_tree)
return self._outlier_scores
else:
raise AttributeError('No condensed tree was generated; try running fit first.')
@property
def condensed_tree_(self):
if self._condensed_tree is not None:
return CondensedTree(self._condensed_tree,
self.cluster_selection_method,
self.allow_single_cluster)
else:
raise AttributeError('No condensed tree was generated; try running fit first.')
@property
def single_linkage_tree_(self):
if self._single_linkage_tree is not None:
return SingleLinkageTree(self._single_linkage_tree)
else:
raise AttributeError('No single linkage tree was generated; try running fit'
' first.')
@property
def minimum_spanning_tree_(self):
if self._min_spanning_tree is not None:
if self._raw_data is not None:
return MinimumSpanningTree(self._min_spanning_tree,
self._raw_data)
else:
warn('No raw data is available; this may be due to using'
' a precomputed metric matrix. No minimum spanning'
' tree will be provided without raw data.')
return None
else:
raise AttributeError('No minimum spanning tree was generated.'
'This may be due to optimized algorithm variations that skip'
' explicit generation of the spanning tree.')
@property
def exemplars_(self):
if self._prediction_data is not None:
return self._prediction_data.exemplars
elif self.metric in FAST_METRICS:
self.generate_prediction_data()
return self._prediction_data.exemplars
else:
raise AttributeError('Currently exemplars require the use of vector input data'
'with a suitable metric. This will likely change in the '
'future, but for now no exemplars can be provided')
@property
def relative_validity_(self):
if self._relative_validity is not None:
return self._relative_validity
if not self.gen_min_span_tree:
raise AttributeError("Minimum spanning tree not present. " +
"Either HDBSCAN object was created with " +
"gen_min_span_tree=False or the tree was " +
"not generated in spite of it owing to " +
"internal optimization criteria.")
return
labels = self.labels_
sizes = np.bincount(labels + 1)
noise_size = sizes[0]
cluster_size = sizes[1:]
total = noise_size + np.sum(cluster_size)
num_clusters = len(cluster_size)
DSC = np.zeros(num_clusters)
min_outlier_sep = np.inf # only required if num_clusters = 1
correction_const = 2 # only required if num_clusters = 1
# Unltimately, for each Ci, we only require the
# minimum of DSPC(Ci, Cj) over all Cj != Ci.
# So let's call this value DSPC_wrt(Ci), i.e.
# density separation 'with respect to' Ci.
DSPC_wrt = np.ones(num_clusters) * np.inf
max_distance = 0
mst_df = self.minimum_spanning_tree_.to_pandas()
for edge in mst_df.iterrows():
label1 = labels[int(edge[1]['from'])]
label2 = labels[int(edge[1]['to'])]
length = edge[1]['distance']
max_distance = max(max_distance, length)
if label1 == -1 and label2 == -1:
continue
elif label1 == -1 or label2 == -1:
# If exactly one of the points is noise
min_outlier_sep = min(min_outlier_sep, length)
continue
if label1 == label2:
# Set the density sparseness of the cluster
# to the sparsest value seen so far.
DSC[label1] = max(length, DSC[label1])
else:
# Check whether density separations with
# respect to each of these clusters can
# be reduced.
DSPC_wrt[label1] = min(length, DSPC_wrt[label1])
DSPC_wrt[label2] = min(length, DSPC_wrt[label2])
# In case min_outlier_sep is still np.inf, we assign a new value to it.
# This only makes sense if num_clusters = 1 since it has turned out
# that the MR-MST has no edges between a noise point and a core point.
min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep
# DSPC_wrt[Ci] might be infinite if the connected component for Ci is
# an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
# MR-MST might contain an edge with one point in Cj and ther other one
# in Ck. Here, we replace the infinite density separation of Ci by
# another large enough value.
#
# TODO: Think of a better yet efficient way to handle this.
correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep)
DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction
V_index = [(DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) for i in range(num_clusters)]
score = np.sum([(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)])
self._relative_validity = score
return self._relative_validity