o
    ?HhK                  	   @   sD  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# eddd\Z$Z%ee$e%dd\Z$Z%e &e$Z$g dZ'dhdd e( D B Z)doddZ*ej+,dedd Z-dd  Z.ej+,d!g e#e"d"d# Z/d$d% Z0ej+,d&e'ej+,d'ed(d) Z1d*d+ Z2ej+,d,d-d.d/ Z3d0d1 Z4d2d3 Z5d4d5 Z6d6d7 Z7ej+,d8d9d:gd;d< Z8ej+,d=e#d>d? Z9ej+,d@e'dAdB Z:dCdD Z;dEdF Z<ej+,dGd'dHie=dIej>gej>dIggfd'dHidIdJgdJdIggfi dIdJgdKdLggfgdMdN Z?ej+,d=e#dOdP Z@ej+,d=e#dQdR ZAdSdT ZBdUdV ZCdWdX ZDej+,dYdZd[gej+,d\dd]gd^d_ ZEd`da ZFej+,dbdcddgdedf ZGej+,dgdhdigdjdk ZHej+,dld9d:gdmdn ZIdS )pzF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)stats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal)CSC_CONTAINERSCSR_CONTAINERS   
   )	n_samplesrandom_state   )r   )kd_tree	ball_treebruteautoc                 C   s   h | ]\}}|d  qS )label ).0_outr    r    b/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/sklearn/cluster/tests/test_hdbscan.py	<setcomp>&       r%   Gz?c                 C   s2   t t| t }|dksJ t| t|ksJ d S )N   )lensetOUTLIER_SETr
   y)labels	threshold
n_clustersr    r    r$   check_label_quality)   s   r0   outlier_typec                 C   s   t jt jd|  }dd dd d|  }t|  d }t|  d }t }|dg|d< ||g|d	< t |}|j|k	 \}t
|dd	g ||j|	 \}t
|dd	g ttdd	ttd
d }	t ||	 }
t
|
j|j|	  dS )O
    Tests if np.inf and np.nan data are each treated as special outliers.
    )infinitemissingc                 S   s   | |kS Nr    xr,   r    r    r$   <lambda>9   s    z#test_outlier_data.<locals>.<lambda>c                 S   s
   t | S r5   )npisnanr6   r    r    r$   r8   :   s   
 r   prob   r         r   N)r9   infnanr   Xcopyr   fitlabels_nonzeror   probabilities_listrange)r1   outlier
prob_checkr   r;   	X_outliermodelmissing_labels_idxmissing_probs_idxclean_indicesclean_modelr    r    r$   test_outlier_data/   s.   rQ   c                  C   s   t t} |  }tddd| }t| | t| d}tjt	|d tdddt W d   n1 s7w   Y  d}d| d	< d
| d< tjt	|d tdd|  W d   dS 1 saw   Y  dS )zy
    Tests that HDBSCAN works with precomputed distance matrices, and throws the
    appropriate errors when needed.
    precomputedT)metricrB   z*The precomputed distance matrix.*has shapematchNz'The precomputed distance matrix.*valuesr   )r   r<   r<   )r<   r   rS   )
r   rA   rB   r   fit_predictr   r0   pytestraises
ValueError)D
D_originalr-   msgr    r    r$   test_hdbscan_distance_matrixO   s   
"r^   sparse_constructorc                 C   sf   t t t}|t| }t| d}d|||k< | |}|	  t
dd|}t| dS )zA
    Tests that HDBSCAN works with sparse distance matrices.
    2           rR   rV   N)r   
squareformpdistrA   r9   maxr   scoreatpercentileflatteneliminate_zerosr   rW   r0   )r_   r[   r.   r-   r    r    r$   #test_hdbscan_sparse_distance_matrixg   s   rh   c                  C   s   t  t} t|  dS )z
    Tests that HDBSCAN works with feature array, including an arbitrary
    goodness of fit check. Note that the check is a simple heuristic.
    N)r   rW   rA   r0   r-   r    r    r$   test_hdbscan_feature_arrayy   s   rj   algorS   c                 C   s   t | dt}t| | dv rdS ttd}dttjd idt	tjd iddidt	tjd d	d

|d}t | ||d}|||  jvrhtt |t W d   dS 1 saw   Y  dS |dkrtt |t W d   dS 1 sw   Y  dS |t dS )z
    Tests that HDBSCAN works with the expected combinations of algorithms and
    metrics, or raises the expected errors.
    )	algorithm)r   r   N)r   r   Vr<   p   )rn   w)mahalanobis
seuclidean	minkowski
wminkowski)rl   rS   metric_paramsrt   )r   rW   rA   r0   r   r   r9   eyeshapeonesgetvalid_metricsrX   rY   rZ   rC   warnsFutureWarning)rk   rS   r-   ALGOS_TREESru   hdbr    r    r$   test_hdbscan_algorithms   s8   ""r   c                  C   s&   t  t} | d}t|dd dS )z
    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
    This test is more of a sanity check than a rigorous evaluation.
    333333?gq=
ףp?)r.   N)r   rC   rA   dbscan_clusteringr0   )	clustererr-   r    r    r$   test_dbscan_clustering   s   
r   cut_distance)皙?      ?r<   c                 C   s   t d d }t d d }t }tjdg|d< dtjg|d< tjtjg|d< t |}|j| d}t	||k}t
|ddg t	||k}t
|dg tttd	t||  }t || }	|	j| d}
t
|
||  d
S )r2   r4   r   r3   r<   r   ro   r=   )r   r   N)r   rA   rB   r9   r?   r@   r   rC   r   flatnonzeror   rG   r*   rH   )r   missing_labelinfinite_labelrK   rL   r-   rM   infinite_labels_idx	clean_idxrP   clean_labelsr    r    r$   #test_dbscan_clustering_outlier_data   s    r   c                  C   s.   t ddttjd idt} t|  dS )z4
    Tests that HDBSCAN using `BallTree` works.
    rr   rm   r<   )rS   ru   N)r   r9   rx   rA   rw   rW   r0   ri   r    r    r$   !test_hdbscan_best_balltree_metric   s   r   c                  C   s.   t ttd dt} t| tsJ dS )z
    Tests that HDBSCAN correctly does not generate a valid cluster when the
    `min_cluster_size` is too large for the data.
    r<   min_cluster_sizeN)r   r)   rA   rW   r*   issubsetr+   ri   r    r    r$   test_hdbscan_no_clusters   s   r   c                  C   s\   t dttdD ]#} t| dt}dd |D }t|dkr+tt|| ks+J qdS )zb
    Test that the smallest non-noise cluster has at least `min_cluster_size`
    many points
    ro   r<   r   c                 S   s   g | ]}|d kr|qS )r   r    )r!   r   r    r    r$   
<listcomp>   r&   z1test_hdbscan_min_cluster_size.<locals>.<listcomp>r   N)rH   r)   rA   r   rW   r9   minbincount)r   r-   true_labelsr    r    r$   test_hdbscan_min_cluster_size   s   r   c                  C   s"   t j} t| dt}t| dS )zA
    Tests that HDBSCAN works when passed a callable metric.
    rV   N)r   	euclideanr   rW   rA   r0   )rS   r-   r    r    r$   test_hdbscan_callable_metric   s   r   treer   r   c                 C   sN   t d| d}d}tjt|d |t W d   dS 1 s w   Y  dS )z
    Tests that HDBSCAN correctly raises an error when passing precomputed data
    while requesting a tree-based algorithm.
    rR   rS   rl   z%precomputed is not a valid metric forrT   N)r   rX   rY   rZ   rC   rA   )r   r~   r]   r    r    r$   "test_hdbscan_precomputed_non_brute   s
   "r   csr_containerc           	      C   s  t  tj}t| | t}| }t  |j}t|| tjdftj	dffD ]7\}}t }||d< t  |j}t| |d t
| d ksLJ | }||d< t  |j}t|| q)d}tjt|d t dd	d
| W d   dS 1 sw   Y  dS )z
    Tests that HDBSCAN works correctly when passing sparse feature data.
    Evaluates correctness by comparing against the same data passed as a dense
    array.
    r3   r4   r   r   r   r   z4Sparse data matrices only support algorithm `brute`.rT   r   r   r   N)r   rC   rA   rD   r0   rB   r   r9   r?   r@   r   rX   rY   rZ   )	r   dense_labels	_X_sparseX_sparsesparse_labelsoutlier_valr1   X_denser]   r    r    r$   test_hdbscan_sparse
  s(   
"r   rl   c                 C   s   ddg}t dd|dd\}}tdd|}t||j|jD ]\}}}t||d	d
d t||d	d
d qt| dtjd dt}|jjd dksKJ |jjd dksUJ dS )zj
    Tests that HDBSCAN centers are calculated and stored properly, and are
    accurate to the data.
    )ra   ra   )      @r   i  r   r   )r   r   centerscluster_stdboth)store_centersr<   g?)rtolatol)rl   r   r   N)	r	   r   rC   zip
centroids_medoids_r   rA   rw   )rl   r   Hr"   r~   centercentroidmedoidr    r    r$   test_hdbscan_centers-  s   r   c                  C   s   t jd} | dd}tddddd|}t j|dd	\}}t|dks(J ||d
k dks2J tdddddd|}t j|dd	\}}t|dksOJ ||d
k dksYJ dS )zS
    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
    r      ro   r=   ra   eomT)r   cluster_selection_epsiloncluster_selection_methodallow_single_cluster)return_countsr      g
ףp=
?r   )r   r   r   r   rl   N)r9   randomRandomStaterandr   rW   uniquer)   )rngno_structurer-   unique_labelscountsr    r    r$   .test_hdbscan_allow_single_cluster_with_epsilonC  s2   r   c                  C   sx   ddgddgddgddgg} t d| g ddd\}}t |j}tt|td	|v  }|d
ks3J t||dk dS )z
    Validate that HDBSCAN can properly cluster this difficult synthetic
    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
    example)
    g333333g333333?r(   i  )皙?gffffff?皙?r   r   )r   r   r   r   r      r'   N)r	   r   rC   rD   r)   r*   intr
   )r   rA   r,   r-   r/   r    r    r$   test_hdbscan_better_than_dbscand  s   
r   z	kwargs, XrR   r<   ro   r(   r   c                 C   s   t dddi||  dS )zo
    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
    with non-finite points.
    min_samplesr<   Nr    )r   rC   )rA   kwargsr    r    r$   test_hdbscan_usable_inputsx  s   r   c                 C   sV   | t d}d}tjt|d tdd| W d   dS 1 s$w   Y  dS )zd
    Tests that HDBSCAN raises the correct error when there are too few
    non-zero distances.
    )r   r   z#There exists points with fewer thanrT   rR   rV   N)r9   zerosrX   rY   rZ   r   rC   r   rA   r]   r    r    r$   -test_hdbscan_sparse_distances_too_few_nonzero  s
   "r   c                 C   s   t d}d|ddddf< d|ddddf< ||j }| |}d}tjt|d tdd	| W d   dS 1 s?w   Y  dS )
zu
    Tests that HDBSCAN raises the correct error when the distance matrix
    has multiple connected components.
    )   r   r<   Nr=      z2HDBSCAN cannot be perfomed on a disconnected graphrT   rR   rV   )r9   r   TrX   rY   rZ   r   rC   r   r    r    r$   0test_hdbscan_sparse_distances_disconnected_graph  s   

"r   c                  C   s   dd } d}t jt|d td| dt W d   n1 s!w   Y  t jt|d td| dt W d   n1 sAw   Y  tttj	tt
j	 }t|d	kr}t jt|d td|d	 dt W d   dS 1 svw   Y  dS dS )
zR
    Tests that HDBSCAN correctly raises an error for invalid metric choices.
    c                 S   s   | S r5   r    )r7   r    r    r$   r8     s    z2test_hdbscan_tree_invalid_metric.<locals>.<lambda>zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.rT   r   )rl   rS   Nr   r   )rX   rY   rZ   r   rC   rA   rG   r*   r   rz   r   r)   )metric_callabler]   metrics_not_kdr    r    r$    test_hdbscan_tree_invalid_metric  s   "r   c                  C   sT   t ttd d} d}tjt|d | t W d   dS 1 s#w   Y  dS )zx
    Tests that HDBSCAN correctly raises an error when setting `min_samples`
    larger than the number of samples.
    r<   )r   z min_samples (.*) must be at mostrT   N)r   r)   rA   rX   rY   rZ   rC   )r~   r]   r    r    r$   !test_hdbscan_too_many_min_samples  s
   "r   c                  C   s^   t  } tj| d< d}tdd}tjt|d ||  W d   dS 1 s(w   Y  dS )zu
    Tests that HDBSCAN correctly raises an error when providing precomputed
    distances with `np.nan` values.
    r   z(np.nan values found in precomputed-denserR   rV   rT   N)	rA   rB   r9   r@   r   rX   rY   rZ   rC   )X_nanr]   r~   r    r    r$   "test_hdbscan_precomputed_dense_nan  s   

"r   r   TFepsilonr   c                    s   d}t || ddgddgddggd\}t |}t|j|jd}|d |d |d h}|d d|d d	|d di}t|||||d
fddttD   fddttD }	t	
|	j}
t|
 dS )zR
    Tests that the `_do_labelling` helper function correctly assigns labels.
    0   r   r   )r   r   r   ro   r(   r   r<   condensed_treeclusterscluster_label_mapr   r   c                    s$   i | ]}|t  |kd  d  qS )r   )r9   wherer!   _y)r,   r    r$   
<dictcomp>  s   $ z+test_labelling_distinct.<locals>.<dictcomp>c                    s   i | ]	}| |  qS r    r    r   )first_with_labelr-   r    r$   r     s    N)r	   r   rC   r   _single_linkage_tree_r   r   rG   r*   r9   	vectorizery   r   )global_random_seedr   r   r   rA   estr   r   r   y_to_labelsaligned_targetr    )r   r-   r,   r$   test_labelling_distinct  s4   
r   c                  C   s   d} d}t jdd|dfddd|dfddgtd	}t|| h| d| d did
dd}|d dk }t|t|dkks:J t|| h| d| d did
dd}|d |k }t|t|dkks\J dS )z
    Tests that the `_do_labelling` helper function correctly thresholds the
    incoming lambda values given various `cluster_selection_epsilon` values.
    r=   g      ?ro   r<   )r=   r<   r   r<   r   )r=   r(   r   r<   )r=   r   r   r<   )dtypeTr   valuer   N)r9   arrayr   r   sum)r   
MAX_LAMBDAr   r-   	num_noiser    r    r$   test_labelling_thresholding  s:   



r   r   r   r   c                 C   sh   t jd}|d}t|}d}tjt|d td| d| W d   dS 1 s-w   Y  dS )zCheck that we raise an error if the centers are requested together with
    a precomputed input matrix.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27893
    r   )d   ro   z>Cannot store centers when using a precomputed distance matrix.rT   rR   )rS   r   N)	r9   r   r   r   rX   rY   rZ   r   rC   )r   r   rA   X_disterr_msgr    r    r$   0test_hdbscan_error_precomputed_and_store_centers%  s   
"r   
valid_algor   r   c                 C   s   t d| dt dS )zTest that HDBSCAN works with the "cosine" metric when the algorithm is set
    to "brute" or "auto".

    Non-regression test for issue #28631
    cosiner   N)r   rW   rA   )r   r    r    r$   *test_hdbscan_cosine_metric_valid_algorithm5  s   r   invalid_algoc                 C   sJ   t d| d}tjtdd |t W d   dS 1 sw   Y  dS )zTest that HDBSCAN raises an informative error is raised when an unsupported
    algorithm is used with the "cosine" metric.
    r   r   zcosine is not a valid metricrT   N)r   rX   rY   rZ   rW   rA   )r   hdbscanr    r    r$   ,test_hdbscan_cosine_metric_invalid_algorithm?  s   "r   )r'   )J__doc__numpyr9   rX   scipyr   scipy.spatialr   sklearn.clusterr   sklearn.cluster._hdbscan._treer   r   r    sklearn.cluster._hdbscan.hdbscanr   sklearn.datasetsr	   sklearn.metricsr
   sklearn.metrics.pairwiser   r   sklearn.neighborsr   r   sklearn.preprocessingr   sklearn.utilsr   sklearn.utils._testingr   r   sklearn.utils.fixesr   r   rA   r,   fit_transform
ALGORITHMSitemsr+   r0   markparametrizerQ   r^   rh   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r?   r   r   r   r   r   r   r   r   r   r   r   r    r    r    r$   <module>   s    


'

		

"
! 


$)

	