o
    `^h6                    @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZ dd	 Zejd
g ddd Zejd
g ddd Zejdejejejgejdejejejgdd Zejdejejejgdd Zdd Zdd Zdd Z dd Z!dd Z"ejjd g d!g d"ge#g d#g d$gej#g d%g d&ge$d'ej#g d(d)ej%d*gge$d'ej#g d(d)e&d+d*gge$d'ej#g d,g d-ge$d'ej#g d.d)ej%dgge$d'ej#g d.d)e&d+dgge$d'gg d/d0d1d2 Z'ejd
g dejd3d4d5gejd6dd7gd8d9 Z(ejd3d4d5gejd:d;d<gd=d<gd;d<ggg d>g d?g d>gfd@d)gdAd)gdBdCgdAd)ggg dDg dEg dFgfgdGdH Z)dIdJ Z*ejd6g dKejdLg dKdMdN Z+ejdOdPdQgejd d=d;ge#dRdSggdTdU Z,ejdOdPdQgdVdW Z-ejjdXdYd<gdZd<ggdYdZgd<ggej.fe#d=d;gd[d;ggd=d[gd;ggej/fej#d\d*gd]d*gge$d'd\d]gd*ggej.fe#d\d*gd]d*ggd\d]gd*ggej0fe#d=d;gej%d;ggd=ej%gd;ggejfej#d\ej%gdej%gge$d'd\dgej%ggej.fej#d\e&d+gde&d+gge$d'd\dge&d+ggej.fgg d^d0d_d` Z1ejd
g dejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fej#dd)gge$d'j2ej#ddCgge$d'j2g dgge$fej#d)dCgge$d'j2ej#d)ej%gge$d'j2g dhge$fej#d)dgge$d'j2ej#d)ej%gge$d'j2g dige$fgg djd0dkdl Z4dmdn Z5ejdoe
egdpdq Z6drds Z7dtdu Z8ejjdvd7dwdxgfdyg dzfg d{d|d}gfgg d~d0dd Z9dd Z:ejjd g d"g d!ge#g dg dgej#g d&g d%ge$d'gg dd0dd Z;ejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fgg dd0dd Z<dd Z=dd Z>ejde&e?gdd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFejd6dyd7gdd ZGejdej%de&d+gdd ZHejd6dYd[gg dgdd ZIejjdd5d4gddgd0ejjd6d7g dgd7dgd0dd ZJejdoe
egdd ZKejddd;iddiddid;dddeddgejddg dggdd ZLejd6dyd7dCggdd ZMejd6d)gdbggdd ZNejddd[iddiddiddiddid[dddeddgddń ZOejd6d7dCggddǄ ZPejd6d)gdbggddɄ ZQdd˄ ZRejdd[d=dddeigdd̈́ ZSddτ ZTddф ZUddӄ ZVddՄ ZWddׄ ZXejddd=dٜgddۄ ZYejdd;d[dٜgdd݄ ZZejdg dߢejdg ddd Z[dd Z\ejdej%dgdd Z]dd Z^ejd
g dejdddgdd Z_ejd
g ddd Z`ejd
g ddd Zaejd
g ddd Zbdd Zcdd Zdejdej%dgdd Zeejdddgejdej%dgdd Zfejjdaej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#dej%ggejd'j2ej#dRggejd'j2e#ddSej%ggejfgg d d0dd Zgejdoe
egdd Zhejde#dej%dRggj2e#dej%dggj2e#dSggfe#g dgj2e#g d	gj2e#ej%ggfej#d
ej%dCgge$d'j2e#dej%dggj2ej#dbgge$d'fej#g dge$d'j2e#g dgj2ej#ej%gge$d'fgdd Ziejdedd Zjdd Zkejddd]ggej#dd]ggdd'ej#dd]ggdd'gejdd\d]ggej#d\d]ggdd'ej#d\d]ggdd'gdd Zldd Zmdd Zndd  Zoejd!d5d4gd"d# Zpejd$ej#d)gd%gge$d'd gej%gej%ggejqd%gdgdgge$d'fej#ej%gd%gd)gge$d'd gej%gej%ggejqd%gej%gej%gge$d'fgd&d' Zrd(d) Zsd*d+ Ztd,d- Zud.d/ Zvd0d1 Zwejddd[iddiddiddiddid[dddeddgd2d3 Zxd4d5 Zyd6d7 Zzd8d9 Z{d:d; Z|ejdddidd;igd<d= Z}ejddd=idd>igd?d@ Z~dAdB ZdCdD Zejdoe
egdEdF ZdS (G      N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                  C   s   t g dg dg} t }tdd}|| }|| }|jdks$J |jdks+J t|s2J t|r9J t| g dg dg t| | d S )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser	   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_dense r!   g/home/air/shanriGPT/back/venv/lib/python3.10/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s   


r#   handle_unknown)ignoreinfrequent_if_existwarnc                 C   s   t g dg dg dg}t g dg}tdd}|| tjtdd || W d    n1 s7w   Y  t| d}|| | }t	||
 t g d	g t|| d S )
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr$   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr	   r   r   r$   r   X2oh	X2_passedr!   r!   r"   #test_one_hot_encoder_handle_unknown*   s   



r8   c                 C   sx   t g dd}t ddgd}t| d}|| | }t|| t g dg dg t|| d S )N)11111111223334444)r   55555r:   r*   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r.   r3   r	   r2   r   r4   r!   r!   r"   +test_one_hot_encoder_handle_unknown_stringsB   s   

rA   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)
categoriesrE   F)rG   rE   r   )	r   asarrayTr   r	   r   r   r.   r2   )rC   rB   r   
X_expectedr6   r!   r!   r"   test_one_hot_encoder_dtypeU   s   rK   c                 C   s   t d}|ddgddgd}tjg dg dg| d	}t| d	}t|| | t|	|
| | t| d
d}t||| t|	|
|| d S )Npandasabr   r   ABr   r   r   r   r   r   r   r   rD   F)rE   r   )r/   importorskip	DataFramer   r   r   r	   r   r   r.   r2   )rB   pdX_dfrJ   r6   r!   r!   r"   !test_one_hot_encoder_dtype_pandasd   s   

rX   c                  C   s   t  } g dg dg dg dg}| | |  }tg d| | g d}tg d| tjtdd	 | d
dg W d    d S 1 sIw   Y  d S )N)Maler   girlr   r   )Female)   rZ   r   
   )rY   3   boy   r   )rY   [   rZ         )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr,   rs   rt   )r   r.   get_feature_names_outr	   r/   r0   r1   )encr   feature_namesfeature_names2r!   r!   r"   "test_one_hot_encoder_feature_namest   s(   
"r   c                  C   s\   t  } tjddggtdj}| | |  }tddg| | jdgd}tdd	g| d S )
Nu   c❤t1dat2rD   u	   x0_c❤t1x0_dat2u   n👍meinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrI   r.   r   r	   )r   r   r   r!   r!   r"   *test_one_hot_encoder_feature_names_unicode   s   
r   c                  C   s   dd } t | d}tjddggtdj}|| | }tddg| |jd	gd
}tddg| dd }t |d|}d}tj	t
|d |  W d   dS 1 sWw   Y  dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 S   s   | d t | S )N_)reprfeaturecategoryr!   r!   r"   name_combiner   s   zHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner)feature_name_combinerNoneNrD   z	x0_'None'x0_NonerM   r   za_'None'a_Nonec                 S   s   dS )Nr   r!   r   r!   r!   r"   wrong_combiner   s   zItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combinerzMWhen `feature_name_combiner` is a callable, it should return a Python string.r,   )r   r   r   r   rI   r.   r   r	   r/   r0   	TypeError)r   r   r   r   r   err_msgr!   r!   r"   1test_one_hot_encoder_custom_feature_name_combiner   s   


"r   c                  C   s   t ddggj} t }|jg dgd | d g dgks"J ||  jdks.J |jg dgd ||  jdksCJ d S )	Nr   r   )r   r   r   r   rG   rG   )r   r(   )r   r   r   r   r(   r   )	r   r   rI   r   
set_params
get_paramsr   r   r   )r   r6   r!   r!   r"   test_one_hot_encoder_set_params   s   r   c                 C   sX   t dd}|| }t ddd}|| }t| | t|r&|jdks(J | S )NrF   r   FrG   r   csr)r   r   r   r   r   r   format)r   r   Xtr1Xtr2r!   r!   r"   check_categorical_onehot   s   


r   r   defr   7   abcr   r   )r]   r   r   )r   r   r   )rN   rP   cat)rM   rQ   r   rD   )rN   r   r   rM   r   nan)Nr   r   )rM   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|g dg dg tdd| }t| g dg dg d S )	Nr   r   )r   r   r   r   r   r   r   r   rF   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   r   )r   Xtrr!   r!   r"   test_one_hot_encoder   s   r   sparse_FTdropfirstc                 C   s  g dg dg dg}t ||d}||}tj|td}t||| ddgddgd	dgg}t |d
|d}||}t|}t||| |d u rg dg dg dg}t || ddgddgg dgd}||}tj|td}d |d< t||| ddgddgd	dgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tg dg dg}td}t	j
t|d || W d    d S 1 sw   Y  d S )Nr   r   )r   r   r   r   r   rD   r   r   r   r   rF   )r   rG   r   r   r   )6   r   8   )r   r$   rG   )r   r   r   r   )r   rG   r$   r   r   r   r   r   r   )Shape of the passed X data is not correctr,   )r   r   r   r   r   r	   inverse_transformreescaper/   r0   r1   )r$   r   r   r   r   X_trexpmsgr!   r!   r"   test_one_hot_encoder_inverse  sJ   





"r   z
X, X_transr   r   r   r   r   r   r   rs   rt   ru   rN   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                 C   s`   t |d| }d}|rt|d}tjt|d || W d   dS 1 s)w   Y  dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r,   N)r   r.   r   r/   r0   r1   r   )r   X_transr   r   r   r!   r!   r"   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownA  s   
"r   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
NrY   r   r[   r   r   rD   	if_binaryFr   r   )r   r   r   r   r   r	   r   )r   oher   r!   r!   r"   &test_one_hot_encoder_inverse_if_binarya  s    
r   )r   r   N
reset_dropc                 C   s   t jddgddgddggtd}t| dd}|| ||}| }|j|d	 t|	|| t
||| t| | d S )
NrY   r   r[   r   r   rD   Fr   r   )r   r   r   r   r.   r2   r   r   r	   r   r   )r   r   r   r   r   r   r!   r!   r"   test_one_hot_encoder_drop_reseth  s    

r   methodr.   r         @      @c                 C   sL   t  }d}tjt|d t|||  W d    d S 1 sw   Y  d S )Nz'Expected 2D array, got 1D array insteadr,   )r   r/   r0   r1   getattr)r   r   r6   r   r!   r!   r"   test_X_is_not_1Dw  s
   "r   c                 C   sp   t d}|g d}t }dt| d}t jt|d t|| | W d    d S 1 s1w   Y  d S )NrL   )   r   r(   r   z+Expected a 2-dimensional container but got z	 instead.r,   )r/   rT   Seriesr   typer0   r1   r   )r   rV   r   r6   r   r!   r!   r"   test_X_is_not_1D_pandas  s   
"r   zX, cat_exp, cat_dtyper   r   r   rP   rQ   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]Q}t dd}|| t|jtsJ t|j|D ]6\}}| }t|d rHt|d s9J |d d |d d ksGJ n| |ksPJ t	|j
|sYJ q#q	d S )Nr=   rF   r   )r   r.   
isinstancecategories_listziptolistr   r   
issubdtyperE   )r   cat_exp	cat_dtypeXir   resr   res_listr!   r!   r"   test_one_hot_encoder_categories  s   #

r   zX, X2, cats, cat_dtypedrM   rN   cint64r(   r   r   r   )NrM   z)rM   rN   r   )rM   Nr   )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                 C   s  t |d}tg dg dg}t||  | t|jd t|d ks)J |jd 	 t|d ks8J |jd j
|ksBJ t |d}tjtdd || W d    n1 s^w   Y  t ||d}tg dg dg}t||| | d S )	Nr   r   r   r   r   r   r   r   r+   r,   rG   r$   )r   r   r   )r   r   r   r	   r   r   r   rG   r   r   rE   r/   r0   r1   r.   r2   )r   r5   catsr   r$   r   r   r!   r!   r"   )test_one_hot_encoder_specified_categories  s   
3
r   c                  C   s  t jddggtdj} tg dgd}t g dg dg}t|| |  | t|	|  | |j
d  g dksBJ t |j
d jt jsOJ t d	d
ggj} tg dgd}d}tjt|d |	|  W d    d S 1 szw   Y  d S )NrM   rN   rD   )rN   rM   r   r   r   r   r   r   r   )r   r   r   z%Unsorted categories are not supportedr,   )r   r   r   rI   r   r	   r.   r2   r   r   r   r   r   rE   object_r/   r0   r1   )r   r   r   r   r!   r!   r"   (test_one_hot_encoder_unsorted_categories  s   "r   Encoderc                 C   sr   t dt jdgg}| |d}t jddggtdj}tjtdd || W d   dS 1 s2w   Y  dS )zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   r   rD   zNan should be the last elementr,   N)	r   r   r   r   rI   r/   r0   r1   r.   r   r   r   r   r!   r!   r"   ,test_encoder_nan_ending_specified_categories  s   
"r   c                  C   s   t jddgddggtdj} tg dg dgd}t g d	g d
g}t||  | |jd 	 g dks;J t 
|jd jt jsHJ |jd 	 g dksUJ t 
|jd jt jsbJ d S )NrM   rN   r   r   rD   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rI   r   r	   r   r   r   r   r   rE   r   r   r   r   r!   r!   r"   7test_one_hot_encoder_specified_categories_mixed_columns$  s   r   c                  C   sD   t d} | ddgddgd}t|}t|g dg dg d S )	NrL   rM   rN   r   r   rO   rR   rS   )r/   rT   rU   r   r   )rV   rW   r   r!   r!   r"   test_one_hot_encoder_pandas1  s   
r   zdrop, expected_namesx0_cx2_br   )r   x1_2r   )r   r   rN   x0_bx2_a)r   binarymanualc                 C   s:   g dg dg}t | d}|| | }t|| d S )N)r   r   rM   )rN   r   rN   r   )r   r.   r   r	   )r   expected_namesr   r   r   r!   r!   r"   'test_one_hot_encoder_feature_names_drop:  s
   


r  c                  C   s   ddgddgddgg} t g dg dg dg}t d d	g}td
dd}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t d	d g}td
dd}|| }t|j| t|| d S )Nr]   yes   norc   )r   r   r   r   r?   )r   r   r   r   r   r   Fr   truerM   falser   r   )r   r   r   r   r	   	drop_idx_r   )r   expectedexpected_drop_idxr   resultr!   r!   r"   *test_one_hot_encoder_drop_equals_if_binaryL  s    


r  )r]   r   r   )r  r   r   )r   r   r   c                 C   sT   t  }tjg dg dgdd}t|| |d t dd}t|| | d S )Nr   r   r   r   r   r   r   rD   float64)r   r   r   r	   r   astyper   r!   r!   r"   test_ordinal_encoderd  s
   

r  )r   r   zobject-string-catc                 C   s   t |d}tdgdgg}t|| | t|jd t|d ks%J |jd  t|d ks4J |jd j	|ks>J t |d}t
jtdd || W d    d S 1 s[w   Y  d S )Nr   r   r   r   r+   r,   )r   r   r   r	   r   r   rG   r   r   rE   r/   r0   r1   r.   )r   r5   r   r   r   r   r!   r!   r"   )test_ordinal_encoder_specified_categoriesu  s   

"r  c                  C   s   g dg dg} t  }|| }tj| td}t||| tg dg dg}td}t	j
t|d || W d    d S 1 sGw   Y  d S )Nr   r   rD   )r   r   r   r   rR   r   r,   )r   r   r   r   r   r	   r   r   r   r/   r0   r1   )r   r   r   r   r   r!   r!   r"   test_ordinal_encoder_inverse  s   

"r  c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer$   unknown_valuerM   xrN   yr   r   rD   xyblar   r   r   r   )r   r   r   r   r.   r2   r	   r   )r   X_fitr   X_trans_encr   X_trans_invinv_expr!   r!   r"   +test_ordinal_encoder_handle_unknowns_string  s     

 

 r!  rE   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr  r  r      r      r   	   rD   r`      r   r   )r   r   r   r.   r2   r	   r   r   )rE   r   r  r   r  r   r  r   r!   r!   r"   ,test_ordinal_encoder_handle_unknowns_numeric  s     

 

 r'  c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr  r  r   r   r   r(   r   )r   r   r   r   r.   r2   r	   )r   r  r   r!   r!   r"   (test_ordinal_encoder_handle_unknowns_nan  s
   
r(  c                  C   sd   t dtjtd} tdgdgdgg}tjtdd | | W d    d S 1 s+w   Y  d S )Nr  )r$   r  rE   r   r   r   z'dtype parameter should be a float dtyper,   )	r   r   r   intr   r/   r0   r1   r.   )r   r  r!   r!   r"   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s   "r*  c                  C   sj   t jg dgtdj} g d}t|d}d}tjt|d ||  W d    d S 1 s.w   Y  d S )N)LowMediumHighr,  r+  rD   )r+  r,  r-  r   z*Shape mismatch: if categories is an array,r,   )	r   r   r   rI   r   r/   r0   r1   r.   )r   r   r   r   r!   r!   r"   +test_ordinal_encoder_raise_categories_shape  s   
"r.  c                     sx  t ddtjg dg dgdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]!   t fddtdD scJ t  |  qLddgd	d
gg   tfddtdD sJ t  |  ddgd	dgg   tfddtdD sJ t  |  d S )NrF   r   )r   r   r   r   )r   r   r   r   r  rD   r   r   r   r(   r   rM   rN   r   r      a   b   c   dr   c                    s   g | ]}j | j jkqS r!   r   rE   .0ir   r   r!   r"   
<listcomp>  s    z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r!   )r   r   r   rE   integerr4  r   r!   r"   r8    s   " c                       g | ]
} j | jd kqS )r   r3  r4  r:  r!   r"   r8        )	r   r   r   r.   allranger	   r2   r   )r   r!   r7  r"   test_encoder_dtypes  s&   

 

r?  c                     s  t d} tddtjg dg dgdd}| jdd	gd
dgddgddd}| tfddtd	D s<J t	
| | | dd	gddgddgd}|d j|d j|d jg | t fddtd
D sxJ t	
| | d S )NrL   rF   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r  rD   r   r   r   r(   r   r   rP   rQ   Cr   c                    r;  )r   r3  r4  r:  r!   r"   r8    r<  z.test_encoder_dtypes_pandas.<locals>.<listcomp>rM   rN   r   r   rP   rQ   rA  c                    s    g | ]}j | j | kqS r!   r3  r4  X_typer   r!   r"   r8    s     )r/   rT   r   r   r   rU   r.   r=  r>  r	   r2   r   rE   )rV   r   r   r!   rB  r"   test_encoder_dtypes_pandas  s   

"

 rD  c                  C   sX   t  } ddgddgg}t  td | | W d    d S 1 s%w   Y  d S )NrY   r   r[   r   r)   )r   warningscatch_warningssimplefilterr   )r   r   r!   r!   r"   test_one_hot_encoder_warning  s   

"rH  c                 C   s   ddgddgddgg}t | ddddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 sDw   Y  t|| dS )z,Check handle_unknown='warn' works correctly.rM   r   rN   r   r   Fr'   r   r   r$   rG   r   qFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr,   N	r   r.   r   r   r/   warnsUserWarningr2   r   )r   r   r   X_testrJ   warn_msgr   r!   r!   r"   test_ohe_handle_unknown_warn%  s    

rP  missing_valuec           	      C   sn  dddd| g}t |d}g dg ddddd| gg}|| }g dg d	g d
g}t|| |j|u s8J dd t|j|jD }||}t	j
|td}t|d rt|d d |d d  t|d skJ t|d ssJ t|d d d df |d d d df  t|dd df |dd df  t|d sJ t|d sJ d S t|| t|| d S )Nr   r`   r   r   r   )r   r`   r   r   rM   )r   r`   r   r   rM   )r   r   r   r   r   )r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r!   r!   )r5  r   r   r!   r!   r"   r8  M  s    z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>rD   r=   )r=   r=   )r   r   r   r	   r   r   r   r	  r   r   r   r   r   )	rQ  cats_to_dropr   r   transr   dropped_catsX_inv_transX_arrayr!   r!   r"    test_one_hot_encoder_drop_manual?  s2   


*"
rW  )r   r   r\   rM   c                 C   s^   t | d}d}tjt|d |g dg dg dg W d    d S 1 s(w   Y  d S )Nr   z-`drop` should have length equal to the numberr,   r   r   )r   r   ;   )r   r/   r0   r1   r.   )r   r   r   r!   r!   r"   test_invalid_drop_lengthd  s
   
"rY  densityr   denserM   r   rN   r  c                 C   s   t | d}t | |d}g dg dg}|| || t|j|j |dkr/t|jd nt||j|jD ]\}}}|t| |ksFJ q7t|jtj	sPJ |jj
tksXJ d S )Nr   r   )r   r   rM   r\  r   r   )r   r.   r	   r   r	  r   r)  r   r   ndarrayrE   r   )rZ  r   ohe_baseohe_testr   drop_catdrop_idxcat_listr!   r!   r"   test_categoriesl  s   



rc  c                 C   s   |    jjs	J d S )N)__sklearn_tags__
input_tagscategorical)r   r!   r!   r"   "test_encoders_has_categorical_tags  s   rg  kwargsmax_categoriesmin_frequency   g(\?r   )ri  rj  r`   rG   rF   rM   rN   r   r   c           
      C   s   t dgd dgd  dgd  dgd  gj}td|d	d
d| |}t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rM   r   rN   r  r   r]   r   r   r&   F)rG   r$   r   rM   r   r   er   r   c                 S      g | ]}|gqS r!   r!   r5  colr!   r!   r"   r8        z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr(   r   x0_infrequent_sklearnNr!   r   r   rI   r   r.   r	   infrequent_categories_r2   r   r   r   )
rh  rG   X_trainr   rN  r
  r   expected_invX_invr   r!   r!   r"   test_ohe_infrequent_two_levels  s(   2(



rz  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}|jd |jd  dks2J t dgdgg}||}tdgdgg| |	 }t
dg| ||}t
dgdgg| dS )z3Test two levels and dropping the frequent category.rM   r   rN   r  r   r]   r   r   r&   Fr   r$   r   ri  r   r   r   rt  rs  N)r   r   rI   r   r.   r   r	  r2   r   r   r	   r   )r   rw  r   rN  r   r   	X_inverser!   r!   r"   ,test_ohe_infrequent_two_levels_drop_frequent  s"   2

r}  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rM   r   rN   r  r   r]   r   r   r&   Fr   r{  Unable to drop category r   ( from feature 0 because it is infrequentr,   Nr   r   rI   r   r/   r0   r1   r.   r   rw  r   r   r!   r!   r"   5test_ohe_infrequent_two_levels_drop_infrequent_errors  s   2"r  r%  gQ?g{Gz?r$  c           	      C   s   t dgd dgd  dgd  dgd  gj}tdd	d
d| |}t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tg d| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rM   r   rN   r  r   r]   r   r   r&   Fr$   r   rn  r  r   r   r   r  rs  )r   r   rt  Nr!   ru  )	rh  rw  r   rN  r
  r   rx  ry  r   r!   r!   r"    test_ohe_infrequent_three_levels  s.   2(



r  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d |dgdgg}W d   n1 sfw   Y  tddgddgg| dS )z5Test three levels and dropping the frequent category.rM   r   rN   r  r   r]   r   r   r&   Fr{  r   r   r%   r*   r+   r,   rn  N)r   r   rI   r   r.   r   r2   r   r/   rL  rM  )r   rw  r   rN  r   r   r!   r!   r"   .test_ohe_infrequent_three_levels_drop_frequent  s"   2"r  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z7Test three levels and dropping the infrequent category.rM   r   rN   r  r   r]   r   r   r&   Fr{  r~  r   r  r,   Nr  r  r!   r!   r"   7test_ohe_infrequent_three_levels_drop_infrequent_errors  s   2"r  c                  C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t g dg dg dg dg}||}t|| dgg}d}t	j
t|d || W d   dS 1 sow   Y  dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rM   r   rN   r  r   r]   r   r   r)   F)r$   r   ri  r  r  r  badz.Found unknown categories \['bad'\] in column 0r,   N)r   r   rI   r   r.   r	   rv  r2   r   r/   r0   r1   )rw  r   rN  r
  r   r   r!   r!   r"   (test_ohe_infrequent_handle_unknown_error'  s    2"

"r  c                 C   s   t jdgd dgd  gtdj}tdg dgddd	| |}dgd
gdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ]}|j|d| tdgdgg|| qZdS )zG'a' is the only frequent category, all other categories are infrequent.rM   r   rn  rc   rD   r   r   rM   rN   Fr&   rG   r   r$   rN   r   r   r   r   r   r   r   Nr!   )	r   r   r   rI   r   r.   r2   r   r   )rh  rw  r   rN  r
  r   dropsr   r!   r!   r"   5test_ohe_infrequent_two_levels_user_cats_one_frequent?  s(   "(

r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.rM   r   rN   r  r   r]   r   r   rD   r  Fr&   r   rG   r   r$   ri  )r   r   rM   rn  r   r   c                 S   ro  r!   r!   rp  r!   r!   r"   r8  q  rr  z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>rs  r(   Nr   r   r   rI   r   r.   r	   rv  r2   r   r   rw  r   rN  r
  r   rx  ry  r!   r!   r"   (test_ohe_infrequent_two_levels_user_cats[  s*   *(


r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rM   r   rN   r  r   r]   r   r   rD   r   r   rN   rM   Fr&   r  rn  r  r  r  rs  Nr  r  r!   r!   r"   *test_ohe_infrequent_three_levels_user_catsv  s4   *(


r  c                  C   sb   t jg dg df } tdddd}||  ddgddgg}||}t|g d	g d
g dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)ri  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r.   r2   r   )r   r   rN  r   r!   r!   r"   test_ohe_infrequent_mixed  s   

r  c            	   
   C   s  t jg dg dg df } tdddd}||  }t|jd d	d
g t|jd	 d	dg t|jd
 d | }tg d| g dg dg dg dg dg dg dg dg dg	}t|| g dg dg}|	|}g dg dg}t||  |
|}t jg dg dgtd}t|| tdddd| }tjtdd |	| W d   n1 sw   Y  g d g d!g}|	|}g d"g dg}t||  |
|}t jg d#g d$gtd}t|| dS )%z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   r]   r   r   r   )	r   r   r   r   r   r   r   r   r   rF   r   r&   rG   ri  r$   r   r   r   r]   N)x0_0x0_3rt  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r(   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   rs  N)rs  r   NrD   r)   r+   r,   )r   r   r   )r   r]   r   )r   r   r   r   r   r   r   r   )rs  rs  r   )r   rs  r   )r   r  r   r   r   r	   rv  r   r   r2   r   r   r   r.   r/   r0   r1   )	r   r   r   r   r
  rN  X_test_transry  rx  r!   r!   r"   'test_ohe_infrequent_multiple_categories  sp   






r  c            	   
   C   s  t d} | jg dg ddddgd}tdd	d
d}|| }t|jd ddg t|jd g d g dg dg dg dg dg dg dg dg dg	}t|| | jddgddgdddgd}g dg dg}|	|}t||  |
|}tjddgddggtd}t|| | jddgddgdddgd}|	| }g dg dg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rL   	rM   fr   r  r  rM   r   rN   rN   	r   r   r   r]   r]   r`   r   r   r   )strr)  r  r)  columnsrF   r   r&   r  r   rM   rN   r   r   r   r`   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     r`   rs  rD   r   r   N)r/   rT   rU   r   r   r   r	   rv  r   r2   r   r   r   r   )	rV   r   r   r   r
  rN  r  ry  rx  r!   r!   r"   .test_ohe_infrequent_multiple_categories_dtypes  sV   
	
 


 

r  rb   )rj  ri  c                 C   sp   t dgd dgd  dgd  dgd  gj}tdd	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.rM   r   rN   r  r   r]   r   r   r&   Fr  r   Nr!   )r   r   rI   r   r.   r2   r   rh  rw  r   r   r!   r!   r"   $test_ohe_infrequent_one_level_errorsH  s   2
r  c                 C   sb   t jdgd gtdj}tdg dgddd| |}|dgdgg}t|d	gd	gg d
S )r  rn  r   rD   r  Fr&   r  rM   r   Nr!   )r   r   r   rI   r   r.   r2   r   r  r!   r!   r"   5test_ohe_infrequent_user_cats_unknown_training_errorsV  s   r  zinput_dtype, category_dtype)OOOUUOUUSOSUSS
array_type)r   r   	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rN   rM   rD   Fr   r   r   r   N)	r   r   r   r.   r   r2   r   r   r	   )
rC   category_dtyper  r   rG   r   rN  r   r
  oer!   r!   r"   test_encoders_string_categoriesg  s   
"

r  c                  C   s~   t jdgdggdd} t jddgddg}t|dd}td}tjt|d	 ||  W d
   d
S 1 s8w   Y  d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rN   rM   UrD   SFr   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r,   N)	r   r   r   r   r   r/   r0   r1   r.   )r   rG   r   r   r!   r!   r"   $test_mixed_string_bytes_categoricals  s   "r  c                 C   sP   t jdd| d| ggtdj}tddd|}| }t|ddd	|  g d S )
NrM   rN   rD   Fr%   r   r$   x0_ar   x0_)r   r   r   rI   r   r.   r   r	   )rQ  r   r   namesr!   r!   r"   )test_ohe_missing_values_get_feature_names  s   r  c                  C   sr   t d} | jg dtjdddtjgtdddd	gd
}tg dg dg dg dg}t|}t|| d S )NrL   )dogr   Nr   r   r   r(   rD   )col1col2r  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r/   rT   rU   r   r   r   floatr   r   )rV   dfexpected_df_transr   r!   r!   r"   %test_ohe_missing_value_support_pandas  s    
	r  pd_nan_typepd.NAznp.nanc              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}tg d	g d
g dg dg d
g}td|d}|	|}t
|| t|jdksMJ t|jd d d g d t|jd d sgJ d S )NrL   r  r  r   rM   rN   r   rD   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   r=   r   )r/   rT   NAr   r   rU   r   r   r   r   r   lenr   r	   isnan)r  r$   rV   pd_missing_valuer  r  r   df_transr!   r!   r"   1test_ohe_missing_value_support_pandas_categorical  s(   



r  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg d	g}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rM   r   rN   r   r   r   Fr   r   r$   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr,   NrD   r   r   r   r   r   r/   rL  rM  r2   r   r	   r   r$   r   r   r   rJ   rN  rO  ry  r!   r!   r"   /test_ohe_drop_first_handle_unknown_ignore_warns  s,   




r  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg dg}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rM   r   rN   r   r   r   Fr  r  r   rR   r   r   )r   r   r   r   r  r,   NrD   r  r  r!   r!   r"   3test_ohe_drop_if_binary_handle_unknown_ignore_warns  s,   




r  c                 C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 sDw   Y  t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rM   r   rN   r   r   r   FrI  r   rJ  r,   NrK  )r$   r   r   rN  rJ   rO  r   r!   r!   r"   'test_ohe_drop_first_explicit_categories&  s    

r  c                  C   s   t d} | jg dg ddddgd}tdd	}|jdd
 d}t jt|d || W d   n1 s9w   Y  || t jt|d |	| W d   dS 1 s[w   Y  dS )zJRaise informative error message when pandas output and sparse_output=True.rL   r   )r   rN   rN   )rM   rN   rM   rN   r  Tr   r2   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr,   N)
r/   rT   rU   r   
set_outputr0   r1   r   r.   r2   )rV   r  r   r   r!   r!   r"   'test_ohe_more_informative_error_messageA  s   
 

"r  c                  C   sn   t t jdddggj} tt jd}dt j }tjt|d |	|  W d   dS 1 s0w   Y  dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rD   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r,   N)
r   r   r   rI   r   int32r/   r0   r1   r.   )r   r  r   r!   r!   r"   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtypeU  s   "r  encoded_missing_valuer  c                 C   s   t jt jdddggt jdj}t| d|}t|jdks J t	|jd ddt jg |
|}t	|| gdgdgdgg ||}t	|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   rD   r  r   r   r   N)r   r   r   r  rI   r   r.   r  r   r   r2   r   )r  r   r  r   r|  r!   r!   r"   5test_ordinal_encoder_passthrough_missing_values_floatc  s   

r  c              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ks1J t|j
d dd g d t|j
d d sKJ ||}t|dgdg|gdgdgg ||}|jdkskJ t|dddf ddg t|dddf ddg t|d sJ dS )z0Check ordinal encoder is compatible with pandas.rL   r  r  r   rM   rN   r   rD   r  r   r   Nr   r   r=          @r   r   )r   r   r   r   )r/   rT   r  r   r   rU   r   r   r.   r  r   r	   r  r2   r   r   r   )r  r  rV   r  r  r  r  r|  r!   r!   r"   =test_ordinal_encoder_missing_value_support_pandas_categoricalu  s"   


r  r  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 C   s   t |d}tdgtjgg}t|| | |jd j|ks!J t |d}tj	t
dd || W d   dS 1 s>w   Y  dS )z.Test ordinal encoder for specified categories.r   r   r   r+   r,   N)r   r   r   r   r	   r   r   rE   r/   r0   r1   r.   )r   r5   r   r   r  r   r!   r!   r"   =test_ordinal_encoder_specified_categories_missing_passthrough  s   
&
"r  c                 C   sr   t jg dtdg}| |d}t jddggtdj}tjtdd || W d   dS 1 s2w   Y  dS )	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rM   rN   rM   rD   r   rM   rN   z5the predefined categories contain duplicate elements.r,   N)r   r   r   rI   r/   r0   r1   r.   r   r!   r!   r"   +test_encoder_duplicate_specified_categories  s   
"r  zX, expected_X_trans, X_testr   r   )r   r   r   )r   r  r   r   )r   rM   rN   )r  r   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr  r=   r  g      N)r   r   r   r2   )r   expected_X_transrN  r  r   r!   r!   r"   /test_ordinal_encoder_handle_missing_and_unknown  s   

r  csr_containerc                 C   s   t g dg dg}| |}t }d}tjt|d || W d   n1 s+w   Y  tjt|d || W d   n1 sGw   Y  ||}| |}tjt|d || W d   dS 1 smw   Y  dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr,   N)	r   r   r   r/   r0   r   r.   r   r   )r  r   X_sparseencoderr   r   r   r!   r!   r"   test_ordinal_encoder_sparse  s   
"r  c                  C   s   t g dddt jf } tg dgddd}||  tg dgdd}tjtd	d
 ||  W d   dS 1 s>w   Y  dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)r=   r   r   r  r"  )rG   r$   r  r)   r   r+   r,   )r   r   newaxisr   r.   r/   r0   r1   )r   r  r!   r!   r"   -test_ordinal_encoder_fit_with_unseen_category  s   
"r  rw  AAOr  rN  c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r  ir  r   N)r   r.   r2   r   )rw  rN  r   r   r!   r!   r"   1test_ordinal_encoder_handle_unknown_string_dtypes&  s   

r  c                  C   sb   t g ddd} t | }t|jt j| ddj |	| }t|dgdgdgdgg dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr=   r   r   )axisr   r   N)
r   r   r@   r   r.   r	   r   sortrI   r2   )r   r  r   r!   r!   r"   #test_ordinal_encoder_python_integerB  s   
r  c                  C   sH   t d} g d}| jg dg|d}t |}| }t|| dS )z-Check feature names out is same as the input.rL   )rN   r   rM   r   r  N)r/   rT   rU   r   r.   r   r	   )rV   r  r   r   feature_names_outr!   r!   r"   .test_ordinal_encoder_features_names_out_pandasV  s   
r  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg ||}|d d d
u sSJ t 	|d d s^J d
S )zECheck interactions between encode_unknown and missing value encoding.rM   rN   rD   r  r$   r  r  r   r   r   N)
r   r   r   r   r   r.   r2   r   r   r  )r   r  r   rN  r  X_roundtripr!   r!   r"   0test_ordinal_encoder_unknown_missing_interactionb  s    


r  with_pandasc                 C   s   t jddgddgdt jggtd}d}| r(td}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W d   dS 1 sIw   Y  dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rM   r  rN   r   r   rD   zTencoded_missing_value \(1\) is already used to encode a known category in features: rL   letterpetr  z	\['pet'\]z\[1\]r   r  r,   N)r   r   r   r   r/   rT   rU   r   r0   r1   r.   )r  r   	error_msgrV   r  r!   r!   r"   0test_ordinal_encoder_encoded_missing_value_error  s   "


"r  z4X_train, X_test_trans_expected, X_roundtrip_expected1c                 C   s   t dtjtjd| }tdgtjgdgg}||}t|| ||}|jd }t	|D ]+}||df }	||df }
|	du rI|
du sHJ q0t
|	rUt|
sTJ q0|
|	ks[J q0dS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r  r  r  rN   r   N)r   r   r   r.   r   r2   r   r   r   r>  r   r  )rw  X_test_trans_expectedX_roundtrip_expectedr  rN  r  r  	n_samplesr6  expected_valvalr!   r!   r"   9test_ordinal_encoder_unknown_missing_interaction_both_nan  s*   



r  c                  C   s   t d} | ddgddgd}t }|jdd d}t jt|d	 || W d
   n1 s3w   Y  tddjdd}tddjdd}||}||}t|	 | t
| |j d
S )z*Check OneHotEncoder works with set_output.rL   rM   rN   r   r   rO   r  zCPandas output does not support sparse data. Set sparse_output=Falser,   NFr   default)r/   rT   rU   r   r  r0   r1   r   r   to_numpyr	   r   r  )rV   rW   r   r-   ohe_default
ohe_pandas	X_defaultX_pandasr!   r!   r"   test_one_hot_encoder_set_output  s   


r  c                  C   st   t d} | ddgddgd}t jdd}t jdd}||}||}t| | t|	 |j
 d	S )
z+Check OrdinalEncoder works with set_output.rL   rM   rN   r   r   rO   r  r  N)r/   rT   rU   r   r  r   r   r  r	   r   r  )rV   rW   ord_default
ord_pandasr
  r  r!   r!   r"   test_ordinal_set_output  s   


r  c                  C   st   g dddgg} t | d}|ddgg t| t|jks J t|jD ]\}}|jtks0J t| | | q%dS )zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asmmaseasrasacsr  2r   r  N)r   r.   r  r   	enumeraterE   r   r	   )rG   r   nr   r!   r!   r"    test_predefined_categories_dtype  s   
r  c                  C   s~   t jdgdgt jggtd} tdd| }t|dgdgdgg tddd	| }t d
gg}||}t|dgg dS )zBCheck missing value or unknown encoding can equal the cardinality.r  r   rD   r   r  r   r   r  r  snakeN)	r   r   r   r   r   r   r   r.   r2   )r   r   r   rN  r!   r!   r"   1test_ordinal_encoder_missing_unknown_encoding_max  s   
r  c                  C   s  t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	d
d| }t| g d |jd |j	d  dksAJ t jdgd dgd  dgd  gtdj} tdd	dd| }t| dg |jd |j	d  dkswJ t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	dgd| }t| g d |jd |j	d  dksJ tdd	dd| }t| g d |j	du sJ dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rM   r   rN   r(   r   r   rn  rD   Fr   )rj  r   r   )r   x0_dx0_ert  r   r]   r   rt  )r   r   r  rt  N)r   r   r  r  rt  )
r   r   r   rI   r   r.   r	   r   r   r	  )r   r   r!   r!   r"   #test_drop_idx_infrequent_categories  s<   4,4r  c                 C   s   t dgd dgd  dgd  dgd  gj}tdd	d
d| |}t|jg dg t|jddgg dgdgdgdgdgg}dgdgdgdgd
gg}||}t	|| |
|}dgdgdgdgdgg}t|| dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.rM   r   rN   r  r   r]   r   r   r  r=   r  rl  r   r   r   r   rs  Nr!   )r   r   rI   r   r.   r	   r   rv  r2   r   r   )rh  rw  ordinalrN  expected_transr   r|  expected_inverser!   r!   r"   ,test_ordinal_encoder_infrequent_three_levels6  s,   2


r!  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg d
g t|jddgg dgdgdgdgdgg}dgdgdgdgdgg}|	|}t
|| ||}dgdgdgdgdgg}t|| dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rM   r   rN   r  r   r]   r   r   rD   r  r  r=   )rG   ri  r$   r  r   r   r   r   rs  N)r   r   r   rI   r   r.   r	   r   rv  r2   r   r   )rw  r  rN  r  r   r|  r   r!   r!   r"   6test_ordinal_encoder_infrequent_three_levels_user_cats]  s6   *


r"  c                  C   s   t g dg df} tdd| }t|jd ddg |jd du s&J ddgddgg}ddgddgg}||}t|| ||}t j	ddgd	dggt
d
}t|| dS )zETest when feature 0 has infrequent categories and feature 1 does not.r  r  r   ri  r   r   r   Nrs  rD   )r   column_stackr   r.   r	   rv  r2   r   r   r   r   )r   r  rN  r  r   r|  r   r!   r!   r"   %test_ordinal_encoder_infrequent_mixed  s   


r%  c                  C   s   t d} | g d}| jg dg d| jdgd dgd  d	g d
g |ddg dd}tdd|}t|jd ddg t|jd g d t|jd d
d	g | jg dg d| jdgd	g d
g dg |ddg dd}g dg dg dg dg}|	|}t
|| dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rL   )birdr   r  r  r  r  r  r(   r   r   r  r&  rD   )r  r)  rf  r  r#  r   rM   rN   r   r  r   )rM   rN   r  r   )r`   r   r]   r   )r   r   r   )r   r   r   )r   r   r   r  N)r/   rT   CategoricalDtyperU   r   r   r.   r	   rv  r2   r   )rV   categorical_dtyper   r  rN  r  r   r!   r!   r"   :test_ordinal_encoder_infrequent_multiple_categories_dtypes  s:   


r)  c                  C   s   t jdgd dgd  dgd  dgd  t jg gtd	j} td
dddd| }t|jg dg t jdgdgdgdgdgt jggtd	}dgdgdgdgdgdgg}|	|}t
|| dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rM   r   rN   r  r   r]   r   r   rD   r  r   )r$   r  ri  r  rm  rn  r   r   N)r   r   r   r   rI   r   r.   r	   rv  r2   r   )rw  r  rN  r  r   r!   r!   r"   .test_ordinal_encoder_infrequent_custom_mapping  s$   2(
r*  c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tdi | d
dd|}td
dd|}dgdgdgdgdgg}t|||| dS )zMAll categories are considered frequent have same encoding as default encoder.rM   r   rN   r  r   r]   r   r   rD   r  r=   r  rn  Nr!   r   r   r   rI   r   r.   r   r2   )rh  rw  adjusted_encoderdefault_encoderrN  r!   r!   r"   !test_ordinal_encoder_all_frequent  s*   	*r.  d   c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tdi | d
dd|}dgdgdgdgdgg}t||dgdgdgdgdgg dS )zAWhen all categories are infrequent, they are all encoded as zero.rM   r   rN   r  r   r]   r   r   rD   r  r=   r  rn  r   Nr!   r+  )rh  rw  r  rN  r!   r!   r"   #test_ordinal_encoder_all_infrequent  s   	*(r0  c                  C   s   t jt jgd dgd  dgd  dg dg gtdj} td	d
| }t jdddt jggtdj}||}t|dgdgdgt jgg dS )z5Check behavior when missing value appears frequently.r  r  r]   r   r   r  deerrD   r   r#  r   r   r   N	r   r   r   r   rI   r   r.   r2   r   r   r  rN  r   r!   r!   r"   -test_ordinal_encoder_missing_appears_frequent
	  s   ,
 r4  c                  C   s   t jt jgdgd  dgd  dg dg dgd d	gd  gtd
j} tdd| }t jddgdd	gt jd	gdd	gddggtd
}||}t|ddgddgt jdgddgddgg dS )z7Check behavior when missing value appears infrequently.r  r]   r   r   r  r1  redr%  greenrD   r(   )rj  r   r   r   Nr2  r3  r!   r!   r"   /test_ordinal_encoder_missing_appears_infrequent	  s(   &

.r7  c                 C   sd   t jdgdgdggtd}| g dgd}tt || W d   dS 1 s+w   Y  dS )a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rP   rQ   rA  rD   r@  r   N)r   r   r   r/   r0   r   r2   )r   r   r  r!   r!   r"   test_encoder_not_fitted3	  s
   	"r8  )r   rE  numpyr   r/   scipyr   sklearn.exceptionsr   sklearn.preprocessingr   r   sklearn.utils._missingr   sklearn.utils._testingr   r   r	   sklearn.utils.fixesr
   r#   markparametrizer8   rA   r  float32r  rK   rX   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r9  str_r   rI   r   r   r   r   r   r   r  r  r  r  r  r!  r)  r'  r(  r*  r.  r?  rD  rH  rP  rW  rY  rc  rg  rz  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rH   r  r  r  r  r  r  r!  r"  r%  r)  r*  r.  r0  r4  r7  r8  r!   r!   r!   r"   <module>   s   


<


/*


 &&* 
!&1
	


		
	




$








$[A



%
$

		"

!$$0