o
    `^hJ                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ dededefddZdedede
jfddZdd Z	dddZ	dddZ		dddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ] \}}}||v r=|d | |d | |d ||  q|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S      i | ]\}}||qS  r   .0	array_idx
column_idxr   r   [/home/air/shanriGPT/back/venv/lib/python3.10/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>.       
z)_split_sparse_columns.<locals>.<dictcomp>r      r   )list	enumeratezipappend)r   r   arff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s   "r"   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]\}}}||v r<||||| f< q+|S )Nr   c                 S   r   r   r   r   r   r   r   r   @   r   z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsy_shaper   yr   r    r!   r   r   r   _sparse_data_to_array9   s   "r-   c                 C   sP   | | }t |dkr| | }||fS t |dkr"| |d  }||fS d}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r&   )framefeature_namestarget_namesXr,   r   r   r   _post_process_frameK   s   r2   c           "         s  dd }|| }|dkrt jnt j}|dk }	t j|||	d}
|| fdd|
d D  |dkrtd	}t|
d }t| }t|
d
 }|j	|g|dd}|j
dd }t|}fdd|D }|| g}t|
d
 |D ]}||j	||dd|  qrt|dkr|d |d j|d< |j|dd}t||}~~i }|jD ]%}| d }| dkrd||< q| dkrd||< q|j| ||< q||}t|||\}n|
d
 }fdd|D }fdd|D }t|tr3|du rtd|d dkr	d}n|d |d  }tjtj|d|d }|j| }|dd|f }|dd|f n@t|t rjt!||}t"|d d }|t|f} t#j$j%|d |d |d ff| tj&d!}|' }t(||n	td"t)|  fd#d$|D }!|!snt*|!rt+ fd%dt,|D n	t-|!rtd&j.d dkrd'n
j.d dkrd|dkr||dfS |d fS )(a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c                 s   s    | D ]}| dV  qd S )Nutf-8)decode)	gzip_fileliner   r   r   _io_to_generator   s   z+_liac_arff_parser.<locals>._io_to_generatorsparsepandas)return_typeencode_nominalc                    s(   i | ]\}}t |tr| v r||qS r   )
isinstancer   )r   namecatcolumns_to_selectr   r   r      s    z%_liac_arff_parser.<locals>.<dictcomp>
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepc                       g | ]}| v r|qS r   r   r   colr?   r   r   
<listcomp>       z%_liac_arff_parser.<locals>.<listcomp>r   r   r   )ignore_index	data_typeintegerInt64nominalcategoryc                       g | ]
}t  | d  qS indexintr   col_nameopenml_columns_infor   r   rI          c                    rQ   rR   rT   rV   rX   r   r   rI      rZ   Nz6shape must be provided when arr['data'] is a Generatorr)   )r$   count)shaper$   z-Unexpected type for data obtained from arff: c                    s   h | ]}| v qS r   r   rV   )
categoriesr   r   	<setcomp>  s    z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]!\}}t t j |d ddd||d f jtddqS )Or#   Nr   F)rD   )r'   takeasarraypopastyperU   )r   irW   )r^   r,   r   r   rI     s     zAMix of nominal and non-nominal targets is not currently supported)r[   )/r   COO	DENSE_GENloadr
   r   r   keysnext	DataFramememory_usagesumr	   r   r   r&   rd   dtypesconcatr   rC   lowerr2   r<   r   
ValueErrorr'   fromiter	itertoolschainfrom_iterablereshapetupler"   r%   spr8   
coo_matrixr)   tocsrr-   typeallhstackr   anyr]   )"r5   output_arrays_typerY   feature_names_to_selecttarget_names_to_selectr]   r7   streamr:   r;   arff_containerpdcolumns_infocolumns_names	first_rowfirst_df	row_bytes	chunksizecolumns_to_keepdfsrB   r.   rn   r=   column_dtyper1   r   feature_indices_to_selecttarget_indices_to_selectr\   arff_data_Xr*   X_shapeis_classificationr   )r^   r@   rY   r,   r   _liac_arff_parserk   s   7
















	
r   c              
      s  ddl | D ]}|d dr nqi |D ]}|| d }| dkr,d|< q| dkr6d	|< qfd
dt|D }	dddgddddd|	d	}
i |
|pUi }j| fi |}z
dd |D |_W n ty} } zj	d|d}~ww ||   fdd|jD }|| }t
dfdd}fdd|j D }|D ]}|| j|||< qt|||\}}|dkr|||dfS | | }}fdd|j D }||d|fS )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr3   z@datarL   rM   rN   rO   rP   c                    s"   i | ]\}}| v r| | qS r   r   )r   r!   r=   )rn   r   r   r     s
    z'_pandas_arff_parser.<locals>.<dictcomp>F?%"T\)	header	index_col	na_valueskeep_default_nacomment	quotecharskipinitialspace
escapecharr$   c                 S   s   g | ]}|qS r   r   )r   r=   r   r   r   rI     s    z'_pandas_arff_parser.<locals>.<listcomp>zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.c                    rF   r   r   rG   r?   r   r   rI     rJ   z^'(?P<contents>.*)'$c                    s"   t  | }|d u r| S |dS )Ncontents)researchgroup)input_stringmatch)single_quote_patternr   r   strip_single_quotes  s   
z0_pandas_arff_parser.<locals>.strip_single_quotesc                    s    g | ]\}}t | jr|qS r   )r<   CategoricalDtyper   r=   r$   r   r   r   rI     s    
r9   c                    s(   i | ]\}}t | jr||j qS r   )r<   r   r^   tolistr   r   r   r   r     s    

)r9   r4   rp   
startswithr   read_csvrC   rq   errorsParserErrorr   compilern   itemsr>   rename_categoriesr2   to_numpy)r5   r   rY   r   r   read_csv_kwargsr6   r=   r   dtypes_positionaldefault_read_csv_kwargsr.   excr   r   categorical_columnsrH   r1   r,   r^   r   )r@   rn   r   r   r   _pandas_arff_parser7  sp   8




r   c                 C   sD   |dkrt | |||||S |dkrt| |||||S td| d)a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr9   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)r   r   rq   )r5   parseroutput_typerY   r   r   r]   r   r   r   r   load_arff_from_gzip_file  s*   ;	
r   )N)NN)__doc__rs   r   collectionsr   collections.abcr   typingr   numpyr'   scipyrx   	externalsr   externals._arffr   utils._chunkingr   r	   utils._optional_dependenciesr
   utils.fixesr   r"   ndarrayr-   r2   r   r   r   r   r   r   r   <module>   sJ    
#
&
 S
  