o
    3Ih%                  	   @  s|  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlZd dlmZmZmZmZ d dlmZ d dlmZmZmZ ddlmZ ddlmZ dd	lmZmZ ejd
ej d e!e"Z#G dd dZ$G dd de$Z%G dd de$Z&G dd de$Z'G dd de$Z(G dd de$Z)dd Z*G dd dZ+dEd d!Z,G d"d# d#Z-G d$d% d%Z.G d&d' d'Z/d(d) Z0d*d+ Z1d,d- Z2e"d.kre2 Z3e3j4re#5ej6 e3j7Z8e3j9Z:ee3j; Z;e3j<re=e3j<nd/Z<e3j>re=e3j>ndZ>ej?@e:re#Ad0e: d1 eBd0e: d1e3jCre3jDd2kre#Ed3 d4e3_CeFe8ZGe3jDd2kr5e'e3jHe3jIe<e>d5ZJnne3jDd6krKe(e3jHe3jCe3jKe;e<e>e3jId7ZJnXe3jDd8krWe%e<d9ZJnLe3jDd:kree&e3jHe<d;ZJn>e3jDd<kre;ejLkrye#Ed= ejMZ;e8ZGe3jNdure3jNd>krd?ZNnd@ZNnd?ZNe)e3jOe3jPe3jQeNdAZJneRdBe3jD e/eGe3jKe3jSe3jTeJdCZUeUV  eUjGWe:dD dS dS )F    )annotationsN)
GraphProto
ModelProto	NodeProtoTensorProto)version)quantize_matmul_4bitsquantize_matmul_8bitsquantize_qdq_matmul_4bits   )CalibrationDataReader)	ONNXModel)QuantFormatattribute_to_kwargz2%(asctime)s %(name)s [%(levelname)s] - %(message)s)formatlevelc                   @  s   e Zd Z			ddddZdS )WeightOnlyQuantConfigN	algorithmstrquant_formatr   op_types_to_quantizetuple[str, ...] | None
quant_axes"tuple[tuple[str, int], ...] | Nonecustomized_weight_configdict | Nonec                 C  sB   || _ || _|rt|ndh| _|rt|nddd| _|| _dS )aq  This is the Base class for Weight Only blockwise quantization Configuration.

        Args:
            algorithm:
                weight only quantize algorithm name.
            quant_format: QuantFormat{QOperator, QDQ}.
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
            op_types_to_quantize (optional):
                set of operator types to quantize. Default {MatMul}
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
            customized_weight_config:
                customized weight config for nodes if needed. It is dictionary with node name as key,
                and the value is a dict of customized config.
        MatMulr   r   )r   GatherN)r   r   setr   dictr   r   )selfr   r   r   r   r    r!   o/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/quantization/matmul_nbits_quantizer.py__init__    s
   
zWeightOnlyQuantConfig.__init__)NNN)
r   r   r   r   r   r   r   r   r   r   )__name__
__module____qualname__r#   r!   r!   r!   r"   r      s
    r   c                      s*   e Zd Zdejddfd fddZ  ZS )	RTNWeightOnlyQuantConfigNr   r   r   r   c                   s<   |t jks	J d|du ri }t jd|||d || _dS )ap  
        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
        RTN is the most straightforward way to quantize weight using scale maps.

        Args:
            ratios:
                percentile of clip. Defaults to {}.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
            customized_weight_config:
                customized weight config for nodes if needed. It is dictionary with node name as key,
                and the value is a dict of customized config.
        z"RTN only supports QOperator formatNRTN)r   r   r   r   )r   	QOperatorsuperr#   ratios)r    r+   r   r   r   	__class__r!   r"   r#   @   s   
z!RTNWeightOnlyQuantConfig.__init__)r   r   r   r   r$   r%   r&   r   r)   r#   __classcell__r!   r!   r,   r"   r'   ?   s    r'   c                      s2   e Zd Zddddddejdfd fd
dZ  ZS )GPTQWeightOnlyQuantConfigNg{Gz?   FTcalibration_data_readerCalibrationDataReader | Noner   r   c	           	        sL   |t jks	J dt jd||d || _|| _|| _|| _|| _|| _	dS )a  
        This is a class for GPTQ algorithm Weight Only Quant Configuration.
        GPTQ algorithm provides more accurate quantization but requires more computational resources.

        Args:
            calibration_data_reader:
                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
            percdamp:
                percent of the average Hessian diagonal to use for dampening.
            block_size (int, optional):
                channel number in one block to execute a GPTQ quantization iteration.
            actorder (bool, optional):
                whether rearrange Hessian matrix considering the diag's value.
            mse (bool, optional):
                whether get scale and zero point with mse error.
            perchannel (bool, optional):
                whether quantize weight per-channel.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
        z#GPTQ only supports QOperator formatGPTQ)r   r   r   N)
r   r)   r*   r#   r2   percdamp
block_sizeactordermse
perchannel)	r    r2   r5   r6   r7   r8   r9   r   r   r,   r!   r"   r#   f   s   #
z"GPTQWeightOnlyQuantConfig.__init__)r2   r3   r   r   r.   r!   r!   r,   r"   r0   e   s    r0   c                      s.   e Zd Zdddejddfd fd	d
Z  ZS )HQQWeightOnlyQuantConfigr1      r   Nr   r   r   r   c                   s<   |t jks	J dt jd|||d || _|| _|| _dS )a  
        This is a class for HQQ algorithm Weight Only Quant Configuration.
        HQQ algorithm quant weight without needing calibrate data.

        Args:
            block_size (int, optional):
                channel number in one block to execute a HQQ quantization iteration.
            bits (int, optional):
                how many bits to represent weight.
            axis (int, optional):
                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
        z"HQQ only supports QOperator formatHQQr   r   r   r   N)r   r)   r*   r#   r6   bitsaxis)r    r6   r>   r?   r   r   r   r,   r!   r"   r#      s   
z!HQQWeightOnlyQuantConfig.__init__)r   r   r   r   r.   r!   r!   r,   r"   r:      s    r:   c                      s0   e Zd Zdddejdddfd fddZ  ZS )DefaultWeightOnlyQuantConfigr1   FNr;   r6   intis_symmetricboolaccuracy_level
int | Noner   r   r   r   r>   c                   s0   t  jd|||d || _|| _|| _|| _dS )a   
        This is a class for weight only affine quantization configuration.

        Args:
            block_size (int, optional):
                channel number in one block to execute an affine quantization iteration.
            is_symmetric (bool, optional):
                whether quantize weight symmetrically.
            accuracy_level (int, optional):
                Accuracy level of the 4-bit quantized MatMul computation.
                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
            bits (int, optional):
                number of bits per element after quantization. Default 4.
        DEFAULTr=   N)r*   r#   r6   rB   r>   rD   )r    r6   rB   rD   r   r   r   r>   r,   r!   r"   r#      s   !
z%DefaultWeightOnlyQuantConfig.__init__)r6   rA   rB   rC   rD   rE   r   r   r   r   r>   rA   r.   r!   r!   r,   r"   r@      s    r@   c                      s4   e Zd Z			d
 fdd	Zdd Zdd	 Z  ZS )NVAWQWeightOnlyQuantConfigcnn./cacheawq_litec                   s$  zddl }ddlm} || _ || _W n ty"   td tddw zddlm} || _W n ty>   td tddw zdd	lm}m	}	 || _|	| _	W n ty_   td
 tddw t
 jdtjddd | j | j j rvdnd}
| j|||ddd|
dddddd}|| _|| _dS )a=  
        Configuration for the nvidia_awq quantization method.

        Args:
            tokenizer_dir (str): pathof the tokenizer dir.
            dataset_name (str): Name of the dataset.
            cache_dir (str): Directory for caching.
            calibration_method (str): calib method for nvidia_awq.
        r   N)
DataLoaderzfError: The 'torch' library is required but not installed. Please install it using 'pip install torch'.z torch is not installed. Exiting.)load_datasetzlError: The 'datasets' library is required but not installed. Please install it using 'pip install datasets'.z#datasets is not installed. Exiting.)
AutoConfigAutoTokenizerztError: The 'transformers' library is required but not installed. Please install it using 'pip install transformers'.z'transformers is not installed. Exiting.
nvidia_awqr=   cudacpu    r   i   TFr1   )dataset_name
model_name	cache_dir
calib_size
batch_sizer6   deviceuse_fp16use_buffer_shareadd_past_kv_inputsmax_calib_rows_to_loadadd_position_ids)torchtorch.utils.datarK   ImportErrorprintdatasetsrL   transformersrM   rN   r*   r#   r   QDQrX   rP   is_availableget_calib_inputsr2   calibration_method)r    tokenizer_dirrS   rU   rg   r^   rK   rL   rM   rN   rX   calib_inputsr,   r!   r"   r#      sj   






z#NVAWQWeightOnlyQuantConfig.__init__c	              	   C  s2  | j }	|}
|}t|tr|	j|||	jd}
|	j|||	jd}|
 | d}|r@| dd }||dkd | |d< |r|rG|	j	n|	j
}|
j\}}|j}|j|j|j }}t|jD ]5}|	j|||rk|nd|||d}|	j|||rz|nd|||d}|d| d| d| d	| i qa|S )
N)rX   dtype)	input_idsattention_maskr   r   position_idszpast_key_values.z.keyz.value)r^   
isinstancelisttensorint64
contiguouslongcumsummasked_fill_float16float32shapemax_position_embeddingsnum_key_value_headshidden_sizenum_attention_headsrangenum_hidden_layerszerosupdate)r    configinput_ids_argattention_mask_argr[   rX   rY   rZ   r]   r^   rk   rl   inputsrn   torch_dtyperW   sequence_lengthmax_sequence_length	num_heads	head_sizeipast_key
past_valuer!   r!   r"   make_model_inputB  sV   




z+NVAWQWeightOnlyQuantConfig.make_model_inputc           $      C  s2  | j }| j}| j}|j|d|dd}|j|d|dd}|ddi |j|_||ks.J dd|v rA|ddd	d
t|}d}nd|v rN|ddd}d}nt	d| d|| d | }|j
|ddd|d}||}|d }|d }| j}|||dd}|||dd}t|jt|jksJ t|t|ksJ || }g }t|D ]\}}|| ||d kr nqg }t|D ]\}}|| ||d kr nqtd| dt| dt| d g }t|D ]$} ||  }!||  }"| ||!|"|
|||	|}#dd |# D }#||# qtd t| d |S )!NT)use_auth_tokenrU   trust_remote_code	pad_tokenz[PAD]z8calib size should be no more than max_calib_rows_to_loadrH   cnn_dailymailz3.0.0train)namesplitarticlepilezmit-han-lab/pile-val-backup
validation)r   textz	dataset "z" not supportedpt)return_tensorspadding
truncation
max_lengthrk   rl   F)rW   shuffler   z/
--Quantize-Script-- number_of_batched_samples=z, batch-input-ids-list-len=z, batched_attention_mask=
c                 S  s   i | ]\}}||   qS r!   )rQ   numpy).0
input_nametorch_tensorr!   r!   r"   
<dictcomp>  s    z?NVAWQWeightOnlyQuantConfig.get_calib_inputs.<locals>.<dictcomp>z0
--Quantize-Script-- number of batched inputs = )rM   rN   rL   from_pretrainedadd_special_tokens	eos_tokenr   selectr~   
ValueErrorbatch_encode_plustorK   lendataset	enumerateappendra   r   items)$r    rS   rT   rU   rV   rW   r6   rX   rY   rZ   r[   r\   r]   auto_configauto_tokenizerrL   r   	tokenizerdataset2columnbatch_encodedbatch_encoded_input_idsbatch_encoded_attention_maskdata_loadercalib_dataloader_input_idscalib_dataloader_attention_masknumber_of_batched_samplesbatched_input_idsidxdatabatched_attention_maskbatched_inputs_listr   rk   rl   r   r!   r!   r"   rf     s   




z+NVAWQWeightOnlyQuantConfig.get_calib_inputs)rH   rI   rJ   )r$   r%   r&   r#   r   rf   r/   r!   r!   r,   r"   rG      s    PArG   c                 C  s   t |t| |  | kS N)rA   npceil)val1val2r!   r!   r"   is_divisible  s   r   c                   @  sN   e Zd Zd ddZe			d!d"ddZedd Z	d#ddZd$ddZdS )%HQQWeightOnlyQuantizerr   r:   c                 C  
   || _ d S r   r   r    r   r!   r!   r"   r#        
zHQQWeightOnlyQuantizer.__init__r   NFmin_max	list[int]r?   rA   
opt_paramsr   c                   sB  dd l  |d u rdddddn|}|d |d |d	 |d
 f\}}}	}
| jr) jn j}| |}||}||}|f fdd	}d}t|
D ]O} || | |d |d }|| | }||| |} j||| |  |dd}||	9 }t	 
||  }|rt|t|d ||k r|}qI ~~~~||fS )Nr   gffffff?g      $@g)\(?   )lp_normbetakappaitersr   r   r   r   c              
     sn   |dkr  |  jj | d|   S   |  jj | d|   | d |d    S )Nr         ?g:0yE>)signnn
functionalreluabspow)xr   pr^   r!   r"   	shrink_op  s
   &*z:HQQWeightOnlyQuantizer.optimize_weights.<locals>.shrink_opg     @r   Tr?   keepdim   )r^   is_cudarw   rx   r   r~   roundclampmeanfloatr   ra   r   )rq   scalezeror   r?   r   verboser   r   r   r   rj   w_fr   
best_errorr   w_qw_rw_ecurrent_errorr!   r   r"   optimize_weights  s6   



"z'HQQWeightOnlyQuantizer.optimize_weightsc                 C  sx   | j d |j d kr|j}| j} |dv r8|  d | }t|D ]}| dd   ||d | || > O  < q d S td)Nr   )   r;      r   zOnly 2,4,8 bits are supported.)ry   Telement_sizer~   NotImplementedError)pack_tensorori_int_tensorr>   compress_ratiojr!   r!   r"   pack_on_row_fast_248bit(  s   (z.HQQWeightOnlyQuantizer.pack_on_row_fast_248bitr;   T@   r   c                 C  s  dd l }| }	|	j}
||
| |  | }|dkr&|jj|	d|fdd}	n|jj|	ddd|fdd}	|	j}|d urO|rO|dkrH|	d|gn|	|dg}	|du r_|	 |	 }}d}n|	j|ddd }|	j|ddd }d| d }d}||g}|||  j	d	d
}|| }|dk
  dkr|||dk< || j	d	d
}| | }|r||}|r| j|	||||d\}}||	| | 	|d |d }|| }d| }|dkr||d d}||d d}n|d|d }|d|d }~	~~|||j||jfS )Nr   r   constantrm   FTr   r   g     @)max)rq   r   r   r   r?   r   )r^   r   ry   r   r   padreshapeminr   r   sumitemr   r   rA   r   rj   )r    rq   r>   channel_wise
group_sizeoptimize
round_zeror?   r^   weight	ori_shapepad_lenry   _min_maxmax_vmin_vr   r   min_max_axisr   r   r!   r!   r"   quantize_internal5  sL   $

"z(HQQWeightOnlyQuantizer.quantize_internalnoder   graph_stacklist[GraphProto]returnlist[NodeProto]c                 C  s  |j dkr	tdddl}td|j d |jd }t||\}}|du r/td |gS tj	
|}t|jd	krDtd
 |gS ||}|j rR| }| jj}	| j|j|	| jjd\}
}}|
 }
| }| }d|	 }|j|
jd |
jd | f|j|
jd}| ||
|	 |  }|  }|d}|d}|j\}}| jj}|| }|| d | }||||}tj	|  }|jd t|	 |_|jD ]}|j|kr|j|  nqtj	|}|jd |_|j !||g |jd |j|jg}tj	|}|jd |_|j !|g |"|j i }|j\}}||d< ||d< |	|d< | jj|d< tj#j$	d||j%d g|jrP|jd t|	 nddd|}td|j d |gS )  
        Target node:        QOperator node:            QDQ nodes:
        MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
        Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear
        If the node is target node with fp32 or fp16 const weight, quantize the weight to int4 and
        return the new nodes.
        If QOperator format, return the corresponding QOperator nodes.
        If QDQ format, return the corresdponging QDQ nodes.
        Gather (quantized data) + Gather (scales) + Gather (optional, zero points) -> DequantizeLinear is
        not supported yet because Gather does not support int4 data.
        r   z/Gather quantization is not supported yet in HQQr   Nstart to quantize  ...r   2MatMul doesn't have const weight. Skip to quantizer   )MatMul weight is not 2D. Skip to quantize)r>   r  r   )rj   rX   rm   _Q_scales_zero_pointsKNr>   r6   MatMulNBits com.microsoftr   outputsr   domaincomplete quantization of r!  )&op_typer   r^   loggerinfor   inputget_initializeronnxnumpy_helperto_arrayr   ry   
from_numpyrP   re   r   r>   r  r   r6   rs   r   uint8rX   r   rQ   r   r  
from_arrayr   removeinitializerextendr   helper	make_nodeoutput)r    r  r  r^   input_bb_pbbs_graphb_arrayb_array_torchr>   quant_weight_torchscales_torchzero_points_torchpacked_sizepacked_torchscaleszero_pointsrowscolsr6   	blob_sizek_blocksb_quantr,  scales_tensorinput_names	zp_tensorkwargsmatmul_q_noder!   r!   r"   quantizev  s   












	zHQQWeightOnlyQuantizer.quantize)r   r:   )r   NF)r   r   r?   rA   r   r   )r;   Tr   TTr   r  r   r  r  r  r  )	r$   r%   r&   r#   staticmethodr   r   r  rP  r!   r!   r!   r"   r     s    
4

Ar   
graph_pathr  r  tuple[TensorProto, GraphProto]c                 C  sL   t t|d ddD ]}|| }|jD ]}|j| kr"||f    S qq
dS )Nr   rm   )NN)r~   r   r5  r   )r   rS  gidgraphrq   r!   r!   r"   r-    s   

r-  c                   @  sv   e Zd Zd(ddZd)d	d
Zd*ddZed+ddZed,ddZed-ddZ	ed.d!d"Z
d*d#d$Zd*d%d&Zd'S )/DefaultWeightOnlyQuantizerr   r@   c                 C  r   r   r   r   r!   r!   r"   r#     s   
z#DefaultWeightOnlyQuantizer.__init__
fp32weightnpt.ArrayLiker  )tuple[np.ndarray, np.ndarray, np.ndarray]c              	   C  s  | j j}d| }t|jdkrtd|j\}}| j j}|| d | }| j jtjkr|| d | }|| }	|	| }
|
dkrJt	
|d|
fdfd}t	j|||fdd	}t	j||| d |  dd	}t	j|| |jd	}|dkrt|||||||| j j nGt|||||||| j j n9|d
ksJ dt	j|| d d dd	}t	j|| d d dd	}t	j||f|jd	}t|||||||| j j |||fS )z54b/8b quantize fp32 weight to int4 using C++ kernels.r   r   z9Current int4 block quantization only supports 2D tensors!r   r   )r   r   r   r2  rj   r;   z+QDQ format only support 4 bits quantization)r   r>   r   ry   r   r6   r   r   r)   r   r   r   rj   r	   rB   r   r
   )r    rX  qbitskpackrF  rG  r6   rI  rH  padded_rowsr  packed
zero_pointrD  r!   r!   r"   qbits_block_quant  s>   

z,DefaultWeightOnlyQuantizer.qbits_block_quantr  r   r  r  r  c                 C  s@  | j j}|dkr| j jrtjntj}n
| j jrtjntj}|jd }t	||\}}|du r5t
d |gS tj|}t|jdkrJt
d |gS | |\}	}
}| j jtjkrqtj|	|jd|  }tj|
|jd }ntj|jd	|  ||j|	 d
}tj|
|jd }|jD ]}|j|kr|j|  nq|j||g g }| j jtjkr#|jd |j|jg}| j jstj||jd }||j |j|g i }|j\}}||d< ||d< ||d< | j j|d< | j jdur| j j|d< tjj 	d||j!d g|jr|jd|  nddd|}|| |S |j|jg}|jd g}|jd |d g}|j!d g}| j js`tj|jd ||
j| d
}||j |j|g d| j jd}tjj 	d|||jrz|jd	|  ndd|}tjj d|||jr|jd|  ndd}|||g |S ) z
        Quantize weight B of MatMul node to int4 or int8.
        Currently only support 2D constant matrix and axis 0 blockwise quantization.
        r   r   Nr  r   r  r  r  _DQ_QT
_DQ_scalesr   r  r  r   r>   r6   rD   r!  r"  r#  r$  _output_DQ_zero_points)r?   r6   DequantizeLinear)r   r%  r   r   	_matmul_Qr(  )rf  )"r   r>   rB   r   INT8UINT8INT4UINT4r,  r-  r*  r+  r.  r/  r0  r   ry   ra  r   r   r)   r3  r   r7  make_tensortobytesr4  r5  r6  r   r6   rD   r8  r9  )r    r  r  r>   qtyper:  b_tensorb_graph	b_ndarrayr_  rD  rE  rJ  rK  r,  output_nodesrL  rM  rN  rF  rG  matmul_qbit_nodedq_input_namesdq_output_namesmatmul_input_namesmatmul_output_names	dq_kwargsdq_nodematmul_noder!   r!   r"   quantize_matmul  s   







	
z*DefaultWeightOnlyQuantizer.quantize_matmulr   
np.ndarraytuple[np.ndarray, np.ndarray]c                 C  sx   t j| ddd}t j| ddd}t t |t |k||}|d }t |dkd| |  ddt j}||fS )Nr   Tr?   keepdimsg       r   i   )	r   r   r  wherer   r   clipastypeint8)r   max_valmin_valabs_maxr   quantized_slicer!   r!   r"   quant_slice_symmetricn  s   *z0DefaultWeightOnlyQuantizer.quant_slice_symmetricc                 C  s   t | jdddd}t | jdddd}|| d }t |dkd| |  ddt j	}t |dkd| | |  ddt j	}|||fS )Nr   Tr~  r   g      .@r      )
r   minimumr  maximumr   r  r   r  r  r2  )r   r  r  r   r`  r  r!   r!   r"   quant_slice_asymmetricy  s   ,.
z1DefaultWeightOnlyQuantizer.quant_slice_asymmetricc                 C  sX   |  d}t|d dkrt|d}|ddd d@ |ddd d@ d> B }|dS )	z2Pack int8 data to int4 and store in uint8 ndarray.rm   r   r   Nr  r   r;   r2  )r  r   r   r   r  )r   	data_flatquant_data_int4r!   r!   r"   pack_int8_to_int4  s
   
(
z,DefaultWeightOnlyQuantizer.pack_int8_to_int4quantize_axisrA   r6   rB   rC   0tuple[np.ndarray, np.ndarray, np.ndarray | None]c                 C  s  d}| j | }d}t| j D ]\}}||k r||9 }q||kr#||9 }q|| d | }	t| j }
|	|
|< | |||f}tj||	|f| jd}|rUtj|||fdd}ntj|||fdd}tj||	|fdd}td||D ]Y}t|| |}|dd||ddf }|rt	
|\}}nt	|\}}}||dd||ddf< || }||dd||d ddf< |s||dd||d ddf< qot	|}d}|st	|}||
}|||fS )zXQuantize ndarray data to int4 using numpy, return (quantized data, scales, zero points).r   r[  r  r2  r   N)ry   r   rp   r  r   r   rj   r~   r  rW  r  r  r  )r   r  r6   rB   mknr   dimrI  scales_shapedata_reshaperD  quant_data_int8zero_point_int8end_idxslicequantized_slice_int8scale_slicezero_point_slice_int8r   r  zero_point_int4r!   r!   r"   quantize_ndarray  sJ   	






z+DefaultWeightOnlyQuantizer.quantize_ndarrayc                 C  s  | j jtjksJ d| j jrtjntj}|jd }t	||\}}|du r-t
d |gS tj|}t|j}| j jdd}	| j j}
|	|k rM|	| ksQJ d|
dkr]|
d |
@ dksaJ d	|	| | }	| ||	|
| j j\}}}|jD ]}|j|kr|j|  nqwtj|jd
 ||j| d}tj||jd }|j|jd |jg}|j||g | j jstj|jd ||j| d}||j |j|g z	tj|d}W n ty   d}Y nw ||	|
d}tjj 	d||j!d g|jr|jd
 nddd|}|gS )z,Quantize weight data of Gather node to int4.z0Gather only supports QOperator format currently.r   Nz4Gather doesn't have const weight. Skip quantization.r   r   z&Invalid quantize axis for Gather node.   z#Invalid block size for Gather node._Q4Tr  r  r?   )gather_axisr  r6   GatherBlockQuantizedr"  r#  r$  )r  )"r   r   r   r)   rB   r   rj  rk  r,  r-  r*  r+  r.  r/  r0  r   ry   r   getr6   r  r   r4  r7  rl  rm  r3  r5  r6  r   get_node_attr_valuer   r8  r9  )r    r  r  rn  data_argdata_tensorprotodata_graphprotodata_ndarray	data_rankr  r6   quantized_datarD  rE  r,  quantized_data_tensorprotoscales_tensorprotorL  zp_tensorprotor  rN  gather_q4_noder!   r!   r"   quantize_gather  sl   


 



	z*DefaultWeightOnlyQuantizer.quantize_gatherc                 C  s   t d|j d | jj}|jdkr-|dkr&| jjtjkr&t 	d |gS | 
||}n'|jdkrG| jjdkr@t 	d |gS | ||}nt 	d	|j d
 |gS t d|j d| jj d |S )r  r  r  r   r   z>MatMul only supports QOperator format for 8 bits quantization.r   r;   z)Gather only supports 4 bits quantization.zUnsupported operator z1 for weight only quantization. Skip quantization.r'  z with z	 bits ...)r*  r+  r   r   r>   r)  r   r   rd   errorr{  r  )r    r  r  r>   resultsr!   r!   r"   rP    s    



z#DefaultWeightOnlyQuantizer.quantizeN)r   r@   )rX  rY  r  rZ  rQ  )r   r|  r  r}  )r   r|  r  rZ  )r   r|  r  r|  )
r   r|  r  rA   r6   rA   rB   rC   r  r  )r$   r%   r&   r#   ra  r{  rR  r  r  r  r  r  rP  r!   r!   r!   r"   rW    s    


*_

	
7@rW  c                   @  s    e Zd ZdddZdd	d
ZdS )NVAWQWeightOnlyQuantizerr   rG   c                 C  r   r   r   r   r!   r!   r"   r#   )  r   z!NVAWQWeightOnlyQuantizer.__init__modelModelProto | strr  r   c                 C  sd   zddl m} W n ty   td tddw td | jj}||| jj|d}td |S )	z
        Perform nvidia_awq quantization using ModelOpt's int4 quantize function.

        Args:
            model (ModelProto): The ONNX model to quantize.

        Returns:
            ModelProto: The quantized ONNX model.
        r   )rP  zlPlease ensure that the 'modelopt' package is installed. Please install it using pip install nvidia_modelopt.zXmodelopt is not installed. Please install it using pip install nvidia_modelopt. Exiting.Nz#Starting nvidia_awq quantization...)rg   r2   "Completed nvidia_awq quantization.)	modelopt.onnx.quantization.int4rP  r`   ra   r*  r+  r   r2   rg   )r    r  quantize_int4ri   quantized_modelr!   r!   r"   quantize_awq/  s*   


z%NVAWQWeightOnlyQuantizer.quantize_awqN)r   rG   )r  r  r  r   )r$   r%   r&   r#   r  r!   r!   r!   r"   r  (  s    
r  c                	   @  sR   e Zd ZdZdddddejdddf	d!ddZd"ddZdd Zdd Z	dd  Z
dS )#MatMulNBitsQuantizera  
    Target node:        QOperator node:            QDQ nodes:
    MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
    Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear

    Perform 4/8 bits quantization of constant weights for target nodes.
    If algo_config.quant_format is QOperator:
      - nodes are replaced by the corresponding QOperator nodes.
      - quantized weights are stored in the contrib ops.
    If algo_config.quant_format is QDQ:
      - the quantized weight is stored in a standard onnx node. For MatMul, it is DequantizeLinear. For Gather,
        it is the three Gathers, one for quantized data, one for scales and one for optional zero points.
      - The nodes are replaced by the corresponding QDQ nodes.
      - currently Gather is not supported in QDQ because Gather does not support int4 yet.
    Note:
      - for quantized gather, the memory usage of "DequantizeLinear + Gather" is the same as the original Gather
        during runtime. Therefor it is not recommended.
      - when a node is in nodes_to_exclude, and the node configuration in algo_config.customized_weight_config will be ignored.
    r1   FNr  r  r6   rA   rB   rC   rD   rE   nodes_to_includelist[str] | Noner   r   r   r   algo_configWeightOnlyQuantConfig | Nonec              	   C  s  |d u rg }t |trtt|nt|| _t |tr|nd | _|| _|| _|| _	t
|| _|r5t
|nd | _d | _|
d u rJt||||||	dd}
|
| _t| jdr]| jjdv s]J d|
jdkrjt| j| _d S |
jdkrwt| j| _d S |
jdkrt| j| _d S d S )	Nr;   r6   rB   rD   r   r   r   r>   r>   )r;   r   z%Only support 4 or 8 bits quantizationr<   rF   rO   )ro   r   r   r.  loadr  
model_pathr6   rB   rD   r   nodes_to_excluder  node_quantizerr@   r  hasattrr>   r   r   rW  r  )r    r  r6   rB   rD   r  r  r   r   r   r  r!   r!   r"   r#   h  s<   "




zMatMulNBitsQuantizer.__init__r  r  c                 C  s  g }|d }|j D ]}dd |jD }t|rsi }|jD ]D}|jtjjkr4||j |j	| 
|i}n'|jtjjkrWg }	|jD ]}
||
 |	| 
|g q@|j	|	i}nt|}|| qtjj|j|j|jfd|j	i|}g }|j	| jv rtd|j	 d |g}n%| jr|j	| jv s|j| jjv r| j||}ntd|j	 d |g}|| q	|d	 |j | |  |S )
Nrm   c                 S  s,   g | ]}|j tjjks|j tjjkr|qS r!   )typer.  AttributeProtoGRAPHGRAPHS)r   attrr!   r!   r"   
<listcomp>  s
    z:MatMulNBitsQuantizer._process_subgraph.<locals>.<listcomp>r   zexclude to quantize z$ as specified by nodes_to_exclude...zskip to quantize r  r  )r  	attributer   r  r.  r  r  r   gr   _process_subgraphr  graphsr6  r   r   r7  r8  r)  r,  r9  r  r*  r+  r  r  r   r  rP  
ClearFieldpop)r    r  	new_nodesrV  r  graph_attrsrN  r  kvvaluesubgraph	out_nodesr!   r!   r"   r    sR   




z&MatMulNBitsQuantizer._process_subgraphc                   s   i } j j jjD ]E}|jdv rMt fdd|jD sMd j jr#dndd} jj	rH|j
 jj	v rH jj	|j
  D ]\}}||v rG|||< q;|||j
< q|S )z3Generate weight only quant configuration for nodes.r   c                 3  s     | ]} j |d u V  qd S r   )r  r-  )r   r   r    r!   r"   	<genexpr>  s    z@MatMulNBitsQuantizer._generate_q4_node_config.<locals>.<genexpr>r;   symasym)r>   r  scheme)r  rV  r  r)  allr,  r6   rB   r  r   r   r   )r    q4_node_configr  template_config_q4keyr  r!   r  r"   _generate_q4_node_config  s$   

z-MatMulNBitsQuantizer._generate_q4_node_configc           	        sH   fdd}i } j dur j |d<   } jj}td| d |dkrTdd	lm}  jj|d
< 	  j	D ]}d||< q7|d j
durH j
n jj|d| _nE|dkrddlm}  jj|d<  jj|d<  jj|d<  jj|d<  jj|d< d|d< | }|d j
dur j
n jj||d| _td| d dS )u  4b quantize a model with RTN or GPTQ algorithm. Please refer to
        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
        for more details on weight only quantization using Intel® Neural Compressor.
        c                  3  s(    t  jj} | D ]}|d fV  q
d S r   )copydeepcopyr  r2   )data_readerr   r  r!   r"   inc_dataloader  s
   z<MatMulNBitsQuantizer.int4_quant_algo.<locals>.inc_dataloaderNrD   zstart to quantize model with z algorithm...r(   r   )rtn_quantizer+   fp32)r  weight_configr4   )gptq_quantizer5   	blocksizer7   r8   r9   rm   	n_samples)r  r  
dataloaderz$complete quantization of model with z algorithm.r!   )rD   r  r  r   r*  r+  .neural_compressor.adaptor.ox_utils.weight_onlyr  r+   r  r  r  r  r5   r6   r7   r8   r9   )	r    r  rN  weight_only_node_configr   r  r  r  r  r!   r  r"   int4_quant_algo  sH   




z$MatMulNBitsQuantizer.int4_quant_algoc              
   C  sx  | j jdv rT| j g}| j jtjkr| jdd | j jtjks'd| j j	v rH| j
 }|D ]}|jdv rG|jdk rGtd | j|jd q.| | | j  d S | j jdkrtd	 | j| jd u rk| jjn| j| _td
 t| j| _| j  d S ztd W n ty } zt| d td|d }~ww dd l}t|jtdksJ d|   d S )N)r<   rF   r#  r   r   )Nzai.onnxr"     zThe opset of the input model is under 21 and doesn't support int4 data type. Force to update it to opset 21, but the generated model may not be a valid model.rO   z%Processing nvidia_awq quantization...r  neural_compressor.zLneural-compressor is not correctly installed. Please check your environment.r   z2.3.2zGRequire neural-compressor >= 2.3.2 to support weight only quantization!) r  r   r  rV  r   r   r)   set_opset_importrd   r   opset_importr&  r   r*  warningr  clean_initializersr+  r  r  r  r   	importlibimport_module	Exceptionloggingr  RuntimeErrorr  parse__version__r  )r    r  r  opseter  r!   r!   r"   process  sN   



zMatMulNBitsQuantizer.process)r  r  r6   rA   rB   rC   rD   rE   r  r  r   r   r   r   r  r  )r  r  )r$   r%   r&   __doc__r   r)   r#   r  r  r  r  r!   r!   r!   r"   r  S  s     
.06r  c                 C  s   |   dv S )N)true1)lower)r  r!   r!   r"   ort_convert_str_to_boolB  s   r  c                 C  s   |  d\}}|t|fS )N:)r   rA   )sr  r  r!   r!   r"   parse_key_value_pairG  s   r  c               
   C  s  t jdd} | jdddd | jdddd | jd	d
dtdd | jddtg ddd | jddtdd | jdd
dddtdd
gdd | jdd
tdd | jddd
d d! | jd
d" | jd#d$td
g d%d& | jd'd$td
d(d) | jd*d+td+d,gd-d | jd.td$d/d0gd1d2 | jd3td$d
d4d5 | d6d7}|jd8td9d:d; |jd<td
d=d> |jd?td
d@dAgdBdC |jdDtdEdFd; | 	 S )GNa
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--block_sizeFrR   zBlock size for quantization)r  defaultr  r  z--quant_methodr  )r  hqqrtngptqrO   uW   the algorithm used to quantize weight, 
rtn and gptq leverage Intel® Neural Compressor)r  r  choicesr  z--bitsr;   z#the target bits to represent weight)r  r  r  z--symmetric?zWIndicate whether to quantize the model symmetrically, symmetric is not supported by hqq)r  r  constnargsr  r  r  z--accuracy_levelzAccuracy level of the 4-bit quantized MatMul computation. Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).)r  r  r  z-vz	--verbose
store_true)r  action)r   z--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)r  r  r  r  r  z--nodes_to_includezKSpecify the specific nodes to be included from quantization with node names)r  r  r  r  z--quant_formatr)   rd   zQuantFormat {QOperator, QDQ}QOperator format quantizes the model with quantized operators directly.QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.z--op_types_to_quantizer   r   zPop_types_to_quantize {MatMul, Gather}. Operators to quantize. Default is MatMul.)r  r  r  r  z--quant_axeszKey-value pairs in op_type:axis_to_quantize separated by space.Specify the axis to quantize for an op. Default {MatMul:0, Gather:1}Example: --quant_axes MatMul:0 Gather:1)r  r  r  r  rO   z-Arguments specific to nvidia_awq quantizationz--calib_dataset_namerH   z/Name of the calibration dataset for nvidia_awq.)r  r  r  z--tokenizer_dirzPath of the tokenizer dir.)r  r  r  z--calibration_methodawqawq_clipz<Support two options, awq implementation and weight clipping.)r  r  r  r  z--cache_dirrI   z%Cache directory for calibration data.)
argparseArgumentParseradd_argumentrA   r   r  set_defaultsr  add_argument_group
parse_args)parsernv_awq_configr!   r!   r"   r!  L  s   	
	
r!  __main__r  zfile z already existsr  zAsymmetric is not supportted by hqq, will force to symmetric=FalseF)r6   r>   r   r   r  r  r  )r   r  )r6   r   rO   zFQOperator is not applicable to nvidia_awq. overriding the value to QDQr  rJ   r  )rS   rh   rU   rg   z!Unsupported quantization method: )r  rD   r  r  r  T)rS  r  r  rT  )X
__future__r   r  r  r  r  osr   r   numpy.typingtypingnptr.  onnx.onnx_pbr   r   r   r   	packagingr   onnxruntime.capi._pybind_stater   r	   r
   	calibrater   
onnx_modelr   quant_utilsr   r   basicConfigINFO	getLoggerr$   r*  r   r'   r0   r:   r@   rG   r   r   r-  rW  r  r  r  r  r!  argsr   setLevelDEBUGinput_modelinput_model_pathoutput_modeloutput_model_pathr   r   tupler   pathexistsr  r  	symmetricquant_methodr  r  r  r6   r>   quant_configrD   r)   rd   rg   calib_dataset_namerh   rU   r   r  r  quantr  save_model_to_filer!   r!   r!   r"   <module>   s   
 &3+. w 
n	  I+ p
o


	
