o
    3Ih3+                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZ d dlmZmZ G dd deZdd Zdd	 Zd
d ZedkrIe  dS dS )    N)QuantFormat	QuantTypeStaticQuantConfigquantize)CalibrationDataReaderCalibrationMethodc                   @   s*   e Zd Zdd ZdefddZdd ZdS )	OnnxModelCalibrationDataReaderc           
         s   t j|_fddt jD }t| }g }|D ]1 i } fddtt  D }fdd|D }t	||ddD ]	\}}	|	||j
< qA|| qt|t|ks[J t|d t|ksgJ t|_d S )Nc                    s&   g | ]}| d rtj j|qS )test_data_set_)
startswithospathjoin	model_dir.0aself o/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/quantization/static_quantize_runner.py
<listcomp>   s
    
z;OnnxModelCalibrationDataReader.__init__.<locals>.<listcomp>c                    s   g | ]	}t j |qS r   )r   r   r   r   )data_dirr   r   r      s    c                    s   g | ]}  |qS r   )read_onnx_pb_data)r   	data_pathr   r   r   r      s    F)strictr   )r   r   dirnamer   listdironnxruntimeInferenceSession
get_inputssortedzipnameappendlenitercalibration_data)
r   
model_path	data_dirsmodel_inputsname2tensorsname2tensor
data_pathsdata_ndarraysmodel_inputdata_ndarrayr   )r   r   r   __init__   s    

z'OnnxModelCalibrationDataReader.__init__returnc                 C   s   t | jdS )z9generate the input data dict for ONNXinferenceSession runN)nextr&   r   r   r   r   get_next!   s   z'OnnxModelCalibrationDataReader.get_nextc                 C   sP   t  }t|d}||  W d    n1 sw   Y  t j|}|S )Nrb)onnxTensorProtoopenParseFromStringreadnumpy_helperto_array)r   file_pbtensorfretr   r   r   r   %   s   z0OnnxModelCalibrationDataReader.read_onnx_pb_dataN)__name__
__module____qualname__r0   dictr3   r   r   r   r   r   r      s    r   c                  C   s  t jdd} | jddddd | jdd	dd
d | jdg dddd | jdg dddd | jdddd | jdddd | jdddd | jdddd | jdddd | jdd g d!d" | jd#d$g d%d&d' | jd(d)d)d*gd+d' | jd,dd-d | jd.dd/d | jd0dd1d | jd2dd3d | jd4td5d6d7 | jd8dd9d | jd:dd;d | jd<dd=d | jd>d d d?d" | jd@d d dAd" | jdBdCdDdEg dFdG | jdHdIdJ |  S )KNz%The arguments for static quantization)descriptionz-iz--input_model_pathTzPath to the input onnx model)requiredhelpz-oz--output_quantized_model_pathz'Path to the output quantized onnx modelz--activation_typeqint8quint8qint16quint16qint4quint4qfloat8e4m3fnrI   z!Activation quantization type used)choicesdefaultrF   z--weight_typerH   zWeight quantization type usedz--enable_subgraph
store_truez#If set, subgraph will be quantized.)actionrF   z--force_quantize_no_input_checka   By default, some latent operators like maxpool, transpose, do not quantize if their input is not quantized already. Setting to True to force such operator always quantize input and so generate quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.z--matmul_const_b_onlyz3If set, only MatMul with const B will be quantized.z--add_qdq_pair_to_weightzjIf set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear nodes to weight.z--dedicated_qdq_pairzFIf set, it will create identical and dedicated QDQ pair for each node.z)--op_types_to_exclude_output_quantization+z]If any op type is specified, it won't quantize the output of ops with this specific op types.)nargsrP   rF   z--calibration_methodminmaxrU   entropy
percentiledistributionzCalibration method used)rP   rO   rF   z--quant_formatqdq	qoperatorzQuantization format usedz--calib_tensor_range_symmetriczoIf enabled, the final range of tensor during calibration will be explicitly set to symmetric to central point 0z--calib_moving_averagezIf enabled, the moving average of the minimum and maximum values will be computed when the calibration method selected is MinMax.z--disable_quantize_biaszWhether to quantize floating-point biases by solely inserting a DeQuantizeLinear node If not set, it remains floating-point bias and does not insert any quantization nodes associated with biases.z--use_qdq_contrib_opszIf set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.z--minimum_real_rangeg-C6?a  If set to a floating-point value, the calculation of the quantization parameters (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin) is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is necessary for EPs like QNN that require a minimum floating-point range when determining  quantization parameters.)typerP   rF   z --qdq_keep_removable_activationsz|If set, removable activations (e.g., Clip or Relu) will not be removed, and will be explicitly represented in the QDQ model.z*--qdq_disable_weight_adjust_for_int32_biaszIf set, QDQ quantizer will not adjust the weight's scale when the bias has a scale (input_scale * weight_scale) that is too small.z--per_channelz&Whether using per-channel quantizationz--nodes_to_quantizezfList of nodes names to quantize. When this list is not None only the nodes in this list are quantized.z--nodes_to_excludeznList of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.z--op_per_channel_axis   r#   )OP_TYPEPER_CHANNEL_AXISa8  Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's effective only when per channel quantization is supported and per_channel is True. If specific op type supports per channel quantization but not explicitly specified with channel axis, default channel axis will be used.)rT   rR   metavarrP   rF   z--tensor_quant_overridesz4Set the json file for tensor quantization overrides.)rF   )argparseArgumentParseradd_argumentfloat
parse_args)parserr   r   r   parse_arguments-   s   	

rg   c                 C   s   | si S t | }t|}W d    n1 sw   Y  |D ]}|| D ]}tj|d tjd|d< t|d |d< q%q|S )Nscale)dtype
zero_point)r7   jsonloadnparrayfloat32)filer>   quant_override_dictr=   enc_dictr   r   r   get_tensor_quant_overrides   s   
rs   c            
      C   s  t  } t| jd}tjtjtjtjtjtj	tj
d}|| j }|| j }t| j}| j| j| j| j| j| j|| j| j| j | j| j| j| jt| jd}tjtj tj!tj"d}t#j$t#j%d}t&||| j' || j( ||d | j)| j*| j+ddd |d}	t,| j| j-|	d d S )	N)r'   rG   )EnableSubgraphForceQuantizeNoInputCheckMatMulConstBOnlyAddQDQPairToWeight"OpTypesToExcludeOutputQuantizationDedicatedQDQPair QDQOpTypePerChannelSupportToAxisCalibTensorRangeSymmetricCalibMovingAverageQuantizeBiasUseQDQContribOpsMinimumRealRangeQDQKeepRemovableActivations"QDQDisableWeightAdjustForInt32BiasTensorQuantOverridesrV   )rZ   r[   F)calibration_data_readercalibrate_methodquant_formatactivation_typeweight_typeop_types_to_quantizenodes_to_quantizenodes_to_excludeper_channelreduce_rangeuse_external_data_formatcalibration_providersextra_options)r.   model_outputquant_config).rg   r   input_model_pathr   QInt8QUInt8QInt16QUInt16QInt4QUInt4QFLOAT8E4M3FNr   r   rC   op_per_channel_axisenable_subgraphforce_quantize_no_input_checkmatmul_const_b_onlyadd_qdq_pair_to_weight'op_types_to_exclude_output_quantizationdedicated_qdq_paircalib_tensor_range_symmetriccalib_moving_averagedisable_quantize_biasuse_qdq_contrib_opsminimum_real_rangeqdq_keep_removable_activations(qdq_disable_weight_adjust_for_int32_biasrs   tensor_quant_overridesr   MinMaxEntropy
PercentileDistributionr   QDQ	QOperatorr   calibration_methodr   r   r   r   r   output_quantized_model_path)
argsdata_readerarg2quant_typer   r   'qdq_op_type_per_channel_support_to_axisr   arg2calib_methodarg2quant_formatsqcr   r   r   main   sj   
	

r   __main__)ra   rk   r   numpyrm   r5   r   onnxruntime.quantizationr   r   r   r   "onnxruntime.quantization.calibrater   r   r   rg   rs   r   r@   r   r   r   r   <module>   s       	=
