o
    %Tḩ                     @   s   d dl Z d dlZd dlZd dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlm Z  G d	d
 d
eZ!dS )    N)onnx_pb   )BaseQuantizerQuantizationParams)
TensorData)	ONNXModel)TENSOR_NAME_QUANT_SUFFIXQuantizationModeQuantizedValueQuantizedValueType__producer____version__add_infer_metadataattribute_to_kwargcompute_scale_zpcompute_scale_zp_float8find_by_nameget_qmin_qmax_for_qTypeget_qrange_for_qType	ms_domain&save_and_reload_model_with_shape_infertensor_proto_to_array)CreateOpQuantizerc                   @   s
  e Zd Z	d:ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
d;ddZdd Zdd Zdd Zdd Zd<ddZ	d=dd Zd!d" Zd>d$d%Zd&d' Zd;d(d)Z			*	d?d+d,Z	-			*	d@d.d/ZdAd0d1Z	-	dBd2d3Zd4d5 Zd6d7 Zd8d9 ZdS )CONNXQuantizerNc                 C   sz  t | |||||||	|
|| |sE| j  t| jj}dd |jjD | _| jdd |jj	D  | jdd |jj
D  t|| _|| _|| _| jdk| _d| jv oZ| jd | _g | _d| _i | _| jdd |jj	D  | jd	d |jj
D  | jjjjD ]}| jd
d |j	D  q| jtvrtd| j |  | _d| _d| _d| _d| _i | _| j | _ d S )Nc                 S      i | ]}|j |qS  name).0vir   r   h/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/onnxruntime/quantization/onnx_quantizer.py
<dictcomp>G       z*ONNXQuantizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   otr   r   r    r!   H   r"   c                 S   r   r   r   r   itr   r   r    r!   I   r"   
   MatMulConstBOnly/c                 S      i | ]}|j d qS r   r   r#   r   r   r    r!   U   r"   c                 S   r*   r+   r   r%   r   r   r    r!   V   r"   c                 S   s   i | ]}|d qS r+   r   )r   output_namer   r   r    r!   X   s    zunsupported quantization mode fixed_quantization_range_uint8fixed_quantization_range_int8
fixed_zerofixed_zero_zp)!r   __init__modelreplace_gemm_with_matmulr   graph
value_infovalue_infosupdateoutputinputr   modestaticopset_versionfuse_dynamic_quantextra_optionsq_matmul_const_b_only	new_nodesgraph_scopetensor_namesnoder	   
ValueErrorcalculate_quantization_paramsquantization_paramsfixed_qrange_uint8_namefixed_qrange_int8_namefixed_zero_namefixed_zero_zp_namequantized_value_mapget_non_initializer_inputsgenerated_value_names)selfr2   per_channelreduce_ranger:   r;   weight_qTypeactivation_qTypetensors_rangenodes_to_quantizenodes_to_excludeop_types_to_quantizer>   rC   r   r   r    r1   &   sP   



zONNXQuantizer.__init__c                 C   s~   t jj|d| jjjd}t| t|| j| j| j	| j
| j| j| j| j| j| j| j}| |_| j | d|_|  |jjjS )z
        generate submodel for the subgraph, so that we re-utilize current quantization implementation.
        quantize the submodel
        update subgraph and set it back to node
        onnx-quantizer)producer_nameopset_importsr)   )onnxhelper
make_modelr2   opset_importr   r   rO   rP   r:   r;   rQ   rR   rS   rT   rU   rV   r>   parentrA   quantize_modelr4   )rN   subgraph	graph_keywarped_modelsub_quantizerr   r   r    quantize_subgrapho   s0   
zONNXQuantizer.quantize_subgraphc           	      C   s  dd |j D }t|dkr|S |jr|jn
|j dt| j }i }|j D ]M}|jtjjkr@|j| 	|j
| d|j i}n.|jtjjkrjg }|jD ]}|| 	|| d|j dt| g qL|j|i}nt|}|| q&tjj|j|j|jfd|ji|S )z|
        Check subgraph, if any, quantize it and replace it.
        return new_nodes added for quantizing subgraph
        c                 S   s,   g | ]}|j tjjks|j tjjkr|qS r   )typerZ   AttributeProtoGRAPHGRAPHS)r   attrr   r   r    
<listcomp>   s
    z>ONNXQuantizer.quantize_node_with_sub_graph.<locals>.<listcomp>r   _node_count_:r   )	attributelenr   op_typer@   re   rZ   rf   rg   rd   grh   graphsextendr   r7   r[   	make_noder9   r8   )	rN   rC   graph_attrs	node_namekwargsri   kvvaluer`   r   r   r    quantize_node_with_sub_graph   s0   "
"
$z*ONNXQuantizer.quantize_node_with_sub_graphc                 C   s   t dd | j D S )zQ
        Detect if model already has QuantizeLinear or DequantizeLinear.
        c                 s   s$    | ]}|j d kp|j dkV  qdS )QuantizeLinearDequantizeLinearN)ro   r   rC   r   r   r    	<genexpr>   s    
z.ONNXQuantizer.has_QDQ_nodes.<locals>.<genexpr>)anyr2   nodes)rN   r   r   r    has_QDQ_nodes   s   zONNXQuantizer.has_QDQ_nodesc                 C   s2   t || j d urdS | jd ur| j|S dS )NTF)r   r2   initializerr^   find_initializer_in_path)rN   initializer_namer   r   r    r      s
   
z&ONNXQuantizer.find_initializer_in_pathc                 C   s2   | j | |D ]}|jD ]}| j| qqd S N)r@   rr   r8   rM   add)rN   r   rC   r,   r   r   r    add_new_nodes   s   
zONNXQuantizer.add_new_nodesc                 C   sD  |   r	td | j D ]2}| jr| |}t| j}t	| |}|
  t|t| jD ]}| j| jD ]}| j| q6q.q|   | j d | j j| j | jd u rq| j \}}t|dkrqtdt| t| jj_t| jj_dd | jjjD }|sdd | jD }	|	r| jjj }
d|
_t|
_| jjS )	NzPlease check if the model is already quantized. Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly.rC   r   z0Invalid model with unknown initializers/tensors.c                 S   s   g | ]	}|j tkr|qS r   )domainr   )r   opsetr   r   r    rj          z0ONNXQuantizer.quantize_model.<locals>.<listcomp>c                 S   s   g | ]	}|j d kr|qS )zcom.microsoft)r   r|   r   r   r    rj      r   r   ) r   loggingwarningr2   r   enable_subgraph_quantizationry   rn   r@   r   quantizeranger8   rM   r   _dequantize_outputsr4   
ClearFieldrC   rr   r^   clean_initializersRuntimeErrorstrr   rX   r   producer_versionr]   versionr   r   )rN   rC   number_of_existing_new_nodesop_quantizerir,   _initializers_not_foundms_opsetms_nodesr   r   r   r    r_      s@   





zONNXQuantizer.quantize_modelc                 C   s8   d| j v rtd|| j d  | j d S td|d)NDefaultTensorTypezDget_tensor_type returns DefaultTensorType for tensor name %r, use %dz)Unable to find data type for weight_name=a7  . shape_inference failed to return a type probably this node is from a different domain or using an input produced by such an operator. This may happen if you quantize a model already quantized. You may use extra_options `DefaultTensorType` to indicate the default weight type, usually `onnx.TensorProto.FLOAT`.)r>   r   infor   rN   tensor_namer   r   r    _get_default_tensor_type   s   


z&ONNXQuantizer._get_default_tensor_typeFc                 C   s   t || j }|d ur|jS || jv r2| j| }|jdr2|r-|jjjdkr-| 	|S |jjjS | j
r:| jd u rC|rA| 	|S d S | j|}|d urO|S | j
ra| jra| j|}|d ura|S |rh| 	|S d S )Ntensor_typer   )r   r2   r   	data_typer6   re   HasFieldr   	elem_typer   r   r^   is_valid_quantize_weightget_tensor_type)rN   r   	mandatoryweightr   otyperesr   r   r    r     s.   





zONNXQuantizer.get_tensor_typec                 C   s   |  |r
| |S || jv r8| j| }|jdr)|jjjtjj	tjj
fv r)dS td|d|j d dS | jrD| jrD| j|S td|d dS )	Nr   Tz<Inference failed or unsupported type to quantize for tensor z
, type is .Fz%Failed to infer data type of tensor: zS. Please add data type info for this tensor if your model has customized operators.)is_input_a_initializerr   r6   re   r   r   r   
onnx_protoTensorProtoFLOATFLOAT16r   r   r   r^   is_float_tensor)rN   r   r   r   r   r    r     s&   




zONNXQuantizer.is_float_tensorc                 C   sD   |t jjkr| |||S |t jjkr| |||S td| d)a  
        Create nodes for dynamic quantization of input and add them to nodes_list.
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter qType: type to quantize to.
            parameter initial_type: type to quantize from
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        zUnexpected value for qType=r   )r   r   INT8+_get_dynamic_input_quantization_params_int8UINT8,_get_dynamic_input_quantization_params_uint8rD   )rN   
input_name
nodes_listqTypeinitial_typer   r   r    &_get_dynamic_input_quantization_params5  s
   	z4ONNXQuantizer._get_dynamic_input_quantization_paramsc                 C   s  t jj}|d }|d }tjjd|g|d g|dd}|| |d }tjjd|g|d g|dd}	||	 |d	 }
tjd
|jd g|
d g|
}|| |d	 }tjd
|	jd g|d g|}|| |d }tjd|jd |jd g|d g|}|| tj| j	|g t
|d g}| j| |d }tjd|jd | j	g|g|}|| tj| j|g dg}| j| || jg g fS )az  
        Create nodes for dynamic quantization of input to int8 and add them to nodes_list
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter initial_type: initial weight type (FLOAT or FLOAT16)
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        _scale
_ReduceMin	ReduceMin:0r   keepdims
_ReduceMax	ReduceMax_AbsAbs_Abs_MaxMaxg       @	scale_DivDiv)r   r   r   rZ   r[   rs   appendr8   make_tensorrH   r   r2   add_initializerrJ   )rN   r   r   r   r   input_scale_namereduce_min_namereduce_min_nodereduce_max_namereduce_max_nodereduce_min_abs_namereduce_min_abs_nodereduce_max_abs_namereduce_max_abs_nodeabs_max_nameabs_max_nodeinitializer_divscale_div_namescale_div_nodeinitializer_zpr   r   r    r   D  s|   







z9ONNXQuantizer._get_dynamic_input_quantization_params_int8c                 C   s  t jj}|d }|d }|d }tjjd|g|d g|dd}|| |d }	tjjd	|g|	d g|	dd}
||
 tj| j|g t	|g}| j
| tj| j|g d
g}| j
| |d }tjd|
jd |jd g|d g|}|| |d }tjd|jd | jg|g|}|| |d }tjd| j|jd g|d g|}|| |d }tjd|jd |g|d g|}|| |d }tjd|j|d g|}|| |d }tjjd|j|g||d}|| ||g g fS )a{  
        Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter initial_type: initial weight type (FLAOT or FLOAT16)
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        r   _zero_pointr   r   r   r   r   r   r   g        
_scale_SubSub
_scale_Divr   _zero_point_Sub_zero_point_Div_zero_point_FloorFloor_zero_point_CastCast)to)r   r   r   rZ   r[   rs   r   r   rG   r   r2   r   rI   r8   )rN   r   r   r   r   r   input_zp_namer   r   r   r   initializer_qrangeinitializer_qvaluescale_sub_namescale_sub_noder   r   zp_sub_namezp_sub_nodezp_div_namezp_div_nodezp_floor_namezp_floor_nodezp_cast_namezp_cast_noder   r   r    r     s   







z:ONNXQuantizer._get_dynamic_input_quantization_params_uint8c                 C   s  | j }|du s|du r| jdu s|| jvr td| d dS | j| }t|ts7tdt| d|d|du sAt|dkrKt	d	| d
| t
|d g}t|d dre|d jt
jt
jfvrst	dt|d  d|t
|d g}|jt
jksJ |d }n't
|g}t
|g}| j| }d|v r|d j}||}|jt
jksJ g }	|d }
g }|d }tj|
||	|  }| j| |jt
jkrtjj}n|jt
jkrtjj}nt	d|j d|tj||||d }| j| d||
||	fS )a\  
        Create initializers and inputs in the graph for zero point and scale of output.
        Zero point and scale values are obtained from self.quantization_params if specified.
            parameter param_name: Name of the quantization parameter.
            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
        Nz$Quantization parameters for tensor:"z" not specified)F r   r   r   Unexpected type  for r      zbQuantization parameters should contain zero point, scale, quant type. Specified values for output z: 
zero_pointscaledtypez and param_name=
quant_typer   r   zUnexpected dtype=z for param_name=)T)rR   rF   r   r   
isinstancer   	TypeErrorre   rn   rD   nparrayhasattrr   float32float16float64astyperZ   r[   r   raveltolistr2   r   r   r   r   r   reshape)rN   
param_name	use_scaleuse_zeropointzero_point_typeparamszero_point_valuesscale_valuesr   zero_point_shapezero_point_namescale_shape
scale_nameinit_zp
scale_type
init_scaler   r   r    _get_quantization_params  sZ   

$





z&ONNXQuantizer._get_quantization_paramsc              	   C   sJ  |j | }|dksJ d|t }|d }	|dur&|dur&d||}
}}n
| |\}
}}}}g }|
rBtjd|||g|g|	}nR| jrGdS | jrf|tj	j
krf|d }|d }tjd	|g|||g|	}n.|duszJ d
|d| d| d| | j||||d\}}}}tjd|||g|g|	}t|||||| j|< g ||S )a  
        Given an input for a node (which is not a initializer), this function

        - add nodes to compute zero point and scale for this input if they don't exist.
        - add new QuantizeLinear node to quantize the input.

        :param node: node being quantized in NodeProto format.
        :param input_index: index of input in node.input.
        :param qType: type to quantize to.
        :param given_scale_name: if those inputs need to be quanitzed using this scale tensor.
        :param given_zp_name: if those inputs to be quantized using this zeropoint tensor.
        :param initial_type: type of the weight to quantize
        :return: List of newly created nodes in NodeProto format.
        r   z*Cannot access undefined variable in graph._QuantizeLinearNTrz   r   r   DynamicQuantizeLinearzCCannot quantize input without knowing the initial type, input_name=z, input_index=z, qType=z, node=r   )r9   r   r  rZ   r[   rs   r;   r=   r   r   r   r   r
   rK   )rN   rC   input_indexr   given_scale_namegiven_zp_namer   r   r,   ql_node_name
data_foundr  zp_namer   r   qlinear_noder  zp_shaper   r   r    _get_quantize_input_nodes1  sf   

	z'ONNXQuantizer._get_quantize_input_nodesc                 C   s.   || j v r
| j | S | jd ur| j|S d S r   )rK   r^   find_quantized_value)rN   r   r   r   r    r$  w  s
   


z"ONNXQuantizer.find_quantized_value      ?c              
   C   s   || j v r| j | jS | j | j}t|| j }t|}|| j v r)| j | j}n|| jv r9| |\}	}}	}	}	nt	d| dt|| j }
t|
}| 
||||\}}}}}}|| j vsbJ t||||tj|jdkrpdnd||d}|| j |< |S )z]
        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
        z	Expected z5 to be in quantized value map for static quantizationr   r   N)	node_type
node_qtype)rK   q_namer  r   r2   r   r   rF   r  rD   quantize_bias_static_implr
   r   Initializersize)rN   	bias_namer   weight_namebetaweight_scale_nameweight_initializerweight_scaler   r   inputscale_initializerinput_scalequantized_bias_namequantized_bias_scale_namequantized_bias_zp_namebias_scale_datar&  r'  quantized_valuer   r   r    quantize_bias_static~  sB   


	

z"ONNXQuantizer.quantize_bias_staticc                 C   s   || j v p|| jv p|| jv S )zq
        only check for value info and newly generated tensor names, initializers are checked separately
        )r6   rB   rM   r   r   r   r    contains_tensor  s
   
zONNXQuantizer.contains_tensorc              	   C   s   | j ||dddd|dS )NFr   rC   indicesinitializer_use_weight_qTyperP   op_level_per_channelaxisfrom_subgraph_ONNXQuantizer__quantize_inputs)rN   rC   r<  r@  r   r   r    quantize_activation  s   z!ONNXQuantizer.quantize_activationr   c              	   C   s   | j ||d||||dS )NTr;  rA  )rN   rC   r<  rP   r>  r?  r@  r   r   r    quantize_weight  s   	zONNXQuantizer.quantize_weightTc              
   C   s  g }g }	g }
g }|D ]e}|j | }|| jv r/| j| }||j |	|j |
|j q
|sA|
d |d |	d q
t|| j }|dur| j	re|re| 
|j|r[| jn| j||\}}}n| ||rm| jn| j|\}}}|
| |	| || q
| |r8| j|d | j| j }|du r|j | }|| jv r| j| }|dsJ d| d|jdsJ d| d|jjj}n|| jv sJ d	|d
| j| }| j||| j|d}|du r dS |r| | n|| |d }|jdkr|
|j ||j d  |	|j d  q
|
|jd  ||jd  |	|jd  q
| jdurf| jj||g||||dd\}}}}|
|d  ||d  |	|d  q
t d| d| j! |
|	||fS )a  
        Given a node, this function quantizes the inputs as follows:
            - If input is an initializer, quantize the initializer data, replace old initializer
              with new initializer
            - Else, add QuantizeLinear nodes to perform quantization
            parameter node: node being quantized in NodeProto format.
            parameter indices: input indices to quantize.
            return: (List of quantized input names,
                     List of zero point names used for input quantization,
                     List of scale names used for input quantization,
                     List of new QuantizeLinear nodes created)
        r   Nr  re   zvalue_info=z has no type.r   z is not a tensor.zshape inference failed for zF and attribute 'tensor_names' does not have any value for this tensor.r  )NNNNr   rz   r      r   T)r=  rP   r>  r?  r@  z!Invalid tensor name to quantize: z @graph scope)"r9   rK   r   r  r   r(  r   r2   r   rO   quantize_weight_per_channelr   rQ   rR   quantize_initializerr:  find_node_by_namer@   r4   r6   r   re   r   r   rB   r#  r   rr   ro   r8   r^   rB  rD   rA   )rN   rC   r<  r=  rP   r>  r?  r@  scale_nameszero_point_namesquantized_input_namesr   r  
node_inputr8  r   q_weight_namer   r  r!  r   r5   r   quantize_input_nodesparent_quantized_input_namesparent_zero_point_namesparent_scale_namesr   r   r   r    __quantize_inputs  s   



















zONNXQuantizer.__quantize_inputsc           	      C   sj   |j | jv r| j|j  }|j|j|jfS | ||||\}}}t|j |||tjd}|| j|j < |||fS )a  
        :param weight: TensorProto initializer
        :param qType: type to quantize to
        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
        :return: quantized weight name, zero point name, scale name
        N)	r   rK   r(  r   r  quantize_initializer_implr
   r   r*  )	rN   r   r   rP   keep_float_weightr8  rM  r   r  r   r   r    rG  \  s&   	

z"ONNXQuantizer.quantize_initializerc           
      C   sd   || j v r| j | }|j|j|jfS | |||||\}}}	t|||	|tjd }|| j |< |||	fS r   )rK   r(  r   r   quantize_weight_per_channel_implr
   r   r*  )
rN   r-  rQ   channel_axisrP   rT  r8  rM  r   r  r   r   r    rF  }  s&   
	




z)ONNXQuantizer.quantize_weight_per_channelc                 C   s   || j v rj|| jvrj| j | }t|j| j }| jjjdks*| jjjdkr9|dur9|du s9tj	|j
dks9J |d }| j|| j| j }|du ra|j|j|jg}tjd||g|}|S ||jd ksjJ dS )a  
        Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
        it back to float32 or float16
            parameter value_name: value to dequantize
            parameter new_nodes_list: List of new nodes created before processing current node
            return: None if there is already a DequantizeLinear node that dequantizes it
                    A DequantizeLinear node otherwise
        rW   Nr   _DequantizeLinearr{   r   )rK   rM   r   r  r2   r   rX   rZ   numpy_helperto_arrayr+  rH  r@   r4   r(  r   r[   rs   r8   )rN   
value_namer8  
scale_initdqlinear_namedqlinear_nodedqlinear_inputsdequantize_noder   r   r    _dequantize_value  s&   	

zONNXQuantizer._dequantize_valuec                 C   s6   | j  jD ]}| |j}|dur| j| qdS )z
        Dequantize output if it is quantized
            parameter new_nodes_list: List of new nodes created before processing current node
            return: List of new nodes created
        N)r2   r4   r8   r`  r   r@   r   )rN   r8   r_  r   r   r    r     s   z!ONNXQuantizer._dequantize_outputsc                 C   s@  | j d u rd S |   i }| j D ]}| j | }t|ts)tdt| d|d| jj|i d}| j}d|v r=|d j	}d|v rOd|v rO|d |d }}nE|t
jjkr`t||jd \}}n4|d	|jd
 }|d|jd }	|d| j}
|dd}t|||
d\}}t||	|||
| j\}}t|||d||< q|S )Nr   r   r   )default_valr   r   r   r   rminr   rmax	symmetricrP   F)rP   rd  )r   r   r   )rS   adjust_tensor_rangesr   r   r   re   tensor_quant_overridesget_per_tensor_overridesrR   r   rZ   r   FLOAT8E4M3FNr   avg_stdgetrange_valueis_activation_symmetricr   r   min_real_ranger   )rN   rF   r   tdquant_overridesr   zeror   rb  rc  rd  rP   qminqmaxr   r   r    rE     s0   




z+ONNXQuantizer.calculate_quantization_paramsr   )F)NN)NNN)r%  )FFr   F)TFFr   F)FF)TF)__name__
__module____qualname__r1   rd   ry   r   r   r   r_   r   r   r   r   r   r   r  r#  r$  r9  r:  rC  rD  rB  rG  rF  r`  r   rE   r   r   r   r    r   %   sR    
I"-
T
^<
F
1



 
&
 'r   )"r   numpyr   rZ   onnx.numpy_helperr   r   base_quantizerr   r   	calibrater   
onnx_modelr   quant_utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   registryr   r   r   r   r   r    <module>   s   H