o
    3IhE#                     @   s(  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
mZmZmZ d dlmZ ddlmZ ddlmZ eeZG dd dZd	d
 Zedkre ZejrXeej ejZejZ ej!"e rue#de  d e$de  de%eZ&ee&ej'ej(ej)dZ*e*+  e*j&,e d dS dS )    N)
GraphProto
ModelProto	NodeProtoTensorProto)quantize_matmul_bnb4   )	ONNXModel)attribute_to_kwargc                   @   s   e Zd ZdZdZdZddededefdd	Ze	d
e
e deeef fddZdejdejfddZdede
e defddZde
e fddZdd ZdS )MatMulBnb4QuantizerzMPerform 4b quantization of constant MatMul weights using FP4 or NF4 data typer   r   Nmodel
quant_type
block_sizec                 C   s@   |pg }|t jt jfv sJ t|| _|| _|| _t|| _d S N)	r
   FP4NF4r   r   r   r   setnodes_to_exclude)selfr   r   r   r    r   n/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py__init__%   s   
zMatMulBnb4Quantizer.__init__
graph_pathreturnc                 C   sL   t t|d ddD ]}|| }|jD ]}|j| kr"||f    S qq
dS )Nr   )NN)rangeleninitializername)r   r   gidgraphtensorr   r   r   __get_initializer-   s   

z%MatMulBnb4Quantizer.__get_initializerfpweightc                 C   s   t |jdkrtd|  }|j\}}|| }| j}|| d | }|d d }tj|dd}	tj||jd}
t	|	||
|| j
|| |	|
fS )z4b quantize fp32/fp16 weight   z9Current bnb4 block quantization only supports 2D tensors!r   uint8)dtype)r   shape
ValueError	transposecopyr   npzerosr%   r   r   )r   r"   
fpweight_trowscolsnumelr   
num_blocksquantized_numelpackedabsmaxr   r   r   bnb4_block_quant6   s   
z$MatMulBnb4Quantizer.bnb4_block_quantnodegraph_stackc                 C   s  |j dkr|S td|j d |j| jv r#td|j d |S |jd }t||\}}|du r;td |S tj	
|}t|jd	krOtd
 |S | |\}}tj	|}	|jd |	_|jD ]}
|
j|krt|j|
  nqetj	|}|jd |_|j|	|g i }|j\}}||d< ||d< | j|d< | j|d< tjj	d|jd |	j|jg|jd g|jr|jd nddd|}td|j d |S )zdIf the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new nodeMatMulzstart to quantize z ...zexclude to quantize z$ as specified by nodes_to_exclude...r   Nz2MatMul doesn't have const weight. Skip to quantizer#   z)MatMul weight is not 2D. Skip to quantize_Bnb4_absmaxKNr   r   
MatMulBnb4r    com.microsoft)inputsoutputsr   domainzcomplete quantization of )r<   )op_typeloggerdebugr   r   inputr
   %_MatMulBnb4Quantizer__get_initializeronnxnumpy_helperto_arrayr   r&   r4   
from_arrayremover   extendr   r   helper	make_nodeoutput)r   r5   r6   inputBBBs_graphB_arrayr2   r3   B_quantrE   absmax_tensorkwargsr-   r.   matmul_bnb4_noder   r   r   _bnb4_matmul_node_weightL   sX   









	z,MatMulBnb4Quantizer._bnb4_matmul_node_weightc                 C   s  g }|d }|j D ]s}dd |jD }t|rsi }|jD ]D}|jtjjkr4||j |j	| 
|i}n'|jtjjkrWg }	|jD ]}
||
 |	| 
|g q@|j	|	i}nt|}|| qtjj|j|j|jfd|j	i|}|| || q	|d |j | |  |S )Nr   c                 S   s,   g | ]}|j tjjks|j tjjkr|qS r   )typerG   AttributeProtoGRAPHGRAPHS).0attrr   r   r   
<listcomp>   s
    z9MatMulBnb4Quantizer._process_subgraph.<locals>.<listcomp>r   r5   )r5   	attributer   rY   rG   rZ   r[   appendgr   _process_subgraphr\   graphsrL   r	   updaterM   rN   rB   rE   rO   rX   
ClearFieldpop)r   r6   	new_nodesr   r5   graph_attrsrV   r^   kvvaluesubgraphr   r   r   rc      s@   




z%MatMulBnb4Quantizer._process_subgraphc                 C   sd   | j  g}| j  }d}|D ]	}|jdkrd}q|s&|tjddg | | | j 	  d S )NFr>   Tr   )
r   r   opset_importrA   rL   rG   rM   make_opsetidrc   clean_initializers)r   r6   rm   has_ms_domainopsetr   r   r   process   s   


zMatMulBnb4Quantizer.processr   )__name__
__module____qualname____doc__r   r   r   intr   staticmethodlistr   tupler   rF   npt	ArrayLiker*   ndarrayr4   r   rX   rc   rr   r   r   r   r   r
      s     7&r
   c                  C   s   t jdd} | jdddd | jdddd | jd	d
dtjtjgdd | jdd
ddd | jddd
dd | jd
d | jddtd
g dd |  S )Na  Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into blocks, where each block is a contiguous
subset inside the flattened transposed weight matrix. Each block is quantized
into a set of 4b integers with an absolute value scaling factor.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--quant_typeFr   z&Quantization data type. 0: FP4, 1: NF4)r   defaultchoicesr   z--block_size@   zVBlock size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64)r   r   r   z-vz	--verbose
store_true)r   action)verbosez--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)nargsrY   r   r   r   )	argparseArgumentParseradd_argumentr
   r   r   set_defaultsstr
parse_args)parserr   r   r   r      s:   	
	r   __main__zfile z already exists)r   T)-r   loggingosnumpyr*   numpy.typingtypingr{   rG   onnx.onnx_pbr   r   r   r   onnxruntime.capi._pybind_stater   
onnx_modelr   quant_utilsr	   	getLoggerrs   rC   r
   r   argsr   setLevelDEBUGinput_modelinput_model_pathoutput_modeloutput_model_pathpathexistserror	Exceptionloadr   r   r   r   quantrr   save_model_to_filer   r   r   r   <module>   s8   
 "'
