o
    3IhW                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlZd dlZd dlZd dlmZ d dlZeeZG d	d
 d
eZG dd deZG dd dZdejiZ ddddddi fddZ!d8ddZ"d9ddZ#dd Z$dd Z%dd Z&dd  Z'd:d!d"Z(ej)d fd#d$Z*d%d& Z+d;d(d)Z,d*e-e.e/ef  dB fd+d,Z0G d-d. d.eZ1G d/d0 d0e1Z2G d1d2 d2e1Z3d<d4d5Z4d6d7 Z5dS )=    N)ABCabstractmethod)ThreadPoolExecutor)datetime)Enum)sleep)Any)versionc                   @   s$   e Zd ZdZdZdZdZdd ZdS )	Precisionfp32fp16int8int4c                 C      | j S Nvalueself r   i/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/transformers/benchmark_helper.py__str__&      zPrecision.__str__N)__name__
__module____qualname__FLOAT32FLOAT16INT8INT4r   r   r   r   r   r
       s    r
   c                   @   s    e Zd ZdZdZdZdd ZdS )OptimizerInfono_optby_ort	by_scriptc                 C   r   r   r   r   r   r   r   r   1   r   zOptimizerInfo.__str__N)r   r   r   NOOPTBYORTBYSCRIPTr   r   r   r   r   r    *   s
    r    c                   @   s$   e Zd Zdd Zdd Zdd ZdS )ConfigModifierc                 C   
   || _ d S r   
num_layers)r   r*   r   r   r   __init__6      
zConfigModifier.__init__c                 C   s   | j d u rd S t|dr| j |_td| j   t|dr+| j |_td| j   t|dr?| j |_td| j   d S d S )Nnum_hidden_layersz6Modifying pytorch model's number of hidden layers to: encoder_layersz7Modifying pytorch model's number of encoder layers to: zdecoder_layers z7Modifying pytorch model's number of decoder layers to: )r*   hasattrr-   loggerinfor.   decoder_layers)r   configr   r   r   modify9   s   



zConfigModifier.modifyc                 C   r   r   r)   r   r   r   r   get_layer_numF   r   zConfigModifier.get_layer_numN)r   r   r   r+   r4   r5   r   r   r   r   r'   5   s    r'   float32TFc	              	      sX  t  }	|rt jj|	_nt jj|	_|rd|	_|dkr&||	_t	d|	j  |r,d|	_
nd|	_
|t  v r9|g}
n=|rs|dkrDddg}
n2|dkrMd	dg}
n)|d
krVg d}
n |dks^|d u rcddg}
n|dkrlg d}
n
td| dg}
 r fdd|
D }
|r|	dd d }zt j| |	|
d}W |S  ty   td|  d|
  Y |S w )NTr   z%Session option: intra_op_num_threads=   dmlDmlExecutionProviderCPUExecutionProviderrocmROCMExecutionProvidermigraphx)MIGraphXExecutionProviderr=   r;   cudaCUDAExecutionProvidertensorrt)TensorrtExecutionProviderrA   r;   z)The execution provider is not supported: c                    s$   g | ]}| v r| | fn|qS r   r   ).0nameprovider_optionsr   r   
<listcomp>   s   $ z.create_onnxruntime_session.<locals>.<listcomp>z(mlas.enable_gemm_fastmath_arm64_bfloat161)	providerszFailed to create session for z with providers=)onnxruntimeSessionOptionsGraphOptimizationLevelORT_ENABLE_ALLgraph_optimization_levelORT_ENABLE_BASICenable_profilingintra_op_num_threadsr0   debuglog_severity_levelget_available_providersRuntimeErroradd_session_config_entryInferenceSession	Exception	exception)onnx_model_pathuse_gpuproviderenable_all_optimizationnum_threadsrQ   verbose(enable_mlas_gemm_fastmath_arm64_bfloat16rG   sess_optionsrJ   sessionr   rF   r   create_onnxruntime_sessionP   sN   





rd   c                 C   s8   | rt jddd d S t jdd tdtj d S )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(message)s)rg   transformers)coloredlogsinstalllogging	getLoggersetLevelWARNING)r`   r   r   r   setup_logger   s   
ro   c                 C   s   | rt j| st |  |rt j|st | |r:|dkr+dt v s*J dntt g dr:J dt	dt
j  t	dtj  t	dtj  tt
jtd	ksbJ ttjtd
ksoJ ttjtd	ks|J d S )Nr9   r:   zBPlease install onnxruntime-directml package to test GPU inference.)rA   r=   r?   zWPlease install onnxruntime-gpu package, or install ROCm support, to test GPU inference.zPyTorch Version:zTransformers Version:zOnnxRuntime Version:z1.10.0z4.12.0)ospathexistsmakedirsrK   rU   set
isdisjointr0   r1   torch__version__rh   r	   parse)	cache_dir
output_dirr\   r]   r   r   r   prepare_environment   s(   

r{   c                 C   s   t | tt|  d }tj| tjdd }|d|  }t| |dt| dd dt| dd dt| dd d|d|ddS )Ng     @@)dtypez.2fZ   _   c   )
test_timeslatency_variancelatency_90_percentilelatency_95_percentilelatency_99_percentileaverage_latency_msQPS)sumfloatlennumpyvarfloat64
percentile)latency_list
batch_size
latency_msr   
throughputr   r   r   get_latency_result   s   r   c                 C   sv   t |dddd!}g d}tj||d}|  | D ]}|| qW d    n1 s,w   Y  td|  d S )Na asciimodenewlineencoding)enginer	   rJ   device	precision	optimizer
io_binding
model_nameinputsthreadsr   sequence_lengthcustom_layer_numr   r   r   r   r   r   r   r   
fieldnamesz&Detail results are saved to csv file: )opencsv
DictWriterwriteheaderwriterowr0   r1   )resultscsv_filenamecsv_filecolumn_names
csv_writerresultr   r   r   output_details   s   r   c                    s  t |dddd}g d g }|jD ]"}|jdgkr#|d|  q|jD ]}|d| d|  q&qtj| | d}|  |jD ]}d	D ]}	|jD ]}
d
D ]}|j	D ]z}i }| D ]l}|d |kr|d |	kr|d |
kr|d |kr|d |krɇ fdd|
 D }|s|| |dd |D  n D ]}|| || ksJ q|d }|d }|r|d |d| d| < q]|d |d| < q]|r|| qWqRqNqIqEW d    n1 sw   Y  td|  d S )Nr   r   r   r   )r   r   r   r   r	   rJ   r   r   r   r   r   b_sr   )         )TFr   r   r   r   r   r   c                    s   i | ]\}}| v r||qS r   r   )rD   kvheader_namesr   r   
<dictcomp>  s    z"output_summary.<locals>.<dictcomp>c                 S   s   i | ]}|d qS )r   r   )rD   r   r   r   r   r     s    r   r   r   z'Summary results are saved to csv file: )r   batch_sizessequence_lengthsappendr   r   r   modelsenginesr_   itemsupdater   r0   r1   )r   r   argsr   
data_namesr   r   r   r   input_countengine_namer   r   rowr   headersr   r   sr   r   r   output_summary   sZ   






6r   c                 C   s   t |ddddO}ddddgttt|   }tj||d	}|  | D ]'}t	t
 | | d< tj| | d< tj| | d< || | d< || |  q(W d    n1 sZw   Y  td
|  d S )Nr   r   r   r   model_filenamer   rh   rv   r   z(Fusion statistics is saved to csv file: )r   listnextitervalueskeysr   r   r   strr   nowrh   rw   rv   r   r0   r1   )model_fusion_statisticsr   r   r   r   keyr   r   r   output_fusion_statistics*  s&   r   c                    sd   i }t j fddd|d t j fddd|d}|| |ddi |t|| |S )Nc                          d  S r   runr   
ort_inputsort_sessionr   r   <lambda>@      zinference_ort.<locals>.<lambda>r   numberrepeatc                      r   r   r   r   r   r   r   r   A  r   r   F)timeitr   r   r   )r   r   result_templaterepeat_timesr   warm_up_repeatr   r   r   r   r   inference_ort>  s   
r   c              
      s  i }   |D ]&}t|| |	}tt|| j|
} ||j	j
d||j|  qt|dkr;t|||	 t|D ]\}} ||| j	j
dtj|| j||   q?tj fddd|d tj fddd|d}|| |ddi |t|| |S )	Nr   c                      
     S r   run_with_iobindingr   r   r   r   r   r   u     
 z/inference_ort_with_io_binding.<locals>.<lambda>r   r   c                      r   r   r   r   r   r   r   r   {  r   r   T)r   rv   
from_numpytoIO_BINDING_DATA_TYPE_MAPgetr   r|   
bind_inputr   typeshapedata_ptrr   allocateOutputBuffers	enumeratebind_outputr   r6   r   r   r   r   )r   r   r   r   ort_output_namesort_outputsoutput_buffersoutput_buffer_max_sizesr   r   	data_typer   r   rE   np_input
input_typeiort_output_namer   r   r   r   inference_ort_with_io_bindingH  sL   	

	
r   c                 C   s&   |D ]}|  tj|tj|d qd S )N)r|   r   )r   rv   emptyr6   )r   r   r   r   r   r   r   r     s   r   {   c                 C   s<   t |  tj |  t|  tj|  tj|  dS )z5Set random seed manually to get deterministic resultsN)randomseedr   rv   manual_seedr@   manual_seed_all)r  r   r   r   set_random_seed  s
   

r  returnc               
   C   s   ddl m} m}m}m}m}m}m} z>|  g }| }t|t	s#W d S t
|D ]#}	|||	}
t|
tr8 W d S ||	|||	|
j|
j|
jd q'|  |W S  | yh } ztd| W Y d }~d S d }~ww )Nr   	NVMLErrornvmlDeviceGetCountnvmlDeviceGetHandleByIndexnvmlDeviceGetMemoryInfonvmlDeviceGetNamenvmlInitnvmlShutdown)idrE   totalfreeused-Error fetching GPU information using nvml: %s)py3nvml.py3nvmlr	  r
  r  r  r  r  r  
isinstanceintranger   r   r  r  r  print)r	  r
  r  r  r  r  r  r   device_countr   r1   errorr   r   r   get_gpu_info  s4   $



	
r  c                   @   s@   e Zd Zd
ddZdd Zedeeee	f  dB fdd	Z
dS )MemoryMonitorTc                 C   r(   r   )keep_measuringr   r  r   r   r   r+     r,   zMemoryMonitor.__init__c                 C   sB   dd l }d}	 t||t  jd }td | js 	 |S q)Nr   T   {Gzt?)	psutilmaxProcessrp   getpidmemory_inforssr   r  )r   r"  	max_usager   r   r   measure_cpu_usage  s   zMemoryMonitor.measure_cpu_usager  Nc                 C   s   t  r   )NotImplementedErrorr   r   r   r   measure_gpu_usage  s   zMemoryMonitor.measure_gpu_usageT)r   r   r   r+   r)  r   r   dictr   r   r+  r   r   r   r   r    s
    
$r  c                       s<   e Zd Zd fdd	Zdeeeef  dB fddZ  Z	S )	CudaMemoryMonitorTc                    s   t  | d S r   )superr+   r  	__class__r   r   r+     s   zCudaMemoryMonitor.__init__r  Nc           
   
      s>  ddl m}m}mm}mm}m} g g  zo|  | }t|t	s-t
d|  W d S dd t|D fddt|D  	 t|D ]%}||}t|tr`t
d|   W d S t| |jd	 |< qGtd
 | jsunqC|   fddt|D W S  |y }	 zt
d|	 W Y d }	~	d S d }	~	ww )Nr   r  z*nvmlDeviceGetCount result is not integer: c                 S      g | ]}d qS r   r   rD   r   r   r   r   rH         z7CudaMemoryMonitor.measure_gpu_usage.<locals>.<listcomp>c                    s   g | ]} |qS r   r   r4  )r  r  r   r   rH     s    Tz%nvmlDeviceGetMemoryInfo returns str: r   r!  c                        g | ]}| | | d qS )	device_idrE   max_used_MBr   r4  gpu_namemax_gpu_usager   r   rH         r  )r  r	  r
  r  r  r  r  r  r  r  r0   r  r  r   r#  r  r   r  )
r   r	  r
  r  r  r  r  r   r1   r  r   )r;  r<  r  r  r   r+    s>   $



z#CudaMemoryMonitor.measure_gpu_usager,  )
r   r   r   r+   r   r-  r   r   r+  __classcell__r   r   r0  r   r.    s    &r.  c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	RocmMemoryMonitorTc                    sl   t  | d}tj|r|tjvrtj| zdd l}|| _| j  W d S  t	y5   d | _Y d S w )Nz/opt/rocm/libexec/rocm_smir   )
r/  r+   rp   rq   rr   sysr   rocm_smiinitializeRsmiImportError)r   r  rocm_smi_pathrA  r0  r   r   r+     s   
zRocmMemoryMonitor.__init__c                 C   s(   | j d u rdS | j |dd d d S )Nr7   VRAMr   i   )rA  
getMemInfo)r   devr   r   r   get_used_memory  s   
z!RocmMemoryMonitor.get_used_memoryc                    s   | j d u rd S | j d urt| j  nd}dd t|D dd t|D  	 t|D ]}t| | ||< q,td | jsDnq( fddt|D S )Nr   c                 S   r2  r3  r   r4  r   r   r   rH     r5  z7RocmMemoryMonitor.measure_gpu_usage.<locals>.<listcomp>c                 S   s   g | ]}d | qS )GPUr   r4  r   r   r   rH     s    Tr!  c                    r6  r7  r   r4  r:  r   r   rH   &  r=  )	rA  r   listDevicesr  r#  rH  timer   r  )r   r  r   r   r:  r   r+    s   

z#RocmMemoryMonitor.measure_gpu_usager,  )r   r   r   r+   rH  r+  r>  r   r   r0  r   r?    s    r?  r@   c              	   C   sD  d }|dkr	t }nt}|d}| r|d ur|}n| }|d u r"d S |d u r(|S t }| }||j}z||}	|	 }
W d|_| }nd|_| }w |d u r]	 W d    d S td| d|  t	|dkrt	|dkrt	|t	|krd}t
|D ]\}}|d }|| d }|| }t||}q|W  d    S W d    d S W d    d S W d    d S 1 sw   Y  d S |d ur|}n| }|d u r|S t >}| }||j}z||}	|	 }
W d|_| }nd|_| }w td|d	d
|d	d || W  d    S 1 sw   Y  d S )Nr<   FzGPU memory usage: before=z  peak=r   r   r9  zCPU memory usage: before=z.1fz
 MB, peak=z MB)r?  r.  r+  r   submitr   r  r0   r1   r   r   r#  r)  )is_gpufuncmonitor_typestart_memorymemory_monitor_typemonitormemory_before_testexecutor
mem_thread	fn_thread_r(  max_usedr   memory_beforebeforeafterr  r   r   r   measure_memory0  s   



(






&r\  c                  C   sL   g d} d}| D ]}t |}|d u rq|r|d7 }|| d| 7 }q|S )N)ORT_DISABLE_FUSED_ATTENTION!ORT_ENABLE_FUSED_CAUSAL_ATTENTION!ORT_DISABLE_FUSED_CROSS_ATTENTIONORT_DISABLE_TRT_FLASH_ATTENTION&ORT_DISABLE_MEMORY_EFFICIENT_ATTENTIONORT_TRANSFORMER_OPTIONSORT_CUDA_GEMM_OPTIONSr   ,=)rp   getenv)	env_namesenvrE   r   r   r   r   get_ort_environment_variablest  s   	
ri  r,  r   r3  )r  )r@   N)6r   rk   rp   r  r@  rK  r   abcr   r   concurrent.futuresr   r   enumr   r   typingr   ri   r   rv   rh   	packagingr	   rK   rl   r   r0   r
   r    r'   r6   r   rd   ro   r{   r   r   r   r   r   longlongr   r   r  r   r-  r   r  r  r.  r?  r\  ri  r   r   r   r   <module>   sf   

	

G
":

=
&2
+D