o
    3Ih                     @   sZ  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! e"d
Z#ej$ddZ$dej%vrwe&e$ej%d< ddl'Z'ddl(m)Z)m*Z*m+Z+ dd Z,dd Z-de.de.fddZ/dd Z0dd Z1dd Z2e3dkre2  dS dS )a   Benchmarking the inference of pretrained transformer models.
PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
One difference is that random input_ids is generated in this benchmark.

For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

Example commands:
    Export all models to ONNX, optimize and validate them:
        python benchmark.py -b 0 -o -v -i 1 2 3
    Run OnnxRuntime on GPU for all models:
        python benchmark.py -g
    Run OnnxRuntime on GPU for all models with fp32 optimization:
        python benchmark.py -g -o
    Run OnnxRuntime on GPU with fp16 optimization:
        python benchmark.py -g -o -p "fp16"
    Run TorchScript on GPU for all models:
        python benchmark.py -e torchscript -g
    Run TorchScript on GPU for all models with fp16:
        python benchmark.py -e torchscript -g -p "fp16"
    Run ONNXRuntime and TorchScript on CPU for all models with quantization:
        python benchmark.py -e torchscript onnxruntime -p "int8" -o
    Run OnnxRuntime with the ROCM provider and graph optimization script:
        python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
    Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
        python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc           4      C   s  dd l }g }| r'd| vr'd| vr'd| vr'd| vr'td |S d}|dkr?tj}d}d	| vr?td
 |S |tjkrMtd| d |D ]}t| d }|
D ]}|t|krf n|d | }t| d |_	t
|}d|v rt , t|t| d t| d t| d |||||| |||||||\}} }!}"W d    n1 sw   Y  d|v rt|t| d t| d t| d |||||| |||||||\}} }!}"| sqZt|| |d|||d}#|#d u rqZdd |# D }$g }%| rdnd}&tj||d}'tt|t|t|!|'jg}(tt||'jg})|D ]}*|*dkr,q#|D ]}+|"d ur<|+|"kr<q.d|v rDtjntj},t|!|*|+||'|,}-d|j||&||| ||||*|+| tt d}.|'j	dv rt d| d|*d|'j!|'j!g  nt d| d|*|+g  |rt"|#|-|.|	|*|}/nG|##|$|-}0|(g}1t$t|0D ]}2|2dkrt| d dkr|1%|) q|1%|( qd|v rtj&ntj'}3t(|#|-|.|	|$|0|%|1|*|&|3|}/t |/ |%|/ q.q#qZqO|S )Nr   CUDAExecutionProviderMIGraphXExecutionProviderROCMExecutionProviderDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.tensorrt   TensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)enable_all_optimizationnum_threadsverbose(enable_mlas_gemm_fastmath_arm64_bfloat16c                 S   s   g | ]}|j qS  )name).0node_argr-   r-   b/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>   s    z#run_onnxruntime.<locals>.<listcomp>cudacpu	cache_dironnxruntimeenginer   	providersdevice	optimizer	precision
io_binding
model_nameinputsthreads
batch_sizesequence_lengthcustom_layer_numr   vitswinzRun onnxruntime on  with input shape gpt))r7   get_available_providersloggererrorr   NOOPTwarningr   len
model_typer   parsetorchno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxhidden_sizeint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer   runrangeappendlonglongintcr	   )4use_gpuprovidermodel_namesmodel_classconfig_modifierr=   r*   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr6   onnx_dirr+   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr7   resultswarm_up_repeatr?   all_input_names
num_inputsinput_namesfusion_optionsonnx_model_fileis_valid_onnx_model
vocab_sizemax_sequence_lengthort_sessionort_output_namesoutput_buffersr;   configmax_last_state_sizemax_pooler_sizerB   rC   input_value_type
ort_inputsresult_templateresultort_outputsoutput_buffer_max_sizesi	data_typer-   r-   r1   run_onnxruntimeY   sh  











	

	


Nr   c                    s  g }| rt j std |S t d |D ]=}tj||	|d}|| t	||||d}|j
dv r:|d g}n
tj||d}|j}td|  td	|   |tjkr_|  t | red
nd}|| |tjkrwt|}|D ]}|dkrqy|D ]}|j
dv rtd| d|d|j|jg  t j|d|j|jf|tjkrt jnt j|dn&|d ur||krqtd| d||g  t jd|jd ||ft j|dz^|	rt j |n|
rt !|n|   t"j# fdd|dd}|	rdn|
rdndt j$d| rdndd|d|d||||% t&t'( d}|)t*|| t| |+| W q t,yS } zt-| t j.  W Y d }~qd }~ww qyq|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr6   )r   r6   custom_model_classrE   r   r5   zModel zNumber of parameters zcuda:0r4   zRun PyTorch on rH   r$   )sizedtyper;   r&   )lowhighr   r   r;   c                      s    S Nr-   r-   	inference	input_idsr-   r1   <lambda>  s    zrun_pytorch.<locals>.<lambda>repeatnumberr   torch2rR   NAr3   r   r8   )/rR   r3   is_availablerK   rL   set_grad_enabledr   rU   modifyr   rP   r   model_max_lengthdebugnum_parametersr   FLOAT16halfr;   toINT8r   quantize_torch_modelr`   ra   randnfloat16float32randintr   longjittracecompiletimeitr   r\   r]   r^   r   r_   updater   rd   RuntimeError	exceptionempty_cache)rg   ri   rj   rk   r=   r*   rl   rm   rn   r   r   r6   r+   rz   r?   r   model	tokenizermax_input_sizer;   rB   rC   runtimesr   er-   r   r1   run_pytorch:  s   









"


9r   do_eager_modeuse_xlac                    s*   ddl m dd l fdd}|S )Nr   )wrapsc                    sT     fdd} j d fdd}du r(du s&J d|S |S )	Nc                         | i |S r   r-   ry   kwargsfuncr-   r1   run_in_eager_mode     zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode)experimental_compilec                     r   r   r-   r   r   r-   r1   run_in_graph_mode  s   zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_modeTFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r(   r   r   r   r1   run_func  s   

z+run_with_tf_optimizations.<locals>.run_func)	functoolsr   
tensorflow)r   r   r   r-   r   r1   run_with_tf_optimizations  s   r   c                    s  g }dd l jj| | sjg d | r$j s$td |S | r`j	d}zj|d d jj
|d d jjdd W n ty_ } zt| W Y d }~nd }~ww |tjksj|tjkrntd|D ]}tj||	d |  t| |	|dd	tj||	d}|j}|D ]}|dkrq|D ]}|d ur||krqtd
| d||g  dd l}|  fddt|| D }j|||fjdzzt dddfdd}t dddfdd}t ddd fdd}| j!r	|nt" t#r|  t$j%fdd|dd}dj&d| r)dndd|d|d||||' t(t)* d }|+t,|| t| |-| W q tyv } zt| dd!l.m/} |0 }|1  W Y d }~qd }~ww qqp|S )"Nr   GPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r;   z+Mixed precision is currently not supported.r5   )r   r6   r   is_tf_modelzRun Tensorflow on rH   c                    s   g | ]} d  jd qS )r   r&   )r   r   )r/   r   )r   rngr-   r1   r2     s    z"run_tensorflow.<locals>.<listcomp>)shaper   F)r   r   c                      s    ddS )NF)trainingr-   r-   r   r   r-   r1   encoder_forward  s   z'run_tensorflow.<locals>.encoder_forwardc                      s     ddS )NF)decoder_input_idsr   r-   r-   r   r-   r1   encoder_decoder_forward  r   z/run_tensorflow.<locals>.encoder_decoder_forwardc                     s8   j dd jg} j dd jg}| |ddS )Nr&   F)visual_feats
visual_posr   )randomnormalvisual_feat_dimvisual_pos_dim)featspos)r   r   r   r(   r-   r1   lxmert_forward  s   z&run_tensorflow.<locals>.lxmert_forwardc                      s     S r   r-   r-   )r   r-   r1   r   '  s    z run_tensorflow.<locals>.<lambda>r&   r   r   r   r3   r4   r   r8   )r3   )2r   r   	threading set_intra_op_parallelism_threadsset_visible_devicestestis_built_with_cudarK   rL   list_physical_devicesexperimentalset_memory_growth
distributeOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rU   r   r   r   r   r`   r   Randomrc   constantr[   r   is_encoder_decoder
isinstancer   r   r   r\   r]   r^   r   r_   r   r   rd   numbar3   get_current_devicereset)rg   ri   rj   rk   r=   r*   rl   rm   rn   r6   r+   rz   physical_devicesr   r?   r   r   rB   rC   r   valuesr   r   r   r   r   r3   r;   r-   )r   r   r   r   r   r(   r1   run_tensorflow  s   









Ir   c                  C   s  t  } | jddddtg dtt ddt  d | jd	dd
tdddgdd | jddtd ttddt d | jddddtdgg ddd | jdddtt	j
dddd | jddtt	j
dddd | jdd dd!d"d# | jd$dtd d%d | jd&d'ttjttd(d) | jd*dd!d+d# | jd,dd!d-d# | jd.d/ttjttd0d) | jd1d2dd!d3d# | jd4d5dd d6d7 | jd8d9dd d:d7 | jd;d<dd d=d7 | jd>d?ddd
gtg d@dAdB | jdCdDddEtdFdG | jdHdIdtd
gdJ | jdKdLdtg dMdJ | jdNdd!dOd# | jddP | jdQdRddtdSgdTdU | jdVdtd dWd | jdXdd!dYd# | jddZ t|  |  }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-basegpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer&   r%   r(   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r   r   r   r   r  z-ez	--enginesr7   )r7   rR   r   r   r   zEngines to benchmarkz-cz--cache_dir.cache_modelsz%Directory to cache pre-trained models)r   r   r   r  z
--onnx_dironnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r   actionr  z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r  z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r   r   r  z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r&   r'   r$   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r   r   r   r   r   r  z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r  z-bz--batch_sizes)r   r   r   z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rt   z-nz--num_threadsr   zThreads to use)r   r   r   r   r  z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )rx   )argparseArgumentParseradd_argumentr^   listr   keysjoinr   ospathr   FLOAT32r   BYSCRIPTintset_defaultsr   add_arguments
parse_args)parserry   r-   r-   r1   parse_argumentsF  sV  

					

r  c                  C   s  t  } t| j | jtjkr| jstd d S | jtj	kr-| jr-| j
dvr-td d S t| jdkrCt| jd  d dv rCdg| _td	d
 | jD | _td|   tj| jsvzt| j W n tyu   td| j Y nw d| jv }d| jv }d| jv }d| jv }d| jv }|rttjtdk rtdtj  d S t| j}g }| jD ]}t| t tj!"  |s|s|r+| j#dgkrt$d |r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r+|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|rG|t)| j| j| j&|| j|| j'| j| j(| j| j7 }i }	|rz4| j* }
|t+| j| j
| j| j&|| j|| j'| j| j(| j#| j,| j-| j| j.| j| j/| j0|
|	| j1| j2| 7 }W q t3y   t4d Y qw qt56 7d}|	r| j8pd| d}t9|	| t|dkr| j'dgkrt$d d S | j:pd| d}t;|| | j<pd| d}t=|||  d S )Nzfp16 is for GPU only)migraphxrocmzint8 is for CPU onlyr&   r   r$   )rF   swimr   c                 S   s   h | ]
}|d kr
t n|qS )r   )	cpu_count)r/   xr-   r-   r1   	<setcomp>  s    zmain.<locals>.<setcomp>zArguments: z#Creation of the directory %s failedrR   r   r   r7   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%Sbenchmark_fusion_z.csvzNo any result available.benchmark_detail_benchmark_summary_)>r  r   r+   r=   r   r   rg   rK   rL   r   rh   rO   modelsr   rm   sortedr*   r`   r  r  existsr6   mkdirOSErrorenginesr   rQ   rR   r\   r   force_num_layersset_num_threadsr   
__config__parallel_inforo   rN   r   rj   rl   
test_timesr   use_mask_indexr   rp   rq   rr   rs   rt   rw   rx   r%  r   r   r_   strftime
fusion_csvr   
detail_csvr
   
result_csvr   )ry   enable_torchenable_torch2enable_torchscriptenable_onnxruntimeenable_tensorflowrk   rz   r*   rv   ru   
time_stampcsv_filenamer-   r-   r1   main  s  


$












r@  __main__)4__doc__r  loggingr  r   r   rV   psutilbenchmark_helperr   r   r   r   r   r   r	   r
   r   r   r   r   r   huggingface_modelsr   r   onnx_exporterr   r   r   r   	packagingr   quantize_helperr   	getLoggerrK   r"  environr^   rR   transformersr   r   r   r   r   boolr   r   r  r@  __name__r-   r-   r-   r1   <module>   sB   4

 bp  I #
