o
    3Ih                    @   sf  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z* ddl+m,Z, ddl-m.Z/ ddl0m1Z1m2Z2 e3dZ4G dd deZ5dde6e7 dB dej8fddZ9dej8fddZ:dej8fddZ;dde7de<fdd Z=dde7de<de<fd!d"Z>d#e7d$e<d%e<de%fd&d'Z?d(ejd)efd*d+Z@d(ejd)efd,d-ZAd(ejd)efd.d/ZB	0	1		dd2ed3ed4e7d5eCd6eDdB d7eDdB fd8d9ZEd:ed;efd<d=ZF	1dd(ed5eCde6e fd>d?ZGd@dA ZHdBdC ZIdDdE ZJdFefdGdHZKdFedIe<dJe<de<fdKdLZLdFefdMdNZMdOedPe7fdQdRZNdg fdOedSeCdTe6eC fdUdVZOdOefdWdXZPdOedPe7fdYdZZQ		[	\ddOed]e7d^eCd_eCd`eCf
dadbZRdFefdcddZSdFefdedfZTdgefdhdiZUddje7de<fdkdlZV	ddje7dme7de<de<fdndoZWdpdq ZXe5jYfdej8dre5fdsdtZZdej8dOee!B duej[dvej[dweCdxeCdye6e6eC  deDe7ef fdzd{Z\d|d} Z]		~ddej8de6e7 dB de<fddZ^ddej8de6e7 dB fddZ_dde6e7 dB de6e7 dB fddZ)e`dkr1e)  dS dS )a  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    python -m models.t5.convert_to_onnx -m t5-small
    python convert_generation.py -m t5-small --model_type t5                     --decoder_onnx ./onnx_models/t5-small_decoder.onnx                       --encoder_decoder_init_onnx ./onnx_models/t5-small_encoder.onnx          --output ./onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)Any)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                   @   s    e Zd ZdZdZdZdd ZdS )GenerationTypebeam_searchgreedy_searchsamplingc                 C   s   | j S N)value)self r%   k/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/transformers/convert_generation.py__str___   s   zGenerationType.__str__N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr'   r%   r%   r%   r&   r   Z   s
    r   argvreturnc                 C   sB  t  }|d}|jdddtddtt t  d |jdd	td
g dddg d d |jdd	tt	j
dddd |jdd	tddd |jdd	tddd |jdd	ddd |jd	d |d}|jddtdd |jd d!d	ttjjtjjtjjgd"d |jd#d$d	d%d&gd'd( |jd)d*d	dd+d |jd	d, |jd-d.d	dd/d |jd	d0 |jd1d2d	dd3d |jd	d4 |jd5d6d	dd7d |jd	d8 |jd9d:d	dd;d |jd	d< |jd=d	dd>d |jd	d? |d@}|jdAd	ddBd |jd	dC |jdDd	ddEd |jd	dF |jdGd	ddH |jd	dI |jdJtd	dKdLdM |jdNd	ddOd |jd	dP |jdQd	ddRd |jd	dS |jdTd	ddUd |jd	dV |jdWd	ddXd |jd	dY |jdZd	dd[d |jd	d\ |jd]d	dd^d |jd	d_ |jd`d	ddad |jd	db |dc}|jddtd	dedfdM |jdgtd	dhdidM |jdjtd	dkdldM |jdmtd	dedndM |jdotd	dedpdM |jdqtd	dedrdM |jdstd	dtdudM |jdvtd	dtdwdM |jdxtd	tdy dzdM |jd{td	ded|dM |jd}td	d~ddM |jdtd	dKddM |jdtd	dddM |jdtd	dddM |jdtd	dddM |d}|jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	tdedd |jdd	ddd |jd	d || }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2)r5   t5mt5z*Model type (default is gpt2) in the list: )r2   r3   defaultchoicesr4   --cache_dir.cache_modelsz%Directory to cache pre-trained models)r2   r3   r8   r4   z--decoder_onnxr   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r2   actionr4   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r2   nargsr8   r4   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz--encoder_decoder_initzbAdd decoder initialization to encoder for T5 model. This is legacy format that will be deprecated.)encoder_decoder_initz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r2   r>   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r3   r2   r8   r4   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r   r   ospathset_defaultsr   FLOAT32r#   FLOAT16intfloat
parse_args)r.   parserinput_groupoutput_groupmodel_groupbeam_parameters_group
test_groupargsr%   r%   r&   parse_argumentsc   s  	
		




rz   ry   c                 C   s   | j }d|d| jdd| jddddd	g}| jr|d
| jg | jr&|d | jr.|d t| j	r?|dg || j	 | jt
jjkrM| jsMJ d| jrXtd|  t|d dS )zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r0   r@   z--optimize_onnxrA   z--test_runs1z--test_cases10z--overwriter:   r_   rF   rB   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r.   N)model_name_or_pathdecoder_onnx	precision	cache_dirextendr`   appendrG   lenop_block_listr   ro   r#   r?   loggerinfoconvert_gpt2_to_onnx)ry   
model_name	argumentsr%   r%   r&   gpt2_to_onnx  s8   


r   c                 C   s   t | j| jt| jj| j| j| jt	j
jk| jddddd| j| j| jt	j
jkd}td|d   td|d   |d | _|d | _dS )	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)r}   r   
output_dirr`   rG   optimize_onnxr   r?   use_decoder_start_token	overwritedisable_auto_mixed_precisionuse_int32_inputs
model_typerL   force_fp16_iozonnx model for encoder: r   zonnx model for decoder: rW   N)export_t5_onnx_modelsr}   r   r   outputparentr`   rG   r   r   ro   r#   r   rL   r   debugencoder_decoder_init_onnxr~   )ry   pathsr%   r%   r&   
t5_to_onnx$  s*   

r   T	onnx_pathrG   c                 C   sP   ddl m} tj| dd}|j|ddd}|r!tj|| |d d	S td d	S )
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTload_external_dataF)
auto_mergeguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelinfer_shapesr   saver   warning)r   rG   r   modeloutr%   r%   r&   shape_inferenceB  s   r   c                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS d}||j	d }|du rR|
|dd}	|	du rBdS ||	j	d }|du rPdS d}|jtjjkr[dS t|jd	krddS |jd }
|
d
 dkrqdS t|
d
 d
 }||
 }|jr|rtj|jd |ftjd}tjt||fdd}||jd< ntj||jd ftjd}tjt||fdd}||jd< | |_ndS tj|| |d dS )zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFrW   N	Transpose      dtypeaxisr   )r   r   graphr   namer   output_name_to_nodeop_typeget_initializerinputmatch_parent	data_typer
   DataTypero   r   dimsmathceilraw_datanpzerosfloat16concatenater   to_arraytobytesr   )r   rG   decoder_model_protologits_output_namedecoder_modelr   matmul_nodepad_along_axis_1logits_weighttranspose_before_matmulactual_vocab_sizepadded_vocab_sizepaddingpadding_dataweight_with_paddingr%   r%   r&   pad_weights_of_logits_matmulT  sN   


r   
model_pathr`   r^   c                    sx   t  }tj|_|rddgndg}|r3dt vrtdtd |r3ddi}d|i  fdd|D }t| ||d	}|S )
a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    CUDAExecutionProviderCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProvider"enable_skip_layer_norm_strict_modeTc                    s$   g | ]}| v r| | fn|qS r%   r%   ).0r   provider_optionsr%   r&   
<listcomp>  s    z&create_ort_session.<locals>.<listcomp>)	providers)	r   r   ORT_DISABLE_ALLgraph_optimization_levelr   RuntimeErrorr   r   r   )r   r`   r^   sess_optionsexecution_providerscuda_provider_optionsort_sessionr%   r   r&   create_ort_session  s   


r   r   r   c              	   C   s  |t jjk}t| j}|d }|dksJ g ddd t|D  }t| jt|kr:tdt| dt| j t|D ]E\}}| j| j|kr[td| d	| d| j| j t	j
}|dkrj|rgt	jnt	j}| j| jjj}	|	|krtd| d
| d|	 q>td dgdd t|D  }
t| jt|
krtdt|
 dt| j t|
D ]>\}}| j| j|krtd| d	| d| j| j |rt	jnt	j}| j| jjj}||krtd| d
| d| qtd dS )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       rW   )	input_idsposition_idsattention_maskc                 S      g | ]}d | qS )past_r%   r   ir%   r%   r&   r         z(verify_gpt2_subgraph.<locals>.<listcomp> Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitsc                 S   r   )present_r%   r   r%   r%   r&   r     r   !Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r   ro   r#   r   r   range
ValueError	enumerater   r
   INT32FLOATr3   tensor_type	elem_typer   r   r   )r   r   
is_float16input_countlayer_countexpected_inputsr   expected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_typer%   r%   r&   verify_gpt2_subgraph  s>   
"
"
r  c              	   C   s<  |t jjk}|rtjntj}t| j}|d d }|dksJ ddg}t|D ]}|d|  |d|  q't|D ]}|d|  |d	|  q>t| jt|krit	d
t| dt| j t
|D ]?\}}| j| j|krt	d| d| d| j| j |dk rtjn|}	| j| jjj}
|
|	krt	d| d|	 d|
 qmdg}t|D ]}|d|  |d|  qt| jt|krt	dt| dt| j t
|D ]8\}}| j| j|krt	d| d| d| j| j | j| jjj}||krt	d| d| d| qdS )  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   rY   rW   r   encoder_attention_maskpast_key_self_past_value_self_past_key_cross_past_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r   r   N)r   ro   r#   r
   r   r   r   r   r   r   r   r   r   r3   r   r   r   )r   r   r   
float_typer   r   r   r   r  r  r  r  r  r  r%   r%   r&   verify_t5_decoder_subgraph  sH   
""
r  c              	   C   s  |t jjk}d| jd jv }g d}|r|dd }t| jt|kr2tdt| dt| j t|D ]9\}}| j| j|krStd| d	| d| j| j t	j
}| j| jjj}||krotd| d
| d| q6|rt| jd dks}J t| jd }	|	dksJ g }
t|	D ]}|
d|  |
d|  qnStd t| jd d dksJ t| jd d }	|	dksJ ddg}
t|	D ]}|
d|  |
d|  qt|	D ]}|
d|  |
d|  qt| jt|
krtdt|
 dt| j t|
D ]B\}}| j| j|kr2td| d	| d| j| j |r8t	jnt	j}| j| jjj}||krUtd| d
| d| qtd dS )r  crossr   )encoder_input_idsr	  decoder_input_idsNr   r   r   r   r   r   rW   present_key_cross_present_value_cross_zZThis format is deprecated. Please export T5 encoder in new format with only cross outputs.rY   r   encoder_hidden_statesr  r  r   r   zMT5 encoder graph verified: name and data type of inputs and outputs are good.)r   ro   r#   r   r   r   r   r   r   r
   r   r3   r   r   r   r   r   r   r   r   )r   r   r   
new_formatr   r   r  r  r  r   r  r  r  r%   r%   r&   'verify_t5_encoder_decoder_init_subgraphG  s\   "
"
r  shared_   graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                 C   s  i }i }g }g }	g }
| j D ]N}|jrt|j|ksq|j D ]=}|jr)t|j|ks*qt||||rZ||j ||j< || |j|vrX||j }|||j< |	| |
|  nqqtd|
  | j	D ]}t
t|jD ]}|j| |
v rtd|j|  qpqg|j	D ]}t
t|jD ]}|j| |
v rtd|j|  qq|	D ]}|j | q|jD ]}|j|v r||j |_q|j	D ]4}t
t|jD ]*}|j| |v r||j|  }td|j d| d|j|  d|  ||j|< qq|D ]}| j | q| jD ]}|j|v r||j |_q| j	D ]7}t
t|jD ],}|j| |v rO||j|  }td|j d| d|j|  d|  ||j|< q$q|	D ]	}||j |_qU|	D ] }tj|j}tj|j|j|}| j| |j| qa|	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   has_same_valuer   r   r   r   noder   r   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r  r  r  r  r   r!  mapping_initializers_1mapping_initializers_2shared_initializers_1shared_initializers_2shared_initializers_namesinitializer1initializer2shared_namer%  jr"  r'  new_namer)  r%   r%   r&   remove_shared_initializers  s   












*


*
r6  encoder_modelr   c                 C   s`   t | }t |}|d |d i i }}|| || t|jj|jjd||d}|S )Ne_d_s_)r  r   r!  )r   add_prefix_to_namesremove_duplicated_initializerr6  r   r   )r7  r   encoderdecoderr   r!  initializersr%   r%   r&   get_shared_initializers  s   




r@  c                 C   s   g }| j D ]}|jrt|j|ksq|| q|D ]}| j | q|D ]}tj|j}tj	
|j|j|}| j| q%|S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r"  r   r#  r   r&  r   r(  r   r)  r*  r+  r   r   r'  )r   r  moved_initializerstensorr"  r)  r'  r%   r%   r&   move_initializers  s   
rC  c                 C   s   | j dkrtd| j d| j dkr| j}n^| j dkr | j}nU| j dkr)| j}nL| j dkr2| j}nC| j dkr;| j}n:| j d	krD| j}n1| j d
krM| j	}n(| j dkrV| j
}n| j dkr_| j}n| j dkrh| j}ntd| j d| j  d| j|fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.rW   r   r   rY            r   	   
   z has unsupported type r;   )r3   r   r   fr   stgfloatsintsstringstensorsgraphs)	attributer#   r%   r%   r&   _attribute_to_pair>  s0   











rS  c                 C   sD   i }| j D ]}t|\}}|||i q| jr |d| ji |S )Ndomain)rR  rS  updaterT  )r%  kwargsattrkeyr#   r%   r%   r&   	kwargs_ofc  s   
rY  c                 C   s   t dd | jjjjD S )Nc                 S   s   g | ]}|j r
|j n|jqS r%   )	dim_param	dim_value)r   dr%   r%   r&   r   n  s    zshape_of.<locals>.<listcomp>)tupler3   r   r)  dim)vir%   r%   r&   shape_ofm  s   r`  subgc              
   C   s  d}d}g }t | jD ],\}}||kr1t|}tjj|j|jjj	|d |d |d d|d gd}|
|g q|
tjjdtjjdgd	g | d
 | j
| g }t | jD ],\}}||krt|}tjj|j|jjj	|d |d |d d|d gd}|
|g qZ| d | j
| g }| jD ]P}	|	}
|	jdkrt|	}|ddi g }|
|	j t|dk r|
dg t|dk st|dk r|
dg tjjd||	jfd|	ji|}
|
|
g q| d | j
| | S )Nr   rW   r   r   max_seq_lenrY   r   r)  past_sequence_lengthr)  r   r   	AttentionrQ   rE  r   rF  r   r%  )r   r   r`  r   r*  r+  r   r3   r   r   r   r
   r   
ClearFieldr   r%  r   rY  rU  r   	make_node)ra  input_past_0output_past_0
new_inputsr   r_  r)  new_outputs	new_nodesr%  new_noderV  nisr%   r%   r&   1update_decoder_subgraph_past_present_share_bufferq  sZ    



 
rp  is_beam_searchswitch_attentionc                 C   s  |r@g }t | jD ]
\}}||g q	|tjjdtjjdgdg |tjjdtjjg ddg | d | j| |rg d}g }| j	D ]x}|j
dkrt|}	|	 D ]}
|
d	krd  d
S |
|vrx|
dkrutd|
 d |	|
= qZg }||j |rt|dk r|dg t|dk st|dk r|dg t|dk r|dg tjjd||jfd|ji|	}||g qK| d | j	| dS )aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthrW   re  cache_indirection
batch_sizers  rb  r   rQ   	num_headsscalemask_filter_valuerT  rf  qkv_hidden_sizesFunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionrF  r   r   rG  DecoderMaskedSelfAttentionr   r%  T)r   r   r   r   r*  r+  r
   r   rg  r%  r   rY  copyr   r   r   rh  r   r   )ra  rq  rr  rk  _ir_  'decoder_masked_attention_supported_attrrm  r%  rV  kro  r%   r%   r&   4update_decoder_subgraph_use_decoder_masked_attention  sl   
 
	



r  c                 C   s  t  }g }dd t| jD }i }i }| jD ]'}|jD ]}|r0||vr)|g||< q|| | q|jD ]}|r<|||< q4q| jD ],}|jdkrn|jd rT|jd sUqA|jd |jd }	}
d}d|
v r| jD ]}|jdkr|jd |
kr|jd j} nqin| j	D ]}|j
|
kr|} nq|du rqAtj|}|jdkrn| d	v rn|jd |v rn||	 }|jd
kr|jd sqA|jd |v r|jd ds|jd dr| dkr||jd  || t||jd  dkr|| qA|jd |vrqA||jd  }|jdkr|jd sqA||jd  }|jdkr*|jd s+qA|jd |v rn|jd dsE|jd drn| dkrn||jd  ||||g t||jd  dkrn|| qAqA||fS )az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    c                 S   s   i | ]\}}|j |qS r%   r   )r   indexinpr%   r%   r&   
<dictcomp>      z+find_past_seq_len_usage.<locals>.<dictcomp>GatherrW   r   N	Constant_Constant>   rW   r   Shaper
  r  r   Reshaper   )setr   r   r%  r   r   r   rR  rK  r"  r   r   r(  r   sizeitem
startswithaddr   r   )ra  tensor_names_to_renamenodes_to_removegraph_input_namesinput_name_to_nodesr   r%  
input_nameoutput_nameshape_tensor_nameshape_index_nameini_gather_indices
const_noderB  gather_indices_arr
shape_nodereshape_nodetranspose_noder%   r%   r&   find_past_seq_len_usage  s   









r  r   past_seq_len_namec                 C   s   d}t tdd | jjj}|D ]"}t|jdk r&|jd t|jdk s|j| |j| q| jjjtj	j
|tjg dd |   | S )Nrt  c                 S   
   | j dkS NMultiHeadAttentionr   r%  r%   r%   r&   <lambda>d     
 z.add_cache_indirection_to_mha.<locals>.<lambda>r   r   rv  rs  max_sequence_lengthre  )listfilterr   r   r%  r   r   r   r   r*  r+  r
   r   topological_sort)r   r  cache_indirection_name	mha_nodesr%  r%   r%   r&   add_cache_indirection_to_mhaa  s   
r  r   skip_node_idxsc              
   C   sR  d}g }t tdd | jjj}t|D ]\}}||v rqd}|jD ]}	|	jdkr.|	j} nq"|}
|
dkrJ| jjj	D ]}|j|j
d krI|j}
 nq:d}| jjj
D ]}|j|j
d krf|jjjjd	 j} nqQt|jdk r{|jd
 t|jdk sn| d|d	  }|j| |tjj||
d|d|gd q| jjj| |   | S )Noutput_cross_qkc                 S   r  r  r  r  r%   r%   r&   r  z  r  z&add_output_qk_to_mha.<locals>.<lambda>r   rx  r   target_sequence_lengthrW   r   r   _rv  sequence_lengthre  )r  r  r   r   r%  r   rR  r   r   r"  r   r   r3   r   r)  r^  r[  r   r   r   r   r*  r+  r   r  )r   r   r  output_qk_basename
output_qksr  idxr%  rx  attoutput_qk_dtyper   r  output_qk_namer%   r%   r&   add_output_qk_to_mhav  sP   


r  c                    s*  t tdd | jjjd }| |ddgddg}|d u rd S | |d g dg d	 | |d g d
g d}| |d g dg d} d u sZ|d u sZ dd  |dd  kr\d S t t fdd| jjjd }| jjj d  | jjj d  | jjj| |d ur| jjj|d  | jjj|d  d}| jjjt	j
j|tjdgd d}d}t	j
jd|g|g| dd}	t	j
j|tjg d}
t	j
jd|g|g| dtjd}t	j
j|tjg d}| jjj|
|g | d jd< ||d jd< | jjj|	|g |   | |fS )Nc                 S   r  )NLayerNormalizationr  nr%   r%   r&   r    r  z*fix_past_sequence_length.<locals>.<lambda>r   AddSlicerW   r]   )	Unsqueezer  r  r  )r   r   r   r   )r  r  r  rW   r   r   )r  r  r  r  r   )rW   r   r   r   r   c                    s   | j d  d jd kS )Nr   r  rW   )r   r   r  	left_pathr%   r&   r    r  rd  re  past_seq_len_int32past_seq_len_int64Squeezeinputsoutputsr   Castr  r  r   to)r  r  r   r   r%  match_parent_pathr&  r   r   r   r*  r+  r
   r   rh  create_node_nameINT64r'  r   r  )r   r%  	base_path
right_pathlong_right_pathconstant_noder  r  r  squeeze_nodesqueeze_output	cast_nodecast_outputr%   r  r&   fix_past_sequence_length  sv   !( 
r  c                 C   s  d}d}| j jjtjj|tjdgdtjj|tjg ddg t	t
dd | j jj}t|D ]\}}d}|jD ]}|jd	krF|j} nq:d
|d  }	tjj|	tjd|ddgd}
|d dkrj| j jj|
 tjjd|jd |jd |jd ddt|jdkr|jd ndt|jdkr|jd nd||||jd g|jd t|jdkr|jd ndt|jdkr|jd nd|d dkr|	ndg|jddd||d dd}|d dkr|jd | j jj| | j jj|g q1|   | S )Nrs  rt  rW   re  r  c                 S   r  r  r  r  r%   r%   r&   r  (  r  z(replace_mha_with_dmmha.<locals>.<lambda>r   rx  output_cross_qk_r   rv  zencode_sequence_length / 2DecoderMaskedMultiHeadAttentionr   rY   rE  rF  r   r  com.microsoft)r  r  r   rT  rx  	output_qkrQ   )r   r   r   r   r   r*  r+  r
   r   r  r  r%  r   rR  r   r   r   r   r   rh  r   replacer&  r  )r   r  rs  rt  r  r  r%  rx  r  qk_output_name	qk_output
dmmha_noder%   r%   r&   replace_mha_with_dmmha  sl   



r  rW   r]   	attn_maskkv_num_heads
world_sizewindow_sizec           1      C   sP  |  tjjdtjdgdgd tjjd|dg|d g| dd}tjjd|d dgdg| dd}tjjd	dgd
g| d	tjd}tjjd|g|d g| dd}tjjd|d dgdg| ddd}	tjjd	dgdg| d	tjd}
| j	j
j|||||	|
g ttdd | j	j
j}t|D ]\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |ddgddg}| |dgdg}d\}}|d ur|\}}n	|d ur"|d }d}|d ur>|d ur>|jD ]}|jdkr<|j}q1d}|jD ]}|jdkrN|j}qC|jd |jd kod|jd |jd k}|d uor|d uor|d u} |d u o|d u o|d u }!d \}"}#}$|r| s|!rt| |jd }%t| |jd }&t| |jd }'|%jd! }(tj|%|&|'fdd"|(d#|( })tjj|)d$| d%})|  |) tjjd|jd |)jg|)j d&g| dd}*| j	j
j|*g | j	j
j| | j	j
j| | j	j
j| |*jd }"| rt| |jd }+t| |jd },t| |jd }-|+jd! }(tj|+|,|-fdd"d#|( }.tjj|.d'| d%}.|  |. tjjd|*jd |.jg|.j d&gd(}/| j	j
j|/g | j	j
j| | j	j
j| | j	j
j| |/jd }"n|jd }"|jd }#|jd }$tjjd)|"|#|$|jd* |jd+ |jd |
jd |d ur|jd nd,|d ur|jd# nd,g	|j|j d-d)d.|| |dkr|| n|| |t!|d uo|d u|d/
}0| j	j
j| | j	j
j|0g |d ur| j	j
j| |d ur%| j	j
j| q| S )0NonerW   r   r   r   vals	ReduceSum	_row_sumsr  Subseqlens_k_int64r  	seqlens_kr  r  _shaper  total_seq_len_int64r   )r  r  r   r   total_seq_lenc                 S   r  r  r  r  r%   r%   r&   r    r  z&replace_mha_with_gqa.<locals>.<lambda>)RotaryEmbeddingr  r   )r   r   r   r  r   )NNNr  r  r   NNinterleavedrx  )r   r   r   r]   r   r   QKV_Weight_r  _output	QKV_Bias_)r  r  GroupQueryAttentionrE  rF  r   r  r  )	r  r  r   rT  rx  r  local_window_size	do_rotaryrotary_interleaved)"add_initializerr   r*  make_tensorr
   r  rh  r  r   r   r   r%  r   r  r  r   r  rR  r   r   r   r   r   r   r)  r   stackreshaper(  
from_arrayr&  r   r  rp   )1r   r  r  r  r  reduce_sum_nodesub_nodeseqlen_k_cast_noder  gather_nodetotal_seqlen_cast_noder  r  r%  q_path_1q_path_2q_rotaryq_addq_matmulk_path_1k_path_2k_rotaryk_addk_matmulv_path_1v_path_2v_addv_matmulr  r  rx  root_input_is_sameall_paths_have_biasall_paths_have_no_biasq_input_to_attentionk_input_to_attentionv_input_to_attentionqwkwvwr^  
qkv_weightpacked_matmul_nodeqbkbvbqkv_biaspacked_add_nodegqa_noder%   r%   r&   replace_mha_with_gqa`  sH  


$






*

 









r'  c              	      s  d}dd j D }|dk r$|| ds$|d7 }|dk r$|| drd}tj| d }d| |   fddt|D }td	|  tj   }td
|  |d }|d }|d }	d}
jD ]_}|jdkr|j d |v rtd|j	 d|j  |
d7 }
||j d  }d| }dgdt|j  }|
| |j| |jtjddg tj|tj||d|	g}j|g qg|
|krtd| d|
 d S )NrW   c                 S      g | ]}|j qS r%   r  r   gir%   r%   r&   r   Q      zBupdate_decoder_subgraph_output_cross_attention.<locals>.<listcomp>r   pastr   c                    s"   i | ]}j |d     j|qS )r   )r   r   )r   layerinput_cross_past_0ra  r%   r&   r  X  s   " zBupdate_decoder_subgraph_output_cross_attention.<locals>.<dictcomp>z    -- past_key_cross_inputs = zpast_key_cross_0_shape is r   r  z'    -- add cross QK output from: node: z with output: r  r   r  z#Did not add cross QK for all layersz vs )r   r  r   r   r   printr`  r%  r   r   r   r   rR  r   r*  make_attributer+  r
   r   r   )ra  input_self_past_0r  output_self_present_0
num_layerspast_key_cross_inputsinput_past_key_cross_0_shapebatch_size_dimnum_heads_dimcross_seq_len_dimnum_layer_output_qkr%  r-  cross_attention_out_nameappended_namescross_attentionr%   r.  r&   .update_decoder_subgraph_output_cross_attentionN  sH   



r>  c              	   C   s$  d}dd | j D }|dk r$|| ds$|d7 }|dk r$|| drd}tt| j | d }d| | }g }g }| jD ]}|jdkrK||g q>t||k rTd	S d }	| jD ]}|jd
krd|}	 nqYg d}
d}t| \}}t|dkr|D ]}td| d| d qy|D ]}td|j d|j	  qt
jjddgdgdd}t
jjddg|gdtjd}|||g | jD ]}t|jdkr|	d ur|jd |	j d krt
jjddgdgdtjd}|jd |j d< ||g |jdkrt|}| D ]
}||
vr||= q|j d |j d |j d g}|t|j dkr%|j d ndg |t|j dkr8|j d ndg |t|j dkrK|j d ndg |t|j d kr^|j d  ndg |dg |d!g |d"g |t|j dkr|j d ndg d|d#< t
jjd$||jfd%|j	i|}||vrt|j D ]\}}||v r||j |< q||g q| d& | j| d'd | j D }g }t| j D ]0\}}||kr||k rt|}t
jj|j	|jjj|d |d d(|d gd)}||g qd|vr|t
jjdt
jjdgd*g d!|vr1|t
jjd!t
jjdgd*g d"|vrG|t
jjd"t
jjg d+d*g | d, | j | g }t| jD ]+\}}||kr}t|}t
jj|j	|jjj|d |d d(|d gd)}||g qY| d- | j| d.S )/NrW   c                 S   r(  r%   r  r)  r%   r%   r&   r   z  r+  zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>r   r,  rY   r   r  FRelativePositionBiasrw  #past_sequence_length_squeezed_int64r   zFound tensor name `z` to be renamed to ``zFound node to remove: type = z	, name = r  rd  past_sequence_length_squeezed!node_past_sequence_length_squeezer  r  &node_past_sequence_length_squeeze_cast)r   r  past_sequence_length_int64past_sequence_length_castr   rD  rE  rF  rs  rt  rQ   r  r   r%  c                 S   r(  r%   r  )r   r  r%   r%   r&   r     r+  rb  rc  re  ru  r   r   T)r   r  rp   r   r%  r   r   r  r0  r   r   r*  rh  r
   r  r   rY  r~  r   rg  r`  r+  r3   r   r   r   )ra  r2  r  output_self_past_0r4  r/  rm  	old_nodesr%  rel_pos_bias_noder  target_squeezed_past_seq_namer  r  name_to_renamenrr  r  rV  r  ro  r  r   orig_input_namesrk  r   r_  r)  rl  r%   r%   r&   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mhaw  s
  




*
&&&&&





 

	

rN  model_protoc                 C   s  t | }| }g }g }| D ]}|jdkrd|jd v r&d|jd v r&q||jd  }||jd  }||jd  }||jd }	||jd }
||jd }|	rY|
rY|s\ dS t|	}t|
}t|}tj	|||gdd}|j
d	d
d}tjj|d |	jdkrtjntj|jd |jd g|  d}| jj|g tjjd	|jd |d g|d g|d}|jd |jd< d|jd< d|jd< ||g ||||g q|| || |  |  dS )Nr  past_key_crossrW   past_value_crossr   r   Fr   r   
MatMul_QKV)name_prefix_weightr  _outr  r   T)r   r   nodesr   r   r   r   r   r   r   r  r   r*  r  r   r
   r   ro   r)  flattentolistr   r"  r   rh  r   	add_nodesremove_nodesupdate_graphr  )rO  
onnx_modelr   nodes_to_addr  r%  r  r  r  q_weightk_weightv_weightr  r  r  r  matmul_node_nameweightr   r%   r%   r&   pack_qkv_for_decoder_masked_mha  sZ   








rc  decoder_onnx_pathc                 C   s   t j| dd}tt|jjD ],}|jj| jdks#|jj| jdkr;|jj| jjj	j
d }|dr8|  d|_qtj|| |d dS )aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   rW   rZ  r   )r   r   r   r   r   r   r   r3   r   r)  r^  HasFieldClearr[  r   r   )rd  rG   r   r   shape_dim_protor%   r%   r&   *update_input_shapes_for_gpt2_decoder_modelQ  s   	
rh  init_decoder_onnx_pathc                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS ||g dg d}|du rA||g d	g d
}|du r_||g dg d}|du r_||g dg d}|du redS |d }	|	jdk}
|
sd}||	g d|dddg}|du rd}||	g d|dddg}|du rd}||	g d|ddg}|du rd}||	g d|ddg}nBd}||	g d|ddg}|du rd}||	g d|ddg}|du rd}||	ddg|dg}|du rd}||	ddg|dg}|du rdS |dkrdnd}|
s|	|	d|}n|	|	d|}|du r!dS |d }|d }t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}|| || || || d|jd  }t j
jd|jd ddddg|g|ddd }|
s|jd n|jd! }d|jd  }t j
jd|ddddg|g|dd"d }|| || |||jd | ||	|| |  tj|||d# dS )$a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   F)r  r  r  r  r  r   r  FastGelur  r   r  r  r  )r   r   r   rW   r   r   r   r   r   r   r   r   r   N)
r  SkipLayerNormalizationr  r   r  rj  r  r   r  rk  )
r   r   rW   r   r   r   r   r   r   r   )r  r  r  r   rj  r   r  r  )r   r   rW   r   r   r   r   r   )rk  r   rj  r   rk  )r   rW   r   r   r   r]   rk  )r  r  r   rf  rW   )r  r   rf  )r  r   rf  rf  r  r  SliceLastTokenStartsr  SliceLastTokenEndsSliceLastTokenAxesSliceLastTokenStepsedge_modified_r  GatherLastToken_0_r  r   GatherLastToken_1_r   )r   r   r   r   r   r   r   r   r  r   r*  r  r
   r   r  rh  r  add_nodereplace_node_inputr  r   )rd  ri  rG   init_decoder_model_protor   gpt2_init_decoder_modelr   logits_matmul_node"logits_matmul_to_residual_add_pathresidual_add_nodeis_skiplayernorm_path&residual_add_to_attention_parent_indexresidual_add_to_attention_path residual_add_to_add_parent_indexadd_before_residual_add	attentionmatmul_after_attentionslice_starts
slice_ends
slice_axesslice_stepsslice_0_output_nameslice_node_0add_before_residual_add_outputslice_1_output_nameslice_node_1r%   r%   r&   generate_gpt2_init_decoderq  sX  













r  c           	      C   s   t d}t |j}t |j}t |j}| jjD ]%}|jjjj	D ]}|
dr;|j||||fv r;t|j}|  ||_qq| jjD ]%}|jjjj	D ]}|
dre|j||||fv ret|j}|  ||_qIqAdS )zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    rW   rZ  N)ri   rx  d_modeld_kvr   r   r3   r   r)  r^  re  rZ  rp   rf  r[  r   )	r   configr  rx  hidden_size	head_sizerB  	dim_protor[  r%   r%   r&   make_dim_proto_numeric_t5	  s>   




r  generation_typec           -      C   s  | j dk}|tjk}|tjk}|tjk}| j}td|  t| j	dkrM| j	d dkrM|rJ| j
tjjkrJg d| _	td| j	  td ng | _	|sQ|re|sWtd	| jr^td
| jretd|rp|rp| jsptd| jry|sytd| jr| jstd|r| jrtj| jrtd| j  nN| js| j d| j
 d}tt| jj| | _td| j d| j d t|  n"| jr| jrtd| j d| j  ntd| j d t|  d}| j s| j
tjjkr|r|s|s|rtd| j d t!| j| j"}|st#d d}	d}
| j$sg|rg|s,|s,|rgtd| j d d | j
 d}tt| jj| }
t%| j|
| j"}	|	sXt#d! |	rgt&| j| j"sgtd"|sq| j'sq|	rtd#| j d t(| j| j" |	rtd#|
 d t(|
| j" |rt)j*| j| j+d$}n| j d%krt,j*| j| j+d$}n	t-j*| j| j+d$}| j.rtd&|  |j/}|r|j/n|j0}|j1}| j1d'kr| j1}| j/d'kr| j/}| j0d'kr| j0}t2j3| jd(d)}| j  d*|j4_5d}| j dkr)t6|j4| j
 |	r(t2j3|
d(d)}| j  d+|j4_5t6|j4| j
 nt7|j4| j
 d}|r:g d,}n
|s@|rDg d-}| j8rN|9d. n|9d/ | j:r]|9d0 n|9d/ | j;rl|9d1 n|9d/ |r| j<r| j=r|9d2 n|9d/ | j>r|9d3 d4g}| jr|9d5 | jr| jsJ d6|9d7 d}|rt2j?j@d8||d9| j  d:}n#|rt2j?j@d;||d<| j  d:}n|rt2j?j@d=||d>| j  d:}d?|_Ad}|rt2j?Bd@|t2j?BdA|t2j?BdB| jCt2j?BdC| jDr	dndt2j?BdD| j dkrdndg}nw|rCt2j?Bd@|t2j?BdA|t2j?BdD| j dkr7dndt2j?BdB| jCg}nP|rt2j?Bd@|t2j?BdA|t2j?BdD| j dkr^dndt2j?BdB| jCt2j?BdE| jEt2j?BdF| jFt2j?BdG| jGt2j?BdH| jHt2j?BdI| j<t2j?BdJ| jIg
}|r|Jt2j?BdK|g |jKJ| g }| j dLv rf| j'rtdM| j d t(| j| j" t2j3| jd(d)}t|j4jLdNkrdOndP}| j  dQ| |j4_5tM|j4| j
 tN|| tN|| |r%| jstdRtdS tO|j4rtdT ntdU tP|r tdV ntdW | jQs@tR||}tt| dXdYdZ |D  d[ |jSdksJJ d\|jKJt2j?BdO|j4t2j?Bd]|j4t2j?Bd^|jSg n|	r| jQstR||}tt| dXd_dZ |D  d` |rtda tT|j4 | jrtU|j4|dstdb|jK9t2j?Bdc|j4 ntV|j4}tt| dd |rtde tT|j4 | jrtU|j4|d(stdf|jK9t2j?Bd]|j4 t2j?WdgtXjYdhdig}t2j?WdjtXjYdg}t2j?WdktXjYdg}t2j?WdltXjYdg}t2j?WdmtXjYdg}t2j?WdntXjZdg}t2j?WdotXjZdg}d} |r<|||||||g} n|sB|rH||||g} | j8r[t2j?Wd.tXjY|g}!| 9|! | j:rot2j?Wd0tXjYdh|g}"| 9|" | j;rt2j?Wd1tXjYdhdig}#| 9|# | j<r| j=rt2j?Wd2tXjYdh|g}$| 9|$ |r| j>rt2j?Wd3tXjYdg}%| 9|% d}&|rt2j?Wd4tXjYg dp}&n|s|rt2j?Wd4tXjYdhdjg}&|&g}'| jrt2j?Wd5tXjZdhdmg}(|'9|( | jr t2j?Wd7tXjZdqdhdl|g})|'9|) t2j?[|g|s| j  drn| j  ds| |'|}*t2j?j\|*dt|j]du}+| j"rHddvl^m_}, |,`t2ja|,`dwk r=t#dx tbjc|+| jd(d(dy nt2c|+| j tdz| j  dS ){zConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r5   z**** past_present_share_buffer=rW   r   rD   )r  r  rk  rj  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: _past_z.onnxzConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. gpt2_init_past_zuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on r   r6   zConfig=r]   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   r  r  r  rP   r   rS   r   rU   rV   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscores
BeamSearchBeamSearch_r  GreedySearchGreedySearch_Sampling	Sampling_r  eos_token_idpad_token_idno_repeat_ngram_sizerO   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer6   r7   zSymbolic shape inference on r   r=  zencoder and decoder init zMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (c                 S   r(  r%   r  r   r%   r%   r&   r     r+  z,convert_generation_model.<locals>.<listcomp>z>) in encoder and decoder subgraphs are moved to the main graphz%decoder_start_token_id should be >= 0r>  decoder_start_token_idc                 S   r(  r%   r  r   r%   r%   r&   r   !  r+  zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentioninit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr   rv  r  r  r  r  r  r  r  )rv  r  r  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)producer_nameopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   all_tensors_to_one_filezmodel save to )dr   r   r+   r,   r-   rQ   r   r   r   r   r   r   ro   r#   NotImplementedErrorrM   rN   rR   r   r`   r~   rk   rl   existsr}   r   r   r   as_posixr   r   r   rI   r   rG   r   rJ   r  rh  rH   r   r   from_pretrainedr   r   r   r?   r  r  r  r   r   r   r   r  r  rP   r   rS   rT   r  rU   rV   r*  rh  rT  r1  r  rO   r  r  r  r  r  r   rR  r   r  r  rN  rc  rK   r@  r  rp  r  rC  r+  r
   r   r   
make_graph
make_modelopset_import	packagingr  parse__version__r   r   )-ry   r  is_gpt2is_beamsearchis_greedysearchis_samplingrQ   onnx_filenamelogits_matmul_weight_paddedgpt2_init_decoder_generatedgpt2_init_decoder_onnx_pathgpt2_init_decoder_onnx_filenamer  r  r  r  r   rv  r  r  r%  attr_to_extendr?  r7  suffixr   r  r  r  r  r  r  graph_inputsrP   rS   r   rU   rV   r  graph_outputsr  r  	new_graph	new_modelr  r%   r%   r&   convert_generation_model	  s  
	






	























	







	
r  r   r   r  r  bad_words_idsc                 C   s   | j rtj std| jtjjkr|	  t
| j rdnd}|| td ||}||}g }t| jD ]3}	t }
|j||| j| j| j| j| j||| j| j| j|r\|ndd| jpc| jd}	|t |
  q<|jd }dd	lm} |||S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   r  r  r  rO   r  r  r  r  r  r  r  return_dict_in_generateoutput_scoresr   get_latency_result)r`   torchcudais_availabler   r   r   ro   r#   halfdevicer  set_grad_enabledr   
total_runstimegenerater  r  r  rO   r  r  r  r  rM   rN   r   r)  benchmark_helperr  )ry   r   r   r   r  r  r  r  torch_latencyr  startrv  r  r%   r%   r&   test_torch_performance  sB   







r  c                 C   sp   t j| jt jd}t| jd D ]%}d}t| jd D ]}| | | |kr0|dkr0d|| |< q|d7 }qq|S )Nr   r   rW   )r   onesr)  int32r   )r   r  r   r   abs_posr4  r%   r%   r&   create_attention_mask  s   
r  F	sentences	is_greedyc           +      C   s  | j dksJ tj| j| jd}d|_|j|_tj| j| j|j	d}|du r*g d}||ddd	}|d
 }|d }d}|j
|dd}	dd |	D }	| jrStd|	 ng }	|j}
|
j	}|
j	}|
j}g }d}| jstd td |j||| j| j| j| j| j||| j| j| j|	r|	ndd| jp| jd}td
| td td|j | jrtd|j | jrtd|j t |jD ]\}}|j!|dd}|"| t| d|  qtd td |r|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dd}nB|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dt&j(| jgt&j)dd}| jrgt&j*|t&j'd}| jrc|	D ]}d||< q[||d< | j+rrt,|||d< |j-d }| j.rt/d  t&j*||ft&j'd}||d!< | j0rt1| j2j34 }td"| dd#l5m6} t/d$| d% |g}t |D ]\}}t7j89|d&t:| }||| qtd'| | j;rdS td( t<| j2| j=| j>}td) |?d|}g }t@| jAD ]}tBB }|?d|}|"tBB |  qdd*lCmD}  |j-d }| ||}!td+ |d }"td|" | jr7td|d,  | jrBtd|d-  |rm|"j-\}}#g }$t@|D ]}|j!|"| dd}|$"| td.| d/|  qPn5|"j-\}}%}#g }$t@|D ](}t@|%D ] }&|j!|"| |& dd}|$"| td.| d0|& d|  qqy|r|jE|| jd1}'tFG|"}(td td2 t|' t| td td3 t|( t|$ td ||$k})td4|)rd5nd6 |)|!d7< | jHrtI| ||||||	}*td8|* td9|! |!S ):a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r5   r  left)r   r  N)zThe product is releasedzI enjoy walking in the parkzTest best way to investptTreturn_tensorsr   r   r   walk in park)add_prefix_spacec                 S      g | ]}|gqS r%   r%   r   word_idr%   r%   r&   r   3  r+  z"test_gpt_model.<locals>.<listcomp>r  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:r  r  r  skip_special_tokens: 'Testing beam search with onnxruntime...r   r  r  r   rP   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.rS   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:rW   r   batch z sequence: 
 sequence r]   Torch Sequences:ORT Sequences:zTorch and ORT result issame	differentparityTorch LatencyORT)Jr   r   r  r}   r   padding_side	eos_token	pad_tokenr   r  encoderP   r   r   r  r  ra   r0  r  r  r  r  rO   r  r  r  r  rM   rN   r  r  r  r   decoder   r  numpyastyper   r  arrayfloat32r  rT   r  r)  rS   r   rd   r   r   r   r  bert_test_datar  rk   rl   rj   ri   rb   r   r`   r^   runr   r  r  r  r  r  r  
LongTensorrc   r  )+ry   r  r  	tokenizerr   r  r   r   	bad_wordsr  r  r  r  r  torch_decoded_sequencesbeam_outputsr   sequencedecoded_sequencerP   bad_word_idrv  rS   r  r  
all_inputsdirr   resultlatencyr  r  r  r   r  r  ort_decoded_sequencesnum_sequencesr4  torch_sequencesort_sequencesis_sametorch_latency_outputr%   r%   r&   test_gpt_model  sB  
















	
r)  c           )      C   s(  | j dv sJ | jrtd dS tj| j| jd}d|_| j dkr,t	j| j| jd}n	t
j| j| jd}|du r=ddg}||d	d
d}|d }|d }d}||dd }dd |D }| jrhtd| ng }|j}	|	j}
|	j}|	j}td|
 d| d|  g }| jstd td |j||| j| j| j| j| j|
|| j| j| j|r|ndd
| jp| jd}td| td td|j | jrtd|j | jrtd|j  t!|jD ]\}}|j"|d
d}|#| t| d|  qtd td t$j%|t$j&d }| jr|D ]}d!||< q|' ( )t$j&t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j+d t$j*| jgt$j+d d"}| jr]||d#< | j,rht-|||d< | j.rt/| j0j12 }td$| d!d%l3m4} |g}t!|D ]\}}t5j67|d&t8| }||| qtd'| t9| j0| j:| j;}g }t<| j=D ]}t>> }|?d|}|#t>> |  q|j@d! }d!d(lAmB} |||}td) |d! } td|  | jrtd|d*  | jrtd|d+  | j@\}}!}"g }#t<|D ](}t<|!D ] }$|j"| | |$ d
d}|##| td,| d-|$ d|  qq	| jsz|jC|| jd}%tDE| }&td td. t|% t| td td/ t|& t|# td ||#k}'td0|'rsd1nd2 |'|d3< | jFrtG| ||||
||}(td4|( td5| |S )6a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNr  r  r6   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  r]   c                 S   r  r%   r%   r  r%   r%   r&   r     r+  z!test_t5_model.<locals>.<listcomp>r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  r  r  r  r  r  r  r   r   r  rP   r  r  r   r  r  r  rW   r   r  r  r  r  zTorch and ORT result is r  r  r	  r
  r  )Hr   rS   r   r   r   r  r}   r   r  r   r   r  rP   r  r  r  r  ra   r0  r  r  r  r  rO   r  r  r  r  rM   rN   r  r  r  r   r  r   r   r  r  r  r  r  r  r  rT   r  rd   r   r   r   r  r  r  rk   rl   rj   ri   r   r`   r^   r   r  r  r  r)  r  r  r  r  r  rc   r  ))ry   r  r  r   r  r   r   r  r  r  r  r  r  r  r  r   r  r  rP   r  r  r  r  r   r   r"  r  r  r!  rv  r  r   r  r$  r  r#  r4  r%  r&  r'  r(  r%   r%   r&   test_t5_model  s   













	
r*  c                 C   sr  t | }t|j |jdv rB|jr tj|js td|j |j	r2tj|j	s2td|j	 |jr8|j	r>|j	rB|jsBtd|j
dkoK|jdk}|jdkr}|r}|jdkrv|jdk rvt|tj td	 |jd
kss|jss|jrudS nt|tj nt| td |jdv rt||d}nt|||d}|r|jrtd|j d|j d |S td|j  |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxrW   r5   r\   rZ   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r1   z.datazOutput file: )rz   r   r?   r   r   rk   rl   r  r   r~   r  r  r  r  r   r-   r   r   r  rV   r,   r*  r)  rG   r   )r.   r  ry   r  r!  r%   r%   r&   r     sF   



r   __main__r"   )T)r  r  NN)r  )r   rW   r]   )NFr  )a__doc__re   loggingr   rk   r  enumr   pathlibr   typingr   r  r   r   r  r  r   r   fusion_utilsr   r   r	   r
   r\  r   transformersr   r   r   r   r   r   r   r   onnxruntimer   r   r   r   4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r   0onnxruntime.transformers.models.gpt2.gpt2_helperr   2onnxruntime.transformers.models.t5.convert_to_onnxr   r   ,onnxruntime.transformers.models.t5.t5_helperr   r   	getLoggerr   r   r  ri   	Namespacerz   r   r   boolr   r   r   r  r  r  rp   dictr6  r@  rC  rS  rY  r`  rp  r  r  r  r  r  r  r'  r>  rN  rc  rh  r  r  r+   r  Tensorr  r  r)  r*  r(   r%   r%   r%   r&   <module>   s8  %(
	   -N!8L_
j
%
1
Vi 4oJ
 o)  ;#
  -'
   


B

 X $>
;
