o
    3Ih                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZmZmZ d dlmZ eeZG dd	 d	ZG d
d deZdS )    )	getLoggerN)Fusion)AttentionMaskFormat)FusionUtilsNumpyHelper)	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   sR   e Zd ZdZdefddZdefddZdd	 Zd
d Z	de
de
dB fddZdS )AttentionMask:
    Fuse Attention subgraph into one Attention node.
    modelc                 C   s2   || _ i | _i | _t|| _tj| _| | _	d S N)
r   mask_indicemask_castedr   utilsr   MaskIndexEndmask_formatget_opset_versionopset_version)selfr    r   i/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_attention.py__init__   s   
zAttentionMask.__init__r   c                 C   s
   || _ d S r   )r   )r   r   r   r   r   set_mask_format    s   
zAttentionMask.set_mask_formatc                 C   s*   || j v r|| j | ksJ || j |< d S r   )r   )r   mask
mask_indexr   r   r   set_mask_indice#   s   
zAttentionMask.set_mask_indicec                 C   s    t | jdks	J tt| jS Nr   )lenr   nextiter)r   r   r   r   get_first_mask(   s   zAttentionMask.get_first_maskmask_2dreturnNc              	   C   s`  | j tjkrd S || jv r| j| S | j|r!| j|\}}n
| j|\}}d}|r2|| j	|< | j tj
kr?|| j|< |S | jd}| jdk rltjd|g|g| jddd}|jtddgtd	d
g n7d}| j|d u r| jtj|tjdgdgdd tjd||g|g| jddd}|jtd	d
g | j| || j|< |S )NTr      	ReduceSumMaskReduceSuminputsoutputsnameaxes   keepdimsr   ort_const_1_reduce_sum_axesFr,   	data_typedimsvalsraw)r   r   NoMaskr   r   find_graph_inputr   cast_graph_input_to_int32cast_input_to_int32r   r   create_node_namer   r	   	make_node	attributeextendmake_attributeget_initializeradd_initializermake_tensorr   INT64add_node)r   r$   casted
input_name
_cast_nodeoutput_namemask_index_node	axes_namer   r   r   process_mask,   sV   




$	
zAttentionMask.process_mask)__name__
__module____qualname____doc__r   r   r   r   r   r#   strrJ   r   r   r   r   r      s    
r   c                (       s,  e Zd ZdZdddddgfdededed	edB d
ededee	 f fddZ
dedeeef fddZdedeeef fddZdefddZde	fddZde	de	de	fddZde	d e	d!e	fd"d#Zd$ed%edB d&edB d'e	dedB f
d(d)Zd*ed+ed,ed$ed%edB d&edB deeeef fd-d.Z	/	/		/	/	/	/	dAd*ed+ee	B dB d,ee	B dB d$ed%edB d&edB deded0e	d1e	de	d2ede	de	d3e	d4e	d5ededB f$d6d7Z	/	/	/	/	/		dBd8e	dB d*ed+ed,ed$ed%ed&ededed9e	d0e	d:e	de	de	d3e	d4e	d;edB d<ededB f&d=d>Zd?d@ Z  ZS )CFusionAttentionr   NFSkipLayerNormalizationLayerNormalizationr   hidden_size	num_headsattention_maskuse_multi_head_attention!disable_multi_head_attention_biassearch_op_typesc           	         sh   |rdnd}t  ||| || _|| _|r|nt|| _|| _|| _d | _d| _	d| _
d | _d| _d S )NMultiHeadAttention	AttentionT)superr   rS   rT   r   rU   rV   rW   mask_filter_valuenum_heads_warninghidden_size_warningshape_infershape_infer_done)	r   r   rS   rT   rU   rV   rW   rX   attention_op_name	__class__r   r   r   l   s   

zFusionAttention.__init__concatr%   c                 C   s   t |jdkr;| j|jd }| j|jd }t|tjr;|jdkr;t|tjr;|jdkr;|d |d |d  fS | j| j	fS )aU  
        Detect num_heads and hidden_size from Concat node in the following subgraph:

        SkipLayerNormalization or EmbedLayerNormalization
                        /        |
                     MatMul    Shape
                        |        |
                       Add     Gather(indices=0)
                        |        |
                        |      Unsqueeze
                        |        |
                        |     Concat (*, -1, 12, 64)
                        |     /
                       Reshape
                          |
                       Transpose
                 r.   r   )
r    inputr   get_constant_value
isinstancenpndarraysizerT   rS   )r   rd   rT   	head_sizer   r   r   )get_num_heads_and_hidden_size_from_concat   s   



z9FusionAttention.get_num_heads_and_hidden_size_from_concat	reshape_qc                 C   s*  | j |jd }|du r1| j |d}|dur"|jdkr"| |S td|jd  | j| j	fS t
|tjrIt|dksI|d dksI|d dkrUtd	| | j| j	fS |d }|d }|| }| jdkry|| jkry| jrytd
| j| d| _| j	dkr|| j	kr| jrtd| j	| d| _||fS )zDetect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r.   NConcatz%s is not initializer.re   rf   r   rg   zGq_shape_value=%s. Expected value are like [0, 0, num_heads, head_size].z>--num_heads is %d. Detected value is %d. Using detected value.Fz@--hidden_size is %d. Detected value is %d. Using detected value.)r   ri   rh   
get_parentop_typero   loggerdebugrT   rS   rj   rk   rl   r    r]   warningr^   )r   rp   q_shape_valuerd   rT   rn   rS   r   r   r   get_num_heads_and_hidden_size   s:   


z-FusionAttention.get_num_heads_and_hidden_sizeadd_qkc                 C   s   | j s| jjdd| _d| _ | jd u rd S | j|jd }| j|jd }|d u s/|d u r7td| d S ||krCtd| d S |jd S )NT)updater   r.   zone of the inputs of %s is Nonez)the shape of two inputs of %s is not same)r`   r   infer_runtime_shaper_   get_edge_shaperh   rt   ru   )r   ry   input_0_shapeinput_1_shaper   r   r   get_add_qk_str   s   

zFusionAttention.get_add_qk_strc                    s    d t tfdd| j}t|dkrS t|dks J | jd}tjd fddt| j	D g|dd	}| j
| | j| j|< S )
N_maskc                    s   | j d  kS r   )output)node)mask_output_namer   r   <lambda>   s    z0FusionAttention.reshape_add_qk.<locals>.<lambda>r.   r   rq   c                    s   g | ]} qS r   r   ).0_)ry   r   r   
<listcomp>   s    z2FusionAttention.reshape_add_qk.<locals>.<listcomp>r*   r+   r,   axis)listfilternodes_to_addr    r   r:   r	   r;   rangerT   appendthis_graph_namenode_name_to_graph_name)r   ry   concat_nodeconcat_node_nameconcat_add_qk_fp32r   )ry   r   r   reshape_add_qk   s    zFusionAttention.reshape_add_qkpast_kpast_vc                 C   s   | j d}| j d}|d dd}|d dd}tjd|g|g|dgd}tjd|g|g|dgd}| j| | j| | j| j|< | j| j|< | j d}	|dd	ddd
d}
tjd||g|
g|	dd}| j| | j| j|	< |
S )zConcatenate past_k and past_v inputs to create past_kv input.

        Args:
            past_k (str): name of past K value
            past_v (str): name of past V value

        Returns:
            kv_output_name (str): name of past KV value
        	Unsqueeze_5d.r   r   )r*   r+   r,   r-   rq   z.valuez.kv_value_kvr   )	r   r:   replacer	   r;   r   r   r   r   )r   r   r   unsqueeze_k_nameunsqueeze_v_name	k_5d_name	v_5d_namek_5dv_5dr   kv_output_name	concat_kvr   r   r   r     sD   		zFusionAttention.concat_kvpresent_k_namepresent_v_namekv_nodec                 C   s   d\}}| j |}| j |}|du r(tjtjddd|d}| j || j |du r@tjtjddd|d}| j || j | j d}| j d}	t	j
d||g|g|dd	}
t	j
d||g|g|	dd	}| j|
 | j| | j| j|< | j| j|	< dS )
a?  Split kv_node containing present KV values into separate present K and present V values.

        Args:
            present_k_name (str): name of output to store present K value in
            present_v_name (str): name of output to store present V value in
            kv_node (str): name of present KV values
        )index_0index_1Nr   int64)dtyper,   r.   Gatherr   )r   r?   r
   
from_arrayrk   arrayr@   r   r:   r	   r;   r   r   r   )r   r   r   r   k_indexv_indexk_dimv_dimgather_k_namegather_v_name	present_k	present_vr   r   r   split_kv8  s:   	zFusionAttention.split_kvq_addk_addv_addname_prefixc                 C   s   | j |jd p| j |jd }t|}t|}t|}|d ur<| j |jd p6| j |jd }	t|	}|d urW| j |jd pQ| j |jd }
t|
}tj|||fdd}dt|j	 }|d }| j
||j|g|d |S )Nr.   r   r   rg   	_qkv_biasr,   r2   r3   r4   )r   r?   rh   r   to_arrayrk   
zeros_likestackprodshaper@   r2   )r   r   r   r   r   q_biasqbkbvbk_biasv_biasqkv_biasqkv_bias_dim	bias_namer   r   r   create_combined_qkv_biase  s(   $


$
$
z(FusionAttention.create_combined_qkv_biasq_matmulk_matmulv_matmulc           #      C   s"  | j d}|jd |jd kr|jd |jd ksJ | j |jd }| j |jd }	| j |jd }
t|}t|	}t|
}|j|jkrR|j|jksTJ |jd }tj|||fdd	|d| f}|d }| j
||j|jd |jd g|d |d }tjd|jd |g|g|d	}| j| j|< |g}|d
 }| j
|tjdgdgdd |d }| j
|tjdg|gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgdgdd |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |}|} |}!||||g | jr|dur| j |jd rodnd}"tt| j |j|" r||jd|" < |}|| | j| j|j< |dur| j |jd rdnd}"tt| j |j|" r||jd|" < |} || | j| j|j< |dur| j |jd rdnd}"tt| j |j|" r||jd|" < |}!|| | j| j|j< | j| || |!fS )a  Create packed QKV MatMul node before MultiHeadAttention node.
           This is for the scenario where an Attention node should be created but cannot be created
           because past_key and past_value are separate inputs and not one concatenated input.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path

        Returns:
             q_output (NodeProto): Slice node for Q
             k_output (NodeProto): Slice node for K
             v_output (NodeProto): Slice node for V
        MatMulr   r.   r   rg   _qkv_weightr   _qkv_outr)   _q_start_indexFr1   _k_start_index_v_start_indexrf   _end_of_qkv_index_qkv_last_axis_q_outSlice_k_out_v_outN)r   r:   rh   r?   r   r   r   rk   r   reshaper@   r2   r	   r;   r   r   r   rB   r,   r=   rW   anyr   r   )#r   r   r   r   r   r   r   matmul_node_nameq_weightk_weightv_weightqwkwvwd
qkv_weightqkv_weight_nameqkv_matmul_output
qkv_matmul	qkv_nodesq_slice_namek_slice_namev_slice_nameend_of_qkv_nameqkv_last_axis_nameq_slice_outputq_slicek_slice_outputk_slicev_slice_outputv_sliceq_outputk_outputv_outputinitializer_inputr   r   r   create_packed_qkv_matmul_node  s   ,



"






 

 

 

z-FusionAttention.create_packed_qkv_matmul_node r   key_padding_maskunidirectionalr   r   
packed_qkvc                 C   s,  |dksJ |dkr|| dkrt d|| dS dd | j jD }| jd}g }|rM| ||||||\}}}||jd |jd |jd g n`t	|t
r~t	|t
r~| jrl||jd |jd |jd g nA||jd |jd |jd g n/t	|trt	|tr||v r||v r| jr||jd ||g n||jd ||g ndS | js| ||||}|| n|d |r|r||
|||g n|
s|r||
|g |	g}|r|r|||g tjd|||d	}d
|_|jtd| |r|jtdt| | d |S )a  Create a MultiHeadAttention node.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name of MHA
            key_padding_mask (str): name of key padding mask
            add_qk (str): name of add after Q x K'
            unidirectional (bool): whether to apply causal attention mask automatically or not
            past_k (str): name of past K value - (batch_size, num_heads, past_sequence_length, head_size)
            past_v (str): name of past V value - (batch_size, num_heads, past_sequence_length, head_size)
            present_k (str): name of present K value - (batch_size, num_heads, sequence_length, head_size)
            present_v (str): name of present V value - (batch_size, num_heads, sequence_length, head_size)
            packed_qkv (bool): whether to combine MatMuls from Q, K, V paths
                               Note: This is for the scenario where an Attention node should be created but cannot be created
                               because past_key and past_value are separate inputs and not one concatenated input.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   9input hidden size %d is not a multiple of num of heads %dNc                 S   s   h | ]}|j qS r   r   )r   r   r   r   r   	<setcomp>H      zBFusionAttention.create_multihead_attention_node.<locals>.<setcomp>rZ   r   rY   r)   com.microsoftrT   r   )rt   ru   r   graphrh   r:   r   r=   r   rj   r   rW   rO   r   r   r	   r;   domainr<   r>   intincrease_counter)r   r   r   r   r   r   r   rT   rS   r   r   ry   r   r   r   r   r   r  graph_input_namesmha_node_name
mha_inputsr   r   r   r   mha_outputsmha_noder   r   r   create_multihead_attention_node  sl   /
$$$

z/FusionAttention.create_multihead_attention_noder   first_input
add_qk_strscalecausalc           5      C   sj  |dksJ |	dkr|	| dkrt d|	| dS d}|du r)|du r)|du r)d}| j|jd }| j|jd }| j|jd }d\}}}|r| j|jd p\| j|jd }| j|jd pn| j|jd }| j|jd p| j|jd }|r|r|r|sdS |du rt|jd  d dS t|}t|}t|}|j|jksJ |jd }|jd }|jd }||  kr|ksJ  J |	dkr|	|krt 	d	|	| d} |j|jkrd} t
|jdd }!t
|jdd }"t
|jdd }#d}$| rt
j|||fdd
}%|!|" |# }$nt
j|||fdd
}%d|! }$d}&d}'|rt|}(t|})t|}*t
|(j}+t
|)j},t
|*j}-|+|,  kr_|!ksbJ  J |-|#ksiJ | r}t
j|(|)|*fdd
}'|+|, |- }&nt
j|(|)|*fdd
}'d|+ }&| jd}.| js| j|.d |j|t|$g|%d |r| j|.d |jt|&g|'d | jr|rt d dS |jd |jd |jd |.d g}/|dur|/| tjd|/|g|.d}0| d nr|
|.d |r|.d ndg}/|dur	|/| n|/d |o|}1|1r!| ||}2|/|2 |r1|1s,|/d |/| |g}3|rT|rT|dddddd}4|3|4 | |||4 tjd|/|3|.d}0| d d|0_|0jtd|g |r~|0jtddg |dur|0jtd|g | r|0jtd|!|"|#gg | jdur|0jtdt | jg |0S )a>  Create an Attention node.

        Args:
            mask_index (str | None): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            first_input (str): first input name
            output (str): output name
            add_qk_str (str): name of Add node after Q x K'
            past_k (str): name of input for past K value
            past_v (str): name of input for past V value
            present_k (str): name of output to store present K value
            present_v (str): name of output to store present V value
            scale: scale before softmax
            causal: whether it is uni-directional mask.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   r  NTFr.   )NNNzl is not an initializer. Please set do_constant_folding=True in torch.onnx.export to unblock attention fusionzInput hidden size (%d) is not same as weight matrix dimension of q,k,v (%d). Please provide a correct input hidden size or pass in 0r   rg   rZ   r   r   r   zVMultiHeadAttention does not support relative_position_bias: cannot fuse the attention.rY   r)   r   z.key_keyr   r   r  rT   r   r  qkv_hidden_sizesr\   )!rt   ru   r   r?   rh   printr   r   r   rv   rk   r   concatenater   r:   rV   r@   r2   r  r   r   r	   r;   r	  r   r   r   r  r<   r=   r>   r\   float)5r   r   r   r   r   r   r   r   rT   rS   r  r   r  r   r   r   r   r  r  has_biasr   r   r   r   r   r   r   r   r   
qw_in_size
kw_in_size
vw_in_sizeis_qkv_diff_dimsqw_out_sizekw_out_sizevw_out_sizeqkv_weight_dimr   r   r   r   r   r   q_bias_shapek_bias_shapev_bias_shapeattention_node_nameattention_inputsattention_nodepast_existspast_kvattention_outputs
present_kvr   r   r   create_attention_node  s  .
$$$








 












z%FusionAttention.create_attention_nodec           ;      C   sV  |}|}|j dkr| j|dd}|d ur|}nd S | j|g dg d}d }|d ur4|\}	}	}
}}n| j|g dg d}|d urK|\}	}}}nd S g }t|jD ]\}}||vr]qT||d jd krgqT|| qTt|dkrud S |d }| j|d	d}|d ur||jd  }|d urt|d
kr|d }|j dkr|jd }n,d S |d urt|dkr|jd }nd S |j dkr|| }|D ]}|j dkr|jd }q|| }|j dkrt|jdkr|jd }|| }dd |D }|	ddkrd S | j|g dg d}|d u rt
d d S |\}	}	}}d}d}d}d}g dg dfg dg dfg dg dfg dg dfg dg dfg dg d fd!}d } | D ]6\}!}"| j||"d |"d } | d u rjqS|!d"krrd#}n|!d$krzd#}n|!d%krd#}n|!d&krd#} | d u rt
d' d S d }#d }$d }%d }&|r| \}	}%}$}	n$|r| \}	}#}%}$n|r| \}	}	}$n|r| \}	}#}$}&}	n| \}	}#}	}$|&p|$}&| j|&g dg d(}'|'d u r| j|&g d)g d*}'|'d u rt
d+ d S |'d, }(|'d- })|'d. }*|$}+|r'| j|$d	d/gdd g},|,d u r#t
d0 d S |,\}+}	| j|+g d|r3dndddd g}-|-d u rW| j|$g d1g d2}-|-d u rWt
d3 d S |-d- }.|-d. }/d }0d4}1|r| j|%g d5g dfg d6g dfg d7g d8fg|\}	}0}	n_|r| j|%g d9g d8fg d6g dfg|\}	}0}	|#d ur| |#}1|1d u rt
d:|# d S n+|rn'| j|#g d;g d<fg d=g d>fg d?g d@fg dAg dBfg|\}	}0}	|s|0d u rt
dC d S |s0t|0dkr0| j|0d \}	}2|2d u s"t|2tjr"|2jdkr"t|2dkr$d S t|2dDkr0t|2| _|jd |kr%|*jd |kr'|/jd |kr)|sV| j|0d. jd nd }3|d u r_|
n|}4| |(\}5}6|5dksr|6dkryt
dE d S | j|3|*|/||)|.||5|6||4jd |1dF}7|7d u rd S | j|7 | j| j|7j< |d ur|jd }8dG|8 }9| jdH|8 t j!dgdd|5t"|6|5 gddI}:| j#t$%dJ|4jd |:jg|9gdK|8 | j |9|jd< | j&'|4||g | j&'|  | j&'| j(s|'n|'d d.  | j&'| j(s	|-n|-d d.  | j&'| j(s|n|d d.  d#| _)d S d S d S d S )LNrR   Addr   )r-  r   Reshape	Transposer   )NNr   r   r   )r-  Einsumr/  r   )r.   Nr   r   r.   Mulrf      rQ   re   c                 S   s   g | ]}|j qS r   )rs   )r   childr   r   r   r     r  z(FusionAttention.fuse.<locals>.<listcomp>r   rg   )r/  r.  r-  r   )r.   r   r   Nz&fuse_attention: failed to match v pathF)Softmaxr-  Divr   )r   r   Nr   )r4  r-  r1  r   )r4  Wherer   r5  )r   r   rf   r   )r4  r-  r6  r   )r   r   r   rf   )r4  r5  r   )r   r   r   )r4  r-  r   r1  Sqrt)r   r   Nr   r.   )path1path2path3path4path5sdpar:  Tr;  r<  r=  z'fuse_attention: failed to match qk path)r   r   r   N)r5  r/  r.  r-  r   )r   r   r   r   Nz&fuse_attention: failed to match q pathr   r7  z/fuse_attention: failed to match mul sqrt q path)r/  r/  r.  r-  r   )r.   r   r   r   Nz&fuse_attention: failed to match k pathr   )Expandr.  Equal)rA  r   r   )Castr@  r.  rA  )r   r   r   r   )rB  rA  r   r   z6fuse_attention: failed to verify shape inference of %s)r1  SubrB  r   r   )Nr   r.   r   r   )r1  rC  r   r   )Nr   r.   r   )r6  rB  rC  r@  r   r   )Nr   r   r.   r   r   )r6  rB  rC  rB  r@  r   r   )Nr   r   r.   r   r   r   z)fuse_attention: failed to match mask pathizmFailed to detect num_heads and hidden_size for Attention fusion. Please specify those parameters in argument.)r   r   r   r   r   r   r   rT   rS   r  r   r  edge_modified_shape_modified_tensorr1   r.  reshape_modified_)*rs   r   match_parentmatch_parent_path	enumeraterh   r   r   r    countrt   ru   itemsmatch_parent_pathsr   get_constant_inputrj   rk   rl   rm   r  r\   rU   rJ   rx   rv   r,  r   r   r   r,   r@   r   rB   r  rC   r	   r;   nodes_to_remover=   rV   prune_graph);r   r   input_name_to_nodesoutput_name_to_nodenormalize_node
start_nodeadd_before_layernormr   einsum_noder   reshape_qkvtranspose_qkv
matmul_qkvother_inputs_i
node_input
root_inputmul_before_layernormmul_childrenlayernorm_nodechildrenr3  parent_nodechildren_typesv_nodesadd_vmatmul_v
is_distillis_distill_addis_no_mask_attentionis_sdpaqk_pathsqk_nodeskvry   	matmul_qkwhere_qkafter_qq_nodesrp   add_qmatmul_qafter_kmul_k_nodesk_nodesadd_kmatmul_k
mask_nodesr  mul_valr   attention_last_nodeq_num_headsq_hidden_sizenew_nodeunique_indexnew_edgeshape_tensorr   r   r   fusev  s  







	















	








0 



	   
zFusionAttention.fuse)r   r   Fr   r   r   r   F)r   r   r   r   r   NF)rK   rL   rM   rN   r   r  r   boolr   rO   r   r   tuplero   rx   r   r   r   r   r   r   r  r  r,  r  __classcell__r   r   rb   r   rP   g   s(   	.7-

 

	

 	

 krP   )loggingr   numpyrk   fusion_baser   fusion_optionsr   fusion_utilsr   r   onnxr   r   r	   r
   
onnx_modelr   rK   rt   r   rP   r   r   r   r   <module>   s   V