o
    3Ih                     @   sp   d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZ eeZG dd deZG dd	 d	eZd
S )    )	getLogger)Fusion)FusionUtils)	NodeProtoTensorProtohelper)	OnnxModelc                       s  e Zd ZdZd2dedef fddZdedd	eeef B fd
dZ	dede
eee f dedefddZdd Zdd Zdd Zdd Zdd Zdedeed	eB f fddZ					d3deded ed!ed"d	eB d#ed	B fd$d%Zd&d' Zd(d) Z		d4d*d+Zd,d- Zd.d/ Zd0d1 Z  ZS )5FusionEmbedLayerNoMaskz
    Fuse embedding layer into one node (EmbedLayerNormalization).
    It supports the following model types: BERT, DistilBert, ALBert.
    no maskmodeldescriptionc                    s<   t  |dddg| t|| _d | _d| _d | _d | _d S )NEmbedLayerNormalizationLayerNormalizationSkipLayerNormalizationF)super__init__r   utilsshape_infershape_infer_done	attention
embed_node)selfr   r   	__class__ j/home/air/sanwanet/gpt-api/venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_embedlayer.pyr      s   

zFusionEmbedLayerNoMask.__init__addreturnNc                 C   sP   | j |dgdg}|d u rd S | j |dgdg}|d u r d S |d |d fS )NGatherr      )r   match_parent_path)r   r   gather_0_pathgather_1_pathr   r   r   match_two_gather%   s   z'FusionEmbedLayerNoMask.match_two_gather	layernorminput_name_to_nodesis_distil_bertc           
      C   s  | j j|d|dd| _| jdurdS |jd |vrdS ||jd  }tdd |D }|g d	kr_|D ]+}|jd
kr^| j |g dg d}|dur^|d jd |jd kr^|d | _ dS q3t	|dkr|d jdkr|d jd |v r||d jd  }t	|dkr|d jdkr|d jd |v r||d jd  }	|	D ]}|jdkr|| _ dS qtdd |	D }|r|g dkr|g dkr|g dkrt
d dS dS |g dkr|g d	krt
d dS dS )a  Check that LayerNormalization has a child of Attention node or subgraph like Attention.

        Args:
            layernorm (NodeProto): LayerNormalization node
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            is_distil_bert (bool): whether it is DistilBert or not

        Returns:
            bool: whether there is Attention node or subgraph like Attention
        	AttentionF)	recursiveNTr   c                 S      g | ]}|j qS r   op_type.0childr   r   r   
<listcomp>J       zCFusionEmbedLayerNoMask.check_attention_subgraph.<locals>.<listcomp>)MatMulr1   r1   r   r   )Addr1   MultiHeadAttentionr1   )NNr   r      r   r1   r2   c                 S   r)   r   r*   r,   r   r   r   r/   g   r0   )r1   r1   r1   Shaper   )r2   r1   r1   r1   r6   r6   )r2   r1   r1   r1   r6   z<No Attention like subgraph in children of LayerNormalization)r2   r1   r1   r1   )r   find_first_child_by_typer   outputsortedr+   r    inputcross_attentionlenloggerdebug)
r   r$   r%   r&   childrenchildren_typesnodepath1grandchildrennodesr   r   r   check_attention_subgraph0   sZ   

 
,


z/FusionEmbedLayerNoMask.check_attention_subgraphc                 C   s  | j |ddgddg}|du r"| j |g dg d}|du r"dS |d |d	 }}|jd |kr4dS | j |g d
g dfg dg dfg|\}}}|du rSdS |d }	| j|	ddrg| j|	ddsidS |d }
| j|
ddswdS |d	 }|jd |krdS dS )az    Match position embedding path from input_ids to Gather for DistilBert.

        Pattern is like the following:
                 (input_ids)
                      |
                     Shape
                       |                          |    Gather (indices=1)
                       |       |
                       |      Cast (optional)
                       |       |
                       |      Range (start=0, end=*, delta=1)
                       |       |
                       |    Unsqueeze
                       |    /
                      Expand
                        |
                      Gather
        Expandr6   r   N)rF   WhereReshaper6   )r   r   r5   r   Fr   r4   )	UnsqueezeRangeCastr   r6   )r   r   r   r   r   )rI   rJ   r   r6   )r   r   r   r   r5   T)r   r    r:   match_parent_pathsr   check_node_input_value)r   position_embedding_gather	input_idsoutput_name_to_noderB   expandshape_path2
range_nodegather_node
shape_noder   r   r   #match_position_embedding_distilbert   sD   
z:FusionEmbedLayerNoMask.match_position_embedding_distilbertc                 C   s   dS )aY  Match position embedding path from input_ids to Gather for Roberta.

        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
                                                |                              ^
                                                V                              |
                                                +------------------------------+

        Roberta new pattern from transformers v4.9:
           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
                                                |                                           ^
                                                V                                           |
                                                +-------------------------------------------+

        start_node = position_embedding_gather
        start_index = 1

        # match optional Cast node.
        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
        if parent is None:
            return
        if parent.op_type == "Cast":
            if OnnxModel.get_node_attribute(parent, "to") != 7:
                return
            start_node = parent
            start_index = 0

        i, path, return_indices = self.model.match_parent_paths(
            start_node,
            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
            output_name_to_node)

        if path is not None:
            # constant input of Add shall be 1.
            i, value = self.model.get_constant_input(path[0])
            if value != 1:
                return False

            _, self.padding_word_id = self.model.get_constant_input(path[-1])

            return input_ids == path[-1].input[0]
        Fr   r   rO   rP   rQ   r   r   r    match_position_embedding_roberta   s   -z7FusionEmbedLayerNoMask.match_position_embedding_robertac                 C   s  | j |ddgddg|}|du rdS |\}}| j |jd }|durTt|jdkrT|jd dkrT| j|ddgrT| j|ddgrTt|jd	ksV| j|d	dgsVdS | j  }|d
k rjt	
|ddgsidS n| j|ddgsudS | j |d|}	|	du rdS |	jdkr| j|	ddsdS | j |	d|}
n|	}
|
du s|
jdkrdS | j|
ddsdS | j |
d|}|du s|jdkrdS ||jd kS )a	    Match position embedding path from input_ids to Gather for BERT.

        BERT Embedding Layer Pattern:
                                    (input_ids)
                                   /                                          /          Shape
                                /              |
                              /              Gather (indices=1)
                             /                  |
                            /                  Add (optional, B=0)
                           /                    |
                        Gather (segment_ids) Unsqueeze (axes=0)
                           \        |           |
                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
                              \    /            |
                                Add          Gather
                                   \       /
                                      Add
                                       |
                                LayerNormalization
        SlicerI   r   r5   NFr            axesr2   r   r6   )r   r    get_constant_valuer:   r<   rS   r   rN   get_opset_versionr   check_node_attribute
get_parentr+   )r   rO   rP   rQ   pathslice	unsqueezeslice_weightopset_versionrA   gatherrS   r   r   r   match_position_embedding_bert   sT    

z4FusionEmbedLayerNoMask.match_position_embedding_bertc                 C   s(   |  |||r	dS | |||rdS dS )NTF)rk   rY   rZ   r   r   r   match_position_embedding9  s
   z/FusionEmbedLayerNoMask.match_position_embeddingc                 C   s  |j d }|r|j d nd}|j d }| js!| jjdd| _d| _| jdurs| j|}| j|}|r6|s8J t|dkrLt|dkrL|d |d ksYtd| d|  dS |rs| j	||sstd	| d
| j|  dS | j
|j d }	|	du st|	jdkrtd dS | j
|j d }
|
du st|
jdks|	jd |
jd krtd dS |r| j
|j d }|du st|jdks|	jd |jd krtd dS |	jd |
jd krtd|j d  d|	jd  d|j d  d|
jd   |rU|	jd |jd kr-td|j d  d|	jd  d|j d  d|jd   |
jd |jd krUtd|j d  d|
jd  d|j d  d|jd   dS )zXSanity check of embedding weights, and match hidden_size of weights and shape of inputs.r   NT)updater5   z^Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: z vs FzYCannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: z != r   zICannot fuse EmbedLayerNormalization: word embedding table is not expectedzMCannot fuse EmbedLayerNormalization: position embedding table is not expectedzLCannot fuse EmbedLayerNormalization: segment embedding table is not expectedzword_embedding_table (z) size z <= position_embedding_table (z <= segment_embedding_table (zposition_embedding_table ()r:   r   r   infer_runtime_shaper   get_edge_shaper<   r=   infocompare_shapera   rS   warning)r   word_embedding_gathersegment_embedding_gatherrO   rP   segment_idsposition_idsinput_ids_shapeposition_ids_shapeword_embedding_tableposition_embedding_tablesegment_embedding_tabler   r   r   check_embeddingG  sj   





222z&FusionEmbedLayerNoMask.check_embedding
input_namec                 C   sd   d}| j |}|dur&|jjjtjkr | j|\}}||fS |}||fS | j|\}}||fS )a  Cast a graph input or node input to int32.

        Args:
            input_name (str): name of graph input or node input

        Returns:
            A tuple of casted input name and the cast node.
            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
        N)	r   find_graph_inputtypetensor_type	elem_typer   INT32r   cast_input_to_int32)r   r}   input_cast_nodegraph_inputint32_outputr   r   r   cast_to_int32  s   z$FusionEmbedLayerNoMask.cast_to_int32FrP   rs   rO   rt   rv   c	                 C   s  g }	|  |\}}
| jd}|jdkr|jd }|jd }n
|jd }|jd }d}|durL|  |jd \}}
|||jd |jd |jd ||g}n|d|jd |jd d||g}|durp|d |  |\}}
|| |d	 |d
 g}|r|dur|n|d }|| tjd|||d}d|_|j	D ]}|j
dkr|j	|g qt|j	dkr|j	tddg |	| |	D ]	}| j| j|j
< q| j|	 || _|S )ag  Create an EmbedLayerNormalization node. Note that segment embedding is optional.

        Args:
            input_ids (str): input_ids for word embeddings
            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
            word_embedding_gather (NodeProto): the Gather node for word embedding
            position_embedding_gather (NodeProto): the Gather node for position embedding
            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.

        Returns:
            NodeProto: the EmbedLayerNormalization node created.
        r   r   r   r5   r]   Nr    _output_dummy_mask_index_embedding_sum)outputsnamezcom.microsoftepsilong-q=)r   r   create_node_namer+   r:   appendr   	make_nodedomain	attributer   extendr<   make_attributethis_graph_namenode_name_to_graph_namenodes_to_addr   )r   rP   r$   rs   rO   rt   rv   embedding_sum_outputembedding_sum_namer   rT   	node_namegammabetaembed_node_inputsru   embed_node_outputsr   r   attrA   r   r   r   create_fused_node  sl   










z(FusionEmbedLayerNoMask.create_fused_nodec                 C   s$   | j |jd |jd  d| _d S )Nr   T)r   replace_input_of_all_nodesr8   prune_graph)r   r$   r   r   r   r   finish_fusion
  s   
z$FusionEmbedLayerNoMask.finish_fusionc                 C   s*   |j dkot|jdkot|jd dkS )Nr   r]   r   )r+   r<   r8   )r   rA   r   r   r   "is_skip_layer_norm_with_sum_output  s   *z9FusionEmbedLayerNoMask.is_skip_layer_norm_with_sum_outputc              
   C   sx  |  |}|d u rdS |\}}|jd }	|jd }
| j||dds#dS | |d |s,dS |jdkrP| |}d}|}|rA|jd nd }|d uoN| j|d u}n@|}|jdkrYdnd}t	|j|krg|j| nd }|d uot| j|d u}|o||v ot	|| dk}|d uo|jdkp|p|}| j
|	|||||
||r|nd d}|rd	|j|< |s| j||jd
  | || dS )NFr   r&   r   r]   r2   r   )r   r   _no_use__to_be_removed_r5   T)r#   r:   rE   r|   r+   r   r8   r   find_graph_outputr<   r   r   r   )r   r$   add_before_layernormr%   rQ   optional_segment_gather
two_gatherrs   rO   rP   rv   need_embedding_sum_outputsum_output_indexnode_with_sum_output
sum_outputis_sum_graph_outputis_sum_used_by_multiple_nodesr   r   r   r   	fuse_gpt2  sX   







z FusionEmbedLayerNoMask.fuse_gpt2c           
      C   s   |  |}|du rdS |\}}|jd }| j||ddsdS | |||s'dS | |d|s0dS | ||||d}	| ||	 dS )a  Fuse embedding layer for DistilBert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        NFr   Tr   )r#   r:   rE   rl   r|   r   r   )
r   r$   r   r%   rQ   r   rs   rO   rP   r   r   r   r   fuse_distilbertc  s    


z&FusionEmbedLayerNoMask.fuse_distilbertc                 C   s   | j |dgdg}|du rdS | |d }|du rdS |\}}|jd }	| j||dds0dS | j |dgdg}
|
du r@dS |
d }| ||	|sZ| ||	|sTdS |}|}|}| |||scdS | |	||||}| || dS )	a  Fuse embedding layer for Bert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        r2   r   NFr   r   r   T)	r   r    r#   r:   rE   rl   r|   r   r   )r   r$   r   r%   rQ   add_2_gatherr   rs   rt   rP   position_embedding_pathrO   tempr   r   r   r   	fuse_bert  s>   	
z FusionEmbedLayerNoMask.fuse_bertc           	      C   s  | j |dgdg}|jdkr|d u rd S |d }d }nP| j |dgdg}| j |dgdg}|d u rG|d urG|d u r>d S |d }|d }n%|d urh|d u rh| j |dgdg}|d u r_d S |d }|d }n|}d }| |||||rwd S | ||||rd S | ||||rd S d S )Nr2   r   r   r   r   )r   r    r+   r   r   r   )	r   rA   r%   rQ   first_add_pathr   r   r!   r"   r   r   r   fuse  s<   



zFusionEmbedLayerNoMask.fuse)r
   )NFN)N)__name__
__module____qualname____doc__r   strr   r   tupler#   dictlistboolrE   rY   r[   rk   rl   r|   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r	      sT    
T>/HJ
b
Q)2r	   c                       s8   e Zd Zd	def fddZdd Z fddZ  ZS )
FusionEmbedLayerNormalizationFr   c                    s   t  |d || _d S )Nz	with mask)r   r   use_mask_index)r   r   r   r   r   r   r     s   
z&FusionEmbedLayerNormalization.__init__c                 C   s   | j }t|jdkr|j| td|j n"t|jdkr1|jd s1||jd< td|j n	td|j d S |D ]$}td|j |jdkrS|jd |jd< q<|jd	kr`|jd |jd
< q<d S )N   zappend mask to %szreplace mask in %szskip mask in %szupdate mask_index in %sr'   r   r]   r3   r^   )	r   r<   r:   r   r=   r>   r   r+   r8   )r   
mask_int32attention_nodesr   attention_noder   r   r   replace_mask  s"   


z*FusionEmbedLayerNormalization.replace_maskc                    sf  d | _ d | _d | _t ||| | jd u rd S | js'td | d d S | j d u r=| jd u r=td | d d S | j rG| j j	d }n| jj	d }|| }| j
|rkdd |D }| || | d d S ||vr|td	| | d d S || }|jd
v rdd |D }|jdkr|j	d }t|t|kr| j| | || | d d S d S )NzG--use_mask_index is not set: EmbedLayerNormalization will not have maskz EmbedLayerNormalization(no mask)zLEmbedLayerNormalization will not have mask since attention node is not foundr]   r^   c                 S      g | ]	}|j d v r|qS )r'   r3   r*   r-   rA   r   r   r   r/         z6FusionEmbedLayerNormalization.fuse.<locals>.<listcomp>z"EmbedLayerNormalization(with mask)zHEmbedLayerNormalization will not have mask since %s is not a node output)	ReduceSumrK   c                 S   r   r   r*   r   r   r   r   r/   $  r   r   r   )r   r;   r   r   r   r   r=   r>   increase_counterr:   r   r~   r   r+   r<   nodes_to_remover   )r   rA   r%   rQ   r   children_nodesr   r   r   r   r     sJ   









z"FusionEmbedLayerNormalization.fuse)F)r   r   r   r   r   r   r   r   r   r   r   r   r     s    r   N)loggingr   fusion_baser   fusion_utilsr   onnxr   r   r   
onnx_modelr   r   r=   r	   r   r   r   r   r   <module>   s        X