o
     Xxi                     @   sT   d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ G dd deZd	S )
u  
The analyzer module supplies Analyzer framework for pre-processing and post-processing for morphological analysis.

Added in *version 0.3.4*

**NOTE** This is experimental. The class/method interfaces can be modified in the future releases.

Usage:

>>> from janome.tokenizer import Tokenizer
>>> from janome.analyzer import Analyzer
>>> from janome.charfilter import *
>>> from janome.tokenfilter import *
>>> text = '蛇の目はPure Ｐｙｔｈｏｎな形態素解析器です。'
>>> char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('蛇の目', 'janome')]
>>> tokenizer = Tokenizer()
>>> token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞']), LowerCaseFilter()]
>>> a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters)
>>> for token in a.analyze(text):
...     print(token)
...
janome	名詞,固有名詞,組織,*,*,*,*,*,*
pure	名詞,固有名詞,組織,*,*,*,*,*,*
python	名詞,一般,*,*,*,*,*,*,*
な	助動詞,*,*,*,特殊・ダ,体言接続,だ,ナ,ナ
形態素解析器	名詞,複合,*,*,*,*,形態素解析器,ケイタイソカイセキキ,ケイタイソカイセキキ
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス

Usage (word count with TokenCountFilter):

>>> from janome.tokenizer import Tokenizer
>>> from janome.analyzer import Analyzer
>>> from janome.tokenfilter import *
>>> text = 'すもももももももものうち'
>>> token_filters = [POSKeepFilter(['名詞']), TokenCountFilter()]
>>> a = Analyzer(token_filters=token_filters)
>>> for k, v in a.analyze(text):
...   print('%s: %d' % (k, v))
...
もも: 2
すもも: 1
うち: 1
    )ListIteratorAnyOptional   )	Tokenizer)
CharFilter)TokenFilterc                   @   sR   e Zd ZdZg dg ddee dee dee fddZ	d	e
d
ee fddZdS )Analyzerz
    An Analyzer analyzes Japanese texts with customized :class:`.CharFilter` chain,
    :class:`.Tokenizer` and :class:`.TokenFilter` chain.

    Added in *version 0.3.4*
    N)char_filters	tokenizertoken_filtersr   r   r   c                C   s2   |st  | _n
|jrtd|| _|| _|| _dS )a  
        Initialize Analyzer object with CharFilters, a Tokenizer and TokenFilters.

        :param char_filters: (Optional) CharFilters list. CharFilters are applied to the input text
                             in the list order. default is the empty list.
        :param tokenizer: (Optional) A Tokenizer object. Tokenizer tokenizes the text modified by
                          *char_filters*. default is Tokenizer initialized with no extra options.
                          **WARNING:** A Tokenizer initialized with *wakati=True* option is not accepted.
        :param token_filters: (Optional) TokenFilters list. TokenFilters are applied to the Tokenizer's
                              output in the list order. default is the empty list.
        zFInvalid argument: A Tokenizer with wakati=True option is not accepted.N)r   r   wakati	Exceptionr   r   )selfr   r   r    r   N/home/air/biblejyuku/back/venv/lib/python3.10/site-packages/janome/analyzer.py__init__J   s   

zAnalyzer.__init__textreturnc                 C   s<   | j D ]}||}q| jj|dd}| jD ]}||}q|S )a8  
        Analyze the input text with custom CharFilters, Tokenizer and TokenFilters.

        :param text: unicode string to be tokenized

        :return: token generator. emitted element type depends on the output of the last TokenFilter.
                 (e.g., ExtractAttributeFilter emits strings.)
        F)r   )r   r   tokenizer   )r   r   cfiltertokenstfilterr   r   r   analyzeb   s   
	


zAnalyzer.analyze)__name__
__module____qualname____doc__r   r   r   r   r	   r   strr   r   r   r   r   r   r   r
   B   s    
r
   N)r   typingr   r   r   r   r   r   
charfilterr   tokenfilterr	   objectr
   r   r   r   r   <module>   s   ,