o
    
zi`                  #   @   s  d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ 													d.de
de"de%de%dedB de&dee& dB de%de'de&de%de%dB d e(d!e(d"e(d#e	d$df"d%d&Z)				'		d/d(e#de%dee& dB de&d)e(de%dedB d$e%fd*d+Z*				'	d0d(e#de%dee& dB de&d)e(dedB d$ee fd,d-Z+dS )1zIFunctions that can be used for the most common use-cases for pdfminer.six    N)	ContainerIterator)StringIO)AnyBinaryIOcast)HOCRConverterHTMLConverterPDFPageAggregatorTextConverterXMLConverter)ImageWriter)LAParamsLTPage)	PDFDeviceTagExtractor)PDFValueError)PDFPageInterpreterPDFResourceManager)PDFPage)AnyIO
FileOrNameopen_filenametextutf-8       ?normalFinfoutfpoutput_typecodeclaparamsmaxpagespage_numberspasswordscalerotation
layoutmode
output_dirstrip_controldebugdisable_cachingkwargsreturnc              	   K   sL  |r
t  t j d}|rt|}t| d}d}|dkr'|tjkr'tjj}|dkr5t	|||||d}nE|dkrDt
||||||d}n6|dkrTt|||||
||d}n&|d	krbt|||||d
}n|dkrqt|tt||d}n	d| }t||dusJ t||}tj| |||| dD ]}|j|	 d |_|| q|  dS )ak  Parses text from inf-file and writes to outfp file-like object.

    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    Ncachingr   )r!   r"   imagewriterxml)r!   r"   r1   stripcontrolhtml)r!   r&   r(   r"   r1   hocr)r!   r"   r3   tag)r!   z1Output type can be text, html, xml or tag but is r#   r%   r0   ih  )logging	getLoggersetLevelDEBUGr   r   sysstdoutbufferr   r   r	   r   r   r   r   r   r   r   	get_pagesrotateprocess_pageclose)r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r1   rsrcmgrdevicemsginterpreterpage rH   R/home/air/biblejyuku/back/venv/lib/python3.10/site-packages/pdfminer/high_level.pyextract_text_to_fp   sx   /	



rJ   Tpdf_filer0   c              	   C   s   |du rt  }t| dP}t <}tt|}t|d}	t|	|||d}
t|	|
}tj	|||||dD ]}|
| q2| W  d   W  d   S 1 sOw   Y  W d   dS 1 s_w   Y  dS )aw  Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    Nrbr/   )r!   r"   r7   )r   r   r   r   r   r   r   r   r   r?   rA   getvalue)rK   r%   r$   r#   r0   r!   r"   fpoutput_stringrC   rD   rF   rG   rH   rH   rI   extract_text   s"   



RrP   c                 c   s    |du rt  }t| d7}tt|}t|d}t||d}t||}	tj|||||dD ]}
|		|
 |
 }|V  q-W d   dS 1 sGw   Y  dS )a  Extract and yield LTPage objects

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: LTPage objects
    NrL   r/   )r"   r7   )r   r   r   r   r   r
   r   r   r?   rA   
get_result)rK   r%   r$   r#   r0   r"   rN   resource_managerrD   rF   rG   layoutrH   rH   rI   extract_pages   s(   




"rT   )r   r   Nr   Nr   r   r   r   NFFF)r   Nr   Tr   N)r   Nr   TN),__doc__r8   r<   collections.abcr   r   ior   typingr   r   r   pdfminer.converterr   r	   r
   r   r   pdfminer.imager   pdfminer.layoutr   r   pdfminer.pdfdevicer   r   pdfminer.pdfexceptionsr   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.utilsr   r   r   strintfloatboolrJ   rP   rT   rH   rH   rH   rI   <module>   s    
	

|

-
