U
    b\$                     @   s,  zd dl ZW n ek
r(   d dlZY nX d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlmZ dd	lmZmZmZmZmZmZ e
ed
eedddZe
ed
eedddZe
ed
eee dddZ e
ed
eedddZ!eedddZ"e
ed
eedddZ#e
ed
eedddZ$e
ed
eedddZ%e
ed
eedddZ&e
ed
eedddZ'eedd d!Z(e
ed
eedd"d#Z)e
ed
eedd$d%Z*e
ed
eedd&d'Z+e
ed
eedd(d)Z,e
ed
eedd*d+Z-e
e.ed
eed,d-d.Z/dNe0e1ee d0d1d2Z2e
d3d
eed4d5d6Z3e0eee e0f d7d8d9Z4eed:d;d<Z5dOeeed>d?d@Z6eee dAdBdCZ7eee8dDdEdFZ9eeedDdGdHZ:dIej;dJfee1eddKdLdMZ<dS )P    N)IncrementalDecoder)aliases)	lru_cache)findall)ListOptionalSetTupleUnion)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATION)maxsize)	characterreturnc                 C   sV   zt | }W n tk
r$   Y dS X d|kpTd|kpTd|kpTd|kpTd|kpTd|kS )NFz
WITH GRAVEz
WITH ACUTEzWITH CEDILLAzWITH DIAERESISzWITH CIRCUMFLEXz
WITH TILDEunicodedataname
ValueErrorr   description r   </tmp/pip-unpacked-wheel-2ta4nrol/charset_normalizer/utils.pyis_accentuated   s    r   c                 C   s.   t | }|s| S |d}tt|d dS )N r      )r   decompositionsplitchrint)r   Z
decomposedcodesr   r   r   remove_accent*   s
    

r&   c                 C   s.   t | }t D ]\}}||kr|  S qdS )zK
    Retrieve the Unicode range official name from a single character.
    N)ordr   items)r   Zcharacter_ord
range_nameZ	ord_ranger   r   r   unicode_range5   s
    
r*   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZLATINr   r   r   r   r   is_latinC   s
    r+   c                 C   s*   z|  d W n tk
r$   Y dS X dS )NasciiFT)encodeUnicodeEncodeErrorr   r   r   r   is_asciiL   s
    r0   c                 C   s2   t | }d|krdS t| }|d kr*dS d|kS )NPTFZPunctuationr   categoryr*   r   character_categorycharacter_ranger   r   r   is_punctuationT   s    
r7   c                 C   s:   t | }d|ksd|krdS t| }|d kr2dS d|kS )NSNTFZFormsr2   r4   r   r   r   	is_symbolc   s    
r:   c                 C   s   t | }|d krdS d|kS )NFZ	Emoticons)r*   )r   r6   r   r   r   is_emoticonr   s    r;   c                 C   s&   |   s| dkrdS t| }d|kS )N>   ;>+<   ｜,TZ)isspacer   r3   r   r5   r   r   r   is_separator|   s    
rE   c                 C   s   |   |  kS N)islowerisupperr/   r   r   r   is_case_variable   s    rI   c                 C   s   t | }|dkS )NCo)r   r3   rD   r   r   r   is_private_use_only   s    
rK   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFCJKr   r   Zcharacter_namer   r   r   is_cjk   s
    rN   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZHIRAGANAr   rM   r   r   r   is_hiragana   s
    rO   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZKATAKANAr   rM   r   r   r   is_katakana   s
    rP   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZHANGULr   rM   r   r   r   	is_hangul   s
    rQ   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZTHAIr   rM   r   r   r   is_thai   s
    rR   )r)   r   c                    s   t  fddtD S )Nc                 3   s   | ]}| kV  qd S rF   r   ).0keywordr)   r   r   	<genexpr>   s     z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rU   r   rU   r   is_unicode_range_secondary   s    rX      )sequencesearch_zoner   c                 C   s   t | tstt| }tt| dt|| jddd}t|dkrHdS |D ]N}| 	dd}t
 D ]0\}}||kr|    S ||krh|    S qhqLdS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nr,   ignoreerrorsr   -_)
isinstancebytes	TypeErrorlenr   r   mindecodelowerreplacer   r(   )rZ   r[   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar   r   r   any_specified_encoding   s"    
rl      )r   r   c                 C   s    | dkpt td| jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   utf_16utf_7	utf_16_be	utf_16_le	utf_8_sigutf_8	utf_32_be	utf_32_leutf_32encodings.{})
issubclass	importlibimport_moduleformatr   r   )r   r   r   r   is_multi_byte_encoding   s    
r|   )rZ   r   c                 C   sJ   t D ]@}t | }t|tr |g}|D ]}| |r$||f    S q$qdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   ra   rb   
startswith)rZ   iana_encodingZmarksmarkr   r   r   identify_sig_or_bom   s    

r   )r   r   c                 C   s   | dkS )N>   rv   rn   r   )r   r   r   r   should_strip_sig_or_bom  s    r   T)cp_namestrictr   c                 C   sL   |   dd} t D ]\}}| ||fkr|  S q|rHtd| | S )Nr_   r`   z Unable to retrieve IANA for '{}')rg   rh   r   r(   r   r{   )r   r   rj   rk   r   r   r   	iana_name  s    
r   )decoded_sequencer   c                 C   s4   t  }| D ] }t|}|d kr q
|| q
t|S rF   )setr*   addlist)r   rangesr   r6   r   r   r   
range_scan  s    r   )iana_name_aiana_name_br   c           	      C   s   t | st |rdS td| j}td|j}|dd}|dd}d}tdD ]*}t|g}||||krX|d7 }qX|d S )	Ng        rw   r\   r]   r      r      )r|   ry   rz   r{   r   rangerb   rf   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr   r   r   cp_similarity+  s    



r   c                 C   s   | t ko|t |  kS )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r   r   r   is_cp_similar@  s    
r   Zcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s)r   levelformat_stringr   c                 C   s:   t | }|| t  }|t | || d S rF   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r   r   r   loggerhandlerr   r   r   set_logging_handlerK  s
    

r   )rY   )T)=Zunicodedata2r   ImportErrorry   r   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   r   r   r	   r
   Z_multibytecodecr   Zconstantr   r   r   r   r   r   strboolr   r&   r*   r+   r0   r7   r:   r;   rE   rI   rK   rN   rO   rP   rQ   rR   rd   rX   rb   r$   rl   r|   r   r   r   r   floatr   r   INFOr   r   r   r   r   <module>   s|    

							