U
    bG                     @   s<  d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZ ee! ee! e"dddZ#eddd#e!e$e"e$dd d!Z%d"S )$    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thairemove_accentunicode_rangec                   @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C   s   t dS )z@
        Determine if given character should be fed in.
        NNotImplementedErrorselfr    r   9/tmp/pip-unpacked-wheel-2ta4nrol/charset_normalizer/md.pyeligible   s    zMessDetectorPlugin.eligibleNc                 C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        Nr   r   r   r   r    feed$   s    zMessDetectorPlugin.feedr   c                 C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        Nr   r   r   r   r    reset+   s    zMessDetectorPlugin.resetc                 C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        Nr   r$   r   r   r    ratio1   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr!   r"   r%   propertyfloatr&   r   r   r   r    r      s   r   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS ) TooManySymbolOrPunctuationPluginNr#   c                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr$   r   r   r    __init__;   s
    z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C   s   |  S Nisprintabler   r   r   r    r!   C   s    z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.| dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r2   r3   r   r   r0   isdigitr   r   r1   r   r   r   r    r"   F   s    

z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r0   r2   r1   r$   r   r   r    r%   X   s    z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr           g333333?)r2   r0   r1   )r   Zratio_of_punctuationr   r   r    r&   ]   s    

z&TooManySymbolOrPunctuationPlugin.ratior'   r(   r)   r4   r+   r,   r!   r"   r%   r-   r.   r&   r   r   r   r    r/   :   s   r/   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )TooManyAccentuatedPluginNr#   c                 C   s   d| _ d| _d S r:   r2   _accentuated_countr$   r   r   r    r4   j   s    z!TooManyAccentuatedPlugin.__init__r   c                 C   s   |  S r5   )isalphar   r   r   r    r!   n   s    z!TooManyAccentuatedPlugin.eligiblec                 C   s(   |  j d7  _ t|r$|  jd7  _d S Nr   )r2   r   r?   r   r   r   r    r"   q   s    zTooManyAccentuatedPlugin.feedc                 C   s   d| _ d| _d S r:   r>   r$   r   r   r    r%   w   s    zTooManyAccentuatedPlugin.resetc                 C   s*   | j dkrdS | j| j  }|dkr&|S dS )Nr   r;   gffffff?r>   )r   Zratio_of_accentuationr   r   r    r&   {   s
    

zTooManyAccentuatedPlugin.ratior<   r   r   r   r    r=   i   s   r=   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )UnprintablePluginNr#   c                 C   s   d| _ d| _d S r:   )_unprintable_countr2   r$   r   r   r    r4      s    zUnprintablePlugin.__init__r   c                 C   s   dS NTr   r   r   r   r    r!      s    zUnprintablePlugin.eligiblec                 C   s@   |  dkr.| dkr.|dkr.|  jd7  _|  jd7  _d S )NFr   )isspacer7   rC   r2   r   r   r   r    r"      s    

zUnprintablePlugin.feedc                 C   s
   d| _ d S r:   )rC   r$   r   r   r    r%      s    zUnprintablePlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   r;      )r2   rC   r$   r   r   r    r&      s    
zUnprintablePlugin.ratior<   r   r   r   r    rB      s   	rB   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousDuplicateAccentPluginNr#   c                 C   s   d| _ d| _d | _d S r:   _successive_countr2   _last_latin_characterr$   r   r   r    r4      s    z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r5   )r@   r   r   r   r   r    r!      s    z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd k	rjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S rA   )r2   rK   r   isupperrJ   r   r   r   r   r    r"      s    z$SuspiciousDuplicateAccentPlugin.feedc                 C   s   d| _ d| _d | _d S r:   rI   r$   r   r   r    r%      s    z%SuspiciousDuplicateAccentPlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   r;   r8   )r2   rJ   r$   r   r   r    r&      s    
z%SuspiciousDuplicateAccentPlugin.ratior<   r   r   r   r    rH      s   rH   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousRangeNr#   c                 C   s   d| _ d| _d | _d S r:   )"_suspicious_successive_range_countr2   _last_printable_seenr$   r   r   r    r4      s    zSuspiciousRange.__init__r   c                 C   s   |  S r5   r6   r   r   r   r    r!      s    zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S rA   )r2   rF   r   r   rO   r    is_suspiciously_successive_rangerN   )r   r   unicode_range_aunicode_range_br   r   r    r"      s&    

zSuspiciousRange.feedc                 C   s   d| _ d| _d | _d S r:   )r2   rN   rO   r$   r   r   r    r%      s    zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk r*dS |S )Nr   r;   r8   g?)r2   rN   )r   Zratio_of_suspicious_range_usager   r   r    r&      s    
zSuspiciousRange.ratior<   r   r   r   r    rM      s   rM   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuperWeirdWordPluginNr#   c                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr2   _bad_character_count_buffer_buffer_accent_countr$   r   r   r    r4      s    zSuperWeirdWordPlugin.__init__r   c                 C   s   dS rD   r   r   r   r   r    r!     s    zSuperWeirdWordPlugin.eligiblec                 C   s  |  rd| j|g| _t|r0|  jd7  _| jdkrt|dksNt|rt|dkrt|dkrt	|dkrt
|dkrt|dkrd| _d S | jsd S | st|st|r| jr|  jd7  _t| j}|  j|7  _|dkr:| j| dkrd| _t| jd r:| jd  r:|  jd7  _d| _|dkr`| jr`|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _n6|d
kr| dkrt|rd| _|  j|7  _d S )NrT   r   FT   g(\?   r   >   _=><|~-)r@   joinr[   r   r\   rY   r   r   r   r   r   r   rF   r   r   rU   lenr2   rX   rL   rW   rV   rZ   r9   r   )r   r   Zbuffer_lengthr   r   r    r"     sv    





	

 
zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrT   Fr   )r[   rX   rY   rV   rU   r2   rZ   rW   r$   r   r   r    r%   D  s    zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   r;   )rU   rW   rZ   r2   r$   r   r   r    r&   N  s    zSuperWeirdWordPlugin.ratior<   r   r   r   r    rS      s   6
rS   c                   @   s^   e Zd ZdZddddZeedddZeddd	d
ZddddZ	e
edddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    Nr#   c                 C   s   d| _ d| _d S r:   _wrong_stop_count_cjk_character_countr$   r   r   r    r4   \  s    zCjkInvalidStopPlugin.__init__r   c                 C   s   dS rD   r   r   r   r   r    r!   `  s    zCjkInvalidStopPlugin.eligiblec                 C   s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N>      丄   丅r   )rl   r   rm   r   r   r   r    r"   c  s
    zCjkInvalidStopPlugin.feedc                 C   s   d| _ d| _d S r:   rk   r$   r   r   r    r%   j  s    zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   r;   )rm   rl   r$   r   r   r    r&   n  s    
zCjkInvalidStopPlugin.ratio)r'   r(   r)   r*   r4   r+   r,   r!   r"   r%   r-   r.   r&   r   r   r   r    rj   V  s   rj   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )ArchaicUpperLowerPluginNr#   c                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr2   _last_alpha_seen_current_ascii_onlyr$   r   r   r    r4   v  s    z ArchaicUpperLowerPlugin.__init__r   c                 C   s   dS rD   r   r   r   r   r    r!     s    z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|dk}|r| jdkr| jdkrV| dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkrt
|dkrd| _| jd k	r| r| j s| r| j r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr8   )r@   r
   rs   r9   rw   ru   rt   rv   rr   r2   r	   rL   islower)r   r   Zis_concernedZ	chunk_sepr   r   r    r"     sF    


zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r2   rs   rt   ru   rv   rr   rw   r$   r   r   r    r%     s    zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   r;   )r2   ru   r$   r   r   r    r&     s    
zArchaicUpperLowerPlugin.ratior<   r   r   r   r    rq   u  s   *	rq   )rQ   rR   r   c                 C   s|  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |  d| d }}|D ]}|tkrq||kr dS q| dk|dk }}|s|rd	| ksd	|krdS |r|rdS d
| ksd
|kr d	| ksd	|krdS | dks|dkr dS d	| ksHd	|ksH| dkrx|dkrxd| ks\d|kr`dS d| kstd|krxdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKZHangulzBasic Latin)r|   r{   ZPunctuationZForms)splitr   )rQ   rR   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r    rP     sh    rP   i   )maxsize皙?F)decoded_sequencemaximum_thresholddebugr   c                 C   s   dd t  D }t| d }d}|dk r0d}n|dkr>d}nd	}t| d
 t|D ]d\}}|D ]}	|	|r`|	| q`|dkr|| dks||d krTtdd |D }||krT qqT|r|D ]}
t|
j	|
j
 qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]
}| qS r   r   ).0Zmd_classr   r   r    
<listcomp>  s    zmess_ratio.<locals>.<listcomp>r   r;   i       i   rx      
r   c                 s   s   | ]}|j V  qd S r5   )r&   )r   dtr   r   r    	<genexpr>&  s     zmess_ratio.<locals>.<genexpr>   )r   __subclasses__rh   zipranger!   r"   sumprint	__class__r&   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorr   r   r   r    
mess_ratio  s6    


r   N)r   F)&	functoolsr   typingr   r   Zconstantr   r   utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r=   rB   rH   rM   rS   rj   rq   r+   r,   rP   r.   r   r   r   r   r    <module>   s2   D"/%6ZM F     