o
     cD                  
   @   sV  d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!edddee" dee" de#fddZ$edd	!d(d"e"d#e%d$e#de%fd%d&Z&d'S ))    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r    9usr/lib/python3.10/site-packages/charset_normalizer/md.pyeligible      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   r    r    r!   feed%   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r   r    r    r!   reset,   r#   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r%   r    r    r!   ratio2   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr"   r$   r&   propertyfloatr'   r    r    r    r!   r      s    
r   c                   @   V   e Zd ZdddZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginr   Nc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr%   r    r    r!   __init__<   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C      |  S Nisprintabler   r    r    r!   r"   D      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r5   r6   r   r   r3   isdigitr   r   r4   r   r    r    r!   r$   G   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r3   r5   r4   r%   r    r    r!   r&   Y      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           g333333?)r5   r3   r4   )r   Zratio_of_punctuationr    r    r!   r'   ^   s   

z&TooManySymbolOrPunctuationPlugin.ratior(   r)   r*   r+   r7   r-   r.   r"   r$   r&   r/   r0   r'   r    r    r    r!   r2   ;   s    

r2   c                   @   r1   )TooManyAccentuatedPluginr   Nc                 C      d| _ d| _d S r?   r5   _accentuated_countr%   r    r    r!   r7   k      
z!TooManyAccentuatedPlugin.__init__r   c                 C   r8   r9   )isalphar   r    r    r!   r"   o   r<   z!TooManyAccentuatedPlugin.eligiblec                 C   s,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r5   r   rF   r   r    r    r!   r$   r   s   zTooManyAccentuatedPlugin.feedc                 C   rD   r?   rE   r%   r    r    r!   r&   x   rG   zTooManyAccentuatedPlugin.resetc                 C   s*   | j dkrdS | j| j  }|dkr|S dS )Nr   rA   gffffff?rE   )r   Zratio_of_accentuationr    r    r!   r'   |   s   
zTooManyAccentuatedPlugin.ratior(   rB   r    r    r    r!   rC   j   s    

rC   c                   @   r1   )UnprintablePluginr   Nc                 C   rD   r?   )_unprintable_countr5   r%   r    r    r!   r7      rG   zUnprintablePlugin.__init__r   c                 C      dS NTr    r   r    r    r!   r"         zUnprintablePlugin.eligiblec                 C   s(   t |r|  jd7  _|  jd7  _d S rI   )r   rK   r5   r   r    r    r!   r$      s   zUnprintablePlugin.feedc                 C   s
   d| _ d S r?   )rK   r%   r    r    r!   r&      s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   rA      )r5   rK   r%   r    r    r!   r'         
zUnprintablePlugin.ratior(   rB   r    r    r    r!   rJ      s    

rJ   c                   @   r1   )SuspiciousDuplicateAccentPluginr   Nc                 C      d| _ d| _d | _d S r?   _successive_countr5   _last_latin_characterr%   r    r    r!   r7      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r9   )rH   r   r   r    r    r!   r"      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rI   )r5   rV   r   isupperrU   r   r   r    r    r!   r$      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C   rS   r?   rT   r%   r    r    r!   r&      r@   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rO   )Nr   rA   r=   )r5   rU   r%   r    r    r!   r'      rQ   z%SuspiciousDuplicateAccentPlugin.ratior(   rB   r    r    r    r!   rR      s    

rR   c                   @   r1   )SuspiciousRanger   Nc                 C   rS   r?   )"_suspicious_successive_range_countr5   _last_printable_seenr%   r    r    r!   r7      r@   zSuspiciousRange.__init__r   c                 C   r8   r9   r:   r   r    r    r!   r"      r<   zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rI   )r5   isspacer   r   rZ   r    is_suspiciously_successive_rangerY   )r   r   unicode_range_aunicode_range_br    r    r!   r$      s    



zSuspiciousRange.feedc                 C   rS   r?   )r5   rY   rZ   r%   r    r    r!   r&      r@   zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk rdS |S )Nr   rA   r=   g?)r5   rY   )r   Zratio_of_suspicious_range_usager    r    r!   r'      s   
zSuspiciousRange.ratior(   rB   r    r    r    r!   rX      s    

rX   c                   @   r1   )SuperWeirdWordPluginr   Nc                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr5   _bad_character_count_buffer_buffer_accent_countr%   r    r    r!   r7      s   
zSuperWeirdWordPlugin.__init__r   c                 C   rL   rM   r    r   r    r    r!   r"     rN   zSuperWeirdWordPlugin.eligiblec                 C   s  |  rH|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _d S | jsMd S | sYt|sYt|r| jr|  jd7  _t| j}|  j|7  _|dkr| j| dkr}d| _t| jd r| jd  r|  jd7  _d| _|dkr| jr|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _d S |d
vr| du rt|rd| _|  j|7  _d S d S d S d S )Nr   FT   g(\?   r`   r   >   |~=-<_>)rH   rg   r   rh   re   r   r   r   r   r   r   r[   r   r   ra   lenr5   rd   rW   rc   rb   rf   r>   r   )r   r   Zbuffer_lengthr    r    r!   r$     sd   


zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nr`   Fr   )rg   rd   re   rb   ra   r5   rf   rc   r%   r    r    r!   r&   =  s   
zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   rA   )ra   rc   rf   r5   r%   r    r    r!   r'   G  s   zSuperWeirdWordPlugin.ratior(   rB   r    r    r    r!   r_      s    

6
r_   c                   @   sZ   e Zd ZdZdddZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 C   rD   r?   _wrong_stop_count_cjk_character_countr%   r    r    r!   r7   U  rG   zCjkInvalidStopPlugin.__init__r   c                 C   rL   rM   r    r   r    r    r!   r"   Y  rN   zCjkInvalidStopPlugin.eligiblec                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>   u   丅u   丄r   )rw   r   rx   r   r    r    r!   r$   \  s   zCjkInvalidStopPlugin.feedc                 C   rD   r?   rv   r%   r    r    r!   r&   c  rG   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rA   )rx   rw   r%   r    r    r!   r'   g  s   
zCjkInvalidStopPlugin.ratior(   )r)   r*   r+   r,   r7   r-   r.   r"   r$   r&   r/   r0   r'   r    r    r    r!   ru   O  s    

ru   c                   @   r1   )ArchaicUpperLowerPluginr   Nc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr5   _last_alpha_seen_current_ascii_onlyr%   r    r    r!   r7   o  s   
z ArchaicUpperLowerPlugin.__init__r   c                 C   rL   rM   r    r   r    r    r!   r"   |  rN   z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQt
|du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr=   )rH   r
   r|   r>   r   r~   r}   r   r{   r5   r	   rW   islower)r   r   Zis_concernedZ	chunk_sepr    r    r!   r$     s@   




zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r5   r|   r}   r~   r   r{   r   r%   r    r    r!   r&     s   
zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rA   )r5   r~   r%   r    r    r!   r'     s   
zArchaicUpperLowerPlugin.ratior(   rB   r    r    r    r!   rz   n  s    

*	rz      )maxsizer]   r^   r   c                 C   sb  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   ZPunctuationZForms)splitr   )r]   r^   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr    r    r!   r\     sZ   r\   i   皙?Fdecoded_sequencemaximum_thresholddebugc                 C   s   dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rk|D ]	}
t|
j	|
j
 qat|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]}| qS r    r    ).0Zmd_classr    r    r!   
<listcomp>	  s    zmess_ratio.<locals>.<listcomp>r   rA   i       r   r      
r   c                 s   s    | ]}|j V  qd S r9   )r'   )r   dtr    r    r!   	<genexpr>   s    zmess_ratio.<locals>.<genexpr>   )r   __subclasses__rs   zipranger"   r$   sumprint	__class__r'   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexZdetectorr   r    r    r!   
mess_ratio  s2   


r   N)r   F)'	functoolsr   typingr   r   Zconstantr   r   utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   rC   rJ   rR   rX   r_   ru   rz   r-   r.   r\   r0   r   r    r    r    r!   <module>   s@    H"/%4ZLF