o
     c-                     @   s  zd dl ZW n ey   d dlZY nw d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZmZmZmZ d dlmZ dd	lmZmZmZmZmZmZ e
ed
dedefddZe
ed
dedefddZ e
ed
dedee fddZ!e
ed
dedefddZ"e
ed
dedefddZ#e
ed
dedefddZ$e
ed
dedefddZ%e
ed
dedefddZ&e
ed
dedefddZ'e
ed
dedefdd Z(dedefd!d"Z)e
ed
dedefd#d$Z*e
ed
dedefd%d&Z+e
ed
dedefd'd(Z,e
ed
dedefd)d*Z-e
ed
dedefd+d,Z.e
e/ed
d-edefd.d/Z0e
ed
dedefd0d1Z1d_d3e2d4e3dee fd5d6Z4e
d7d
d8edefd9d:Z5d3e2deee e2f fd;d<Z6d=edefd>d?Z7d`dAedBedefdCdDZ8dEedee fdFdGZ9dHedIede:fdJdKZ;dHedIedefdLdMZ<dNej=dOfd8edPe3dQeddfdRdSZ>	dadTe2dUedVe?dWe3dXedYedZe2d[ed\ee deeddf fd]d^Z@dS )b    N)IncrementalDecoder)aliases)	lru_cache)findall)	GeneratorListOptionalSetTupleUnion)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATION)maxsize	characterreturnc                 C   sT   zt | }W n
 ty   Y dS w d|v p)d|v p)d|v p)d|v p)d|v p)d|v S )NFz
WITH GRAVEz
WITH ACUTEzWITH CEDILLAzWITH DIAERESISzWITH CIRCUMFLEXz
WITH TILDEunicodedataname
ValueErrorr   description r   <usr/lib/python3.10/site-packages/charset_normalizer/utils.pyis_accentuated   s    r   c                 C   s.   t | }|s	| S |d}tt|d dS )N r      )r   decompositionsplitchrint)r   Z
decomposedcodesr   r   r   remove_accent,   s
   

r'   c                 C   s.   t | }t D ]\}}||v r|  S qdS )zK
    Retrieve the Unicode range official name from a single character.
    N)ordr   items)r   Zcharacter_ord
range_nameZ	ord_ranger   r   r   unicode_range7   s   r+   c                 C   *   z
t | }W d|v S  ty   Y dS w )NFZLATINr   r   r   r   r   is_latinE   s   r-   c                 C   s&   z|  d W dS  ty   Y dS w )NasciiFT)encodeUnicodeEncodeErrorr   r   r   r   is_asciiN   s   r2   c                 C   s2   t | }d|v rdS t| }|d u rdS d|v S )NPTFZPunctuationr   categoryr+   r   character_categorycharacter_ranger   r   r   is_punctuationW   s   
r9   c                 C   s:   t | }d|v sd|v rdS t| }|d u rdS d|v S )NSNTFZFormsr4   r6   r   r   r   	is_symbolf   s   
r<   c                 C   s   t | }|d u r
dS d|v S )NFZ	Emoticons)r+   )r   r8   r   r   r   is_emoticonu   s   r=   c                 C   s&   |   s| dv r
dS t| }d|v S )N>   u   ｜;,<+>TZ)isspacer   r5   r   r7   r   r   r   is_separator   s   
rF   c                 C   s   |   |  kS N)islowerisupperr1   r   r   r   is_case_variable   s   rJ   c                 C   s   t | }|dkS )NZCo)r   r5   rE   r   r   r   is_private_use_only   s   
rK   c                 C   r,   )NFCJKr   r   Zcharacter_namer   r   r   is_cjk      rN   c                 C   r,   )NFZHIRAGANAr   rM   r   r   r   is_hiragana   rO   rP   c                 C   r,   )NFZKATAKANAr   rM   r   r   r   is_katakana   rO   rQ   c                 C   r,   )NFZHANGULr   rM   r   r   r   	is_hangul   rO   rR   c                 C   r,   )NFZTHAIr   rM   r   r   r   is_thai   rO   rS   r*   c                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S rG   r   ).0keywordr*   r   r   	<genexpr>   s    z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rV   r   rV   r   is_unicode_range_secondary   s   rY   c                 C   s(   |   du o|  du o| dko| dkS )NFu   ﻿)rD   isprintabler1   r   r   r   is_unprintable   s   
r\      sequencesearch_zonec                 C   s   t | tstt| }tt| dt|| jddd}t|dkr$dS |D ]'}| 	dd}t
 D ]\}}||krB|    S ||krL|    S q4q&dS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nr.   ignoreerrorsr   -_)
isinstancebytes	TypeErrorlenr   r   mindecodelowerreplacer   r)   )r^   r_   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar   r   r   any_specified_encoding   s&   
rp      r   c                 C   s    | dv pt td| jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   Z	utf_8_sigutf_8utf_7	utf_16_le	utf_16_beutf_32	utf_32_le	utf_32_beutf_16encodings.{})
issubclass	importlibimport_moduleformatr   r   )r   r   r   r   is_multi_byte_encoding   s   
r   c                 C   sJ   t D ] }t | }t|tr|g}|D ]}| |r!||f    S qqdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   re   rf   
startswith)r^   iana_encodingZmarksZmarkr   r   r   identify_sig_or_bom  s   

r   r   c                 C   s   | dvS )N>   rv   ry   r   )r   r   r   r   should_strip_sig_or_bom  s   r   Tcp_namestrictc                 C   sL   |   dd} t D ]\}}| ||fv r|  S q|r$td| | S )Nrc   rd   z Unable to retrieve IANA for '{}')rk   rl   r   r)   r   r~   )r   r   rn   ro   r   r   r   	iana_name!  s   r   decoded_sequencec                 C   s4   t  }| D ]}t|}|d u rq|| qt|S rG   )setr+   addlist)r   Zrangesr   r8   r   r   r   
range_scan1  s   r   iana_name_aiana_name_bc           	      C   s   t | st |r
dS td| j}td|j}|dd}|dd}d}tdD ]}t|g}||||krA|d7 }q,|d S )	Ng        rz   r`   ra   r      r      )r   r|   r}   r~   r   rangerf   rj   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr   r   r   cp_similarity?  s*   


r   c                 C   s   | t v o	|t |  v S )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r   r   r   is_cp_similarX  s   
r   charset_normalizerz)%(asctime)s | %(levelname)s | %(message)slevelformat_stringc                 C   s:   t | }|| t  }|t | || d S rG   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r   r   r   loggerhandlerr   r   r   set_logging_handlerc  s
   

r   	sequencesro   offsets
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadc	                 c   s2   |r|du r|D ]}	||	|	|  }
|
s d S |
V  q	d S |D ]v}	|	| }|t | d kr/q | |	|	|  }|rA|du rA|| }|j||rHdndd}
|r|	dkr| |	 dkrt|d}|r|
d | |vrt|	|	d	 d
D ]#}| || }|r|du r|| }|j|dd}
|
d | |v r nqo|
V  q d S )NF   r`   r   ra   r   rq   r!      )rh   rj   ri   r   )r   ro   r   r   r   r   r   r   r   r   chunkZ	chunk_endZcut_sequenceZchunk_partial_size_chkjr   r   r   cut_sequence_chunksq  sD   

r   )r]   )TrG   )Aunicodedata2r   ImportErrorr|   r   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   r   r   r	   r
   r   Z_multibytecodecr   Zconstantr   r   r   r   r   r   strboolr   r'   r+   r-   r2   r9   r<   r=   rF   rJ   rK   rN   rP   rQ   rR   rS   rh   rY   r\   rf   r%   rp   r   r   r   r   r   floatr   r   INFOr   r   r   r   r   r   r   <module>   s      

							
 
	
