
    is                       U d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlmZmZmZ ddlmZ ddlmZ d	Z ej0                           G d
 de      Z G d de      Zej6                  ej8                  ej:                  ej<                  ej>                  ej@                  ejB                  ejD                  ejF                  ejH                  ejJ                  ejL                  dZ'de(d<    G d de      Z)	 	 	 	 	 	 d dZ*dZ+	 	 	 	 	 	 	 	 	 	 d!dZ,d"d#dZ-	 d"	 	 	 	 	 	 	 d$dZ.	 d"	 	 	 	 	 	 	 d$dZ/	 	 	 	 	 	 d%dZ0	 d"	 	 	 	 	 	 	 d#dZ1e-Z	 d"	 	 	 	 	 	 	 d#dZ2	 	 d&	 	 	 	 	 	 	 	 	 d'dZ3d(dZ4d)dZ5d*dZ6y)+z{
ftfy: fixes text for you

This is a module for making text less broken. See the `fix_text` function
for more information.
    )annotationsN)Iterator)AnyBinaryIOCallableLiteral
NamedTupleTextIOcast)
bad_codecschardatafixes)is_bad)display_ljustz6.3.1c                  .    e Zd ZU dZded<   ded<   ddZy)ExplanationStepa  
    A step in an ExplainedText, explaining how to decode text.

    The possible actions are:

    - "encode": take in a string and encode it as bytes, with the given encoding
    - "decode": take in bytes and decode them as a string, with the given encoding
    - "transcode": convert bytes to bytes with a particular named function
    - "apply": convert str to str with a particular named function

    The `parameter` is the name of the encoding or function to use. If it's a
    function, it must appear in the FIXERS dictionary.
    straction	parameterc                *    t        t        |             S )z
        Get the string representation of an ExplanationStep. We output the
        representation of the equivalent tuple, for simplicity.
        )reprtuple)selfs    i/home/developers/rajanand/mypropertyqr-fmb-refixing-v2/venv/lib/python3.12/site-packages/ftfy/__init__.py__repr__zExplanationStep.__repr__5   s    
 E$K      N)returnr   )__name__
__module____qualname____doc____annotations__r    r   r   r   r   #   s     KN!r   r   c                  &    e Zd ZU dZded<   ded<   y)ExplainedTexta  
    The return type from ftfy's functions that provide an "explanation" of which
    steps it applied to fix the text, such as :func:`fix_and_explain()`.

    When the 'explain' option is disabled, these functions return the same
    type, but the `explanation` will be None.
    r   textlist[ExplanationStep] | NoneexplanationN)r   r   r    r!   r"   r#   r   r   r%   r%   =   s     I--r   r%   )unescape_htmlremove_terminal_escapesrestore_byte_a0replace_lossy_sequencesdecode_inconsistent_utf8fix_c1_controlsfix_latin_ligaturesfix_character_widthuncurl_quotesfix_line_breaksfix_surrogatesremove_control_charszdict[str, Callable]FIXERSc                      e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   dZ	ded
<   dZ
ded<   dZded<   dZded<   dZded<   dZded<   dZded<   dZded<   dZded<   dZded<   dZded<   dZded<   y)TextFixerConfigu[  
    A TextFixerConfig object stores configuration options for ftfy.

    It's implemented as a namedtuple with defaults, so you can instantiate
    it by providing the values to change from their defaults as keyword arguments.
    For example, to disable 'unescape_html' and keep the rest of the defaults::

        TextFixerConfig(unescape_html=False)

    Here are the options and their default values:

    - `unescape_html`: "auto"

      Configures whether to replace HTML entities such as &amp; with the character
      they represent. "auto" says to do this by default, but disable it when a
      literal < character appears, indicating that the input is actual HTML and
      entities should be preserved. The value can be True, to always enable this
      fixer, or False, to always disable it.

    - `remove_terminal_escapes`: True

      Removes "ANSI" terminal escapes, such as for changing the color of text in a
      terminal window.

    - `fix_encoding`: True

      Detect mojibake and attempt to fix it by decoding the text in a different
      encoding standard.

      The following four options affect `fix_encoding` works, and do nothing if
      `fix_encoding` is False:

      - `restore_byte_a0`: True

        Allow a literal space (U+20) to be interpreted as a non-breaking space
        (U+A0) when that would make it part of a fixable mojibake string.

        Because spaces are very common characters, this could lead to false
        positives, but we try to apply it only when there's strong evidence for
        mojibake. Disabling `restore_byte_a0` is safer from false positives,
        but creates false negatives.

      - `replace_lossy_sequences`: True

        Detect mojibake that has been partially replaced by the characters
        '�' or '?'. If the mojibake could be decoded otherwise, replace the
        detected sequence with '�'.

      - `decode_inconsistent_utf8`: True

        When we see sequences that distinctly look like UTF-8 mojibake, but
        there's no consistent way to reinterpret the string in a new encoding,
        replace the mojibake with the appropriate UTF-8 characters anyway.

        This helps to decode strings that are concatenated from different
        encodings.

      - `fix_c1_controls`: True

        Replace C1 control characters (the useless characters U+80 - U+9B that
        come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
        even if the whole string doesn't decode as Latin-1.

    - `fix_latin_ligatures`: True

      Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the
      letters they're made of.

    - `fix_character_width`: True

      Replace fullwidth Latin characters and halfwidth Katakana with
      their more standard widths.

    - `uncurl_quotes`: True

      Replace curly quotes with straight quotes.

    - `fix_line_breaks`: True

      Replace various forms of line breaks with the standard Unix line
      break, ``\n``.

    - `fix_surrogates`: True

      Replace sequences of UTF-16 surrogate codepoints with the character
      they were meant to encode. This fixes text that was decoded with the
      obsolete UCS-2 standard, and allows it to support high-numbered
      codepoints such as emoji.

    - `remove_control_chars`: True

      Remove certain control characters that have no displayed effect on text.

    - `normalization`: "NFC"

      Choose what kind of Unicode normalization is applied. Usually, we apply
      NFC normalization, so that letters followed by combining characters become
      single combined characters.

      Changing this to "NFKC" applies more compatibility conversions, such as
      replacing the 'micro sign' with a standard Greek lowercase mu, which looks
      identical. However, some NFKC normalizations change the meaning of text,
      such as converting "10³" to "103".

    `normalization` can be None, to apply no normalization.

    - `max_decode_length`: 1_000_000

      The maximum size of "segment" that ftfy will try to fix all at once.

    - `explain`: True

      Whether to compute 'explanations', lists describing what ftfy changed.
      When this is False, the explanation will be None, and the code that
      builds the explanation will be skipped, possibly saving time.

      Functions that accept TextFixerConfig and don't return an explanation
      will automatically set `explain` to False.
    autoz
str | boolr)   Tboolr*   fix_encodingr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   NFCz,Literal['NFC', 'NFD', 'NFKC', 'NFKD'] | Nonenormalizationi@B intmax_decode_lengthexplainN)r   r   r    r!   r)   r"   r*   r:   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r<   r>   r?   r#   r   r   r7   r7   [   s    vp !'M:&$(T(L$ OT $(T(%)d) OT  $$ $$M4 OT ND!%$%BGM?G$s$GTr   r7   c                    d|v r7t        j                  dt        d       |j                         }|d   |d<   |d=  | j                  di |} | S )z
    Handle parameters provided as keyword arguments to ftfy's top-level
    functions, converting them into a TextFixerConfig.
    fix_entitiesz2`fix_entities` has been renamed to `unescape_html`   )
stacklevelr)   r#   )warningswarnDeprecationWarningcopy_replace)configkwargss     r   _config_from_kwargsrK      s]     @	

 "("8>"V__&v&FMr   a  Hey wait, this isn't Unicode.

ftfy is designed to fix problems with text. Treating bytes like they're
interchangeable with Unicode text is usually something that introduces
problems with text.

You should first decode these bytes from the encoding you think they're in.
If you're not sure what encoding they're in:

- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
  ftfy.guess_bytes. As the name implies, this may not always be accurate.

For more information on the distinction between bytes and text, read the
Python Unicode HOWTO:

    http://docs.python.org/3/howto/unicode.html
c                    t        ||       rCt        |    } ||      }| ||k7  r|j                  t        d|              t	        t
        |      S |S )z
    A helper function used across several 'fixer' steps, deciding whether to
    apply the fix and whether to record the fix in `steps`.
    apply)getattrr5   appendr   r   r   )
fixer_namer&   rI   stepsfixerfixeds         r   _try_fixrT     sR     vz"z"d$LL*=>CKr   c                   |t        d      }t        ||      }t        | t              rt	        t
              g }d}|t        |       k  r| j                  d|      dz   }|dk(  rt        |       }||z
  |j                  kD  r||j                  z   }| || }|j                  dk(  rd|v r|j                  d      }t        ||      \  }}|j                  |       |}|t        |       k  rd	j                  |      S )
u  
    Given Unicode text as input, fix inconsistencies and glitches in it,
    such as mojibake (text that was decoded in the wrong encoding).

    Let's start with some examples:

        >>> fix_text('âœ” No problems')
        '✔ No problems'

        >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
        ¯\_(ツ)_/¯

        >>> fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!')
        "Broken text... it's flubberific!"

        >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')
        'LOUD NOISES'

    ftfy applies a number of different fixes to the text, and can accept
    configuration to select which fixes to apply.

    The configuration takes the form of a :class:`TextFixerConfig` object,
    and you can see a description of the options in that class's docstring
    or in the full documentation at ftfy.readthedocs.org.

    For convenience and backward compatibility, the configuration can also
    take the form of keyword arguments, which will set the equivalently-named
    fields of the TextFixerConfig object.

    For example, here are two ways to fix text but skip the "uncurl_quotes"
    step::

        fix_text(text, TextFixerConfig(uncurl_quotes=False))
        fix_text(text, uncurl_quotes=False)

    This function fixes text in independent segments, which are usually lines
    of text, or arbitrarily broken up every 1 million codepoints (configurable
    with `config.max_decode_length`) if there aren't enough line breaks. The
    bound on segment lengths helps to avoid unbounded slowdowns.

    ftfy can also provide an 'explanation', a list of transformations it applied
    to the text that would fix more text like it. This function doesn't provide
    explanations (because there may be different fixes for different segments
    of text).

    To get an explanation, use the :func:`fix_and_explain()` function, which
    fixes the string in one segment and explains what it fixed.
    Fr?   r   
   r8   <r)    )r7   rK   
isinstancebytesUnicodeErrorBYTES_ERROR_TEXTlenfindr>   r)   rH   fix_and_explainrO   join)	r&   rI   rJ   outpos	textbreaksegmentfixed_segment_s	            r   fix_textrj   "  s   d ~ / 0F$+,,
C
C
D	/IIdC(1,	>D	IOv777f666Is9%6)cWn__5_9F*7F;q

=! D	/ 773<r   c                x   |
t               }t        | t              rt        t              t        ||      }|j                  dk(  rd| v r|j                  d      }|j                  rg }nd}	 | }t        d| ||      } |j                  r0|t        |       } n"t        | |      \  } }||j                  |       dD ]  }t        || ||      }  |j                  Nt        j                  |j                  |       }|*|| k7  r%|j!                  t#        d|j                               |} | |k(  rt%        | |      S )	z
    Fix text as a single segment, returning the fixed text and an explanation
    of what was fixed.

    The explanation is a list of steps that can be applied with
    :func:`apply_plan`, or if config.explain is False, it will be None.
    Nr8   rY   FrZ   r)   )r.   r/   r0   r1   r2   r3   r*   r4   	normalize)r7   r\   r]   r^   r_   rK   r)   rH   r?   rT   r:   fix_encoding_and_explainextendr<   unicodedatarl   rO   r   r%   )r&   rI   rJ   rQ   origtextencoding_stepsrR   rS   s           r   rb   rb   l  sK    ~ "$+,, 0Fv%#+u5~~.0 
vu=}#D)'?f'M$n!-LL0	
 
	8E E47D
	8 +))&*>*>EE Ud]_[&:N:NOPD8 u--A r   c                   |
t               }t        | t              rt        t              t        ||      }|j                  st        | g       S g }	 | }t        | |      \  } }||j                  |       | |k(  rt        | |      S 6)u  
    Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
    text and a list explaining what was fixed.

    This includes fixing text by encoding and decoding it in different encodings,
    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
    `decode_inconsistent_utf8`, and `fix_c1_controls`.

    Examples::

        >>> fix_encoding_and_explain("sÃ³")
        ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])

        >>> result = fix_encoding_and_explain("voilÃ le travail")
        >>> result.text
        'voilà le travail'
        >>> result.explanation
        [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]

    )
r7   r\   r]   r^   r_   rK   r:   r%   "_fix_encoding_one_step_and_explainrn   )r&   rI   rJ   plan_so_farprevtextplans         r   rm   rm     s    . ~ "$+,, 0F T2&&)+K
7fE
dt$8 {33 r   c                   |
t               }t        |       dk(  rt        | g       S t        j                  | d      st        |       st        | g       S g }t        j                  D ]E  }t        j                  | |      s|j                  |       | j                  |      }t        d|      }g }	 d}|j                  r[|dk7  rVt        j                  j                  |      r7t        j                  |      }||k7  r|j                  t        dd             |}|j                  rH|j                  d      r7t        j                  |      }||k7  r|j                  t        dd	             |}d
|v sd|v rd}t        d|      }	|g|z   |	gz   }
|j!                  |      }t        ||
      c S  |j$                  rRt        j&                  j                  |       r3t        dd      g}
t        j$                  |       }|| k7  rt        ||
      S d|v r[d|v rt        | g       S 	 | j                  d      j!                  d      }|| k7  r$t        dd      t        dd      g}
t        ||
      S 	 |j(                  rMt        j*                  j                  |       r.t        dd      g}
t        j(                  |       }t        ||
      S t        | g       S # t"        $ r Y ww xY w# t"        $ r Y w xY w)z:
    Perform one step of fixing the encoding of text.
    r   asciiencodeutf-8macroman	transcoder+   sloppyr,         utf-8-variantsdecoderM   r-   zlatin-1zwindows-1252r.   )r7   r`   r%   r   possible_encodingr   CHARMAP_ENCODINGSrO   ry   r   r+   ALTERED_UTF8_REsearchr   r,   
startswithr   UnicodeDecodeErrorr-   UTF8_DETECTOR_REr.   C1_CONTROL_RE)r&   rI   possible_1byte_encodingsencodingencoded_bytesencode_steptranscode_stepsdecodingreplaced_bytesdecode_steprQ   rS   s               r   rs   rs     s     ~ "
4yA~T2&& !!$0tT2&&
  "
 .. .%%dH5$++H5 KK1M)(H=K O%" ** J. 0077F%*%:%:=%IN%6'..+K9JK )7 11h6I6I(6S%*%B%B=%QN%6'..+K9RS )7=(DM,A/H-hA$7;-G%,,X6$UE22W.b &&8+D+D+K+KD+Q *DEF..t4D= ..
 ,,55 !r**
	I.55nED=')<'.AE )66 ! ("8"8"?"?"E .?@A%%d+UE** r""[ & < & s&   2C9K"AK! 	KK!	K-,K-c                X    |t        d      }t        ||      }t        | |      \  }}|S )u   
    Apply just the encoding-fixing steps of ftfy to this text. Returns the
    fixed text, discarding the explanation.

        >>> fix_encoding("Ã³")
        'ó'
        >>> fix_encoding("&ATILDE;&SUP3;")
        '&ATILDE;&SUP3;'
    FrV   )r7   rK   rm   r&   rI   rJ   rS   _explans        r   r:   r:   J  s4     ~ / 0F-dF;NE7Lr   c                X    |t        d      }t        ||      }t        | |      \  }}|S )z
    Fix text as a single segment, with a consistent sequence of steps that
    are applied to fix the text. Discard the explanation.
    FrV   )r7   rK   rb   r   s        r   fix_text_segmentr   a  s4     ~ / 0F$T62NE7Lr   c              +    K   |
t               }t        ||      }| D ]l  }t        |t              r"|t	        |      \  }}n|j                  |      }|j                  dk(  rd|v r|j                  d      }t        ||      \  }}| n yw)a  
    Fix text that is found in a file.

    If the file is being read as Unicode text, use that. If it's being read as
    bytes, then we hope an encoding was supplied. If not, unfortunately, we
    have to guess what encoding it is. We'll try a few common encodings, but we
    make no promises. See the `guess_bytes` function for how this is done.

    The output is a stream of fixed lines of text.
    Nr8   rY   FrZ   )	r7   rK   r\   r]   guess_bytesr   r)   rH   rb   )
input_filer   rI   rJ   line
fixed_liner   s          r   fix_filer   o  s       ~ " 0F 
dE"!,T!2h{{8,6)cTk__5_9F-dF;
G
s   BBc                   t        | t              rt        d      | j                  d      s| j                  d      r| j	                  d      dfS t        |       }	 d|v sd|v r| j	                  d      dfS | j	                  d      dfS # t        $ r Y nw xY wd	|v rd
|vr| j	                  d      dfS | j	                  d      dfS )a?  
    NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
    is not designed to be an encoding detector.

    In the unfortunate situation that you have some bytes in an unknown
    encoding, ftfy can guess a reasonable strategy for decoding them, by trying
    a few common encodings that can be distinguished from each other.

    Unlike the rest of ftfy, this may not be accurate, and it may *create*
    Unicode problems instead of solving them!

    The encodings we try here are:

    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
      like nothing else
    - UTF-8, because it's the global standard, which has been used by a
      majority of the Web since 2008
    - "utf-8-variants", or buggy implementations of UTF-8
    - MacRoman, because Microsoft Office thinks it's still a thing, and it
      can be distinguished by its line breaks. (If there are no line breaks in
      the string, though, you're out of luck.)
    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
      single-byte encoding.
    z^This string was already decoded as Unicode. You should pass bytes to guess_bytes, not Unicode.s   s   zutf-16r~   r   r   rz      
   r{   zsloppy-windows-1252)r\   r   r^   r   r   setr   )bstringbytesets     r   r   r     s    2 '31
 	

 +&'*<*<[*I~~h'11'lG7?dgo, >>"235EEE>>'*G33  w4w.~~j):55>>/02GGGs   B 8B 	BBc                    | }|D ]i  \  }}|dk(  r|j                  |      }|dk(  r|j                  |      }4|dv r%|t        v rt        |   |      }Ot        d|       t        d|        |S )u(  
    Apply a plan for fixing the encoding of text.

    The plan is a list of tuples of the form (operation, arg).

    `operation` is one of:

    - `'encode'`: convert a string to bytes, using `arg` as the encoding
    - `'decode'`: convert bytes to a string, using `arg` as the encoding
    - `'transcode'`: convert bytes to bytes, using the function named `arg`
    - `'apply'`: convert a string to a string, using the function named `arg`

    The functions that can be applied by 'transcode' and 'apply' are
    specifically those that appear in the dictionary named `FIXERS`. They
    can also can be imported from the `ftfy.fixes` module.

    Example::

        >>> mojibake = "schÃ¶n"
        >>> text, plan = fix_and_explain(mojibake)
        >>> apply_plan(mojibake, plan)
        'schön'
    ry   r   )r|   rM   zUnknown function to apply: zUnknown plan step: )ry   r   r5   
ValueError)r&   rv   obj	operationr   s        r   
apply_planr     s    0 C# @	8 **X&C("**X&C006!X&s+ #>xj!IJJ29+>??@ Jr   c                (   | D ]  }|j                         r|}n |j                  d      j                  d      }t        dj	                  t        |d      t        |      t        j                  |      t        j                  |d                    y)u  
    A utility method that's useful for debugging mysterious Unicode.

    It breaks down a string, showing you for each codepoint its number in
    hexadecimal, its glyph, its category in the Unicode standard, and its name
    in the Unicode standard.

        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
        U+0028  (       [Ps] LEFT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+00B0  °       [So] DEGREE SIGN
        U+25A1  □       [So] WHITE SQUARE
        U+00B0  °       [So] DEGREE SIGN
        U+0029  )       [Pe] RIGHT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+FE35  ︵      [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
        U+0020          [Zs] SPACE
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
    zunicode-escaperx   z+U+{code:04X}  {display} [{category}] {name}   z	<unknown>)displaycodecategorynameN)
isprintablery   r   printformatr   ordro   r   r   )r&   charr   s      r   explain_unicoder      s    ,  
Gkk"23::7CG9@@%gq1Y$--d3 %%dK8	 A 	

r   )rI   r7   rJ   zdict[str, Any]r   r7   )
rP   r   r&   r   rI   r7   rQ   r'   r   r   )N)r&   r   rI   TextFixerConfig | NonerJ   r   r   r   )r&   r   rI   r   rJ   r   r   r%   )r&   r   rI   r7   r   r%   )NN)
r   zTextIO | BinaryIOr   z
str | NonerI   r   rJ   r   r   zIterator[str])r   r]   r   ztuple[str, str])r&   r   rv   zlist[tuple[str, str]]r   r   )r&   r   r   None)7r!   
__future__r   ro   rD   collections.abcr   typingr   r   r   r   r	   r
   r   ftfyr   r   r   ftfy.badnessr   ftfy.formattingr   __version__okr   r%   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r"   r7   rK   r_   rT   rj   rb   rm   rs   r:   r   r   r   r   r   r#   r   r   <module>r      sl   #   $   - ,  )
 
!j !4
.J 
. (($<<,,$<< % > >,, 44 44((,,**!66  Hj HV%3( (
  (	
 	(GV 159.
9.-9.@C9.9.z 15)4
)4-)4@C)4)4Xs#
s#&s#s#n 15
-@C(  15
-@C   %)! # 	
 BDHN&R"
r   