GGSheng commited on
Commit
80b5c51
·
verified ·
1 Parent(s): 1f1827b

fix: 强制推送更新 backup.py 修复逻辑

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. .venv/bin/hf +1 -1
  4. .venv/bin/httpx +1 -1
  5. .venv/bin/huggingface-cli +10 -0
  6. .venv/bin/markdown-it +1 -1
  7. .venv/bin/normalizer +10 -0
  8. .venv/bin/pygmentize +1 -1
  9. .venv/bin/tiny-agents +1 -1
  10. .venv/bin/tqdm +1 -1
  11. .venv/bin/typer +1 -1
  12. .venv/lib/python3.14/site-packages/81d243bd2c585b0f4821__mypyc.cpython-314-x86_64-linux-gnu.so +3 -0
  13. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/INSTALLER +1 -0
  14. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/METADATA +808 -0
  15. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/RECORD +25 -0
  16. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/REQUESTED +0 -0
  17. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/WHEEL +7 -0
  18. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/entry_points.txt +2 -0
  19. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/licenses/LICENSE +21 -0
  20. .venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/top_level.txt +2 -0
  21. .venv/lib/python3.14/site-packages/charset_normalizer/__init__.py +48 -0
  22. .venv/lib/python3.14/site-packages/charset_normalizer/__main__.py +6 -0
  23. .venv/lib/python3.14/site-packages/charset_normalizer/api.py +988 -0
  24. .venv/lib/python3.14/site-packages/charset_normalizer/cd.cpython-314-x86_64-linux-gnu.so +0 -0
  25. .venv/lib/python3.14/site-packages/charset_normalizer/cd.py +454 -0
  26. .venv/lib/python3.14/site-packages/charset_normalizer/cli/__init__.py +8 -0
  27. .venv/lib/python3.14/site-packages/charset_normalizer/cli/__main__.py +362 -0
  28. .venv/lib/python3.14/site-packages/charset_normalizer/constant.py +2050 -0
  29. .venv/lib/python3.14/site-packages/charset_normalizer/legacy.py +79 -0
  30. .venv/lib/python3.14/site-packages/charset_normalizer/md.cpython-314-x86_64-linux-gnu.so +0 -0
  31. .venv/lib/python3.14/site-packages/charset_normalizer/md.py +936 -0
  32. .venv/lib/python3.14/site-packages/charset_normalizer/models.py +369 -0
  33. .venv/lib/python3.14/site-packages/charset_normalizer/py.typed +0 -0
  34. .venv/lib/python3.14/site-packages/charset_normalizer/utils.py +422 -0
  35. .venv/lib/python3.14/site-packages/charset_normalizer/version.py +8 -0
  36. .venv/lib/python3.14/site-packages/httpx-0.28.1.dist-info/RECORD +1 -1
  37. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/INSTALLER +1 -0
  38. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/METADATA +324 -0
  39. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/RECORD +189 -0
  40. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/REQUESTED +0 -0
  41. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/WHEEL +5 -0
  42. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/entry_points.txt +7 -0
  43. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/licenses/LICENSE +201 -0
  44. .venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/top_level.txt +1 -0
  45. .venv/lib/python3.14/site-packages/huggingface_hub/__init__.py +34 -1
  46. .venv/lib/python3.14/site-packages/huggingface_hub/_buckets.py +55 -8
  47. .venv/lib/python3.14/site-packages/huggingface_hub/_hot_reload/client.py +29 -3
  48. .venv/lib/python3.14/site-packages/huggingface_hub/_hot_reload/types.py +6 -0
  49. .venv/lib/python3.14/site-packages/huggingface_hub/_snapshot_download.py +18 -14
  50. .venv/lib/python3.14/site-packages/huggingface_hub/_space_api.py +102 -1
.gitattributes CHANGED
@@ -63,3 +63,4 @@ bt-source/panel/class/btdockerModel/config/docker_hub_repos.db filter=lfs diff=l
63
  bt-source/panel/class/projectModel/wordpress.db filter=lfs diff=lfs merge=lfs -text
64
  bt-source/panel/class/safeModel/tpl.docx filter=lfs diff=lfs merge=lfs -text
65
  bt-source/panel/config/GeoLite2-City.mmdb filter=lfs diff=lfs merge=lfs -text
 
 
63
  bt-source/panel/class/projectModel/wordpress.db filter=lfs diff=lfs merge=lfs -text
64
  bt-source/panel/class/safeModel/tpl.docx filter=lfs diff=lfs merge=lfs -text
65
  bt-source/panel/config/GeoLite2-City.mmdb filter=lfs diff=lfs merge=lfs -text
66
+ .venv/lib/python3.14/site-packages/81d243bd2c585b0f4821__mypyc.cpython-314-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -10,4 +10,5 @@ __pycache__/
10
  .idea/
11
  .vscode/
12
  .claude/
 
13
 
 
10
  .idea/
11
  .vscode/
12
  .claude/
13
+ tmp/
14
 
.venv/bin/hf CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from huggingface_hub.cli.hf import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from huggingface_hub.cli.hf import main
.venv/bin/httpx CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from httpx import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from httpx import main
.venv/bin/huggingface-cli ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import sys
4
+ from huggingface_hub.cli.deprecated_cli import main
5
+ if __name__ == "__main__":
6
+ if sys.argv[0].endswith("-script.pyw"):
7
+ sys.argv[0] = sys.argv[0][:-11]
8
+ elif sys.argv[0].endswith(".exe"):
9
+ sys.argv[0] = sys.argv[0][:-4]
10
+ sys.exit(main())
.venv/bin/markdown-it CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from markdown_it.cli.parse import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from markdown_it.cli.parse import main
.venv/bin/normalizer ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import sys
4
+ from charset_normalizer.cli import cli_detect
5
+ if __name__ == "__main__":
6
+ if sys.argv[0].endswith("-script.pyw"):
7
+ sys.argv[0] = sys.argv[0][:-11]
8
+ elif sys.argv[0].endswith(".exe"):
9
+ sys.argv[0] = sys.argv[0][:-4]
10
+ sys.exit(cli_detect())
.venv/bin/pygmentize CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from pygments.cmdline import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from pygments.cmdline import main
.venv/bin/tiny-agents CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from huggingface_hub.inference._mcp.cli import app
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from huggingface_hub.inference._mcp.cli import app
.venv/bin/tqdm CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from tqdm.cli import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from tqdm.cli import main
.venv/bin/typer CHANGED
@@ -1,4 +1,4 @@
1
- #!/workspace/huggingface/hi-man/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from typer.cli import main
 
1
+ #!/workspace/huggingface/hi-main/.venv/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import sys
4
  from typer.cli import main
.venv/lib/python3.14/site-packages/81d243bd2c585b0f4821__mypyc.cpython-314-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:128abe84767022efa02b02c588bd2ec1955c5aaa22f6fdc655ae690f9592dec1
3
+ size 433360
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ uv
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/METADATA ADDED
@@ -0,0 +1,808 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: charset-normalizer
3
+ Version: 3.4.7
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
6
+ Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Programming Language :: Python :: Free Threading :: 4 - Resilient
30
+ Classifier: Topic :: Text Processing :: Linguistic
31
+ Classifier: Topic :: Utilities
32
+ Classifier: Typing :: Typed
33
+ Requires-Python: >=3.7
34
+ Description-Content-Type: text/markdown
35
+ License-File: LICENSE
36
+ Provides-Extra: unicode-backport
37
+ Dynamic: license-file
38
+
39
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
40
+
41
+ <p align="center">
42
+ <sup>The Real First Universal Charset Detector</sup><br>
43
+ <a href="https://pypi.org/project/charset-normalizer">
44
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
45
+ </a>
46
+ <a href="https://pepy.tech/project/charset-normalizer/">
47
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
48
+ </a>
49
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
50
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
51
+ </a>
52
+ </p>
53
+ <p align="center">
54
+ <sup><i>Featured Packages</i></sup><br>
55
+ <a href="https://github.com/jawah/niquests">
56
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
57
+ </a>
58
+ <a href="https://github.com/jawah/wassima">
59
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
60
+ </a>
61
+ </p>
62
+ <p align="center">
63
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
64
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
65
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
66
+ </a>
67
+ </p>
68
+
69
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
70
+ > I'm trying to resolve the issue by taking a new approach.
71
+ > All IANA character set names for which the Python core library provides codecs are supported.
72
+ > You can also register your own set of codecs, and yes, it would work as-is.
73
+
74
+ <p align="center">
75
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
76
+ </p>
77
+
78
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
79
+
80
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
81
+ |--------------------------------------------------|:---------------------------------------------:|:-----------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
82
+ | `Fast` | ✅ | ✅ | ✅ |
83
+ | `Universal`[^1] | ❌ | ✅ | ❌ |
84
+ | `Reliable` **without** distinguishable standards | ✅ | ✅ | ✅ |
85
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
86
+ | `License` | _Disputed_[^2]<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
87
+ | `Native Python` | ✅ | ✅ | ❌ |
88
+ | `Detect spoken language` | ✅ | ✅ | N/A |
89
+ | `UnicodeDecodeError Safety` | ✅ | ✅ | ❌ |
90
+ | `Whl Size (min)` | 500 kB | 150 kB | ~200 kB |
91
+ | `Supported Encoding` | 99 | [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
92
+ | `Can register custom encoding` | ❌ | ✅ | ❌ |
93
+
94
+ <p align="center">
95
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
96
+ </p>
97
+
98
+ [^1]: They are clearly using specific code for a specific encoding even if covering most of used one.
99
+ [^2]: Chardet 7.0+ was relicensed from LGPL-2.1 to MIT following an AI-assisted rewrite. This relicensing is disputed on two independent grounds: **(a)** the original author [contests](https://github.com/chardet/chardet/issues/327) that the maintainer had the right to relicense, arguing the rewrite is a derivative work of the LGPL-licensed codebase since it was not a clean room implementation; **(b)** the copyright claim itself is [questionable](https://github.com/chardet/chardet/issues/334) given the code was primarily generated by an LLM, and AI-generated output may not be copyrightable under most jurisdictions. Either issue alone could undermine the MIT license. Beyond licensing, the rewrite raises questions about responsible use of AI in open source: key architectural ideas pioneered by charset-normalizer - notably decode-first validity filtering (our foundational approach since v1) and encoding pairwise similarity with the same algorithm and threshold — surfaced in chardet 7 without acknowledgment. The project also imported test files from charset-normalizer to train and benchmark against it, then claimed superior accuracy on those very files. Charset-normalizer has always been MIT-licensed, encoding-agnostic by design, and built on a verifiable human-authored history.
100
+
101
+ ## ⚡ Performance
102
+
103
+ This package offer better performances (99th, and 95th) against Chardet. Here are some numbers.
104
+
105
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
106
+ |---------------------------------------------------|:--------:|:------------------:|:------------------:|
107
+ | [chardet 7.1](https://github.com/chardet/chardet) | 89 % | 3 ms | 333 file/sec |
108
+ | charset-normalizer | **97 %** | 3 ms | 333 file/sec |
109
+
110
+ | Package | 99th percentile | 95th percentile | 50th percentile |
111
+ |---------------------------------------------------|:---------------:|:---------------:|:---------------:|
112
+ | [chardet 7.1](https://github.com/chardet/chardet) | 32 ms | 17 ms | < 1 ms |
113
+ | charset-normalizer | 16 ms | 10 ms | 1 ms |
114
+
115
+ _updated as of March 2026 using CPython 3.12, Charset-Normalizer 3.4.6, and Chardet 7.1.0_
116
+
117
+ ~Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.~ No longer the case since Chardet 7.0+
118
+
119
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
120
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
121
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
122
+ > Chardet claims on his documentation to have a greater accuracy than us based on the dataset they trained Chardet on(...)
123
+ > Well, it's normal, the opposite would have been worrying. Whereas charset-normalizer don't train on anything, our solution
124
+ > is based on a completely different algorithm, still heuristic through, it does not need weights across every encoding tables.
125
+
126
+ ## ✨ Installation
127
+
128
+ Using pip:
129
+
130
+ ```sh
131
+ pip install charset-normalizer -U
132
+ ```
133
+
134
+ ## 🚀 Basic Usage
135
+
136
+ ### CLI
137
+ This package comes with a CLI.
138
+
139
+ ```
140
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
141
+ file [file ...]
142
+
143
+ The Real First Universal Charset Detector. Discover originating encoding used
144
+ on text file. Normalize text to unicode.
145
+
146
+ positional arguments:
147
+ files File(s) to be analysed
148
+
149
+ optional arguments:
150
+ -h, --help show this help message and exit
151
+ -v, --verbose Display complementary information about file if any.
152
+ Stdout will contain logs about the detection process.
153
+ -a, --with-alternative
154
+ Output complementary possibilities if any. Top-level
155
+ JSON WILL be a list.
156
+ -n, --normalize Permit to normalize input file. If not set, program
157
+ does not write anything.
158
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
159
+ JSON output.
160
+ -r, --replace Replace file when trying to normalize it instead of
161
+ creating a new one.
162
+ -f, --force Replace file without asking if you are sure, use this
163
+ flag with caution.
164
+ -t THRESHOLD, --threshold THRESHOLD
165
+ Define a custom maximum amount of chaos allowed in
166
+ decoded content. 0. <= chaos <= 1.
167
+ --version Show version information and exit.
168
+ ```
169
+
170
+ ```bash
171
+ normalizer ./data/sample.1.fr.srt
172
+ ```
173
+
174
+ or
175
+
176
+ ```bash
177
+ python -m charset_normalizer ./data/sample.1.fr.srt
178
+ ```
179
+
180
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
181
+
182
+ ```json
183
+ {
184
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
185
+ "encoding": "cp1252",
186
+ "encoding_aliases": [
187
+ "1252",
188
+ "windows_1252"
189
+ ],
190
+ "alternative_encodings": [
191
+ "cp1254",
192
+ "cp1256",
193
+ "cp1258",
194
+ "iso8859_14",
195
+ "iso8859_15",
196
+ "iso8859_16",
197
+ "iso8859_3",
198
+ "iso8859_9",
199
+ "latin_1",
200
+ "mbcs"
201
+ ],
202
+ "language": "French",
203
+ "alphabets": [
204
+ "Basic Latin",
205
+ "Latin-1 Supplement"
206
+ ],
207
+ "has_sig_or_bom": false,
208
+ "chaos": 0.149,
209
+ "coherence": 97.152,
210
+ "unicode_path": null,
211
+ "is_preferred": true
212
+ }
213
+ ```
214
+
215
+ ### Python
216
+ *Just print out normalized text*
217
+ ```python
218
+ from charset_normalizer import from_path
219
+
220
+ results = from_path('./my_subtitle.srt')
221
+
222
+ print(str(results.best()))
223
+ ```
224
+
225
+ *Upgrade your code without effort*
226
+ ```python
227
+ from charset_normalizer import detect
228
+ ```
229
+
230
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
231
+
232
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
233
+
234
+ ## 😇 Why
235
+
236
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
237
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
238
+
239
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
240
+ produce **two identical rendered string.**
241
+ What I want is to get readable text, the best I can.
242
+
243
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
244
+
245
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
246
+
247
+ ## 🍰 How
248
+
249
+ - Discard all charset encoding table that could not fit the binary content.
250
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
251
+ - Extract matches with the lowest mess detected.
252
+ - Additionally, we measure coherence / probe for a language.
253
+
254
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
255
+
256
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
257
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
258
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
259
+ improve or rewrite it.
260
+
261
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
262
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
263
+
264
+ ## ⚡ Known limitations
265
+
266
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
267
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
268
+
269
+ ## ⚠️ About Python EOLs
270
+
271
+ **If you are running:**
272
+
273
+ - Python >=2.7,<3.5: Unsupported
274
+ - Python 3.5: charset-normalizer < 2.1
275
+ - Python 3.6: charset-normalizer < 3.1
276
+
277
+ Upgrade your Python interpreter as soon as possible.
278
+
279
+ ## 👤 Contributing
280
+
281
+ Contributions, issues and feature requests are very much welcome.<br />
282
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
283
+
284
+ ## 📝 License
285
+
286
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
287
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
288
+
289
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
290
+
291
+ ## 💼 For Enterprise
292
+
293
+ Professional support for charset-normalizer is available as part of the [Tidelift
294
+ Subscription][1]. Tidelift gives software development teams a single source for
295
+ purchasing and maintaining their software, with professional grade assurances
296
+ from the experts who know it best, while seamlessly integrating with existing
297
+ tools.
298
+
299
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
300
+
301
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
302
+
303
+ # Changelog
304
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
305
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
306
+
307
+ ## [3.4.7](https://github.com/Ousret/charset_normalizer/compare/3.4.6...3.4.7) (2026-04-02)
308
+
309
+ ### Changed
310
+ - Pre-built optimized version using mypy[c] v1.20.
311
+ - Relax `setuptools` constraint to `setuptools>=68,<82.1`.
312
+
313
+ ### Fixed
314
+ - Correctly remove SIG remnant in utf-7 decoded string. (#718) (#716)
315
+
316
+ ## [3.4.6](https://github.com/Ousret/charset_normalizer/compare/3.4.5...3.4.6) (2026-03-15)
317
+
318
+ ### Changed
319
+ - Flattened the logic in `charset_normalizer.md` for higher performance. Removed `eligible(..)` and `feed(...)`
320
+ in favor of `feed_info(...)`.
321
+ - Raised upper bound for mypy[c] to 1.20, for our optimized version.
322
+ - Updated `UNICODE_RANGES_COMBINED` using Unicode blocks v17.
323
+
324
+ ### Fixed
325
+ - Edge case where noise difference between two candidates can be almost insignificant. (#672)
326
+ - CLI `--normalize` writing to wrong path when passing multiple files in. (#702)
327
+
328
+ ### Misc
329
+ - Freethreaded pre-built wheels now shipped in PyPI starting with 3.14t. (#616)
330
+
331
+ ## [3.4.5](https://github.com/Ousret/charset_normalizer/compare/3.4.4...3.4.5) (2026-03-06)
332
+
333
+ ### Changed
334
+ - Update `setuptools` constraint to `setuptools>=68,<=82`.
335
+ - Raised upper bound of mypyc for the optional pre-built extension to v1.19.1
336
+
337
+ ### Fixed
338
+ - Add explicit link to lib math in our optimized build. (#692)
339
+ - Logger level not restored correctly for empty byte sequences. (#701)
340
+ - TypeError when passing bytearray to from_bytes. (#703)
341
+
342
+ ### Misc
343
+ - Applied safe micro-optimizations in both our noise detector and language detector.
344
+ - Rewrote the `query_yes_no` function (inside CLI) to avoid using ambiguous licensed code.
345
+ - Added `cd.py` submodule into mypyc optional compilation to reduce further the performance impact.
346
+
347
+ ## [3.4.4](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.4) (2025-10-13)
348
+
349
+ ### Changed
350
+ - Bound `setuptools` to a specific constraint `setuptools>=68,<=81`.
351
+ - Raised upper bound of mypyc for the optional pre-built extension to v1.18.2
352
+
353
+ ### Removed
354
+ - `setuptools-scm` as a build dependency.
355
+
356
+ ### Misc
357
+ - Enforced hashes in `dev-requirements.txt` and created `ci-requirements.txt` for security purposes.
358
+ - Additional pre-built wheels for riscv64, s390x, and armv7l architectures.
359
+ - Restore ` multiple.intoto.jsonl` in GitHub releases in addition to individual attestation file per wheel.
360
+
361
+ ## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
362
+
363
+ ### Changed
364
+ - mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
365
+ - automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
366
+
367
+ ### Added
368
+ - Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
369
+ - Support for Python 3.14
370
+
371
+ ### Fixed
372
+ - sdist archive contained useless directories.
373
+ - automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
374
+
375
+ ### Misc
376
+ - SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
377
+ Each published wheel comes with its SBOM. We choose CycloneDX as the format.
378
+ - Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
379
+
380
+ ## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
381
+
382
+ ### Fixed
383
+ - Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
384
+ - Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
385
+
386
+ ### Changed
387
+ - Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
388
+
389
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
390
+
391
+ ### Changed
392
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
393
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
394
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
395
+
396
+ ### Added
397
+ - pre-commit configuration.
398
+ - noxfile.
399
+
400
+ ### Removed
401
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
402
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
403
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
404
+ - Unused `utils.range_scan` function.
405
+
406
+ ### Fixed
407
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
408
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
409
+
410
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
411
+
412
+ ### Added
413
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
414
+ - Support for Python 3.13 (#512)
415
+
416
+ ### Fixed
417
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
418
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
419
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
420
+
421
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
422
+
423
+ ### Fixed
424
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
425
+ - Regression on some detection case showcased in the documentation (#371)
426
+
427
+ ### Added
428
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
429
+
430
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
431
+
432
+ ### Changed
433
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
434
+ - Improved the general detection reliability based on reports from the community
435
+
436
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
437
+
438
+ ### Added
439
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
440
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
441
+
442
+ ### Removed
443
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
444
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
445
+
446
+ ### Changed
447
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
448
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
449
+
450
+ ### Fixed
451
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
452
+
453
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
454
+
455
+ ### Changed
456
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
457
+ - Minor improvement over the global detection reliability
458
+
459
+ ### Added
460
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
461
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
462
+ - Explicit support for Python 3.12
463
+
464
+ ### Fixed
465
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
466
+
467
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
468
+
469
+ ### Added
470
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
471
+
472
+ ### Removed
473
+ - Support for Python 3.6 (PR #260)
474
+
475
+ ### Changed
476
+ - Optional speedup provided by mypy/c 1.0.1
477
+
478
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
479
+
480
+ ### Fixed
481
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
482
+
483
+ ### Changed
484
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
485
+
486
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
487
+
488
+ ### Added
489
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
490
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
491
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
492
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
493
+
494
+ ### Changed
495
+ - Build with static metadata using 'build' frontend
496
+ - Make the language detection stricter
497
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
498
+
499
+ ### Fixed
500
+ - CLI with opt --normalize fail when using full path for files
501
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
502
+ - Sphinx warnings when generating the documentation
503
+
504
+ ### Removed
505
+ - Coherence detector no longer return 'Simple English' instead return 'English'
506
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
507
+ - Breaking: Method `first()` and `best()` from CharsetMatch
508
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
509
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
510
+ - Breaking: Top-level function `normalize`
511
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
512
+ - Support for the backport `unicodedata2`
513
+
514
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
515
+
516
+ ### Added
517
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
518
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
519
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
520
+
521
+ ### Changed
522
+ - Build with static metadata using 'build' frontend
523
+ - Make the language detection stricter
524
+
525
+ ### Fixed
526
+ - CLI with opt --normalize fail when using full path for files
527
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
528
+
529
+ ### Removed
530
+ - Coherence detector no longer return 'Simple English' instead return 'English'
531
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
532
+
533
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
534
+
535
+ ### Added
536
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
537
+
538
+ ### Removed
539
+ - Breaking: Method `first()` and `best()` from CharsetMatch
540
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
541
+
542
+ ### Fixed
543
+ - Sphinx warnings when generating the documentation
544
+
545
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
546
+
547
+ ### Changed
548
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
549
+
550
+ ### Removed
551
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
552
+ - Breaking: Top-level function `normalize`
553
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
554
+ - Support for the backport `unicodedata2`
555
+
556
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
557
+
558
+ ### Deprecated
559
+ - Function `normalize` scheduled for removal in 3.0
560
+
561
+ ### Changed
562
+ - Removed useless call to decode in fn is_unprintable (#206)
563
+
564
+ ### Fixed
565
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
566
+
567
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
568
+
569
+ ### Added
570
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
571
+
572
+ ### Changed
573
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
574
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
575
+
576
+ ### Fixed
577
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
578
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
579
+
580
+ ### Removed
581
+ - Support for Python 3.5 (PR #192)
582
+
583
+ ### Deprecated
584
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
585
+
586
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
587
+
588
+ ### Fixed
589
+ - ASCII miss-detection on rare cases (PR #170)
590
+
591
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
592
+
593
+ ### Added
594
+ - Explicit support for Python 3.11 (PR #164)
595
+
596
+ ### Changed
597
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
598
+
599
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
600
+
601
+ ### Fixed
602
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
603
+
604
+ ### Changed
605
+ - Skipping the language-detection (CD) on ASCII (PR #155)
606
+
607
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
608
+
609
+ ### Changed
610
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
611
+
612
+ ### Fixed
613
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
614
+
615
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
616
+ ### Changed
617
+ - Improvement over Vietnamese detection (PR #126)
618
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
619
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
620
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
621
+ - Code style as refactored by Sourcery-AI (PR #131)
622
+ - Minor adjustment on the MD around european words (PR #133)
623
+ - Remove and replace SRTs from assets / tests (PR #139)
624
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
625
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
626
+
627
+ ### Fixed
628
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
629
+ - Avoid using too insignificant chunk (PR #137)
630
+
631
+ ### Added
632
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
633
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
634
+
635
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
636
+ ### Added
637
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
638
+
639
+ ### Changed
640
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
641
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
642
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
643
+ - Various detection improvement (MD+CD) (PR #117)
644
+
645
+ ### Removed
646
+ - Remove redundant logging entry about detected language(s) (PR #115)
647
+
648
+ ### Fixed
649
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
650
+
651
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
652
+ ### Fixed
653
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
654
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
655
+
656
+ ### Changed
657
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
658
+
659
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
660
+ ### Changed
661
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
662
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
663
+ - The Unicode detection is slightly improved (PR #93)
664
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
665
+
666
+ ### Removed
667
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
668
+
669
+ ### Fixed
670
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
671
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
672
+ - The MANIFEST.in was not exhaustive (PR #78)
673
+
674
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
675
+ ### Fixed
676
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
677
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
678
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
679
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
680
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
681
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
682
+
683
+ ### Changed
684
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
685
+ - Allow fallback on specified encoding if any (PR #71)
686
+
687
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
688
+ ### Changed
689
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
690
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
691
+
692
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
693
+ ### Fixed
694
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
695
+
696
+ ### Changed
697
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
698
+
699
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
700
+ ### Fixed
701
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
702
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
703
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
704
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
705
+
706
+ ### Changed
707
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
708
+
709
+ ### Added
710
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
711
+
712
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
713
+ ### Changed
714
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
715
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
716
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
717
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
718
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
719
+ - utf_7 detection has been reinstated.
720
+
721
+ ### Removed
722
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
723
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
724
+ - The exception hook on UnicodeDecodeError has been removed.
725
+
726
+ ### Deprecated
727
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
728
+
729
+ ### Fixed
730
+ - The CLI output used the relative path of the file(s). Should be absolute.
731
+
732
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
733
+ ### Fixed
734
+ - Logger configuration/usage no longer conflict with others (PR #44)
735
+
736
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
737
+ ### Removed
738
+ - Using standard logging instead of using the package loguru.
739
+ - Dropping nose test framework in favor of the maintained pytest.
740
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
741
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
742
+ - Stop support for UTF-7 that does not contain a SIG.
743
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
744
+
745
+ ### Fixed
746
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
747
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
748
+
749
+ ### Changed
750
+ - Improving the package final size by compressing frequencies.json.
751
+ - Huge improvement over the larges payload.
752
+
753
+ ### Added
754
+ - CLI now produces JSON consumable output.
755
+ - Return ASCII if given sequences fit. Given reasonable confidence.
756
+
757
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
758
+
759
+ ### Fixed
760
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
761
+
762
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
763
+
764
+ ### Fixed
765
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
766
+
767
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
768
+
769
+ ### Fixed
770
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
771
+
772
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
773
+
774
+ ### Changed
775
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
776
+
777
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
778
+
779
+ ### Fixed
780
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
781
+
782
+ ### Changed
783
+ - Dependencies refactoring, constraints revised.
784
+
785
+ ### Added
786
+ - Add python 3.9 and 3.10 to the supported interpreters
787
+
788
+ MIT License
789
+
790
+ Copyright (c) 2025 TAHRI Ahmed R.
791
+
792
+ Permission is hereby granted, free of charge, to any person obtaining a copy
793
+ of this software and associated documentation files (the "Software"), to deal
794
+ in the Software without restriction, including without limitation the rights
795
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
796
+ copies of the Software, and to permit persons to whom the Software is
797
+ furnished to do so, subject to the following conditions:
798
+
799
+ The above copyright notice and this permission notice shall be included in all
800
+ copies or substantial portions of the Software.
801
+
802
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
803
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
804
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
805
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
806
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
807
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
808
+ SOFTWARE.
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/RECORD ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../../bin/normalizer,sha256=qrJTFfLbrSCiRRy7rn5gzE3orxcp81VjJuDiGvughdY,348
2
+ 81d243bd2c585b0f4821__mypyc.cpython-314-x86_64-linux-gnu.so,sha256=Eoq-hHZwIu-gKwLFiL0uwZVcWqoi9v3GVa5pD5WS3sE,433360
3
+ charset_normalizer-3.4.7.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
4
+ charset_normalizer-3.4.7.dist-info/METADATA,sha256=K8lK8L8LaZ1YmKvWLt3zEkpIxiCOC58xNhzFQrfQJxQ,40931
5
+ charset_normalizer-3.4.7.dist-info/RECORD,,
6
+ charset_normalizer-3.4.7.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ charset_normalizer-3.4.7.dist-info/WHEEL,sha256=a6EB0SZmvnUxj7CP1sYpBVvB6TTNx-Vzw2sye5KrYcM,190
8
+ charset_normalizer-3.4.7.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
9
+ charset_normalizer-3.4.7.dist-info/licenses/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
10
+ charset_normalizer-3.4.7.dist-info/top_level.txt,sha256=c_vZbitqecT2GfK3zdxSTLCn8C-6pGnHQY5o_5Y32M0,47
11
+ charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
12
+ charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
13
+ charset_normalizer/api.py,sha256=387F3n23MlMu-xfSbFULW2DLGsBmVrZVGhnkiGXeKBo,38844
14
+ charset_normalizer/cd.cpython-314-x86_64-linux-gnu.so,sha256=-T9Bunt3lkMVS1l8kZ4yh236yozvPpBlw7WuRUEg-Xg,15912
15
+ charset_normalizer/cd.py,sha256=v0iPJweGsRegXywrM1LzUgqW9bJ1KFvIblQHP1jm5FQ,15174
16
+ charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
17
+ charset_normalizer/cli/__main__.py,sha256=E9FFSV1E2iOE_B2B1tJHQT9ExJqc60Ks_c-08sNawh8,11940
18
+ charset_normalizer/constant.py,sha256=yvLAWDrdSC743Cu4amhwHLIO-FGuRTOTZouCzZKGikc,44431
19
+ charset_normalizer/legacy.py,sha256=yBIFMNABNPE5JkdKOWyVo36fZtV9nm8bf37LrDWulz8,2661
20
+ charset_normalizer/md.cpython-314-x86_64-linux-gnu.so,sha256=h3N9skDXMMXSiDNC3DzjTBixkgUrvILRHKaFZLYDL_Y,15912
21
+ charset_normalizer/md.py,sha256=AYCdfDX79FrgoId3zXqmbCuDcbGr1NRuGqgJN94Rx9Q,30441
22
+ charset_normalizer/models.py,sha256=FbaQnI6ECmVmyHRSvVM5fHNeMAQ3KSGdwLjGcQqWDws,12821
23
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ charset_normalizer/utils.py,sha256=9cpi-_0-vC9pGDfuoarhC6VlF_Jxwx5Jsa_8I4w2D8k,12282
25
+ charset_normalizer/version.py,sha256=2LxFuGp3BBuIwt95cp64y7v8bCNHcMAi08IfXt_47Co,115
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/REQUESTED ADDED
File without changes
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/WHEEL ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-manylinux_2_17_x86_64
5
+ Tag: cp314-cp314-manylinux2014_x86_64
6
+ Tag: cp314-cp314-manylinux_2_28_x86_64
7
+
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer.cli:cli_detect
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
.venv/lib/python3.14/site-packages/charset_normalizer-3.4.7.dist-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 81d243bd2c585b0f4821__mypyc
2
+ charset_normalizer
.venv/lib/python3.14/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
.venv/lib/python3.14/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
.venv/lib/python3.14/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,988 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import (
14
+ IANA_SUPPORTED,
15
+ IANA_SUPPORTED_SIMILAR,
16
+ TOO_BIG_SEQUENCE,
17
+ TOO_SMALL_SEQUENCE,
18
+ TRACE,
19
+ )
20
+ from .md import mess_ratio
21
+ from .models import CharsetMatch, CharsetMatches
22
+ from .utils import (
23
+ any_specified_encoding,
24
+ cut_sequence_chunks,
25
+ iana_name,
26
+ identify_sig_or_bom,
27
+ is_multi_byte_encoding,
28
+ should_strip_sig_or_bom,
29
+ )
30
+
31
+ logger = logging.getLogger("charset_normalizer")
32
+ explain_handler = logging.StreamHandler()
33
+ explain_handler.setFormatter(
34
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
35
+ )
36
+
37
+ # Pre-compute a reordered encoding list: multibyte first, then single-byte.
38
+ # This allows the mb_definitive_match optimization to fire earlier, skipping
39
+ # all single-byte encodings for genuine CJK content. Multibyte codecs
40
+ # hard-fail (UnicodeDecodeError) on single-byte data almost instantly, so
41
+ # testing them first costs negligible time for non-CJK files.
42
+ _mb_supported: list[str] = []
43
+ _sb_supported: list[str] = []
44
+
45
+ for _supported_enc in IANA_SUPPORTED:
46
+ try:
47
+ if is_multi_byte_encoding(_supported_enc):
48
+ _mb_supported.append(_supported_enc)
49
+ else:
50
+ _sb_supported.append(_supported_enc)
51
+ except ImportError:
52
+ _sb_supported.append(_supported_enc)
53
+
54
+ IANA_SUPPORTED_MB_FIRST: list[str] = _mb_supported + _sb_supported
55
+
56
+
57
+ def from_bytes(
58
+ sequences: bytes | bytearray,
59
+ steps: int = 5,
60
+ chunk_size: int = 512,
61
+ threshold: float = 0.2,
62
+ cp_isolation: list[str] | None = None,
63
+ cp_exclusion: list[str] | None = None,
64
+ preemptive_behaviour: bool = True,
65
+ explain: bool = False,
66
+ language_threshold: float = 0.1,
67
+ enable_fallback: bool = True,
68
+ ) -> CharsetMatches:
69
+ """
70
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
71
+ If there is no results, it is a strong indicator that the source is binary/not text.
72
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
73
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
74
+
75
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
76
+ but never take it for granted. Can improve the performance.
77
+
78
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
79
+ purpose.
80
+
81
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
82
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
83
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
84
+ Custom logging format and handler can be set manually.
85
+ """
86
+
87
+ if not isinstance(sequences, (bytearray, bytes)):
88
+ raise TypeError(
89
+ "Expected object of type bytes or bytearray, got: {}".format(
90
+ type(sequences)
91
+ )
92
+ )
93
+
94
+ if explain:
95
+ previous_logger_level: int = logger.level
96
+ logger.addHandler(explain_handler)
97
+ logger.setLevel(TRACE)
98
+
99
+ length: int = len(sequences)
100
+
101
+ if length == 0:
102
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
103
+ if explain: # Defensive: ensure exit path clean handler
104
+ logger.removeHandler(explain_handler)
105
+ logger.setLevel(previous_logger_level)
106
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
107
+
108
+ if cp_isolation is not None:
109
+ logger.log(
110
+ TRACE,
111
+ "cp_isolation is set. use this flag for debugging purpose. "
112
+ "limited list of encoding allowed : %s.",
113
+ ", ".join(cp_isolation),
114
+ )
115
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
116
+ else:
117
+ cp_isolation = []
118
+
119
+ if cp_exclusion is not None:
120
+ logger.log(
121
+ TRACE,
122
+ "cp_exclusion is set. use this flag for debugging purpose. "
123
+ "limited list of encoding excluded : %s.",
124
+ ", ".join(cp_exclusion),
125
+ )
126
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
127
+ else:
128
+ cp_exclusion = []
129
+
130
+ if length <= (chunk_size * steps):
131
+ logger.log(
132
+ TRACE,
133
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
134
+ steps,
135
+ chunk_size,
136
+ length,
137
+ )
138
+ steps = 1
139
+ chunk_size = length
140
+
141
+ if steps > 1 and length / steps < chunk_size:
142
+ chunk_size = int(length / steps)
143
+
144
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
145
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
146
+
147
+ if is_too_small_sequence:
148
+ logger.log(
149
+ TRACE,
150
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
151
+ length
152
+ ),
153
+ )
154
+ elif is_too_large_sequence:
155
+ logger.log(
156
+ TRACE,
157
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
158
+ length
159
+ ),
160
+ )
161
+
162
+ prioritized_encodings: list[str] = []
163
+
164
+ specified_encoding: str | None = (
165
+ any_specified_encoding(sequences) if preemptive_behaviour else None
166
+ )
167
+
168
+ if specified_encoding is not None:
169
+ prioritized_encodings.append(specified_encoding)
170
+ logger.log(
171
+ TRACE,
172
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
173
+ specified_encoding,
174
+ )
175
+
176
+ tested: set[str] = set()
177
+ tested_but_hard_failure: list[str] = []
178
+ tested_but_soft_failure: list[str] = []
179
+ soft_failure_skip: set[str] = set()
180
+ success_fast_tracked: set[str] = set()
181
+
182
+ # Cache for decoded payload deduplication: hash(decoded_payload) -> (mean_mess_ratio, cd_ratios_merged, passed)
183
+ # When multiple encodings decode to the exact same string, we can skip the expensive
184
+ # mess_ratio and coherence_ratio analysis and reuse the results from the first encoding.
185
+ payload_result_cache: dict[int, tuple[float, list[tuple[str, float]], bool]] = {}
186
+
187
+ # When a definitive result (chaos=0.0 and good coherence) is found after testing
188
+ # the prioritized encodings (ascii, utf_8), we can significantly reduce the remaining
189
+ # work. Encodings that target completely different language families (e.g., Cyrillic
190
+ # when the definitive match is Latin) are skipped entirely.
191
+ # Additionally, for same-family encodings that pass chaos probing, we reuse the
192
+ # definitive match's coherence ratios instead of recomputing them — a major savings
193
+ # since coherence_ratio accounts for ~30% of total time on slow Latin files.
194
+ definitive_match_found: bool = False
195
+ definitive_target_languages: set[str] = set()
196
+ # After the definitive match fires, we cap the number of additional same-family
197
+ # single-byte encodings that pass chaos probing. Once we've accumulated enough
198
+ # good candidates (N), further same-family SB encodings are unlikely to produce
199
+ # a better best() result and just waste mess_ratio + coherence_ratio time.
200
+ # The first encoding to trigger the definitive match is NOT counted (it's already in).
201
+ post_definitive_sb_success_count: int = 0
202
+ POST_DEFINITIVE_SB_CAP: int = 7
203
+
204
+ # When a non-UTF multibyte encoding passes chaos probing with significant multibyte
205
+ # content (decoded length < 98% of raw length), skip all remaining single-byte encodings.
206
+ # Rationale: multi-byte decoders (CJK) have strict byte-sequence validation — if they
207
+ # decode without error AND pass chaos probing with substantial multibyte content, the
208
+ # data is genuinely multibyte encoded. Single-byte encodings will always decode (every
209
+ # byte maps to something) but waste time on mess_ratio before failing.
210
+ # The 98% threshold prevents false triggers on files that happen to have a few valid
211
+ # multibyte pairs (e.g., cp424/_ude_1.txt where big5 decodes with 99% ratio).
212
+ mb_definitive_match_found: bool = False
213
+
214
+ fallback_ascii: CharsetMatch | None = None
215
+ fallback_u8: CharsetMatch | None = None
216
+ fallback_specified: CharsetMatch | None = None
217
+
218
+ results: CharsetMatches = CharsetMatches()
219
+
220
+ early_stop_results: CharsetMatches = CharsetMatches()
221
+
222
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
223
+
224
+ if sig_encoding is not None:
225
+ prioritized_encodings.append(sig_encoding)
226
+ logger.log(
227
+ TRACE,
228
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
229
+ len(sig_payload),
230
+ sig_encoding,
231
+ )
232
+
233
+ prioritized_encodings.append("ascii")
234
+
235
+ if "utf_8" not in prioritized_encodings:
236
+ prioritized_encodings.append("utf_8")
237
+
238
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED_MB_FIRST:
239
+ if cp_isolation and encoding_iana not in cp_isolation:
240
+ continue
241
+
242
+ if cp_exclusion and encoding_iana in cp_exclusion:
243
+ continue
244
+
245
+ if encoding_iana in tested:
246
+ continue
247
+
248
+ tested.add(encoding_iana)
249
+
250
+ decoded_payload: str | None = None
251
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
252
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
253
+ encoding_iana
254
+ )
255
+
256
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
257
+ logger.log(
258
+ TRACE,
259
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
260
+ encoding_iana,
261
+ )
262
+ continue
263
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
264
+ logger.log(
265
+ TRACE,
266
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
267
+ encoding_iana,
268
+ )
269
+ continue
270
+
271
+ # Skip encodings similar to ones that already soft-failed (high mess ratio).
272
+ # Checked BEFORE the expensive decode attempt.
273
+ if encoding_iana in soft_failure_skip:
274
+ logger.log(
275
+ TRACE,
276
+ "%s is deemed too similar to a code page that was already considered unsuited. Continuing!",
277
+ encoding_iana,
278
+ )
279
+ continue
280
+
281
+ # Skip encodings that were already fast-tracked from a similar successful encoding.
282
+ if encoding_iana in success_fast_tracked:
283
+ logger.log(
284
+ TRACE,
285
+ "Skipping %s: already fast-tracked from a similar successful encoding.",
286
+ encoding_iana,
287
+ )
288
+ continue
289
+
290
+ try:
291
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
292
+ except (ModuleNotFoundError, ImportError): # Defensive:
293
+ logger.log(
294
+ TRACE,
295
+ "Encoding %s does not provide an IncrementalDecoder",
296
+ encoding_iana,
297
+ )
298
+ continue
299
+
300
+ # When we've already found a definitive match (chaos=0.0 with good coherence)
301
+ # after testing the prioritized encodings, skip encodings that target
302
+ # completely different language families. This avoids running expensive
303
+ # mess_ratio + coherence_ratio on clearly unrelated candidates (e.g., Cyrillic
304
+ # when the definitive match is Latin-based).
305
+ if definitive_match_found:
306
+ if not is_multi_byte_decoder:
307
+ enc_languages = set(encoding_languages(encoding_iana))
308
+ else:
309
+ enc_languages = set(mb_encoding_languages(encoding_iana))
310
+ if not enc_languages.intersection(definitive_target_languages):
311
+ logger.log(
312
+ TRACE,
313
+ "Skipping %s: definitive match already found, this encoding targets different languages (%s vs %s).",
314
+ encoding_iana,
315
+ enc_languages,
316
+ definitive_target_languages,
317
+ )
318
+ continue
319
+
320
+ # After the definitive match, cap the number of additional same-family
321
+ # single-byte encodings that pass chaos probing. This avoids testing the
322
+ # tail of rare, low-value same-family encodings (mac_iceland, cp860, etc.)
323
+ # that almost never change best() but each cost ~1-2ms of mess_ratio + coherence.
324
+ if (
325
+ definitive_match_found
326
+ and not is_multi_byte_decoder
327
+ and post_definitive_sb_success_count >= POST_DEFINITIVE_SB_CAP
328
+ ):
329
+ logger.log(
330
+ TRACE,
331
+ "Skipping %s: already accumulated %d same-family results after definitive match (cap=%d).",
332
+ encoding_iana,
333
+ post_definitive_sb_success_count,
334
+ POST_DEFINITIVE_SB_CAP,
335
+ )
336
+ continue
337
+
338
+ # When a multibyte encoding with significant multibyte content has already
339
+ # passed chaos probing, skip all single-byte encodings. They will either fail
340
+ # chaos probing (wasting mess_ratio time) or produce inferior results.
341
+ if mb_definitive_match_found and not is_multi_byte_decoder:
342
+ logger.log(
343
+ TRACE,
344
+ "Skipping single-byte %s: multi-byte definitive match already found.",
345
+ encoding_iana,
346
+ )
347
+ continue
348
+
349
+ try:
350
+ if is_too_large_sequence and is_multi_byte_decoder is False:
351
+ str(
352
+ (
353
+ sequences[: int(50e4)]
354
+ if strip_sig_or_bom is False
355
+ else sequences[len(sig_payload) : int(50e4)]
356
+ ),
357
+ encoding=encoding_iana,
358
+ )
359
+ else:
360
+ # UTF-7 BOM is encoded in modified Base64 whose byte boundary
361
+ # can overlap with the next character. Stripping raw SIG bytes
362
+ # before decoding may leave stray bytes that decode as garbage.
363
+ # Decode the full sequence and remove the leading BOM char instead.
364
+ # see https://github.com/jawah/charset_normalizer/issues/718
365
+ # and https://github.com/jawah/charset_normalizer/issues/716
366
+ if encoding_iana == "utf_7" and bom_or_sig_available:
367
+ decoded_payload = str(
368
+ sequences,
369
+ encoding=encoding_iana,
370
+ )
371
+ if decoded_payload and decoded_payload[0] == "\ufeff":
372
+ decoded_payload = decoded_payload[1:]
373
+ else:
374
+ decoded_payload = str(
375
+ (
376
+ sequences
377
+ if strip_sig_or_bom is False
378
+ else sequences[len(sig_payload) :]
379
+ ),
380
+ encoding=encoding_iana,
381
+ )
382
+ except (UnicodeDecodeError, LookupError) as e:
383
+ if not isinstance(e, LookupError):
384
+ logger.log(
385
+ TRACE,
386
+ "Code page %s does not fit given bytes sequence at ALL. %s",
387
+ encoding_iana,
388
+ str(e),
389
+ )
390
+ tested_but_hard_failure.append(encoding_iana)
391
+ continue
392
+
393
+ r_ = range(
394
+ 0 if not bom_or_sig_available else len(sig_payload),
395
+ length,
396
+ int(length / steps),
397
+ )
398
+
399
+ multi_byte_bonus: bool = (
400
+ is_multi_byte_decoder
401
+ and decoded_payload is not None
402
+ and len(decoded_payload) < length
403
+ )
404
+
405
+ if multi_byte_bonus:
406
+ logger.log(
407
+ TRACE,
408
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
409
+ "was encoded using n-bytes.",
410
+ encoding_iana,
411
+ )
412
+
413
+ # Payload-hash deduplication: if another encoding already decoded to the
414
+ # exact same string, reuse its mess_ratio and coherence results entirely.
415
+ # This is strictly more general than the old IANA_SUPPORTED_SIMILAR approach
416
+ # because it catches ALL identical decoding, not just pre-mapped ones.
417
+ if decoded_payload is not None and not is_multi_byte_decoder:
418
+ payload_hash: int = hash(decoded_payload)
419
+ cached = payload_result_cache.get(payload_hash)
420
+ if cached is not None:
421
+ cached_mess, cached_cd, cached_passed = cached
422
+ if cached_passed:
423
+ # The previous encoding with identical output passed chaos probing.
424
+ fast_match = CharsetMatch(
425
+ sequences,
426
+ encoding_iana,
427
+ cached_mess,
428
+ bom_or_sig_available,
429
+ cached_cd,
430
+ (
431
+ decoded_payload
432
+ if (
433
+ is_too_large_sequence is False
434
+ or encoding_iana
435
+ in [specified_encoding, "ascii", "utf_8"]
436
+ )
437
+ else None
438
+ ),
439
+ preemptive_declaration=specified_encoding,
440
+ )
441
+ results.append(fast_match)
442
+ success_fast_tracked.add(encoding_iana)
443
+ logger.log(
444
+ TRACE,
445
+ "%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).",
446
+ encoding_iana,
447
+ round(cached_mess * 100, ndigits=3),
448
+ )
449
+
450
+ if (
451
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
452
+ and cached_mess < 0.1
453
+ ):
454
+ if cached_mess == 0.0:
455
+ logger.debug(
456
+ "Encoding detection: %s is most likely the one.",
457
+ fast_match.encoding,
458
+ )
459
+ if explain:
460
+ logger.removeHandler(explain_handler)
461
+ logger.setLevel(previous_logger_level)
462
+ return CharsetMatches([fast_match])
463
+ early_stop_results.append(fast_match)
464
+
465
+ if (
466
+ len(early_stop_results)
467
+ and (specified_encoding is None or specified_encoding in tested)
468
+ and "ascii" in tested
469
+ and "utf_8" in tested
470
+ ):
471
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
472
+ logger.debug(
473
+ "Encoding detection: %s is most likely the one.",
474
+ probable_result.encoding,
475
+ )
476
+ if explain:
477
+ logger.removeHandler(explain_handler)
478
+ logger.setLevel(previous_logger_level)
479
+ return CharsetMatches([probable_result])
480
+
481
+ continue
482
+ else:
483
+ # The previous encoding with identical output failed chaos probing.
484
+ tested_but_soft_failure.append(encoding_iana)
485
+ logger.log(
486
+ TRACE,
487
+ "%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).",
488
+ encoding_iana,
489
+ )
490
+ # Prepare fallbacks for special encodings even when skipped.
491
+ if enable_fallback and encoding_iana in [
492
+ "ascii",
493
+ "utf_8",
494
+ specified_encoding,
495
+ "utf_16",
496
+ "utf_32",
497
+ ]:
498
+ fallback_entry = CharsetMatch(
499
+ sequences,
500
+ encoding_iana,
501
+ threshold,
502
+ bom_or_sig_available,
503
+ [],
504
+ decoded_payload,
505
+ preemptive_declaration=specified_encoding,
506
+ )
507
+ if encoding_iana == specified_encoding:
508
+ fallback_specified = fallback_entry
509
+ elif encoding_iana == "ascii":
510
+ fallback_ascii = fallback_entry
511
+ else:
512
+ fallback_u8 = fallback_entry
513
+ continue
514
+
515
+ max_chunk_gave_up: int = int(len(r_) / 4)
516
+
517
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
518
+ early_stop_count: int = 0
519
+ lazy_str_hard_failure = False
520
+
521
+ md_chunks: list[str] = []
522
+ md_ratios = []
523
+
524
+ try:
525
+ for chunk in cut_sequence_chunks(
526
+ sequences,
527
+ encoding_iana,
528
+ r_,
529
+ chunk_size,
530
+ bom_or_sig_available,
531
+ strip_sig_or_bom,
532
+ sig_payload,
533
+ is_multi_byte_decoder,
534
+ decoded_payload,
535
+ ):
536
+ md_chunks.append(chunk)
537
+
538
+ md_ratios.append(
539
+ mess_ratio(
540
+ chunk,
541
+ threshold,
542
+ explain is True and 1 <= len(cp_isolation) <= 2,
543
+ )
544
+ )
545
+
546
+ if md_ratios[-1] >= threshold:
547
+ early_stop_count += 1
548
+
549
+ if (early_stop_count >= max_chunk_gave_up) or (
550
+ bom_or_sig_available and strip_sig_or_bom is False
551
+ ):
552
+ break
553
+ except (
554
+ UnicodeDecodeError
555
+ ) as e: # Lazy str loading may have missed something there
556
+ logger.log(
557
+ TRACE,
558
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
559
+ encoding_iana,
560
+ str(e),
561
+ )
562
+ early_stop_count = max_chunk_gave_up
563
+ lazy_str_hard_failure = True
564
+
565
+ # We might want to check the sequence again with the whole content
566
+ # Only if initial MD tests passes
567
+ if (
568
+ not lazy_str_hard_failure
569
+ and is_too_large_sequence
570
+ and not is_multi_byte_decoder
571
+ ):
572
+ try:
573
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
574
+ except UnicodeDecodeError as e:
575
+ logger.log(
576
+ TRACE,
577
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
578
+ encoding_iana,
579
+ str(e),
580
+ )
581
+ tested_but_hard_failure.append(encoding_iana)
582
+ continue
583
+
584
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
585
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
586
+ tested_but_soft_failure.append(encoding_iana)
587
+ if encoding_iana in IANA_SUPPORTED_SIMILAR:
588
+ soft_failure_skip.update(IANA_SUPPORTED_SIMILAR[encoding_iana])
589
+ # Cache this soft-failure so identical decoding from other encodings
590
+ # can be skipped immediately.
591
+ if decoded_payload is not None and not is_multi_byte_decoder:
592
+ payload_result_cache.setdefault(
593
+ hash(decoded_payload), (mean_mess_ratio, [], False)
594
+ )
595
+ logger.log(
596
+ TRACE,
597
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
598
+ "Computed mean chaos is %f %%.",
599
+ encoding_iana,
600
+ early_stop_count,
601
+ round(mean_mess_ratio * 100, ndigits=3),
602
+ )
603
+ # Preparing those fallbacks in case we got nothing.
604
+ if (
605
+ enable_fallback
606
+ and encoding_iana
607
+ in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
608
+ and not lazy_str_hard_failure
609
+ ):
610
+ fallback_entry = CharsetMatch(
611
+ sequences,
612
+ encoding_iana,
613
+ threshold,
614
+ bom_or_sig_available,
615
+ [],
616
+ decoded_payload,
617
+ preemptive_declaration=specified_encoding,
618
+ )
619
+ if encoding_iana == specified_encoding:
620
+ fallback_specified = fallback_entry
621
+ elif encoding_iana == "ascii":
622
+ fallback_ascii = fallback_entry
623
+ else:
624
+ fallback_u8 = fallback_entry
625
+ continue
626
+
627
+ logger.log(
628
+ TRACE,
629
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
630
+ encoding_iana,
631
+ round(mean_mess_ratio * 100, ndigits=3),
632
+ )
633
+
634
+ if not is_multi_byte_decoder:
635
+ target_languages: list[str] = encoding_languages(encoding_iana)
636
+ else:
637
+ target_languages = mb_encoding_languages(encoding_iana)
638
+
639
+ if target_languages:
640
+ logger.log(
641
+ TRACE,
642
+ "{} should target any language(s) of {}".format(
643
+ encoding_iana, str(target_languages)
644
+ ),
645
+ )
646
+
647
+ cd_ratios = []
648
+
649
+ # Run coherence detection on all chunks. We previously tried limiting to
650
+ # 1-2 chunks for post-definitive encodings to save time, but this caused
651
+ # coverage regressions by producing unrepresentative coherence scores.
652
+ # The SB cap and language-family skip optimizations provide sufficient
653
+ # speedup without sacrificing coherence accuracy.
654
+ if encoding_iana != "ascii":
655
+ # We shall skip the CD when its about ASCII
656
+ # Most of the time its not relevant to run "language-detection" on it.
657
+ for chunk in md_chunks:
658
+ chunk_languages = coherence_ratio(
659
+ chunk,
660
+ language_threshold,
661
+ ",".join(target_languages) if target_languages else None,
662
+ )
663
+
664
+ cd_ratios.append(chunk_languages)
665
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
666
+ else:
667
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
668
+
669
+ if cd_ratios_merged:
670
+ logger.log(
671
+ TRACE,
672
+ "We detected language {} using {}".format(
673
+ cd_ratios_merged, encoding_iana
674
+ ),
675
+ )
676
+
677
+ current_match = CharsetMatch(
678
+ sequences,
679
+ encoding_iana,
680
+ mean_mess_ratio,
681
+ bom_or_sig_available,
682
+ cd_ratios_merged,
683
+ (
684
+ decoded_payload
685
+ if (
686
+ is_too_large_sequence is False
687
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
688
+ )
689
+ else None
690
+ ),
691
+ preemptive_declaration=specified_encoding,
692
+ )
693
+
694
+ results.append(current_match)
695
+
696
+ # Cache the successful result for payload-hash deduplication.
697
+ if decoded_payload is not None and not is_multi_byte_decoder:
698
+ payload_result_cache.setdefault(
699
+ hash(decoded_payload),
700
+ (mean_mess_ratio, cd_ratios_merged, True),
701
+ )
702
+
703
+ # Count post-definitive same-family SB successes for the early termination cap.
704
+ # Only count low-mess encodings (< 2%) toward the cap. High-mess encodings are
705
+ # marginal results that shouldn't prevent better-quality candidates from being
706
+ # tested. For example, iso8859_4 (mess=0%) should not be skipped just because
707
+ # 7 high-mess Latin encodings (cp1252 at 8%, etc.) were tried first.
708
+ if (
709
+ definitive_match_found
710
+ and not is_multi_byte_decoder
711
+ and mean_mess_ratio < 0.02
712
+ ):
713
+ post_definitive_sb_success_count += 1
714
+
715
+ if (
716
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
717
+ and mean_mess_ratio < 0.1
718
+ ):
719
+ # If md says nothing to worry about, then... stop immediately!
720
+ if mean_mess_ratio == 0.0:
721
+ logger.debug(
722
+ "Encoding detection: %s is most likely the one.",
723
+ current_match.encoding,
724
+ )
725
+ if explain: # Defensive: ensure exit path clean handler
726
+ logger.removeHandler(explain_handler)
727
+ logger.setLevel(previous_logger_level)
728
+ return CharsetMatches([current_match])
729
+
730
+ early_stop_results.append(current_match)
731
+
732
+ if (
733
+ len(early_stop_results)
734
+ and (specified_encoding is None or specified_encoding in tested)
735
+ and "ascii" in tested
736
+ and "utf_8" in tested
737
+ ):
738
+ probable_result = early_stop_results.best() # type: ignore[assignment]
739
+ logger.debug(
740
+ "Encoding detection: %s is most likely the one.",
741
+ probable_result.encoding, # type: ignore[union-attr]
742
+ )
743
+ if explain: # Defensive: ensure exit path clean handler
744
+ logger.removeHandler(explain_handler)
745
+ logger.setLevel(previous_logger_level)
746
+
747
+ return CharsetMatches([probable_result])
748
+
749
+ # Once we find a result with good coherence (>= 0.5) after testing the
750
+ # prioritized encodings (ascii, utf_8), activate "definitive mode": skip
751
+ # encodings that target completely different language families. This avoids
752
+ # running expensive mess_ratio + coherence_ratio on clearly unrelated
753
+ # candidates (e.g., Cyrillic encodings when the match is Latin-based).
754
+ # We require coherence >= 0.5 to avoid false positives (e.g., cp1251 decoding
755
+ # Hebrew text with 0.0 chaos but wrong language detection at coherence 0.33).
756
+ if not definitive_match_found and not is_multi_byte_decoder:
757
+ best_coherence = (
758
+ max((v for _, v in cd_ratios_merged), default=0.0)
759
+ if cd_ratios_merged
760
+ else 0.0
761
+ )
762
+ if best_coherence >= 0.5 and "ascii" in tested and "utf_8" in tested:
763
+ definitive_match_found = True
764
+ definitive_target_languages.update(target_languages)
765
+ logger.log(
766
+ TRACE,
767
+ "Definitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.",
768
+ encoding_iana,
769
+ mean_mess_ratio,
770
+ best_coherence,
771
+ )
772
+
773
+ # When a non-UTF multibyte encoding passes chaos probing with significant
774
+ # multibyte content (decoded < 98% of raw), activate mb_definitive_match.
775
+ # This skips all remaining single-byte encodings which would either soft-fail
776
+ # (running expensive mess_ratio for nothing) or produce inferior results.
777
+ if (
778
+ not mb_definitive_match_found
779
+ and is_multi_byte_decoder
780
+ and multi_byte_bonus
781
+ and decoded_payload is not None
782
+ and len(decoded_payload) < length * 0.98
783
+ and encoding_iana
784
+ not in {
785
+ "utf_8",
786
+ "utf_8_sig",
787
+ "utf_16",
788
+ "utf_16_be",
789
+ "utf_16_le",
790
+ "utf_32",
791
+ "utf_32_be",
792
+ "utf_32_le",
793
+ "utf_7",
794
+ }
795
+ and "ascii" in tested
796
+ and "utf_8" in tested
797
+ ):
798
+ mb_definitive_match_found = True
799
+ logger.log(
800
+ TRACE,
801
+ "Multi-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.",
802
+ encoding_iana,
803
+ mean_mess_ratio,
804
+ len(decoded_payload),
805
+ length,
806
+ len(decoded_payload) / length * 100,
807
+ )
808
+
809
+ if encoding_iana == sig_encoding:
810
+ logger.debug(
811
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
812
+ "the beginning of the sequence.",
813
+ encoding_iana,
814
+ )
815
+ if explain: # Defensive: ensure exit path clean handler
816
+ logger.removeHandler(explain_handler)
817
+ logger.setLevel(previous_logger_level)
818
+ return CharsetMatches([results[encoding_iana]])
819
+
820
+ if len(results) == 0:
821
+ if fallback_u8 or fallback_ascii or fallback_specified:
822
+ logger.log(
823
+ TRACE,
824
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
825
+ )
826
+
827
+ if fallback_specified:
828
+ logger.debug(
829
+ "Encoding detection: %s will be used as a fallback match",
830
+ fallback_specified.encoding,
831
+ )
832
+ results.append(fallback_specified)
833
+ elif (
834
+ (fallback_u8 and fallback_ascii is None)
835
+ or (
836
+ fallback_u8
837
+ and fallback_ascii
838
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
839
+ )
840
+ or (fallback_u8 is not None)
841
+ ):
842
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
843
+ results.append(fallback_u8)
844
+ elif fallback_ascii:
845
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
846
+ results.append(fallback_ascii)
847
+
848
+ if results:
849
+ logger.debug(
850
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
851
+ results.best().encoding, # type: ignore
852
+ len(results) - 1,
853
+ )
854
+ else:
855
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
856
+
857
+ if explain:
858
+ logger.removeHandler(explain_handler)
859
+ logger.setLevel(previous_logger_level)
860
+
861
+ return results
862
+
863
+
864
+ def from_fp(
865
+ fp: BinaryIO,
866
+ steps: int = 5,
867
+ chunk_size: int = 512,
868
+ threshold: float = 0.20,
869
+ cp_isolation: list[str] | None = None,
870
+ cp_exclusion: list[str] | None = None,
871
+ preemptive_behaviour: bool = True,
872
+ explain: bool = False,
873
+ language_threshold: float = 0.1,
874
+ enable_fallback: bool = True,
875
+ ) -> CharsetMatches:
876
+ """
877
+ Same thing than the function from_bytes but using a file pointer that is already ready.
878
+ Will not close the file pointer.
879
+ """
880
+ return from_bytes(
881
+ fp.read(),
882
+ steps,
883
+ chunk_size,
884
+ threshold,
885
+ cp_isolation,
886
+ cp_exclusion,
887
+ preemptive_behaviour,
888
+ explain,
889
+ language_threshold,
890
+ enable_fallback,
891
+ )
892
+
893
+
894
+ def from_path(
895
+ path: str | bytes | PathLike, # type: ignore[type-arg]
896
+ steps: int = 5,
897
+ chunk_size: int = 512,
898
+ threshold: float = 0.20,
899
+ cp_isolation: list[str] | None = None,
900
+ cp_exclusion: list[str] | None = None,
901
+ preemptive_behaviour: bool = True,
902
+ explain: bool = False,
903
+ language_threshold: float = 0.1,
904
+ enable_fallback: bool = True,
905
+ ) -> CharsetMatches:
906
+ """
907
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
908
+ Can raise IOError.
909
+ """
910
+ with open(path, "rb") as fp:
911
+ return from_fp(
912
+ fp,
913
+ steps,
914
+ chunk_size,
915
+ threshold,
916
+ cp_isolation,
917
+ cp_exclusion,
918
+ preemptive_behaviour,
919
+ explain,
920
+ language_threshold,
921
+ enable_fallback,
922
+ )
923
+
924
+
925
+ def is_binary(
926
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
927
+ steps: int = 5,
928
+ chunk_size: int = 512,
929
+ threshold: float = 0.20,
930
+ cp_isolation: list[str] | None = None,
931
+ cp_exclusion: list[str] | None = None,
932
+ preemptive_behaviour: bool = True,
933
+ explain: bool = False,
934
+ language_threshold: float = 0.1,
935
+ enable_fallback: bool = False,
936
+ ) -> bool:
937
+ """
938
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
939
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
940
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
941
+ """
942
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
943
+ guesses = from_path(
944
+ fp_or_path_or_payload,
945
+ steps=steps,
946
+ chunk_size=chunk_size,
947
+ threshold=threshold,
948
+ cp_isolation=cp_isolation,
949
+ cp_exclusion=cp_exclusion,
950
+ preemptive_behaviour=preemptive_behaviour,
951
+ explain=explain,
952
+ language_threshold=language_threshold,
953
+ enable_fallback=enable_fallback,
954
+ )
955
+ elif isinstance(
956
+ fp_or_path_or_payload,
957
+ (
958
+ bytes,
959
+ bytearray,
960
+ ),
961
+ ):
962
+ guesses = from_bytes(
963
+ fp_or_path_or_payload,
964
+ steps=steps,
965
+ chunk_size=chunk_size,
966
+ threshold=threshold,
967
+ cp_isolation=cp_isolation,
968
+ cp_exclusion=cp_exclusion,
969
+ preemptive_behaviour=preemptive_behaviour,
970
+ explain=explain,
971
+ language_threshold=language_threshold,
972
+ enable_fallback=enable_fallback,
973
+ )
974
+ else:
975
+ guesses = from_fp(
976
+ fp_or_path_or_payload,
977
+ steps=steps,
978
+ chunk_size=chunk_size,
979
+ threshold=threshold,
980
+ cp_isolation=cp_isolation,
981
+ cp_exclusion=cp_exclusion,
982
+ preemptive_behaviour=preemptive_behaviour,
983
+ explain=explain,
984
+ language_threshold=language_threshold,
985
+ enable_fallback=enable_fallback,
986
+ )
987
+
988
+ return not guesses
.venv/lib/python3.14/site-packages/charset_normalizer/cd.cpython-314-x86_64-linux-gnu.so ADDED
Binary file (15.9 kB). View file
 
.venv/lib/python3.14/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ _FREQUENCIES_SET,
16
+ _FREQUENCIES_RANK,
17
+ )
18
+ from .md import is_suspiciously_successive_range
19
+ from .models import CoherenceMatches
20
+ from .utils import (
21
+ is_accentuated,
22
+ is_latin,
23
+ is_multi_byte_encoding,
24
+ is_unicode_range_secondary,
25
+ unicode_range,
26
+ )
27
+
28
+
29
+ def encoding_unicode_range(iana_name: str) -> list[str]:
30
+ """
31
+ Return associated unicode ranges in a single byte code page.
32
+ """
33
+ if is_multi_byte_encoding(iana_name):
34
+ raise OSError( # Defensive:
35
+ "Function not supported on multi-byte code page"
36
+ )
37
+
38
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
39
+
40
+ p: IncrementalDecoder = decoder(errors="ignore")
41
+ seen_ranges: dict[str, int] = {}
42
+ character_count: int = 0
43
+
44
+ for i in range(0x40, 0xFF):
45
+ chunk: str = p.decode(bytes([i]))
46
+
47
+ if chunk:
48
+ character_range: str | None = unicode_range(chunk)
49
+
50
+ if character_range is None:
51
+ continue
52
+
53
+ if is_unicode_range_secondary(character_range) is False:
54
+ if character_range not in seen_ranges:
55
+ seen_ranges[character_range] = 0
56
+ seen_ranges[character_range] += 1
57
+ character_count += 1
58
+
59
+ return sorted(
60
+ [
61
+ character_range
62
+ for character_range in seen_ranges
63
+ if seen_ranges[character_range] / character_count >= 0.15
64
+ ]
65
+ )
66
+
67
+
68
+ def unicode_range_languages(primary_range: str) -> list[str]:
69
+ """
70
+ Return inferred languages used with a unicode range.
71
+ """
72
+ languages: list[str] = []
73
+
74
+ for language, characters in FREQUENCIES.items():
75
+ for character in characters:
76
+ if unicode_range(character) == primary_range:
77
+ languages.append(language)
78
+ break
79
+
80
+ return languages
81
+
82
+
83
+ @lru_cache()
84
+ def encoding_languages(iana_name: str) -> list[str]:
85
+ """
86
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
87
+ This function does the correspondence.
88
+ """
89
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
90
+ primary_range: str | None = None
91
+
92
+ for specified_range in unicode_ranges:
93
+ if "Latin" not in specified_range:
94
+ primary_range = specified_range
95
+ break
96
+
97
+ if primary_range is None:
98
+ return ["Latin Based"]
99
+
100
+ return unicode_range_languages(primary_range)
101
+
102
+
103
+ @lru_cache()
104
+ def mb_encoding_languages(iana_name: str) -> list[str]:
105
+ """
106
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
107
+ This function does the correspondence.
108
+ """
109
+ if (
110
+ iana_name.startswith("shift_")
111
+ or iana_name.startswith("iso2022_jp")
112
+ or iana_name.startswith("euc_j")
113
+ or iana_name == "cp932"
114
+ ):
115
+ return ["Japanese"]
116
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
117
+ return ["Chinese"]
118
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
119
+ return ["Korean"]
120
+
121
+ return []
122
+
123
+
124
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
125
+ def get_target_features(language: str) -> tuple[bool, bool]:
126
+ """
127
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
128
+ """
129
+ target_have_accents: bool = False
130
+ target_pure_latin: bool = True
131
+
132
+ for character in FREQUENCIES[language]:
133
+ if not target_have_accents and is_accentuated(character):
134
+ target_have_accents = True
135
+ if target_pure_latin and is_latin(character) is False:
136
+ target_pure_latin = False
137
+
138
+ return target_have_accents, target_pure_latin
139
+
140
+
141
+ def alphabet_languages(
142
+ characters: list[str], ignore_non_latin: bool = False
143
+ ) -> list[str]:
144
+ """
145
+ Return associated languages associated to given characters.
146
+ """
147
+ languages: list[tuple[str, float]] = []
148
+
149
+ characters_set: frozenset[str] = frozenset(characters)
150
+ source_have_accents = any(is_accentuated(character) for character in characters)
151
+
152
+ for language, language_characters in FREQUENCIES.items():
153
+ target_have_accents, target_pure_latin = get_target_features(language)
154
+
155
+ if ignore_non_latin and target_pure_latin is False:
156
+ continue
157
+
158
+ if target_have_accents is False and source_have_accents:
159
+ continue
160
+
161
+ character_count: int = len(language_characters)
162
+
163
+ character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
164
+
165
+ ratio: float = character_match_count / character_count
166
+
167
+ if ratio >= 0.2:
168
+ languages.append((language, ratio))
169
+
170
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
171
+
172
+ return [compatible_language[0] for compatible_language in languages]
173
+
174
+
175
+ def characters_popularity_compare(
176
+ language: str, ordered_characters: list[str]
177
+ ) -> float:
178
+ """
179
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
180
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
181
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
182
+ """
183
+ if language not in FREQUENCIES:
184
+ raise ValueError(f"{language} not available") # Defensive:
185
+
186
+ character_approved_count: int = 0
187
+ frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
188
+ lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
189
+
190
+ ordered_characters_count: int = len(ordered_characters)
191
+ target_language_characters_count: int = len(FREQUENCIES[language])
192
+
193
+ large_alphabet: bool = target_language_characters_count > 26
194
+
195
+ expected_projection_ratio: float = (
196
+ target_language_characters_count / ordered_characters_count
197
+ )
198
+
199
+ # Pre-built rank dict for ordered_characters (avoids repeated list slicing).
200
+ ordered_rank: dict[str, int] = {
201
+ char: rank for rank, char in enumerate(ordered_characters)
202
+ }
203
+
204
+ # Pre-compute characters common to both orderings.
205
+ # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
206
+ common_chars: list[tuple[int, int]] = [
207
+ (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
208
+ ]
209
+
210
+ # Pre-extract lr and orr arrays for faster iteration in the inner loop.
211
+ # Plain integer loops with local arrays are much faster under mypyc than
212
+ # generator expression sums over a list of tuples.
213
+ common_count: int = len(common_chars)
214
+ common_lr: list[int] = [p[0] for p in common_chars]
215
+ common_orr: list[int] = [p[1] for p in common_chars]
216
+
217
+ for character, character_rank in zip(
218
+ ordered_characters, range(0, ordered_characters_count)
219
+ ):
220
+ if character not in frequencies_language_set:
221
+ continue
222
+
223
+ character_rank_in_language: int = lang_rank[character]
224
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
225
+
226
+ if (
227
+ large_alphabet is False
228
+ and abs(character_rank_projection - character_rank_in_language) > 4
229
+ ):
230
+ continue
231
+
232
+ if (
233
+ large_alphabet is True
234
+ and abs(character_rank_projection - character_rank_in_language)
235
+ < target_language_characters_count / 3
236
+ ):
237
+ character_approved_count += 1
238
+ continue
239
+
240
+ # Count how many characters appear "before" in both orderings,
241
+ # and how many appear "at or after" in both orderings.
242
+ # Single pass over pre-extracted arrays — much faster under mypyc
243
+ # than two generator expression sums.
244
+ before_match_count: int = 0
245
+ after_match_count: int = 0
246
+ for i in range(common_count):
247
+ lr_i: int = common_lr[i]
248
+ orr_i: int = common_orr[i]
249
+ if lr_i < character_rank_in_language:
250
+ if orr_i < character_rank:
251
+ before_match_count += 1
252
+ else:
253
+ if orr_i >= character_rank:
254
+ after_match_count += 1
255
+
256
+ after_len: int = target_language_characters_count - character_rank_in_language
257
+
258
+ if character_rank_in_language == 0 and before_match_count <= 4:
259
+ character_approved_count += 1
260
+ continue
261
+
262
+ if after_len == 0 and after_match_count <= 4:
263
+ character_approved_count += 1
264
+ continue
265
+
266
+ if (
267
+ character_rank_in_language > 0
268
+ and before_match_count / character_rank_in_language >= 0.4
269
+ ) or (after_len > 0 and after_match_count / after_len >= 0.4):
270
+ character_approved_count += 1
271
+ continue
272
+
273
+ return character_approved_count / len(ordered_characters)
274
+
275
+
276
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
277
+ """
278
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
279
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
280
+ One containing the latin letters and the other hebrew.
281
+ """
282
+ layers: dict[str, list[str]] = {}
283
+
284
+ # Fast path: track single-layer key to skip dict iteration for single-script text.
285
+ single_layer_key: str | None = None
286
+ multi_layer: bool = False
287
+
288
+ # Cache the last character_range and its resolved layer to avoid repeated
289
+ # is_suspiciously_successive_range calls for consecutive same-range chars.
290
+ prev_character_range: str | None = None
291
+ prev_layer_target: str | None = None
292
+
293
+ for character in decoded_sequence:
294
+ if character.isalpha() is False:
295
+ continue
296
+
297
+ # ASCII fast-path: a-z and A-Z are always "Basic Latin".
298
+ # Avoids unicode_range() function call overhead for the most common case.
299
+ character_ord: int = ord(character)
300
+ if character_ord < 128:
301
+ character_range: str | None = "Basic Latin"
302
+ else:
303
+ character_range = unicode_range(character)
304
+
305
+ if character_range is None:
306
+ continue
307
+
308
+ # Fast path: same range as previous character → reuse cached layer target.
309
+ if character_range == prev_character_range:
310
+ if prev_layer_target is not None:
311
+ layers[prev_layer_target].append(character)
312
+ continue
313
+
314
+ layer_target_range: str | None = None
315
+
316
+ if multi_layer:
317
+ for discovered_range in layers:
318
+ if (
319
+ is_suspiciously_successive_range(discovered_range, character_range)
320
+ is False
321
+ ):
322
+ layer_target_range = discovered_range
323
+ break
324
+ elif single_layer_key is not None:
325
+ if (
326
+ is_suspiciously_successive_range(single_layer_key, character_range)
327
+ is False
328
+ ):
329
+ layer_target_range = single_layer_key
330
+
331
+ if layer_target_range is None:
332
+ layer_target_range = character_range
333
+
334
+ if layer_target_range not in layers:
335
+ layers[layer_target_range] = []
336
+ if single_layer_key is None:
337
+ single_layer_key = layer_target_range
338
+ else:
339
+ multi_layer = True
340
+
341
+ layers[layer_target_range].append(character)
342
+
343
+ # Cache for next iteration
344
+ prev_character_range = character_range
345
+ prev_layer_target = layer_target_range
346
+
347
+ return ["".join(chars).lower() for chars in layers.values()]
348
+
349
+
350
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
351
+ """
352
+ This function merge results previously given by the function coherence_ratio.
353
+ The return type is the same as coherence_ratio.
354
+ """
355
+ per_language_ratios: dict[str, list[float]] = {}
356
+ for result in results:
357
+ for sub_result in result:
358
+ language, ratio = sub_result
359
+ if language not in per_language_ratios:
360
+ per_language_ratios[language] = [ratio]
361
+ continue
362
+ per_language_ratios[language].append(ratio)
363
+
364
+ merge = [
365
+ (
366
+ language,
367
+ round(
368
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
369
+ 4,
370
+ ),
371
+ )
372
+ for language in per_language_ratios
373
+ ]
374
+
375
+ return sorted(merge, key=lambda x: x[1], reverse=True)
376
+
377
+
378
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
379
+ """
380
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
381
+ of "English". This function only keeps the best match and remove the em-dash in it.
382
+ """
383
+ index_results: dict[str, list[float]] = dict()
384
+
385
+ for result in results:
386
+ language, ratio = result
387
+ no_em_name: str = language.replace("—", "")
388
+
389
+ if no_em_name not in index_results:
390
+ index_results[no_em_name] = []
391
+
392
+ index_results[no_em_name].append(ratio)
393
+
394
+ if any(len(index_results[e]) > 1 for e in index_results):
395
+ filtered_results: CoherenceMatches = []
396
+
397
+ for language in index_results:
398
+ filtered_results.append((language, max(index_results[language])))
399
+
400
+ return filtered_results
401
+
402
+ return results
403
+
404
+
405
+ @lru_cache(maxsize=2048)
406
+ def coherence_ratio(
407
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
408
+ ) -> CoherenceMatches:
409
+ """
410
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
411
+ A layer = Character extraction by alphabets/ranges.
412
+ """
413
+
414
+ results: list[tuple[str, float]] = []
415
+ ignore_non_latin: bool = False
416
+
417
+ sufficient_match_count: int = 0
418
+
419
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
420
+ if "Latin Based" in lg_inclusion_list:
421
+ ignore_non_latin = True
422
+ lg_inclusion_list.remove("Latin Based")
423
+
424
+ for layer in alpha_unicode_split(decoded_sequence):
425
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
426
+ most_common = sequence_frequencies.most_common()
427
+
428
+ character_count: int = len(layer)
429
+
430
+ if character_count <= TOO_SMALL_SEQUENCE:
431
+ continue
432
+
433
+ popular_character_ordered: list[str] = [c for c, o in most_common]
434
+
435
+ for language in lg_inclusion_list or alphabet_languages(
436
+ popular_character_ordered, ignore_non_latin
437
+ ):
438
+ ratio: float = characters_popularity_compare(
439
+ language, popular_character_ordered
440
+ )
441
+
442
+ if ratio < threshold:
443
+ continue
444
+ elif ratio >= 0.8:
445
+ sufficient_match_count += 1
446
+
447
+ results.append((language, round(ratio, 4)))
448
+
449
+ if sufficient_match_count >= 3:
450
+ break
451
+
452
+ return sorted(
453
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
454
+ )
.venv/lib/python3.14/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .__main__ import cli_detect, query_yes_no
4
+
5
+ __all__ = (
6
+ "cli_detect",
7
+ "query_yes_no",
8
+ )
.venv/lib/python3.14/site-packages/charset_normalizer/cli/__main__.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ import typing
6
+ from json import dumps
7
+ from os.path import abspath, basename, dirname, join, realpath
8
+ from platform import python_version
9
+ from unicodedata import unidata_version
10
+
11
+ import charset_normalizer.md as md_module
12
+ from charset_normalizer import from_fp
13
+ from charset_normalizer.models import CliDetectionResult
14
+ from charset_normalizer.version import __version__
15
+
16
+
17
+ def query_yes_no(question: str, default: str = "yes") -> bool: # Defensive:
18
+ """Ask a yes/no question via input() and return the answer as a bool."""
19
+ prompt = " [Y/n] " if default == "yes" else " [y/N] "
20
+
21
+ while True:
22
+ choice = input(question + prompt).strip().lower()
23
+ if not choice:
24
+ return default == "yes"
25
+ if choice in ("y", "yes"):
26
+ return True
27
+ if choice in ("n", "no"):
28
+ return False
29
+ print("Please respond with 'y' or 'n'.")
30
+
31
+
32
+ class FileType:
33
+ """Factory for creating file object types
34
+
35
+ Instances of FileType are typically passed as type= arguments to the
36
+ ArgumentParser add_argument() method.
37
+
38
+ Keyword Arguments:
39
+ - mode -- A string indicating how the file is to be opened. Accepts the
40
+ same values as the builtin open() function.
41
+ - bufsize -- The file's desired buffer size. Accepts the same values as
42
+ the builtin open() function.
43
+ - encoding -- The file's encoding. Accepts the same values as the
44
+ builtin open() function.
45
+ - errors -- A string indicating how encoding and decoding errors are to
46
+ be handled. Accepts the same value as the builtin open() function.
47
+
48
+ Backported from CPython 3.12
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ mode: str = "r",
54
+ bufsize: int = -1,
55
+ encoding: str | None = None,
56
+ errors: str | None = None,
57
+ ):
58
+ self._mode = mode
59
+ self._bufsize = bufsize
60
+ self._encoding = encoding
61
+ self._errors = errors
62
+
63
+ def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
64
+ # the special argument "-" means sys.std{in,out}
65
+ if string == "-":
66
+ if "r" in self._mode:
67
+ return sys.stdin.buffer if "b" in self._mode else sys.stdin
68
+ elif any(c in self._mode for c in "wax"):
69
+ return sys.stdout.buffer if "b" in self._mode else sys.stdout
70
+ else:
71
+ msg = f'argument "-" with mode {self._mode}'
72
+ raise ValueError(msg)
73
+
74
+ # all other arguments are used as file names
75
+ try:
76
+ return open(string, self._mode, self._bufsize, self._encoding, self._errors)
77
+ except OSError as e:
78
+ message = f"can't open '{string}': {e}"
79
+ raise argparse.ArgumentTypeError(message)
80
+
81
+ def __repr__(self) -> str:
82
+ args = self._mode, self._bufsize
83
+ kwargs = [("encoding", self._encoding), ("errors", self._errors)]
84
+ args_str = ", ".join(
85
+ [repr(arg) for arg in args if arg != -1]
86
+ + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
87
+ )
88
+ return f"{type(self).__name__}({args_str})"
89
+
90
+
91
+ def cli_detect(argv: list[str] | None = None) -> int:
92
+ """
93
+ CLI assistant using ARGV and ArgumentParser
94
+ :param argv:
95
+ :return: 0 if everything is fine, anything else equal trouble
96
+ """
97
+ parser = argparse.ArgumentParser(
98
+ description="The Real First Universal Charset Detector. "
99
+ "Discover originating encoding used on text file. "
100
+ "Normalize text to unicode."
101
+ )
102
+
103
+ parser.add_argument(
104
+ "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
105
+ )
106
+ parser.add_argument(
107
+ "-v",
108
+ "--verbose",
109
+ action="store_true",
110
+ default=False,
111
+ dest="verbose",
112
+ help="Display complementary information about file if any. "
113
+ "Stdout will contain logs about the detection process.",
114
+ )
115
+ parser.add_argument(
116
+ "-a",
117
+ "--with-alternative",
118
+ action="store_true",
119
+ default=False,
120
+ dest="alternatives",
121
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
122
+ )
123
+ parser.add_argument(
124
+ "-n",
125
+ "--normalize",
126
+ action="store_true",
127
+ default=False,
128
+ dest="normalize",
129
+ help="Permit to normalize input file. If not set, program does not write anything.",
130
+ )
131
+ parser.add_argument(
132
+ "-m",
133
+ "--minimal",
134
+ action="store_true",
135
+ default=False,
136
+ dest="minimal",
137
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
138
+ )
139
+ parser.add_argument(
140
+ "-r",
141
+ "--replace",
142
+ action="store_true",
143
+ default=False,
144
+ dest="replace",
145
+ help="Replace file when trying to normalize it instead of creating a new one.",
146
+ )
147
+ parser.add_argument(
148
+ "-f",
149
+ "--force",
150
+ action="store_true",
151
+ default=False,
152
+ dest="force",
153
+ help="Replace file without asking if you are sure, use this flag with caution.",
154
+ )
155
+ parser.add_argument(
156
+ "-i",
157
+ "--no-preemptive",
158
+ action="store_true",
159
+ default=False,
160
+ dest="no_preemptive",
161
+ help="Disable looking at a charset declaration to hint the detector.",
162
+ )
163
+ parser.add_argument(
164
+ "-t",
165
+ "--threshold",
166
+ action="store",
167
+ default=0.2,
168
+ type=float,
169
+ dest="threshold",
170
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
171
+ )
172
+ parser.add_argument(
173
+ "--version",
174
+ action="version",
175
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
176
+ __version__,
177
+ python_version(),
178
+ unidata_version,
179
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
180
+ ),
181
+ help="Show version information and exit.",
182
+ )
183
+
184
+ args = parser.parse_args(argv)
185
+
186
+ if args.replace is True and args.normalize is False:
187
+ if args.files:
188
+ for my_file in args.files:
189
+ my_file.close()
190
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
191
+ return 1
192
+
193
+ if args.force is True and args.replace is False:
194
+ if args.files:
195
+ for my_file in args.files:
196
+ my_file.close()
197
+ print("Use --force in addition of --replace only.", file=sys.stderr)
198
+ return 1
199
+
200
+ if args.threshold < 0.0 or args.threshold > 1.0:
201
+ if args.files:
202
+ for my_file in args.files:
203
+ my_file.close()
204
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
205
+ return 1
206
+
207
+ x_ = []
208
+
209
+ for my_file in args.files:
210
+ matches = from_fp(
211
+ my_file,
212
+ threshold=args.threshold,
213
+ explain=args.verbose,
214
+ preemptive_behaviour=args.no_preemptive is False,
215
+ )
216
+
217
+ best_guess = matches.best()
218
+
219
+ if best_guess is None:
220
+ print(
221
+ 'Unable to identify originating encoding for "{}". {}'.format(
222
+ my_file.name,
223
+ (
224
+ "Maybe try increasing maximum amount of chaos."
225
+ if args.threshold < 1.0
226
+ else ""
227
+ ),
228
+ ),
229
+ file=sys.stderr,
230
+ )
231
+ x_.append(
232
+ CliDetectionResult(
233
+ abspath(my_file.name),
234
+ None,
235
+ [],
236
+ [],
237
+ "Unknown",
238
+ [],
239
+ False,
240
+ 1.0,
241
+ 0.0,
242
+ None,
243
+ True,
244
+ )
245
+ )
246
+ else:
247
+ cli_result = CliDetectionResult(
248
+ abspath(my_file.name),
249
+ best_guess.encoding,
250
+ best_guess.encoding_aliases,
251
+ [
252
+ cp
253
+ for cp in best_guess.could_be_from_charset
254
+ if cp != best_guess.encoding
255
+ ],
256
+ best_guess.language,
257
+ best_guess.alphabets,
258
+ best_guess.bom,
259
+ best_guess.percent_chaos,
260
+ best_guess.percent_coherence,
261
+ None,
262
+ True,
263
+ )
264
+ x_.append(cli_result)
265
+
266
+ if len(matches) > 1 and args.alternatives:
267
+ for el in matches:
268
+ if el != best_guess:
269
+ x_.append(
270
+ CliDetectionResult(
271
+ abspath(my_file.name),
272
+ el.encoding,
273
+ el.encoding_aliases,
274
+ [
275
+ cp
276
+ for cp in el.could_be_from_charset
277
+ if cp != el.encoding
278
+ ],
279
+ el.language,
280
+ el.alphabets,
281
+ el.bom,
282
+ el.percent_chaos,
283
+ el.percent_coherence,
284
+ None,
285
+ False,
286
+ )
287
+ )
288
+
289
+ if args.normalize is True:
290
+ if best_guess.encoding.startswith("utf") is True:
291
+ print(
292
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
293
+ my_file.name
294
+ ),
295
+ file=sys.stderr,
296
+ )
297
+ if my_file.closed is False:
298
+ my_file.close()
299
+ continue
300
+
301
+ dir_path = dirname(realpath(my_file.name))
302
+ file_name = basename(realpath(my_file.name))
303
+
304
+ o_: list[str] = file_name.split(".")
305
+
306
+ if args.replace is False:
307
+ o_.insert(-1, best_guess.encoding)
308
+ if my_file.closed is False:
309
+ my_file.close()
310
+ elif (
311
+ args.force is False
312
+ and query_yes_no(
313
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
314
+ my_file.name
315
+ ),
316
+ "no",
317
+ )
318
+ is False
319
+ ):
320
+ if my_file.closed is False:
321
+ my_file.close()
322
+ continue
323
+
324
+ try:
325
+ cli_result.unicode_path = join(dir_path, ".".join(o_))
326
+
327
+ with open(cli_result.unicode_path, "wb") as fp:
328
+ fp.write(best_guess.output())
329
+ except OSError as e: # Defensive:
330
+ print(str(e), file=sys.stderr)
331
+ if my_file.closed is False:
332
+ my_file.close()
333
+ return 2
334
+
335
+ if my_file.closed is False:
336
+ my_file.close()
337
+
338
+ if args.minimal is False:
339
+ print(
340
+ dumps(
341
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
342
+ ensure_ascii=True,
343
+ indent=4,
344
+ )
345
+ )
346
+ else:
347
+ for my_file in args.files:
348
+ print(
349
+ ", ".join(
350
+ [
351
+ el.encoding or "undefined"
352
+ for el in x_
353
+ if el.path == abspath(my_file.name)
354
+ ]
355
+ )
356
+ )
357
+
358
+ return 0
359
+
360
+
361
+ if __name__ == "__main__": # Defensive:
362
+ cli_detect()
.venv/lib/python3.14/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,2050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
+ from encodings.aliases import aliases
5
+ from re import IGNORECASE
6
+ from re import compile as re_compile
7
+
8
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
+ ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
+ "utf_8": BOM_UTF8,
11
+ "utf_7": [
12
+ b"\x2b\x2f\x76\x38\x2d",
13
+ b"\x2b\x2f\x76\x38",
14
+ b"\x2b\x2f\x76\x39",
15
+ b"\x2b\x2f\x76\x2b",
16
+ b"\x2b\x2f\x76\x2f",
17
+ ],
18
+ "gb18030": b"\x84\x31\x95\x33",
19
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
+ }
22
+
23
+ TOO_SMALL_SEQUENCE: int = 32
24
+ TOO_BIG_SEQUENCE: int = int(10e6)
25
+
26
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
+
28
+ # Up-to-date Unicode ucd/17.0.0
29
+ UNICODE_RANGES_COMBINED: dict[str, range] = {
30
+ "Control character": range(32),
31
+ "Basic Latin": range(32, 128),
32
+ "Latin-1 Supplement": range(128, 256),
33
+ "Latin Extended-A": range(256, 384),
34
+ "Latin Extended-B": range(384, 592),
35
+ "IPA Extensions": range(592, 688),
36
+ "Spacing Modifier Letters": range(688, 768),
37
+ "Combining Diacritical Marks": range(768, 880),
38
+ "Greek and Coptic": range(880, 1024),
39
+ "Cyrillic": range(1024, 1280),
40
+ "Cyrillic Supplement": range(1280, 1328),
41
+ "Armenian": range(1328, 1424),
42
+ "Hebrew": range(1424, 1536),
43
+ "Arabic": range(1536, 1792),
44
+ "Syriac": range(1792, 1872),
45
+ "Arabic Supplement": range(1872, 1920),
46
+ "Thaana": range(1920, 1984),
47
+ "NKo": range(1984, 2048),
48
+ "Samaritan": range(2048, 2112),
49
+ "Mandaic": range(2112, 2144),
50
+ "Syriac Supplement": range(2144, 2160),
51
+ "Arabic Extended-B": range(2160, 2208),
52
+ "Arabic Extended-A": range(2208, 2304),
53
+ "Devanagari": range(2304, 2432),
54
+ "Bengali": range(2432, 2560),
55
+ "Gurmukhi": range(2560, 2688),
56
+ "Gujarati": range(2688, 2816),
57
+ "Oriya": range(2816, 2944),
58
+ "Tamil": range(2944, 3072),
59
+ "Telugu": range(3072, 3200),
60
+ "Kannada": range(3200, 3328),
61
+ "Malayalam": range(3328, 3456),
62
+ "Sinhala": range(3456, 3584),
63
+ "Thai": range(3584, 3712),
64
+ "Lao": range(3712, 3840),
65
+ "Tibetan": range(3840, 4096),
66
+ "Myanmar": range(4096, 4256),
67
+ "Georgian": range(4256, 4352),
68
+ "Hangul Jamo": range(4352, 4608),
69
+ "Ethiopic": range(4608, 4992),
70
+ "Ethiopic Supplement": range(4992, 5024),
71
+ "Cherokee": range(5024, 5120),
72
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
+ "Ogham": range(5760, 5792),
74
+ "Runic": range(5792, 5888),
75
+ "Tagalog": range(5888, 5920),
76
+ "Hanunoo": range(5920, 5952),
77
+ "Buhid": range(5952, 5984),
78
+ "Tagbanwa": range(5984, 6016),
79
+ "Khmer": range(6016, 6144),
80
+ "Mongolian": range(6144, 6320),
81
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
+ "Limbu": range(6400, 6480),
83
+ "Tai Le": range(6480, 6528),
84
+ "New Tai Lue": range(6528, 6624),
85
+ "Khmer Symbols": range(6624, 6656),
86
+ "Buginese": range(6656, 6688),
87
+ "Tai Tham": range(6688, 6832),
88
+ "Combining Diacritical Marks Extended": range(6832, 6912),
89
+ "Balinese": range(6912, 7040),
90
+ "Sundanese": range(7040, 7104),
91
+ "Batak": range(7104, 7168),
92
+ "Lepcha": range(7168, 7248),
93
+ "Ol Chiki": range(7248, 7296),
94
+ "Cyrillic Extended-C": range(7296, 7312),
95
+ "Georgian Extended": range(7312, 7360),
96
+ "Sundanese Supplement": range(7360, 7376),
97
+ "Vedic Extensions": range(7376, 7424),
98
+ "Phonetic Extensions": range(7424, 7552),
99
+ "Phonetic Extensions Supplement": range(7552, 7616),
100
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
101
+ "Latin Extended Additional": range(7680, 7936),
102
+ "Greek Extended": range(7936, 8192),
103
+ "General Punctuation": range(8192, 8304),
104
+ "Superscripts and Subscripts": range(8304, 8352),
105
+ "Currency Symbols": range(8352, 8400),
106
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
+ "Letterlike Symbols": range(8448, 8528),
108
+ "Number Forms": range(8528, 8592),
109
+ "Arrows": range(8592, 8704),
110
+ "Mathematical Operators": range(8704, 8960),
111
+ "Miscellaneous Technical": range(8960, 9216),
112
+ "Control Pictures": range(9216, 9280),
113
+ "Optical Character Recognition": range(9280, 9312),
114
+ "Enclosed Alphanumerics": range(9312, 9472),
115
+ "Box Drawing": range(9472, 9600),
116
+ "Block Elements": range(9600, 9632),
117
+ "Geometric Shapes": range(9632, 9728),
118
+ "Miscellaneous Symbols": range(9728, 9984),
119
+ "Dingbats": range(9984, 10176),
120
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
+ "Supplemental Arrows-A": range(10224, 10240),
122
+ "Braille Patterns": range(10240, 10496),
123
+ "Supplemental Arrows-B": range(10496, 10624),
124
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
+ "Supplemental Mathematical Operators": range(10752, 11008),
126
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
+ "Glagolitic": range(11264, 11360),
128
+ "Latin Extended-C": range(11360, 11392),
129
+ "Coptic": range(11392, 11520),
130
+ "Georgian Supplement": range(11520, 11568),
131
+ "Tifinagh": range(11568, 11648),
132
+ "Ethiopic Extended": range(11648, 11744),
133
+ "Cyrillic Extended-A": range(11744, 11776),
134
+ "Supplemental Punctuation": range(11776, 11904),
135
+ "CJK Radicals Supplement": range(11904, 12032),
136
+ "Kangxi Radicals": range(12032, 12256),
137
+ "Ideographic Description Characters": range(12272, 12288),
138
+ "CJK Symbols and Punctuation": range(12288, 12352),
139
+ "Hiragana": range(12352, 12448),
140
+ "Katakana": range(12448, 12544),
141
+ "Bopomofo": range(12544, 12592),
142
+ "Hangul Compatibility Jamo": range(12592, 12688),
143
+ "Kanbun": range(12688, 12704),
144
+ "Bopomofo Extended": range(12704, 12736),
145
+ "CJK Strokes": range(12736, 12784),
146
+ "Katakana Phonetic Extensions": range(12784, 12800),
147
+ "Enclosed CJK Letters and Months": range(12800, 13056),
148
+ "CJK Compatibility": range(13056, 13312),
149
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
150
+ "Yijing Hexagram Symbols": range(19904, 19968),
151
+ "CJK Unified Ideographs": range(19968, 40960),
152
+ "Yi Syllables": range(40960, 42128),
153
+ "Yi Radicals": range(42128, 42192),
154
+ "Lisu": range(42192, 42240),
155
+ "Vai": range(42240, 42560),
156
+ "Cyrillic Extended-B": range(42560, 42656),
157
+ "Bamum": range(42656, 42752),
158
+ "Modifier Tone Letters": range(42752, 42784),
159
+ "Latin Extended-D": range(42784, 43008),
160
+ "Syloti Nagri": range(43008, 43056),
161
+ "Common Indic Number Forms": range(43056, 43072),
162
+ "Phags-pa": range(43072, 43136),
163
+ "Saurashtra": range(43136, 43232),
164
+ "Devanagari Extended": range(43232, 43264),
165
+ "Kayah Li": range(43264, 43312),
166
+ "Rejang": range(43312, 43360),
167
+ "Hangul Jamo Extended-A": range(43360, 43392),
168
+ "Javanese": range(43392, 43488),
169
+ "Myanmar Extended-B": range(43488, 43520),
170
+ "Cham": range(43520, 43616),
171
+ "Myanmar Extended-A": range(43616, 43648),
172
+ "Tai Viet": range(43648, 43744),
173
+ "Meetei Mayek Extensions": range(43744, 43776),
174
+ "Ethiopic Extended-A": range(43776, 43824),
175
+ "Latin Extended-E": range(43824, 43888),
176
+ "Cherokee Supplement": range(43888, 43968),
177
+ "Meetei Mayek": range(43968, 44032),
178
+ "Hangul Syllables": range(44032, 55216),
179
+ "Hangul Jamo Extended-B": range(55216, 55296),
180
+ "High Surrogates": range(55296, 56192),
181
+ "High Private Use Surrogates": range(56192, 56320),
182
+ "Low Surrogates": range(56320, 57344),
183
+ "Private Use Area": range(57344, 63744),
184
+ "CJK Compatibility Ideographs": range(63744, 64256),
185
+ "Alphabetic Presentation Forms": range(64256, 64336),
186
+ "Arabic Presentation Forms-A": range(64336, 65024),
187
+ "Variation Selectors": range(65024, 65040),
188
+ "Vertical Forms": range(65040, 65056),
189
+ "Combining Half Marks": range(65056, 65072),
190
+ "CJK Compatibility Forms": range(65072, 65104),
191
+ "Small Form Variants": range(65104, 65136),
192
+ "Arabic Presentation Forms-B": range(65136, 65280),
193
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
+ "Specials": range(65520, 65536),
195
+ "Linear B Syllabary": range(65536, 65664),
196
+ "Linear B Ideograms": range(65664, 65792),
197
+ "Aegean Numbers": range(65792, 65856),
198
+ "Ancient Greek Numbers": range(65856, 65936),
199
+ "Ancient Symbols": range(65936, 66000),
200
+ "Phaistos Disc": range(66000, 66048),
201
+ "Lycian": range(66176, 66208),
202
+ "Carian": range(66208, 66272),
203
+ "Coptic Epact Numbers": range(66272, 66304),
204
+ "Old Italic": range(66304, 66352),
205
+ "Gothic": range(66352, 66384),
206
+ "Old Permic": range(66384, 66432),
207
+ "Ugaritic": range(66432, 66464),
208
+ "Old Persian": range(66464, 66528),
209
+ "Deseret": range(66560, 66640),
210
+ "Shavian": range(66640, 66688),
211
+ "Osmanya": range(66688, 66736),
212
+ "Osage": range(66736, 66816),
213
+ "Elbasan": range(66816, 66864),
214
+ "Caucasian Albanian": range(66864, 66928),
215
+ "Vithkuqi": range(66928, 67008),
216
+ "Todhri": range(67008, 67072),
217
+ "Linear A": range(67072, 67456),
218
+ "Latin Extended-F": range(67456, 67520),
219
+ "Cypriot Syllabary": range(67584, 67648),
220
+ "Imperial Aramaic": range(67648, 67680),
221
+ "Palmyrene": range(67680, 67712),
222
+ "Nabataean": range(67712, 67760),
223
+ "Hatran": range(67808, 67840),
224
+ "Phoenician": range(67840, 67872),
225
+ "Lydian": range(67872, 67904),
226
+ "Sidetic": range(67904, 67936),
227
+ "Meroitic Hieroglyphs": range(67968, 68000),
228
+ "Meroitic Cursive": range(68000, 68096),
229
+ "Kharoshthi": range(68096, 68192),
230
+ "Old South Arabian": range(68192, 68224),
231
+ "Old North Arabian": range(68224, 68256),
232
+ "Manichaean": range(68288, 68352),
233
+ "Avestan": range(68352, 68416),
234
+ "Inscriptional Parthian": range(68416, 68448),
235
+ "Inscriptional Pahlavi": range(68448, 68480),
236
+ "Psalter Pahlavi": range(68480, 68528),
237
+ "Old Turkic": range(68608, 68688),
238
+ "Old Hungarian": range(68736, 68864),
239
+ "Hanifi Rohingya": range(68864, 68928),
240
+ "Garay": range(68928, 69008),
241
+ "Rumi Numeral Symbols": range(69216, 69248),
242
+ "Yezidi": range(69248, 69312),
243
+ "Arabic Extended-C": range(69312, 69376),
244
+ "Old Sogdian": range(69376, 69424),
245
+ "Sogdian": range(69424, 69488),
246
+ "Old Uyghur": range(69488, 69552),
247
+ "Chorasmian": range(69552, 69600),
248
+ "Elymaic": range(69600, 69632),
249
+ "Brahmi": range(69632, 69760),
250
+ "Kaithi": range(69760, 69840),
251
+ "Sora Sompeng": range(69840, 69888),
252
+ "Chakma": range(69888, 69968),
253
+ "Mahajani": range(69968, 70016),
254
+ "Sharada": range(70016, 70112),
255
+ "Sinhala Archaic Numbers": range(70112, 70144),
256
+ "Khojki": range(70144, 70224),
257
+ "Multani": range(70272, 70320),
258
+ "Khudawadi": range(70320, 70400),
259
+ "Grantha": range(70400, 70528),
260
+ "Tulu-Tigalari": range(70528, 70656),
261
+ "Newa": range(70656, 70784),
262
+ "Tirhuta": range(70784, 70880),
263
+ "Siddham": range(71040, 71168),
264
+ "Modi": range(71168, 71264),
265
+ "Mongolian Supplement": range(71264, 71296),
266
+ "Takri": range(71296, 71376),
267
+ "Myanmar Extended-C": range(71376, 71424),
268
+ "Ahom": range(71424, 71504),
269
+ "Dogra": range(71680, 71760),
270
+ "Warang Citi": range(71840, 71936),
271
+ "Dives Akuru": range(71936, 72032),
272
+ "Nandinagari": range(72096, 72192),
273
+ "Zanabazar Square": range(72192, 72272),
274
+ "Soyombo": range(72272, 72368),
275
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
276
+ "Pau Cin Hau": range(72384, 72448),
277
+ "Devanagari Extended-A": range(72448, 72544),
278
+ "Sharada Supplement": range(72544, 72576),
279
+ "Sunuwar": range(72640, 72704),
280
+ "Bhaiksuki": range(72704, 72816),
281
+ "Marchen": range(72816, 72896),
282
+ "Masaram Gondi": range(72960, 73056),
283
+ "Gunjala Gondi": range(73056, 73136),
284
+ "Tolong Siki": range(73136, 73200),
285
+ "Makasar": range(73440, 73472),
286
+ "Kawi": range(73472, 73568),
287
+ "Lisu Supplement": range(73648, 73664),
288
+ "Tamil Supplement": range(73664, 73728),
289
+ "Cuneiform": range(73728, 74752),
290
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
291
+ "Early Dynastic Cuneiform": range(74880, 75088),
292
+ "Cypro-Minoan": range(77712, 77824),
293
+ "Egyptian Hieroglyphs": range(77824, 78896),
294
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
295
+ "Egyptian Hieroglyphs Extended-A": range(78944, 82944),
296
+ "Anatolian Hieroglyphs": range(82944, 83584),
297
+ "Gurung Khema": range(90368, 90432),
298
+ "Bamum Supplement": range(92160, 92736),
299
+ "Mro": range(92736, 92784),
300
+ "Tangsa": range(92784, 92880),
301
+ "Bassa Vah": range(92880, 92928),
302
+ "Pahawh Hmong": range(92928, 93072),
303
+ "Kirat Rai": range(93504, 93568),
304
+ "Medefaidrin": range(93760, 93856),
305
+ "Beria Erfe": range(93856, 93920),
306
+ "Miao": range(93952, 94112),
307
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
308
+ "Tangut": range(94208, 100352),
309
+ "Tangut Components": range(100352, 101120),
310
+ "Khitan Small Script": range(101120, 101632),
311
+ "Tangut Supplement": range(101632, 101760),
312
+ "Tangut Components Supplement": range(101760, 101888),
313
+ "Kana Extended-B": range(110576, 110592),
314
+ "Kana Supplement": range(110592, 110848),
315
+ "Kana Extended-A": range(110848, 110896),
316
+ "Small Kana Extension": range(110896, 110960),
317
+ "Nushu": range(110960, 111360),
318
+ "Duployan": range(113664, 113824),
319
+ "Shorthand Format Controls": range(113824, 113840),
320
+ "Symbols for Legacy Computing Supplement": range(117760, 118464),
321
+ "Miscellaneous Symbols Supplement": range(118464, 118528),
322
+ "Znamenny Musical Notation": range(118528, 118736),
323
+ "Byzantine Musical Symbols": range(118784, 119040),
324
+ "Musical Symbols": range(119040, 119296),
325
+ "Ancient Greek Musical Notation": range(119296, 119376),
326
+ "Kaktovik Numerals": range(119488, 119520),
327
+ "Mayan Numerals": range(119520, 119552),
328
+ "Tai Xuan Jing Symbols": range(119552, 119648),
329
+ "Counting Rod Numerals": range(119648, 119680),
330
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
331
+ "Sutton SignWriting": range(120832, 121520),
332
+ "Latin Extended-G": range(122624, 122880),
333
+ "Glagolitic Supplement": range(122880, 122928),
334
+ "Cyrillic Extended-D": range(122928, 123024),
335
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
336
+ "Toto": range(123536, 123584),
337
+ "Wancho": range(123584, 123648),
338
+ "Nag Mundari": range(124112, 124160),
339
+ "Ol Onal": range(124368, 124416),
340
+ "Tai Yo": range(124608, 124672),
341
+ "Ethiopic Extended-B": range(124896, 124928),
342
+ "Mende Kikakui": range(124928, 125152),
343
+ "Adlam": range(125184, 125280),
344
+ "Indic Siyaq Numbers": range(126064, 126144),
345
+ "Ottoman Siyaq Numbers": range(126208, 126288),
346
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
347
+ "Mahjong Tiles": range(126976, 127024),
348
+ "Domino Tiles": range(127024, 127136),
349
+ "Playing Cards": range(127136, 127232),
350
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
351
+ "Enclosed Ideographic Supplement": range(127488, 127744),
352
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
353
+ "Emoticons": range(128512, 128592),
354
+ "Ornamental Dingbats": range(128592, 128640),
355
+ "Transport and Map Symbols": range(128640, 128768),
356
+ "Alchemical Symbols": range(128768, 128896),
357
+ "Geometric Shapes Extended": range(128896, 129024),
358
+ "Supplemental Arrows-C": range(129024, 129280),
359
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
360
+ "Chess Symbols": range(129536, 129648),
361
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
362
+ "Symbols for Legacy Computing": range(129792, 130048),
363
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
364
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
365
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
366
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
367
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
368
+ "CJK Unified Ideographs Extension I": range(191472, 192096),
369
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
370
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
371
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
372
+ "CJK Unified Ideographs Extension J": range(205744, 210048),
373
+ "Tags": range(917504, 917632),
374
+ "Variation Selectors Supplement": range(917760, 918000),
375
+ "Supplementary Private Use Area-A": range(983040, 1048576),
376
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
377
+ }
378
+
379
+
380
+ UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
381
+ "Supplement",
382
+ "Extended",
383
+ "Extensions",
384
+ "Modifier",
385
+ "Marks",
386
+ "Punctuation",
387
+ "Symbols",
388
+ "Forms",
389
+ "Operators",
390
+ "Miscellaneous",
391
+ "Drawing",
392
+ "Block",
393
+ "Shapes",
394
+ "Supplemental",
395
+ "Tags",
396
+ ]
397
+
398
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
399
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
400
+ IGNORECASE,
401
+ )
402
+
403
+ IANA_NO_ALIASES = [
404
+ "cp720",
405
+ "cp737",
406
+ "cp856",
407
+ "cp874",
408
+ "cp875",
409
+ "cp1006",
410
+ "koi8_r",
411
+ "koi8_t",
412
+ "koi8_u",
413
+ ]
414
+
415
+ IANA_SUPPORTED: list[str] = sorted(
416
+ filter(
417
+ lambda x: x.endswith("_codec") is False
418
+ and x not in {"rot_13", "tactis", "mbcs"},
419
+ list(set(aliases.values())) + IANA_NO_ALIASES,
420
+ )
421
+ )
422
+
423
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
424
+
425
+ # pre-computed code page that are similar using the function cp_similarity.
426
+ IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
427
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
428
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
429
+ "cp1125": ["cp866"],
430
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
431
+ "cp1250": ["iso8859_2"],
432
+ "cp1251": ["kz1048", "ptcp154"],
433
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
434
+ "cp1253": ["iso8859_7"],
435
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
436
+ "cp1257": ["iso8859_13"],
437
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
438
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
439
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
440
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
441
+ "cp857": ["cp850", "cp858", "cp865"],
442
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
443
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
444
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
445
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
446
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
447
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
448
+ "cp866": ["cp1125"],
449
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
450
+ "iso8859_11": ["tis_620"],
451
+ "iso8859_13": ["cp1257"],
452
+ "iso8859_14": [
453
+ "iso8859_10",
454
+ "iso8859_15",
455
+ "iso8859_16",
456
+ "iso8859_3",
457
+ "iso8859_9",
458
+ "latin_1",
459
+ ],
460
+ "iso8859_15": [
461
+ "cp1252",
462
+ "cp1254",
463
+ "iso8859_10",
464
+ "iso8859_14",
465
+ "iso8859_16",
466
+ "iso8859_3",
467
+ "iso8859_9",
468
+ "latin_1",
469
+ ],
470
+ "iso8859_16": [
471
+ "iso8859_14",
472
+ "iso8859_15",
473
+ "iso8859_2",
474
+ "iso8859_3",
475
+ "iso8859_9",
476
+ "latin_1",
477
+ ],
478
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
479
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
480
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
481
+ "iso8859_7": ["cp1253"],
482
+ "iso8859_9": [
483
+ "cp1252",
484
+ "cp1254",
485
+ "cp1258",
486
+ "iso8859_10",
487
+ "iso8859_14",
488
+ "iso8859_15",
489
+ "iso8859_16",
490
+ "iso8859_3",
491
+ "iso8859_4",
492
+ "latin_1",
493
+ ],
494
+ "kz1048": ["cp1251", "ptcp154"],
495
+ "latin_1": [
496
+ "cp1252",
497
+ "cp1254",
498
+ "cp1258",
499
+ "iso8859_10",
500
+ "iso8859_14",
501
+ "iso8859_15",
502
+ "iso8859_16",
503
+ "iso8859_3",
504
+ "iso8859_4",
505
+ "iso8859_9",
506
+ ],
507
+ "mac_iceland": ["mac_roman", "mac_turkish"],
508
+ "mac_roman": ["mac_iceland", "mac_turkish"],
509
+ "mac_turkish": ["mac_iceland", "mac_roman"],
510
+ "ptcp154": ["cp1251", "kz1048"],
511
+ "tis_620": ["iso8859_11"],
512
+ }
513
+
514
+
515
+ CHARDET_CORRESPONDENCE: dict[str, str] = {
516
+ "iso2022_kr": "ISO-2022-KR",
517
+ "iso2022_jp": "ISO-2022-JP",
518
+ "euc_kr": "EUC-KR",
519
+ "tis_620": "TIS-620",
520
+ "utf_32": "UTF-32",
521
+ "euc_jp": "EUC-JP",
522
+ "koi8_r": "KOI8-R",
523
+ "iso8859_1": "ISO-8859-1",
524
+ "iso8859_2": "ISO-8859-2",
525
+ "iso8859_5": "ISO-8859-5",
526
+ "iso8859_6": "ISO-8859-6",
527
+ "iso8859_7": "ISO-8859-7",
528
+ "iso8859_8": "ISO-8859-8",
529
+ "utf_16": "UTF-16",
530
+ "cp855": "IBM855",
531
+ "mac_cyrillic": "MacCyrillic",
532
+ "gb2312": "GB2312",
533
+ "gb18030": "GB18030",
534
+ "cp932": "CP932",
535
+ "cp866": "IBM866",
536
+ "utf_8": "utf-8",
537
+ "utf_8_sig": "UTF-8-SIG",
538
+ "shift_jis": "SHIFT_JIS",
539
+ "big5": "Big5",
540
+ "cp1250": "windows-1250",
541
+ "cp1251": "windows-1251",
542
+ "cp1252": "Windows-1252",
543
+ "cp1253": "windows-1253",
544
+ "cp1255": "windows-1255",
545
+ "cp1256": "windows-1256",
546
+ "cp1254": "Windows-1254",
547
+ "cp949": "CP949",
548
+ }
549
+
550
+
551
+ COMMON_SAFE_ASCII_CHARACTERS: frozenset[str] = frozenset(
552
+ {
553
+ "<",
554
+ ">",
555
+ "=",
556
+ ":",
557
+ "/",
558
+ "&",
559
+ ";",
560
+ "{",
561
+ "}",
562
+ "[",
563
+ "]",
564
+ ",",
565
+ "|",
566
+ '"',
567
+ "-",
568
+ "(",
569
+ ")",
570
+ }
571
+ )
572
+
573
+ # Sample character sets — replace with full lists if needed
574
+ COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
575
+
576
+ COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
577
+
578
+ COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
579
+
580
+ # Combine all into a frozenset
581
+ COMMON_CJK_CHARACTERS = frozenset(
582
+ "".join(
583
+ [
584
+ COMMON_CHINESE_CHARACTERS,
585
+ COMMON_JAPANESE_CHARACTERS,
586
+ COMMON_KOREAN_CHARACTERS,
587
+ ]
588
+ )
589
+ )
590
+
591
+ KO_NAMES: frozenset[str] = frozenset({"johab", "cp949", "euc_kr"})
592
+ ZH_NAMES: frozenset[str] = frozenset({"big5", "cp950", "big5hkscs", "hz"})
593
+
594
+ # Logging LEVEL below DEBUG
595
+ TRACE: int = 5
596
+
597
+
598
+ # Language label that contain the em dash "—"
599
+ # character are to be considered alternative seq to origin
600
+ FREQUENCIES: dict[str, list[str]] = {
601
+ "English": [
602
+ "e",
603
+ "a",
604
+ "t",
605
+ "i",
606
+ "o",
607
+ "n",
608
+ "s",
609
+ "r",
610
+ "h",
611
+ "l",
612
+ "d",
613
+ "c",
614
+ "u",
615
+ "m",
616
+ "f",
617
+ "p",
618
+ "g",
619
+ "w",
620
+ "y",
621
+ "b",
622
+ "v",
623
+ "k",
624
+ "x",
625
+ "j",
626
+ "z",
627
+ "q",
628
+ ],
629
+ "English—": [
630
+ "e",
631
+ "a",
632
+ "t",
633
+ "i",
634
+ "o",
635
+ "n",
636
+ "s",
637
+ "r",
638
+ "h",
639
+ "l",
640
+ "d",
641
+ "c",
642
+ "m",
643
+ "u",
644
+ "f",
645
+ "p",
646
+ "g",
647
+ "w",
648
+ "b",
649
+ "y",
650
+ "v",
651
+ "k",
652
+ "j",
653
+ "x",
654
+ "z",
655
+ "q",
656
+ ],
657
+ "German": [
658
+ "e",
659
+ "n",
660
+ "i",
661
+ "r",
662
+ "s",
663
+ "t",
664
+ "a",
665
+ "d",
666
+ "h",
667
+ "u",
668
+ "l",
669
+ "g",
670
+ "o",
671
+ "c",
672
+ "m",
673
+ "b",
674
+ "f",
675
+ "k",
676
+ "w",
677
+ "z",
678
+ "p",
679
+ "v",
680
+ "ü",
681
+ "ä",
682
+ "ö",
683
+ "j",
684
+ ],
685
+ "French": [
686
+ "e",
687
+ "a",
688
+ "s",
689
+ "n",
690
+ "i",
691
+ "t",
692
+ "r",
693
+ "l",
694
+ "u",
695
+ "o",
696
+ "d",
697
+ "c",
698
+ "p",
699
+ "m",
700
+ "é",
701
+ "v",
702
+ "g",
703
+ "f",
704
+ "b",
705
+ "h",
706
+ "q",
707
+ "à",
708
+ "x",
709
+ "è",
710
+ "y",
711
+ "j",
712
+ ],
713
+ "Dutch": [
714
+ "e",
715
+ "n",
716
+ "a",
717
+ "i",
718
+ "r",
719
+ "t",
720
+ "o",
721
+ "d",
722
+ "s",
723
+ "l",
724
+ "g",
725
+ "h",
726
+ "v",
727
+ "m",
728
+ "u",
729
+ "k",
730
+ "c",
731
+ "p",
732
+ "b",
733
+ "w",
734
+ "j",
735
+ "z",
736
+ "f",
737
+ "y",
738
+ "x",
739
+ "ë",
740
+ ],
741
+ "Italian": [
742
+ "e",
743
+ "i",
744
+ "a",
745
+ "o",
746
+ "n",
747
+ "l",
748
+ "t",
749
+ "r",
750
+ "s",
751
+ "c",
752
+ "d",
753
+ "u",
754
+ "p",
755
+ "m",
756
+ "g",
757
+ "v",
758
+ "f",
759
+ "b",
760
+ "z",
761
+ "h",
762
+ "q",
763
+ "è",
764
+ "à",
765
+ "k",
766
+ "y",
767
+ "ò",
768
+ ],
769
+ "Polish": [
770
+ "a",
771
+ "i",
772
+ "o",
773
+ "e",
774
+ "n",
775
+ "r",
776
+ "z",
777
+ "w",
778
+ "s",
779
+ "c",
780
+ "t",
781
+ "k",
782
+ "y",
783
+ "d",
784
+ "p",
785
+ "m",
786
+ "u",
787
+ "l",
788
+ "j",
789
+ "ł",
790
+ "g",
791
+ "b",
792
+ "h",
793
+ "ą",
794
+ "ę",
795
+ "ó",
796
+ ],
797
+ "Spanish": [
798
+ "e",
799
+ "a",
800
+ "o",
801
+ "n",
802
+ "s",
803
+ "r",
804
+ "i",
805
+ "l",
806
+ "d",
807
+ "t",
808
+ "c",
809
+ "u",
810
+ "m",
811
+ "p",
812
+ "b",
813
+ "g",
814
+ "v",
815
+ "f",
816
+ "y",
817
+ "ó",
818
+ "h",
819
+ "q",
820
+ "í",
821
+ "j",
822
+ "z",
823
+ "á",
824
+ ],
825
+ "Russian": [
826
+ "о",
827
+ "е",
828
+ "а",
829
+ "и",
830
+ "н",
831
+ "т",
832
+ "с",
833
+ "р",
834
+ "в",
835
+ "л",
836
+ "к",
837
+ "м",
838
+ "д",
839
+ "п",
840
+ "у",
841
+ "г",
842
+ "я",
843
+ "ы",
844
+ "з",
845
+ "б",
846
+ "й",
847
+ "ь",
848
+ "ч",
849
+ "х",
850
+ "ж",
851
+ "ц",
852
+ ],
853
+ # Jap-Kanji
854
+ "Japanese": [
855
+ "日",
856
+ "一",
857
+ "人",
858
+ "年",
859
+ "大",
860
+ "十",
861
+ "二",
862
+ "本",
863
+ "中",
864
+ "長",
865
+ "出",
866
+ "三",
867
+ "時",
868
+ "行",
869
+ "見",
870
+ "月",
871
+ "分",
872
+ "後",
873
+ "前",
874
+ "生",
875
+ "五",
876
+ "間",
877
+ "上",
878
+ "東",
879
+ "四",
880
+ "今",
881
+ "金",
882
+ "九",
883
+ "入",
884
+ "学",
885
+ "高",
886
+ "円",
887
+ "子",
888
+ "外",
889
+ "八",
890
+ "六",
891
+ "下",
892
+ "来",
893
+ "気",
894
+ "小",
895
+ "七",
896
+ "山",
897
+ "話",
898
+ "女",
899
+ "北",
900
+ "午",
901
+ "百",
902
+ "書",
903
+ "先",
904
+ "名",
905
+ "川",
906
+ "千",
907
+ "水",
908
+ "半",
909
+ "男",
910
+ "西",
911
+ "電",
912
+ "校",
913
+ "語",
914
+ "土",
915
+ "木",
916
+ "聞",
917
+ "食",
918
+ "車",
919
+ "何",
920
+ "南",
921
+ "万",
922
+ "毎",
923
+ "白",
924
+ "天",
925
+ "母",
926
+ "火",
927
+ "右",
928
+ "読",
929
+ "友",
930
+ "左",
931
+ "休",
932
+ "父",
933
+ "雨",
934
+ ],
935
+ # Jap-Katakana
936
+ "Japanese—": [
937
+ "ー",
938
+ "ン",
939
+ "ス",
940
+ "・",
941
+ "ル",
942
+ "ト",
943
+ "リ",
944
+ "イ",
945
+ "ア",
946
+ "ラ",
947
+ "ッ",
948
+ "ク",
949
+ "ド",
950
+ "シ",
951
+ "レ",
952
+ "ジ",
953
+ "タ",
954
+ "フ",
955
+ "ロ",
956
+ "カ",
957
+ "テ",
958
+ "マ",
959
+ "ィ",
960
+ "グ",
961
+ "バ",
962
+ "ム",
963
+ "プ",
964
+ "オ",
965
+ "コ",
966
+ "デ",
967
+ "ニ",
968
+ "ウ",
969
+ "メ",
970
+ "サ",
971
+ "ビ",
972
+ "ナ",
973
+ "ブ",
974
+ "ャ",
975
+ "エ",
976
+ "ュ",
977
+ "チ",
978
+ "キ",
979
+ "ズ",
980
+ "ダ",
981
+ "パ",
982
+ "ミ",
983
+ "ェ",
984
+ "ョ",
985
+ "ハ",
986
+ "セ",
987
+ "ベ",
988
+ "ガ",
989
+ "モ",
990
+ "ツ",
991
+ "ネ",
992
+ "ボ",
993
+ "ソ",
994
+ "ノ",
995
+ "ァ",
996
+ "ヴ",
997
+ "ワ",
998
+ "ポ",
999
+ "ペ",
1000
+ "ピ",
1001
+ "ケ",
1002
+ "ゴ",
1003
+ "ギ",
1004
+ "ザ",
1005
+ "ホ",
1006
+ "ゲ",
1007
+ "ォ",
1008
+ "ヤ",
1009
+ "ヒ",
1010
+ "ユ",
1011
+ "ヨ",
1012
+ "ヘ",
1013
+ "ゼ",
1014
+ "ヌ",
1015
+ "ゥ",
1016
+ "ゾ",
1017
+ "ヶ",
1018
+ "ヂ",
1019
+ "ヲ",
1020
+ "ヅ",
1021
+ "ヵ",
1022
+ "ヱ",
1023
+ "ヰ",
1024
+ "ヮ",
1025
+ "ヽ",
1026
+ "゠",
1027
+ "ヾ",
1028
+ "ヷ",
1029
+ "ヿ",
1030
+ "ヸ",
1031
+ "ヹ",
1032
+ "ヺ",
1033
+ ],
1034
+ # Jap-Hiragana
1035
+ "Japanese——": [
1036
+ "の",
1037
+ "に",
1038
+ "る",
1039
+ "た",
1040
+ "と",
1041
+ "は",
1042
+ "し",
1043
+ "い",
1044
+ "を",
1045
+ "で",
1046
+ "て",
1047
+ "が",
1048
+ "な",
1049
+ "れ",
1050
+ "か",
1051
+ "ら",
1052
+ "さ",
1053
+ "っ",
1054
+ "り",
1055
+ "す",
1056
+ "あ",
1057
+ "も",
1058
+ "こ",
1059
+ "ま",
1060
+ "う",
1061
+ "く",
1062
+ "よ",
1063
+ "き",
1064
+ "ん",
1065
+ "め",
1066
+ "お",
1067
+ "け",
1068
+ "そ",
1069
+ "つ",
1070
+ "だ",
1071
+ "や",
1072
+ "え",
1073
+ "ど",
1074
+ "わ",
1075
+ "ち",
1076
+ "み",
1077
+ "せ",
1078
+ "じ",
1079
+ "ば",
1080
+ "へ",
1081
+ "び",
1082
+ "ず",
1083
+ "ろ",
1084
+ "ほ",
1085
+ "げ",
1086
+ "む",
1087
+ "べ",
1088
+ "ひ",
1089
+ "ょ",
1090
+ "ゆ",
1091
+ "ぶ",
1092
+ "ご",
1093
+ "ゃ",
1094
+ "ね",
1095
+ "ふ",
1096
+ "ぐ",
1097
+ "ぎ",
1098
+ "ぼ",
1099
+ "ゅ",
1100
+ "づ",
1101
+ "ざ",
1102
+ "ぞ",
1103
+ "ぬ",
1104
+ "ぜ",
1105
+ "ぱ",
1106
+ "ぽ",
1107
+ "ぷ",
1108
+ "ぴ",
1109
+ "ぃ",
1110
+ "ぁ",
1111
+ "ぇ",
1112
+ "ぺ",
1113
+ "ゞ",
1114
+ "ぢ",
1115
+ "ぉ",
1116
+ "ぅ",
1117
+ "ゐ",
1118
+ "ゝ",
1119
+ "ゑ",
1120
+ "゛",
1121
+ "゜",
1122
+ "ゎ",
1123
+ "ゔ",
1124
+ "゚",
1125
+ "ゟ",
1126
+ "゙",
1127
+ "ゕ",
1128
+ "ゖ",
1129
+ ],
1130
+ "Portuguese": [
1131
+ "a",
1132
+ "e",
1133
+ "o",
1134
+ "s",
1135
+ "i",
1136
+ "r",
1137
+ "d",
1138
+ "n",
1139
+ "t",
1140
+ "m",
1141
+ "u",
1142
+ "c",
1143
+ "l",
1144
+ "p",
1145
+ "g",
1146
+ "v",
1147
+ "b",
1148
+ "f",
1149
+ "h",
1150
+ "ã",
1151
+ "q",
1152
+ "é",
1153
+ "ç",
1154
+ "á",
1155
+ "z",
1156
+ "í",
1157
+ ],
1158
+ "Swedish": [
1159
+ "e",
1160
+ "a",
1161
+ "n",
1162
+ "r",
1163
+ "t",
1164
+ "s",
1165
+ "i",
1166
+ "l",
1167
+ "d",
1168
+ "o",
1169
+ "m",
1170
+ "k",
1171
+ "g",
1172
+ "v",
1173
+ "h",
1174
+ "f",
1175
+ "u",
1176
+ "p",
1177
+ "ä",
1178
+ "c",
1179
+ "b",
1180
+ "ö",
1181
+ "å",
1182
+ "y",
1183
+ "j",
1184
+ "x",
1185
+ ],
1186
+ "Chinese": [
1187
+ "的",
1188
+ "一",
1189
+ "是",
1190
+ "不",
1191
+ "了",
1192
+ "在",
1193
+ "人",
1194
+ "有",
1195
+ "我",
1196
+ "他",
1197
+ "这",
1198
+ "个",
1199
+ "们",
1200
+ "中",
1201
+ "来",
1202
+ "上",
1203
+ "大",
1204
+ "为",
1205
+ "和",
1206
+ "国",
1207
+ "地",
1208
+ "到",
1209
+ "以",
1210
+ "说",
1211
+ "时",
1212
+ "要",
1213
+ "就",
1214
+ "出",
1215
+ "会",
1216
+ "可",
1217
+ "也",
1218
+ "你",
1219
+ "对",
1220
+ "生",
1221
+ "能",
1222
+ "而",
1223
+ "子",
1224
+ "那",
1225
+ "得",
1226
+ "于",
1227
+ "着",
1228
+ "下",
1229
+ "自",
1230
+ "之",
1231
+ "年",
1232
+ "过",
1233
+ "发",
1234
+ "后",
1235
+ "作",
1236
+ "里",
1237
+ "用",
1238
+ "道",
1239
+ "行",
1240
+ "所",
1241
+ "然",
1242
+ "家",
1243
+ "种",
1244
+ "事",
1245
+ "成",
1246
+ "方",
1247
+ "多",
1248
+ "经",
1249
+ "么",
1250
+ "去",
1251
+ "法",
1252
+ "学",
1253
+ "如",
1254
+ "都",
1255
+ "同",
1256
+ "现",
1257
+ "当",
1258
+ "没",
1259
+ "动",
1260
+ "面",
1261
+ "起",
1262
+ "看",
1263
+ "定",
1264
+ "天",
1265
+ "分",
1266
+ "还",
1267
+ "进",
1268
+ "好",
1269
+ "小",
1270
+ "部",
1271
+ "其",
1272
+ "些",
1273
+ "主",
1274
+ "样",
1275
+ "理",
1276
+ "心",
1277
+ "她",
1278
+ "本",
1279
+ "前",
1280
+ "开",
1281
+ "但",
1282
+ "因",
1283
+ "只",
1284
+ "从",
1285
+ "想",
1286
+ "实",
1287
+ ],
1288
+ "Ukrainian": [
1289
+ "о",
1290
+ "а",
1291
+ "н",
1292
+ "і",
1293
+ "и",
1294
+ "р",
1295
+ "в",
1296
+ "т",
1297
+ "е",
1298
+ "с",
1299
+ "к",
1300
+ "л",
1301
+ "у",
1302
+ "д",
1303
+ "м",
1304
+ "п",
1305
+ "з",
1306
+ "я",
1307
+ "ь",
1308
+ "б",
1309
+ "г",
1310
+ "й",
1311
+ "ч",
1312
+ "х",
1313
+ "ц",
1314
+ "ї",
1315
+ ],
1316
+ "Norwegian": [
1317
+ "e",
1318
+ "r",
1319
+ "n",
1320
+ "t",
1321
+ "a",
1322
+ "s",
1323
+ "i",
1324
+ "o",
1325
+ "l",
1326
+ "d",
1327
+ "g",
1328
+ "k",
1329
+ "m",
1330
+ "v",
1331
+ "f",
1332
+ "p",
1333
+ "u",
1334
+ "b",
1335
+ "h",
1336
+ "å",
1337
+ "y",
1338
+ "j",
1339
+ "ø",
1340
+ "c",
1341
+ "æ",
1342
+ "w",
1343
+ ],
1344
+ "Finnish": [
1345
+ "a",
1346
+ "i",
1347
+ "n",
1348
+ "t",
1349
+ "e",
1350
+ "s",
1351
+ "l",
1352
+ "o",
1353
+ "u",
1354
+ "k",
1355
+ "ä",
1356
+ "m",
1357
+ "r",
1358
+ "v",
1359
+ "j",
1360
+ "h",
1361
+ "p",
1362
+ "y",
1363
+ "d",
1364
+ "ö",
1365
+ "g",
1366
+ "c",
1367
+ "b",
1368
+ "f",
1369
+ "w",
1370
+ "z",
1371
+ ],
1372
+ "Vietnamese": [
1373
+ "n",
1374
+ "h",
1375
+ "t",
1376
+ "i",
1377
+ "c",
1378
+ "g",
1379
+ "a",
1380
+ "o",
1381
+ "u",
1382
+ "m",
1383
+ "l",
1384
+ "r",
1385
+ "à",
1386
+ "đ",
1387
+ "s",
1388
+ "e",
1389
+ "v",
1390
+ "p",
1391
+ "b",
1392
+ "y",
1393
+ "ư",
1394
+ "d",
1395
+ "á",
1396
+ "k",
1397
+ "ộ",
1398
+ "ế",
1399
+ ],
1400
+ "Czech": [
1401
+ "o",
1402
+ "e",
1403
+ "a",
1404
+ "n",
1405
+ "t",
1406
+ "s",
1407
+ "i",
1408
+ "l",
1409
+ "v",
1410
+ "r",
1411
+ "k",
1412
+ "d",
1413
+ "u",
1414
+ "m",
1415
+ "p",
1416
+ "í",
1417
+ "c",
1418
+ "h",
1419
+ "z",
1420
+ "á",
1421
+ "y",
1422
+ "j",
1423
+ "b",
1424
+ "ě",
1425
+ "é",
1426
+ "ř",
1427
+ ],
1428
+ "Hungarian": [
1429
+ "e",
1430
+ "a",
1431
+ "t",
1432
+ "l",
1433
+ "s",
1434
+ "n",
1435
+ "k",
1436
+ "r",
1437
+ "i",
1438
+ "o",
1439
+ "z",
1440
+ "á",
1441
+ "é",
1442
+ "g",
1443
+ "m",
1444
+ "b",
1445
+ "y",
1446
+ "v",
1447
+ "d",
1448
+ "h",
1449
+ "u",
1450
+ "p",
1451
+ "j",
1452
+ "ö",
1453
+ "f",
1454
+ "c",
1455
+ ],
1456
+ "Korean": [
1457
+ "이",
1458
+ "다",
1459
+ "에",
1460
+ "의",
1461
+ "는",
1462
+ "로",
1463
+ "하",
1464
+ "을",
1465
+ "가",
1466
+ "고",
1467
+ "지",
1468
+ "서",
1469
+ "한",
1470
+ "은",
1471
+ "기",
1472
+ "으",
1473
+ "년",
1474
+ "대",
1475
+ "사",
1476
+ "시",
1477
+ "를",
1478
+ "리",
1479
+ "도",
1480
+ "인",
1481
+ "스",
1482
+ "일",
1483
+ ],
1484
+ "Indonesian": [
1485
+ "a",
1486
+ "n",
1487
+ "e",
1488
+ "i",
1489
+ "r",
1490
+ "t",
1491
+ "u",
1492
+ "s",
1493
+ "d",
1494
+ "k",
1495
+ "m",
1496
+ "l",
1497
+ "g",
1498
+ "p",
1499
+ "b",
1500
+ "o",
1501
+ "h",
1502
+ "y",
1503
+ "j",
1504
+ "c",
1505
+ "w",
1506
+ "f",
1507
+ "v",
1508
+ "z",
1509
+ "x",
1510
+ "q",
1511
+ ],
1512
+ "Turkish": [
1513
+ "a",
1514
+ "e",
1515
+ "i",
1516
+ "n",
1517
+ "r",
1518
+ "l",
1519
+ "ı",
1520
+ "k",
1521
+ "d",
1522
+ "t",
1523
+ "s",
1524
+ "m",
1525
+ "y",
1526
+ "u",
1527
+ "o",
1528
+ "b",
1529
+ "ü",
1530
+ "ş",
1531
+ "v",
1532
+ "g",
1533
+ "z",
1534
+ "h",
1535
+ "c",
1536
+ "p",
1537
+ "ç",
1538
+ "ğ",
1539
+ ],
1540
+ "Romanian": [
1541
+ "e",
1542
+ "i",
1543
+ "a",
1544
+ "r",
1545
+ "n",
1546
+ "t",
1547
+ "u",
1548
+ "l",
1549
+ "o",
1550
+ "c",
1551
+ "s",
1552
+ "d",
1553
+ "p",
1554
+ "m",
1555
+ "ă",
1556
+ "f",
1557
+ "v",
1558
+ "î",
1559
+ "g",
1560
+ "b",
1561
+ "ș",
1562
+ "ț",
1563
+ "z",
1564
+ "h",
1565
+ "â",
1566
+ "j",
1567
+ ],
1568
+ "Farsi": [
1569
+ "ا",
1570
+ "ی",
1571
+ "ر",
1572
+ "د",
1573
+ "ن",
1574
+ "ه",
1575
+ "و",
1576
+ "م",
1577
+ "ت",
1578
+ "ب",
1579
+ "س",
1580
+ "ل",
1581
+ "ک",
1582
+ "ش",
1583
+ "ز",
1584
+ "ف",
1585
+ "گ",
1586
+ "ع",
1587
+ "خ",
1588
+ "ق",
1589
+ "ج",
1590
+ "آ",
1591
+ "پ",
1592
+ "ح",
1593
+ "ط",
1594
+ "ص",
1595
+ ],
1596
+ "Arabic": [
1597
+ "ا",
1598
+ "ل",
1599
+ "ي",
1600
+ "م",
1601
+ "و",
1602
+ "ن",
1603
+ "ر",
1604
+ "ت",
1605
+ "ب",
1606
+ "ة",
1607
+ "ع",
1608
+ "د",
1609
+ "س",
1610
+ "ف",
1611
+ "ه",
1612
+ "ك",
1613
+ "ق",
1614
+ "أ",
1615
+ "ح",
1616
+ "ج",
1617
+ "ش",
1618
+ "ط",
1619
+ "ص",
1620
+ "ى",
1621
+ "خ",
1622
+ "إ",
1623
+ ],
1624
+ "Danish": [
1625
+ "e",
1626
+ "r",
1627
+ "n",
1628
+ "t",
1629
+ "a",
1630
+ "i",
1631
+ "s",
1632
+ "d",
1633
+ "l",
1634
+ "o",
1635
+ "g",
1636
+ "m",
1637
+ "k",
1638
+ "f",
1639
+ "v",
1640
+ "u",
1641
+ "b",
1642
+ "h",
1643
+ "p",
1644
+ "å",
1645
+ "y",
1646
+ "ø",
1647
+ "æ",
1648
+ "c",
1649
+ "j",
1650
+ "w",
1651
+ ],
1652
+ "Serbian": [
1653
+ "а",
1654
+ "и",
1655
+ "о",
1656
+ "е",
1657
+ "н",
1658
+ "р",
1659
+ "с",
1660
+ "у",
1661
+ "т",
1662
+ "к",
1663
+ "ј",
1664
+ "в",
1665
+ "д",
1666
+ "м",
1667
+ "п",
1668
+ "л",
1669
+ "г",
1670
+ "з",
1671
+ "б",
1672
+ "a",
1673
+ "i",
1674
+ "e",
1675
+ "o",
1676
+ "n",
1677
+ "ц",
1678
+ "ш",
1679
+ ],
1680
+ "Lithuanian": [
1681
+ "i",
1682
+ "a",
1683
+ "s",
1684
+ "o",
1685
+ "r",
1686
+ "e",
1687
+ "t",
1688
+ "n",
1689
+ "u",
1690
+ "k",
1691
+ "m",
1692
+ "l",
1693
+ "p",
1694
+ "v",
1695
+ "d",
1696
+ "j",
1697
+ "g",
1698
+ "ė",
1699
+ "b",
1700
+ "y",
1701
+ "ų",
1702
+ "š",
1703
+ "ž",
1704
+ "c",
1705
+ "ą",
1706
+ "į",
1707
+ ],
1708
+ "Slovene": [
1709
+ "e",
1710
+ "a",
1711
+ "i",
1712
+ "o",
1713
+ "n",
1714
+ "r",
1715
+ "s",
1716
+ "l",
1717
+ "t",
1718
+ "j",
1719
+ "v",
1720
+ "k",
1721
+ "d",
1722
+ "p",
1723
+ "m",
1724
+ "u",
1725
+ "z",
1726
+ "b",
1727
+ "g",
1728
+ "h",
1729
+ "č",
1730
+ "c",
1731
+ "š",
1732
+ "ž",
1733
+ "f",
1734
+ "y",
1735
+ ],
1736
+ "Slovak": [
1737
+ "o",
1738
+ "a",
1739
+ "e",
1740
+ "n",
1741
+ "i",
1742
+ "r",
1743
+ "v",
1744
+ "t",
1745
+ "s",
1746
+ "l",
1747
+ "k",
1748
+ "d",
1749
+ "m",
1750
+ "p",
1751
+ "u",
1752
+ "c",
1753
+ "h",
1754
+ "j",
1755
+ "b",
1756
+ "z",
1757
+ "á",
1758
+ "y",
1759
+ "ý",
1760
+ "í",
1761
+ "č",
1762
+ "é",
1763
+ ],
1764
+ "Hebrew": [
1765
+ "י",
1766
+ "ו",
1767
+ "ה",
1768
+ "ל",
1769
+ "ר",
1770
+ "ב",
1771
+ "ת",
1772
+ "מ",
1773
+ "א",
1774
+ "ש",
1775
+ "נ",
1776
+ "ע",
1777
+ "ם",
1778
+ "ד",
1779
+ "ק",
1780
+ "ח",
1781
+ "פ",
1782
+ "ס",
1783
+ "כ",
1784
+ "ג",
1785
+ "ט",
1786
+ "צ",
1787
+ "ן",
1788
+ "ז",
1789
+ "ך",
1790
+ ],
1791
+ "Bulgarian": [
1792
+ "а",
1793
+ "и",
1794
+ "о",
1795
+ "е",
1796
+ "н",
1797
+ "т",
1798
+ "р",
1799
+ "с",
1800
+ "в",
1801
+ "л",
1802
+ "к",
1803
+ "д",
1804
+ "п",
1805
+ "м",
1806
+ "з",
1807
+ "г",
1808
+ "я",
1809
+ "ъ",
1810
+ "у",
1811
+ "б",
1812
+ "ч",
1813
+ "ц",
1814
+ "й",
1815
+ "ж",
1816
+ "щ",
1817
+ "х",
1818
+ ],
1819
+ "Croatian": [
1820
+ "a",
1821
+ "i",
1822
+ "o",
1823
+ "e",
1824
+ "n",
1825
+ "r",
1826
+ "j",
1827
+ "s",
1828
+ "t",
1829
+ "u",
1830
+ "k",
1831
+ "l",
1832
+ "v",
1833
+ "d",
1834
+ "m",
1835
+ "p",
1836
+ "g",
1837
+ "z",
1838
+ "b",
1839
+ "c",
1840
+ "č",
1841
+ "h",
1842
+ "š",
1843
+ "ž",
1844
+ "ć",
1845
+ "f",
1846
+ ],
1847
+ "Hindi": [
1848
+ "क",
1849
+ "र",
1850
+ "स",
1851
+ "न",
1852
+ "त",
1853
+ "म",
1854
+ "ह",
1855
+ "प",
1856
+ "य",
1857
+ "ल",
1858
+ "व",
1859
+ "ज",
1860
+ "द",
1861
+ "ग",
1862
+ "ब",
1863
+ "श",
1864
+ "ट",
1865
+ "अ",
1866
+ "ए",
1867
+ "थ",
1868
+ "भ",
1869
+ "ड",
1870
+ "च",
1871
+ "ध",
1872
+ "ष",
1873
+ "इ",
1874
+ ],
1875
+ "Estonian": [
1876
+ "a",
1877
+ "i",
1878
+ "e",
1879
+ "s",
1880
+ "t",
1881
+ "l",
1882
+ "u",
1883
+ "n",
1884
+ "o",
1885
+ "k",
1886
+ "r",
1887
+ "d",
1888
+ "m",
1889
+ "v",
1890
+ "g",
1891
+ "p",
1892
+ "j",
1893
+ "h",
1894
+ "ä",
1895
+ "b",
1896
+ "õ",
1897
+ "ü",
1898
+ "f",
1899
+ "c",
1900
+ "ö",
1901
+ "y",
1902
+ ],
1903
+ "Thai": [
1904
+ "า",
1905
+ "น",
1906
+ "ร",
1907
+ "อ",
1908
+ "ก",
1909
+ "เ",
1910
+ "ง",
1911
+ "ม",
1912
+ "ย",
1913
+ "ล",
1914
+ "ว",
1915
+ "ด",
1916
+ "ท",
1917
+ "ส",
1918
+ "ต",
1919
+ "ะ",
1920
+ "ป",
1921
+ "บ",
1922
+ "ค",
1923
+ "ห",
1924
+ "แ",
1925
+ "จ",
1926
+ "พ",
1927
+ "ช",
1928
+ "ข",
1929
+ "ใ",
1930
+ ],
1931
+ "Greek": [
1932
+ "α",
1933
+ "τ",
1934
+ "ο",
1935
+ "ι",
1936
+ "ε",
1937
+ "ν",
1938
+ "ρ",
1939
+ "σ",
1940
+ "κ",
1941
+ "η",
1942
+ "π",
1943
+ "ς",
1944
+ "υ",
1945
+ "μ",
1946
+ "λ",
1947
+ "ί",
1948
+ "ό",
1949
+ "ά",
1950
+ "γ",
1951
+ "έ",
1952
+ "δ",
1953
+ "ή",
1954
+ "ω",
1955
+ "χ",
1956
+ "θ",
1957
+ "ύ",
1958
+ ],
1959
+ "Tamil": [
1960
+ "க",
1961
+ "த",
1962
+ "ப",
1963
+ "ட",
1964
+ "ர",
1965
+ "ம",
1966
+ "ல",
1967
+ "ன",
1968
+ "வ",
1969
+ "ற",
1970
+ "ய",
1971
+ "ள",
1972
+ "ச",
1973
+ "ந",
1974
+ "இ",
1975
+ "ண",
1976
+ "அ",
1977
+ "ஆ",
1978
+ "ழ",
1979
+ "ங",
1980
+ "எ",
1981
+ "உ",
1982
+ "ஒ",
1983
+ "ஸ",
1984
+ ],
1985
+ "Kazakh": [
1986
+ "а",
1987
+ "ы",
1988
+ "е",
1989
+ "н",
1990
+ "т",
1991
+ "р",
1992
+ "л",
1993
+ "і",
1994
+ "д",
1995
+ "с",
1996
+ "м",
1997
+ "қ",
1998
+ "к",
1999
+ "о",
2000
+ "б",
2001
+ "и",
2002
+ "у",
2003
+ "ғ",
2004
+ "ж",
2005
+ "ң",
2006
+ "з",
2007
+ "ш",
2008
+ "й",
2009
+ "п",
2010
+ "г",
2011
+ "ө",
2012
+ ],
2013
+ }
2014
+
2015
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
2016
+
2017
+ # Bit flags for unified character classification.
2018
+ # A single unicodedata.name() call sets all relevant flags at once.
2019
+ _LATIN: int = 1
2020
+ _ACCENTUATED: int = 1 << 1
2021
+ _CJK: int = 1 << 2
2022
+ _HANGUL: int = 1 << 3
2023
+ _KATAKANA: int = 1 << 4
2024
+ _HIRAGANA: int = 1 << 5
2025
+ _THAI: int = 1 << 6
2026
+ _ARABIC: int = 1 << 7
2027
+ _ARABIC_ISOLATED_FORM: int = 1 << 8
2028
+
2029
+ _ACCENT_KEYWORDS: tuple[str, ...] = (
2030
+ "WITH GRAVE",
2031
+ "WITH ACUTE",
2032
+ "WITH CEDILLA",
2033
+ "WITH DIAERESIS",
2034
+ "WITH CIRCUMFLEX",
2035
+ "WITH TILDE",
2036
+ "WITH MACRON",
2037
+ "WITH RING ABOVE",
2038
+ )
2039
+
2040
+ # Pre-built lookup structures for FREQUENCIES (computed once at import time).
2041
+ # character -> rank mapping per language (replaces list .index() calls).
2042
+ _FREQUENCIES_RANK: dict[str, dict[str, int]] = {
2043
+ lang: {char: rank for rank, char in enumerate(chars)}
2044
+ for lang, chars in FREQUENCIES.items()
2045
+ }
2046
+
2047
+ # frozenset per language (avoids rebuilding set() per call).
2048
+ _FREQUENCIES_SET: dict[str, frozenset[str]] = {
2049
+ lang: frozenset(chars) for lang, chars in FREQUENCIES.items()
2050
+ }
.venv/lib/python3.14/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import TypedDict
11
+
12
+ class ResultDict(TypedDict):
13
+ encoding: str | None
14
+ language: str
15
+ confidence: float | None
16
+
17
+
18
+ def detect(
19
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
20
+ ) -> ResultDict:
21
+ """
22
+ chardet legacy method
23
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
24
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
25
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
26
+ further information. Not planned for removal.
27
+
28
+ :param byte_str: The byte sequence to examine.
29
+ :param should_rename_legacy: Should we rename legacy encodings
30
+ to their more modern equivalents?
31
+ """
32
+ if len(kwargs):
33
+ warn(
34
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
35
+ )
36
+
37
+ if not isinstance(byte_str, (bytearray, bytes)):
38
+ raise TypeError( # pragma: nocover
39
+ f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
40
+ )
41
+
42
+ if isinstance(byte_str, bytearray):
43
+ byte_str = bytes(byte_str)
44
+
45
+ r = from_bytes(byte_str).best()
46
+
47
+ encoding = r.encoding if r is not None else None
48
+ language = r.language if r is not None and r.language != "Unknown" else ""
49
+ confidence = 1.0 - r.chaos if r is not None else None
50
+
51
+ # automatically lower confidence
52
+ # on small bytes samples.
53
+ # https://github.com/jawah/charset_normalizer/issues/391
54
+ if (
55
+ confidence is not None
56
+ and confidence >= 0.9
57
+ and encoding
58
+ not in {
59
+ "utf_8",
60
+ "ascii",
61
+ }
62
+ and r.bom is False # type: ignore[union-attr]
63
+ and len(byte_str) < TOO_SMALL_SEQUENCE
64
+ ):
65
+ confidence -= 0.2
66
+
67
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
68
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
69
+ if r is not None and encoding == "utf_8" and r.bom:
70
+ encoding += "_sig"
71
+
72
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
73
+ encoding = CHARDET_CORRESPONDENCE[encoding]
74
+
75
+ return {
76
+ "encoding": encoding,
77
+ "language": language,
78
+ "confidence": confidence,
79
+ }
.venv/lib/python3.14/site-packages/charset_normalizer/md.cpython-314-x86_64-linux-gnu.so ADDED
Binary file (15.9 kB). View file
 
.venv/lib/python3.14/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from functools import lru_cache
5
+ from logging import getLogger
6
+
7
+ if sys.version_info >= (3, 8):
8
+ from typing import final
9
+ else:
10
+ try:
11
+ from typing_extensions import final
12
+ except ImportError:
13
+
14
+ def final(cls): # type: ignore[misc,no-untyped-def]
15
+ return cls
16
+
17
+
18
+ from .constant import (
19
+ COMMON_CJK_CHARACTERS,
20
+ COMMON_SAFE_ASCII_CHARACTERS,
21
+ TRACE,
22
+ UNICODE_SECONDARY_RANGE_KEYWORD,
23
+ _ACCENTUATED,
24
+ _ARABIC,
25
+ _ARABIC_ISOLATED_FORM,
26
+ _CJK,
27
+ _HANGUL,
28
+ _HIRAGANA,
29
+ _KATAKANA,
30
+ _LATIN,
31
+ _THAI,
32
+ )
33
+ from .utils import (
34
+ _character_flags,
35
+ is_emoticon,
36
+ is_punctuation,
37
+ is_separator,
38
+ is_symbol,
39
+ remove_accent,
40
+ unicode_range,
41
+ )
42
+
43
+ # Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
44
+ _GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
45
+
46
+
47
+ @final
48
+ class CharInfo:
49
+ """Pre-computed character properties shared across all detectors.
50
+
51
+ Instantiated once and reused via :meth:`update` on every character
52
+ in the hot loop so that redundant calls to str methods
53
+ (``isalpha``, ``isupper``, …) and cached utility functions
54
+ (``_character_flags``, ``is_punctuation``, …) are avoided when
55
+ several plugins need the same information.
56
+ """
57
+
58
+ __slots__ = (
59
+ "character",
60
+ "printable",
61
+ "alpha",
62
+ "upper",
63
+ "lower",
64
+ "space",
65
+ "digit",
66
+ "is_ascii",
67
+ "case_variable",
68
+ "flags",
69
+ "accentuated",
70
+ "latin",
71
+ "is_cjk",
72
+ "is_arabic",
73
+ "is_glyph",
74
+ "punct",
75
+ "sym",
76
+ )
77
+
78
+ def __init__(self) -> None:
79
+ self.character: str = ""
80
+ self.printable: bool = False
81
+ self.alpha: bool = False
82
+ self.upper: bool = False
83
+ self.lower: bool = False
84
+ self.space: bool = False
85
+ self.digit: bool = False
86
+ self.is_ascii: bool = False
87
+ self.case_variable: bool = False
88
+ self.flags: int = 0
89
+ self.accentuated: bool = False
90
+ self.latin: bool = False
91
+ self.is_cjk: bool = False
92
+ self.is_arabic: bool = False
93
+ self.is_glyph: bool = False
94
+ self.punct: bool = False
95
+ self.sym: bool = False
96
+
97
+ def update(self, character: str) -> None:
98
+ """Update all properties for *character* (called once per character)."""
99
+ self.character = character
100
+
101
+ # ASCII fast-path: for characters with ord < 128, we can skip
102
+ # _character_flags() entirely and derive most properties from ord.
103
+ o: int = ord(character)
104
+ if o < 128:
105
+ self.is_ascii = True
106
+ self.accentuated = False
107
+ self.is_cjk = False
108
+ self.is_arabic = False
109
+ self.is_glyph = False
110
+ # ASCII alpha: a-z (97-122) or A-Z (65-90)
111
+ if 65 <= o <= 90:
112
+ # Uppercase ASCII letter
113
+ self.alpha = True
114
+ self.upper = True
115
+ self.lower = False
116
+ self.space = False
117
+ self.digit = False
118
+ self.printable = True
119
+ self.case_variable = True
120
+ self.flags = _LATIN
121
+ self.latin = True
122
+ self.punct = False
123
+ self.sym = False
124
+ elif 97 <= o <= 122:
125
+ # Lowercase ASCII letter
126
+ self.alpha = True
127
+ self.upper = False
128
+ self.lower = True
129
+ self.space = False
130
+ self.digit = False
131
+ self.printable = True
132
+ self.case_variable = True
133
+ self.flags = _LATIN
134
+ self.latin = True
135
+ self.punct = False
136
+ self.sym = False
137
+ elif 48 <= o <= 57:
138
+ # ASCII digit 0-9
139
+ self.alpha = False
140
+ self.upper = False
141
+ self.lower = False
142
+ self.space = False
143
+ self.digit = True
144
+ self.printable = True
145
+ self.case_variable = False
146
+ self.flags = 0
147
+ self.latin = False
148
+ self.punct = False
149
+ self.sym = False
150
+ elif o == 32 or (9 <= o <= 13):
151
+ # Space, tab, newline, etc.
152
+ self.alpha = False
153
+ self.upper = False
154
+ self.lower = False
155
+ self.space = True
156
+ self.digit = False
157
+ self.printable = o == 32
158
+ self.case_variable = False
159
+ self.flags = 0
160
+ self.latin = False
161
+ self.punct = False
162
+ self.sym = False
163
+ else:
164
+ # Other ASCII (punctuation, symbols, control chars)
165
+ self.printable = character.isprintable()
166
+ self.alpha = False
167
+ self.upper = False
168
+ self.lower = False
169
+ self.space = False
170
+ self.digit = False
171
+ self.case_variable = False
172
+ self.flags = 0
173
+ self.latin = False
174
+ self.punct = is_punctuation(character) if self.printable else False
175
+ self.sym = is_symbol(character) if self.printable else False
176
+ else:
177
+ # Non-ASCII path
178
+ self.is_ascii = False
179
+ self.printable = character.isprintable()
180
+ self.alpha = character.isalpha()
181
+ self.upper = character.isupper()
182
+ self.lower = character.islower()
183
+ self.space = character.isspace()
184
+ self.digit = character.isdigit()
185
+ self.case_variable = self.lower != self.upper
186
+
187
+ # Flag-based classification (single unicodedata.name() call, lru-cached)
188
+ flags: int
189
+ if self.alpha:
190
+ flags = _character_flags(character)
191
+ else:
192
+ flags = 0
193
+ self.flags = flags
194
+ self.accentuated = bool(flags & _ACCENTUATED)
195
+ self.latin = bool(flags & _LATIN)
196
+ self.is_cjk = bool(flags & _CJK)
197
+ self.is_arabic = bool(flags & _ARABIC)
198
+ self.is_glyph = bool(flags & _GLYPH_MASK)
199
+
200
+ # Eagerly compute punct and sym (avoids property dispatch overhead
201
+ # on 300K+ accesses in the hot loop).
202
+ self.punct = is_punctuation(character) if self.printable else False
203
+ self.sym = is_symbol(character) if self.printable else False
204
+
205
+
206
+ class MessDetectorPlugin:
207
+ """
208
+ Base abstract class used for mess detection plugins.
209
+ All detectors MUST extend and implement given methods.
210
+ """
211
+
212
+ __slots__ = ()
213
+
214
+ def feed_info(self, character: str, info: CharInfo) -> None:
215
+ """
216
+ The main routine to be executed upon character.
217
+ Insert the logic in witch the text would be considered chaotic.
218
+ """
219
+ raise NotImplementedError # Defensive:
220
+
221
+ def reset(self) -> None: # Defensive:
222
+ """
223
+ Permit to reset the plugin to the initial state.
224
+ """
225
+ raise NotImplementedError
226
+
227
+ @property
228
+ def ratio(self) -> float:
229
+ """
230
+ Compute the chaos ratio based on what your feed() has seen.
231
+ Must NOT be lower than 0.; No restriction gt 0.
232
+ """
233
+ raise NotImplementedError # Defensive:
234
+
235
+
236
+ @final
237
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
238
+ __slots__ = (
239
+ "_punctuation_count",
240
+ "_symbol_count",
241
+ "_character_count",
242
+ "_last_printable_char",
243
+ "_frenzy_symbol_in_word",
244
+ )
245
+
246
+ def __init__(self) -> None:
247
+ self._punctuation_count: int = 0
248
+ self._symbol_count: int = 0
249
+ self._character_count: int = 0
250
+
251
+ self._last_printable_char: str | None = None
252
+ self._frenzy_symbol_in_word: bool = False
253
+
254
+ def feed_info(self, character: str, info: CharInfo) -> None:
255
+ """Optimized feed using pre-computed character info."""
256
+ self._character_count += 1
257
+
258
+ if (
259
+ character != self._last_printable_char
260
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
261
+ ):
262
+ if info.punct:
263
+ self._punctuation_count += 1
264
+ elif not info.digit and info.sym and not is_emoticon(character):
265
+ self._symbol_count += 2
266
+
267
+ self._last_printable_char = character
268
+
269
+ def reset(self) -> None: # Abstract
270
+ self._punctuation_count = 0
271
+ self._character_count = 0
272
+ self._symbol_count = 0
273
+
274
+ @property
275
+ def ratio(self) -> float:
276
+ if self._character_count == 0:
277
+ return 0.0
278
+
279
+ ratio_of_punctuation: float = (
280
+ self._punctuation_count + self._symbol_count
281
+ ) / self._character_count
282
+
283
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
284
+
285
+
286
+ @final
287
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
288
+ __slots__ = ("_character_count", "_accentuated_count")
289
+
290
+ def __init__(self) -> None:
291
+ self._character_count: int = 0
292
+ self._accentuated_count: int = 0
293
+
294
+ def feed_info(self, character: str, info: CharInfo) -> None:
295
+ """Optimized feed using pre-computed character info."""
296
+ self._character_count += 1
297
+
298
+ if info.accentuated:
299
+ self._accentuated_count += 1
300
+
301
+ def reset(self) -> None: # Abstract
302
+ self._character_count = 0
303
+ self._accentuated_count = 0
304
+
305
+ @property
306
+ def ratio(self) -> float:
307
+ if self._character_count < 8:
308
+ return 0.0
309
+
310
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
311
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
312
+
313
+
314
+ @final
315
+ class UnprintablePlugin(MessDetectorPlugin):
316
+ __slots__ = ("_unprintable_count", "_character_count")
317
+
318
+ def __init__(self) -> None:
319
+ self._unprintable_count: int = 0
320
+ self._character_count: int = 0
321
+
322
+ def feed_info(self, character: str, info: CharInfo) -> None:
323
+ """Optimized feed using pre-computed character info."""
324
+ if (
325
+ not info.space
326
+ and not info.printable
327
+ and character != "\x1a"
328
+ and character != "\ufeff"
329
+ ):
330
+ self._unprintable_count += 1
331
+ self._character_count += 1
332
+
333
+ def reset(self) -> None: # Abstract
334
+ self._unprintable_count = 0
335
+
336
+ @property
337
+ def ratio(self) -> float:
338
+ if self._character_count == 0: # Defensive:
339
+ return 0.0
340
+
341
+ return (self._unprintable_count * 8) / self._character_count
342
+
343
+
344
+ @final
345
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
346
+ __slots__ = (
347
+ "_successive_count",
348
+ "_character_count",
349
+ "_last_latin_character",
350
+ "_last_was_accentuated",
351
+ )
352
+
353
+ def __init__(self) -> None:
354
+ self._successive_count: int = 0
355
+ self._character_count: int = 0
356
+
357
+ self._last_latin_character: str | None = None
358
+ self._last_was_accentuated: bool = False
359
+
360
+ def feed_info(self, character: str, info: CharInfo) -> None:
361
+ """Optimized feed using pre-computed character info."""
362
+ self._character_count += 1
363
+ if (
364
+ self._last_latin_character is not None
365
+ and info.accentuated
366
+ and self._last_was_accentuated
367
+ ):
368
+ if info.upper and self._last_latin_character.isupper():
369
+ self._successive_count += 1
370
+ if remove_accent(character) == remove_accent(self._last_latin_character):
371
+ self._successive_count += 1
372
+ self._last_latin_character = character
373
+ self._last_was_accentuated = info.accentuated
374
+
375
+ def reset(self) -> None: # Abstract
376
+ self._successive_count = 0
377
+ self._character_count = 0
378
+ self._last_latin_character = None
379
+ self._last_was_accentuated = False
380
+
381
+ @property
382
+ def ratio(self) -> float:
383
+ if self._character_count == 0:
384
+ return 0.0
385
+
386
+ return (self._successive_count * 2) / self._character_count
387
+
388
+
389
+ @final
390
+ class SuspiciousRange(MessDetectorPlugin):
391
+ __slots__ = (
392
+ "_suspicious_successive_range_count",
393
+ "_character_count",
394
+ "_last_printable_seen",
395
+ "_last_printable_range",
396
+ )
397
+
398
+ def __init__(self) -> None:
399
+ self._suspicious_successive_range_count: int = 0
400
+ self._character_count: int = 0
401
+ self._last_printable_seen: str | None = None
402
+ self._last_printable_range: str | None = None
403
+
404
+ def feed_info(self, character: str, info: CharInfo) -> None:
405
+ """Optimized feed using pre-computed character info."""
406
+ self._character_count += 1
407
+
408
+ if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
409
+ self._last_printable_seen = None
410
+ self._last_printable_range = None
411
+ return
412
+
413
+ if self._last_printable_seen is None:
414
+ self._last_printable_seen = character
415
+ self._last_printable_range = unicode_range(character)
416
+ return
417
+
418
+ unicode_range_a: str | None = self._last_printable_range
419
+ unicode_range_b: str | None = unicode_range(character)
420
+
421
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
422
+ self._suspicious_successive_range_count += 1
423
+
424
+ self._last_printable_seen = character
425
+ self._last_printable_range = unicode_range_b
426
+
427
+ def reset(self) -> None: # Abstract
428
+ self._character_count = 0
429
+ self._suspicious_successive_range_count = 0
430
+ self._last_printable_seen = None
431
+ self._last_printable_range = None
432
+
433
+ @property
434
+ def ratio(self) -> float:
435
+ if self._character_count <= 13:
436
+ return 0.0
437
+
438
+ ratio_of_suspicious_range_usage: float = (
439
+ self._suspicious_successive_range_count * 2
440
+ ) / self._character_count
441
+
442
+ return ratio_of_suspicious_range_usage
443
+
444
+
445
+ @final
446
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
447
+ __slots__ = (
448
+ "_word_count",
449
+ "_bad_word_count",
450
+ "_foreign_long_count",
451
+ "_is_current_word_bad",
452
+ "_foreign_long_watch",
453
+ "_character_count",
454
+ "_bad_character_count",
455
+ "_buffer_length",
456
+ "_buffer_last_char",
457
+ "_buffer_last_char_accentuated",
458
+ "_buffer_accent_count",
459
+ "_buffer_glyph_count",
460
+ "_buffer_upper_count",
461
+ )
462
+
463
+ def __init__(self) -> None:
464
+ self._word_count: int = 0
465
+ self._bad_word_count: int = 0
466
+ self._foreign_long_count: int = 0
467
+
468
+ self._is_current_word_bad: bool = False
469
+ self._foreign_long_watch: bool = False
470
+
471
+ self._character_count: int = 0
472
+ self._bad_character_count: int = 0
473
+
474
+ self._buffer_length: int = 0
475
+ self._buffer_last_char: str | None = None
476
+ self._buffer_last_char_accentuated: bool = False
477
+ self._buffer_accent_count: int = 0
478
+ self._buffer_glyph_count: int = 0
479
+ self._buffer_upper_count: int = 0
480
+
481
+ def feed_info(self, character: str, info: CharInfo) -> None:
482
+ """Optimized feed using pre-computed character info."""
483
+ if info.alpha:
484
+ self._buffer_length += 1
485
+ self._buffer_last_char = character
486
+
487
+ if info.upper:
488
+ self._buffer_upper_count += 1
489
+
490
+ self._buffer_last_char_accentuated = info.accentuated
491
+
492
+ if info.accentuated:
493
+ self._buffer_accent_count += 1
494
+ if (
495
+ not self._foreign_long_watch
496
+ and (not info.latin or info.accentuated)
497
+ and not info.is_glyph
498
+ ):
499
+ self._foreign_long_watch = True
500
+ if info.is_glyph:
501
+ self._buffer_glyph_count += 1
502
+ return
503
+ if not self._buffer_length:
504
+ return
505
+ if info.space or info.punct or is_separator(character):
506
+ self._word_count += 1
507
+ buffer_length: int = self._buffer_length
508
+
509
+ self._character_count += buffer_length
510
+
511
+ if buffer_length >= 4:
512
+ if self._buffer_accent_count / buffer_length >= 0.5:
513
+ self._is_current_word_bad = True
514
+ elif (
515
+ self._buffer_last_char_accentuated
516
+ and self._buffer_last_char.isupper() # type: ignore[union-attr]
517
+ and self._buffer_upper_count != buffer_length
518
+ ):
519
+ self._foreign_long_count += 1
520
+ self._is_current_word_bad = True
521
+ elif self._buffer_glyph_count == 1:
522
+ self._is_current_word_bad = True
523
+ self._foreign_long_count += 1
524
+ if buffer_length >= 24 and self._foreign_long_watch:
525
+ probable_camel_cased: bool = (
526
+ self._buffer_upper_count > 0
527
+ and self._buffer_upper_count / buffer_length <= 0.3
528
+ )
529
+
530
+ if not probable_camel_cased:
531
+ self._foreign_long_count += 1
532
+ self._is_current_word_bad = True
533
+
534
+ if self._is_current_word_bad:
535
+ self._bad_word_count += 1
536
+ self._bad_character_count += buffer_length
537
+ self._is_current_word_bad = False
538
+
539
+ self._foreign_long_watch = False
540
+ self._buffer_length = 0
541
+ self._buffer_last_char = None
542
+ self._buffer_last_char_accentuated = False
543
+ self._buffer_accent_count = 0
544
+ self._buffer_glyph_count = 0
545
+ self._buffer_upper_count = 0
546
+ elif (
547
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
548
+ and not info.digit
549
+ and info.sym
550
+ ):
551
+ self._is_current_word_bad = True
552
+ self._buffer_length += 1
553
+ self._buffer_last_char = character
554
+ self._buffer_last_char_accentuated = False
555
+
556
+ def reset(self) -> None: # Abstract
557
+ self._buffer_length = 0
558
+ self._buffer_last_char = None
559
+ self._buffer_last_char_accentuated = False
560
+ self._is_current_word_bad = False
561
+ self._foreign_long_watch = False
562
+ self._bad_word_count = 0
563
+ self._word_count = 0
564
+ self._character_count = 0
565
+ self._bad_character_count = 0
566
+ self._foreign_long_count = 0
567
+ self._buffer_accent_count = 0
568
+ self._buffer_glyph_count = 0
569
+ self._buffer_upper_count = 0
570
+
571
+ @property
572
+ def ratio(self) -> float:
573
+ if self._word_count <= 10 and self._foreign_long_count == 0:
574
+ return 0.0
575
+
576
+ return self._bad_character_count / self._character_count
577
+
578
+
579
+ @final
580
+ class CjkUncommonPlugin(MessDetectorPlugin):
581
+ """
582
+ Detect messy CJK text that probably means nothing.
583
+ """
584
+
585
+ __slots__ = ("_character_count", "_uncommon_count")
586
+
587
+ def __init__(self) -> None:
588
+ self._character_count: int = 0
589
+ self._uncommon_count: int = 0
590
+
591
+ def feed_info(self, character: str, info: CharInfo) -> None:
592
+ """Optimized feed using pre-computed character info."""
593
+ self._character_count += 1
594
+
595
+ if character not in COMMON_CJK_CHARACTERS:
596
+ self._uncommon_count += 1
597
+
598
+ def reset(self) -> None: # Abstract
599
+ self._character_count = 0
600
+ self._uncommon_count = 0
601
+
602
+ @property
603
+ def ratio(self) -> float:
604
+ if self._character_count < 8:
605
+ return 0.0
606
+
607
+ uncommon_form_usage: float = self._uncommon_count / self._character_count
608
+
609
+ # we can be pretty sure it's garbage when uncommon characters are widely
610
+ # used. otherwise it could just be traditional chinese for example.
611
+ return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
612
+
613
+
614
+ @final
615
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
616
+ __slots__ = (
617
+ "_buf",
618
+ "_character_count_since_last_sep",
619
+ "_successive_upper_lower_count",
620
+ "_successive_upper_lower_count_final",
621
+ "_character_count",
622
+ "_last_alpha_seen",
623
+ "_last_alpha_seen_upper",
624
+ "_last_alpha_seen_lower",
625
+ "_current_ascii_only",
626
+ )
627
+
628
+ def __init__(self) -> None:
629
+ self._buf: bool = False
630
+
631
+ self._character_count_since_last_sep: int = 0
632
+
633
+ self._successive_upper_lower_count: int = 0
634
+ self._successive_upper_lower_count_final: int = 0
635
+
636
+ self._character_count: int = 0
637
+
638
+ self._last_alpha_seen: str | None = None
639
+ self._last_alpha_seen_upper: bool = False
640
+ self._last_alpha_seen_lower: bool = False
641
+ self._current_ascii_only: bool = True
642
+
643
+ def feed_info(self, character: str, info: CharInfo) -> None:
644
+ """Optimized feed using pre-computed character info."""
645
+ is_concerned: bool = info.alpha and info.case_variable
646
+ chunk_sep: bool = not is_concerned
647
+
648
+ if chunk_sep and self._character_count_since_last_sep > 0:
649
+ if (
650
+ self._character_count_since_last_sep <= 64
651
+ and not info.digit
652
+ and not self._current_ascii_only
653
+ ):
654
+ self._successive_upper_lower_count_final += (
655
+ self._successive_upper_lower_count
656
+ )
657
+
658
+ self._successive_upper_lower_count = 0
659
+ self._character_count_since_last_sep = 0
660
+ self._last_alpha_seen = None
661
+ self._buf = False
662
+ self._character_count += 1
663
+ self._current_ascii_only = True
664
+
665
+ return
666
+
667
+ if self._current_ascii_only and not info.is_ascii:
668
+ self._current_ascii_only = False
669
+
670
+ if self._last_alpha_seen is not None:
671
+ if (info.upper and self._last_alpha_seen_lower) or (
672
+ info.lower and self._last_alpha_seen_upper
673
+ ):
674
+ if self._buf:
675
+ self._successive_upper_lower_count += 2
676
+ self._buf = False
677
+ else:
678
+ self._buf = True
679
+ else:
680
+ self._buf = False
681
+
682
+ self._character_count += 1
683
+ self._character_count_since_last_sep += 1
684
+ self._last_alpha_seen = character
685
+ self._last_alpha_seen_upper = info.upper
686
+ self._last_alpha_seen_lower = info.lower
687
+
688
+ def reset(self) -> None: # Abstract
689
+ self._character_count = 0
690
+ self._character_count_since_last_sep = 0
691
+ self._successive_upper_lower_count = 0
692
+ self._successive_upper_lower_count_final = 0
693
+ self._last_alpha_seen = None
694
+ self._last_alpha_seen_upper = False
695
+ self._last_alpha_seen_lower = False
696
+ self._buf = False
697
+ self._current_ascii_only = True
698
+
699
+ @property
700
+ def ratio(self) -> float:
701
+ if self._character_count == 0: # Defensive:
702
+ return 0.0
703
+
704
+ return self._successive_upper_lower_count_final / self._character_count
705
+
706
+
707
+ @final
708
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
709
+ __slots__ = ("_character_count", "_isolated_form_count")
710
+
711
+ def __init__(self) -> None:
712
+ self._character_count: int = 0
713
+ self._isolated_form_count: int = 0
714
+
715
+ def reset(self) -> None: # Abstract
716
+ self._character_count = 0
717
+ self._isolated_form_count = 0
718
+
719
+ def feed_info(self, character: str, info: CharInfo) -> None:
720
+ """Optimized feed using pre-computed character info."""
721
+ self._character_count += 1
722
+
723
+ if info.flags & _ARABIC_ISOLATED_FORM:
724
+ self._isolated_form_count += 1
725
+
726
+ @property
727
+ def ratio(self) -> float:
728
+ if self._character_count < 8:
729
+ return 0.0
730
+
731
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
732
+
733
+ return isolated_form_usage
734
+
735
+
736
+ @lru_cache(maxsize=1024)
737
+ def is_suspiciously_successive_range(
738
+ unicode_range_a: str | None, unicode_range_b: str | None
739
+ ) -> bool:
740
+ """
741
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
742
+ """
743
+ if unicode_range_a is None or unicode_range_b is None:
744
+ return True
745
+
746
+ if unicode_range_a == unicode_range_b:
747
+ return False
748
+
749
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
750
+ return False
751
+
752
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
753
+ return False
754
+
755
+ # Latin characters can be accompanied with a combining diacritical mark
756
+ # eg. Vietnamese.
757
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
758
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
759
+ ):
760
+ return False
761
+
762
+ keywords_range_a, keywords_range_b = (
763
+ unicode_range_a.split(" "),
764
+ unicode_range_b.split(" "),
765
+ )
766
+
767
+ for el in keywords_range_a:
768
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
769
+ continue
770
+ if el in keywords_range_b:
771
+ return False
772
+
773
+ # Japanese Exception
774
+ range_a_jp_chars, range_b_jp_chars = (
775
+ unicode_range_a
776
+ in (
777
+ "Hiragana",
778
+ "Katakana",
779
+ ),
780
+ unicode_range_b in ("Hiragana", "Katakana"),
781
+ )
782
+ if (range_a_jp_chars or range_b_jp_chars) and (
783
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
784
+ ):
785
+ return False
786
+ if range_a_jp_chars and range_b_jp_chars:
787
+ return False
788
+
789
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
790
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
791
+ return False
792
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
793
+ return False
794
+
795
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
796
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
797
+ unicode_range_a in ["Katakana", "Hiragana"]
798
+ and unicode_range_b in ["Katakana", "Hiragana"]
799
+ ):
800
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
801
+ return False
802
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
803
+ return False
804
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
805
+ return False
806
+
807
+ return True
808
+
809
+
810
+ @lru_cache(maxsize=2048)
811
+ def mess_ratio(
812
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
813
+ ) -> float:
814
+ """
815
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
816
+ """
817
+
818
+ seq_len: int = len(decoded_sequence)
819
+
820
+ if seq_len < 511:
821
+ step: int = 32
822
+ elif seq_len < 1024:
823
+ step = 64
824
+ else:
825
+ step = 128
826
+
827
+ # Create each detector as a named local variable (unrolled from the generic loop).
828
+ # This eliminates per-character iteration over the detector list and
829
+ # per-character eligible() virtual dispatch, while keeping every plugin class
830
+ # intact and fully readable.
831
+ d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
832
+ d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
833
+ d_up: UnprintablePlugin = UnprintablePlugin()
834
+ d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
835
+ d_sr: SuspiciousRange = SuspiciousRange()
836
+ d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
837
+ d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
838
+ d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
839
+ d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
840
+
841
+ # Local references for feed_info methods called in the hot loop.
842
+ d_sp_feed = d_sp.feed_info
843
+ d_ta_feed = d_ta.feed_info
844
+ d_up_feed = d_up.feed_info
845
+ d_sda_feed = d_sda.feed_info
846
+ d_sr_feed = d_sr.feed_info
847
+ d_sw_feed = d_sw.feed_info
848
+ d_cu_feed = d_cu.feed_info
849
+ d_au_feed = d_au.feed_info
850
+ d_ai_feed = d_ai.feed_info
851
+
852
+ # Single reusable CharInfo object (avoids per-character allocation).
853
+ info: CharInfo = CharInfo()
854
+ info_update = info.update
855
+
856
+ mean_mess_ratio: float
857
+
858
+ for block_start in range(0, seq_len, step):
859
+ for character in decoded_sequence[block_start : block_start + step]:
860
+ # Pre-compute all character properties once (shared across all plugins).
861
+ info_update(character)
862
+
863
+ # Detectors with eligible() == always True
864
+ d_up_feed(character, info)
865
+ d_sw_feed(character, info)
866
+ d_au_feed(character, info)
867
+
868
+ # Detectors with eligible() == isprintable
869
+ if info.printable:
870
+ d_sp_feed(character, info)
871
+ d_sr_feed(character, info)
872
+
873
+ # Detectors with eligible() == isalpha
874
+ if info.alpha:
875
+ d_ta_feed(character, info)
876
+ # SuspiciousDuplicateAccent: isalpha() and is_latin()
877
+ if info.latin:
878
+ d_sda_feed(character, info)
879
+ # CjkUncommon: is_cjk()
880
+ if info.is_cjk:
881
+ d_cu_feed(character, info)
882
+ # ArabicIsolatedForm: is_arabic()
883
+ if info.is_arabic:
884
+ d_ai_feed(character, info)
885
+
886
+ mean_mess_ratio = (
887
+ d_sp.ratio
888
+ + d_ta.ratio
889
+ + d_up.ratio
890
+ + d_sda.ratio
891
+ + d_sr.ratio
892
+ + d_sw.ratio
893
+ + d_cu.ratio
894
+ + d_au.ratio
895
+ + d_ai.ratio
896
+ )
897
+
898
+ if mean_mess_ratio >= maximum_threshold:
899
+ break
900
+ else:
901
+ # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
902
+ info_update("\n")
903
+ d_sw_feed("\n", info)
904
+ d_au_feed("\n", info)
905
+ d_up_feed("\n", info)
906
+
907
+ mean_mess_ratio = (
908
+ d_sp.ratio
909
+ + d_ta.ratio
910
+ + d_up.ratio
911
+ + d_sda.ratio
912
+ + d_sr.ratio
913
+ + d_sw.ratio
914
+ + d_cu.ratio
915
+ + d_au.ratio
916
+ + d_ai.ratio
917
+ )
918
+
919
+ if debug: # Defensive:
920
+ logger = getLogger("charset_normalizer")
921
+
922
+ logger.log(
923
+ TRACE,
924
+ "Mess-detector extended-analysis start. "
925
+ f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
926
+ f"maximum_threshold={maximum_threshold}",
927
+ )
928
+
929
+ if seq_len > 16:
930
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
931
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
932
+
933
+ for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
934
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
935
+
936
+ return round(mean_mess_ratio, 3)
.venv/lib/python3.14/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from encodings.aliases import aliases
4
+ from json import dumps
5
+ from re import sub
6
+ from typing import Any, Iterator, List, Tuple
7
+
8
+ from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
9
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
10
+
11
+
12
+ class CharsetMatch:
13
+ def __init__(
14
+ self,
15
+ payload: bytes | bytearray,
16
+ guessed_encoding: str,
17
+ mean_mess_ratio: float,
18
+ has_sig_or_bom: bool,
19
+ languages: CoherenceMatches,
20
+ decoded_payload: str | None = None,
21
+ preemptive_declaration: str | None = None,
22
+ ):
23
+ self._payload: bytes | bytearray = payload
24
+
25
+ self._encoding: str = guessed_encoding
26
+ self._mean_mess_ratio: float = mean_mess_ratio
27
+ self._languages: CoherenceMatches = languages
28
+ self._has_sig_or_bom: bool = has_sig_or_bom
29
+ self._unicode_ranges: list[str] | None = None
30
+
31
+ self._leaves: list[CharsetMatch] = []
32
+ self._mean_coherence_ratio: float = 0.0
33
+
34
+ self._output_payload: bytes | None = None
35
+ self._output_encoding: str | None = None
36
+
37
+ self._string: str | None = decoded_payload
38
+
39
+ self._preemptive_declaration: str | None = preemptive_declaration
40
+
41
+ def __eq__(self, other: object) -> bool:
42
+ if not isinstance(other, CharsetMatch):
43
+ if isinstance(other, str):
44
+ return iana_name(other) == self.encoding
45
+ return False
46
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
47
+
48
+ def __lt__(self, other: object) -> bool:
49
+ """
50
+ Implemented to make sorted available upon CharsetMatches items.
51
+ """
52
+ if not isinstance(other, CharsetMatch):
53
+ raise ValueError
54
+
55
+ chaos_difference: float = abs(self.chaos - other.chaos)
56
+ coherence_difference: float = abs(self.coherence - other.coherence)
57
+
58
+ # Below 0.5% difference --> Use Coherence
59
+ if chaos_difference < 0.005 and coherence_difference > 0.02:
60
+ return self.coherence > other.coherence
61
+ elif chaos_difference < 0.005 and coherence_difference <= 0.02:
62
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
63
+ # preserve RAM usage!
64
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
65
+ return self.chaos < other.chaos
66
+ return self.multi_byte_usage > other.multi_byte_usage
67
+
68
+ return self.chaos < other.chaos
69
+
70
+ @property
71
+ def multi_byte_usage(self) -> float:
72
+ return 1.0 - (len(str(self)) / len(self.raw))
73
+
74
+ def __str__(self) -> str:
75
+ # Lazy Str Loading
76
+ if self._string is None:
77
+ self._string = str(self._payload, self._encoding, "strict")
78
+ # UTF-7 BOM is encoded in modified Base64 whose byte boundary
79
+ # can overlap with the next character, so raw-byte stripping
80
+ # is unreliable. Strip the decoded BOM character instead.
81
+ if (
82
+ self._has_sig_or_bom
83
+ and self._encoding == "utf_7"
84
+ and self._string
85
+ and self._string[0] == "\ufeff"
86
+ ):
87
+ self._string = self._string[1:]
88
+ return self._string
89
+
90
+ def __repr__(self) -> str:
91
+ return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"
92
+
93
+ def add_submatch(self, other: CharsetMatch) -> None:
94
+ if not isinstance(other, CharsetMatch) or other == self:
95
+ raise ValueError(
96
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
97
+ other.__class__
98
+ )
99
+ )
100
+
101
+ other._string = None # Unload RAM usage; dirty trick.
102
+ self._leaves.append(other)
103
+
104
+ @property
105
+ def encoding(self) -> str:
106
+ return self._encoding
107
+
108
+ @property
109
+ def encoding_aliases(self) -> list[str]:
110
+ """
111
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
112
+ """
113
+ also_known_as: list[str] = []
114
+ for u, p in aliases.items():
115
+ if self.encoding == u:
116
+ also_known_as.append(p)
117
+ elif self.encoding == p:
118
+ also_known_as.append(u)
119
+ return also_known_as
120
+
121
+ @property
122
+ def bom(self) -> bool:
123
+ return self._has_sig_or_bom
124
+
125
+ @property
126
+ def byte_order_mark(self) -> bool:
127
+ return self._has_sig_or_bom
128
+
129
+ @property
130
+ def languages(self) -> list[str]:
131
+ """
132
+ Return the complete list of possible languages found in decoded sequence.
133
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
134
+ """
135
+ return [e[0] for e in self._languages]
136
+
137
+ @property
138
+ def language(self) -> str:
139
+ """
140
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
141
+ "Unknown".
142
+ """
143
+ if not self._languages:
144
+ # Trying to infer the language based on the given encoding
145
+ # Its either English or we should not pronounce ourselves in certain cases.
146
+ if "ascii" in self.could_be_from_charset:
147
+ return "English"
148
+
149
+ # doing it there to avoid circular import
150
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
151
+
152
+ languages = (
153
+ mb_encoding_languages(self.encoding)
154
+ if is_multi_byte_encoding(self.encoding)
155
+ else encoding_languages(self.encoding)
156
+ )
157
+
158
+ if len(languages) == 0 or "Latin Based" in languages:
159
+ return "Unknown"
160
+
161
+ return languages[0]
162
+
163
+ return self._languages[0][0]
164
+
165
+ @property
166
+ def chaos(self) -> float:
167
+ return self._mean_mess_ratio
168
+
169
+ @property
170
+ def coherence(self) -> float:
171
+ if not self._languages:
172
+ return 0.0
173
+ return self._languages[0][1]
174
+
175
+ @property
176
+ def percent_chaos(self) -> float:
177
+ return round(self.chaos * 100, ndigits=3)
178
+
179
+ @property
180
+ def percent_coherence(self) -> float:
181
+ return round(self.coherence * 100, ndigits=3)
182
+
183
+ @property
184
+ def raw(self) -> bytes | bytearray:
185
+ """
186
+ Original untouched bytes.
187
+ """
188
+ return self._payload
189
+
190
+ @property
191
+ def submatch(self) -> list[CharsetMatch]:
192
+ return self._leaves
193
+
194
+ @property
195
+ def has_submatch(self) -> bool:
196
+ return len(self._leaves) > 0
197
+
198
+ @property
199
+ def alphabets(self) -> list[str]:
200
+ if self._unicode_ranges is not None:
201
+ return self._unicode_ranges
202
+ # list detected ranges
203
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
204
+ # filter and sort
205
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
206
+ return self._unicode_ranges
207
+
208
+ @property
209
+ def could_be_from_charset(self) -> list[str]:
210
+ """
211
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
212
+ encoding.
213
+ This list does include the encoding available in property 'encoding'.
214
+ """
215
+ return [self._encoding] + [m.encoding for m in self._leaves]
216
+
217
+ def output(self, encoding: str = "utf_8") -> bytes:
218
+ """
219
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
220
+ Any errors will be simply ignored by the encoder NOT replaced.
221
+ """
222
+ if self._output_encoding is None or self._output_encoding != encoding:
223
+ self._output_encoding = encoding
224
+ decoded_string = str(self)
225
+ if (
226
+ self._preemptive_declaration is not None
227
+ and self._preemptive_declaration.lower()
228
+ not in ["utf-8", "utf8", "utf_8"]
229
+ ):
230
+ patched_header = sub(
231
+ RE_POSSIBLE_ENCODING_INDICATION,
232
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
233
+ m.groups()[0],
234
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
235
+ ),
236
+ decoded_string[:8192],
237
+ count=1,
238
+ )
239
+
240
+ decoded_string = patched_header + decoded_string[8192:]
241
+
242
+ self._output_payload = decoded_string.encode(encoding, "replace")
243
+
244
+ return self._output_payload # type: ignore
245
+
246
+ @property
247
+ def fingerprint(self) -> int:
248
+ """
249
+ Retrieve a hash fingerprint of the decoded payload, used for deduplication.
250
+ """
251
+ return hash(str(self))
252
+
253
+
254
+ class CharsetMatches:
255
+ """
256
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
257
+ Act like a list(iterable) but does not implements all related methods.
258
+ """
259
+
260
+ def __init__(self, results: list[CharsetMatch] | None = None):
261
+ self._results: list[CharsetMatch] = sorted(results) if results else []
262
+
263
+ def __iter__(self) -> Iterator[CharsetMatch]:
264
+ yield from self._results
265
+
266
+ def __getitem__(self, item: int | str) -> CharsetMatch:
267
+ """
268
+ Retrieve a single item either by its position or encoding name (alias may be used here).
269
+ Raise KeyError upon invalid index or encoding not present in results.
270
+ """
271
+ if isinstance(item, int):
272
+ return self._results[item]
273
+ if isinstance(item, str):
274
+ item = iana_name(item, False)
275
+ for result in self._results:
276
+ if item in result.could_be_from_charset:
277
+ return result
278
+ raise KeyError
279
+
280
+ def __len__(self) -> int:
281
+ return len(self._results)
282
+
283
+ def __bool__(self) -> bool:
284
+ return len(self._results) > 0
285
+
286
+ def append(self, item: CharsetMatch) -> None:
287
+ """
288
+ Insert a single match. Will be inserted accordingly to preserve sort.
289
+ Can be inserted as a submatch.
290
+ """
291
+ if not isinstance(item, CharsetMatch):
292
+ raise ValueError(
293
+ "Cannot append instance '{}' to CharsetMatches".format(
294
+ str(item.__class__)
295
+ )
296
+ )
297
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
298
+ if len(item.raw) < TOO_BIG_SEQUENCE:
299
+ for match in self._results:
300
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
301
+ match.add_submatch(item)
302
+ return
303
+ self._results.append(item)
304
+ self._results = sorted(self._results)
305
+
306
+ def best(self) -> CharsetMatch | None:
307
+ """
308
+ Simply return the first match. Strict equivalent to matches[0].
309
+ """
310
+ if not self._results:
311
+ return None
312
+ return self._results[0]
313
+
314
+ def first(self) -> CharsetMatch | None:
315
+ """
316
+ Redundant method, call the method best(). Kept for BC reasons.
317
+ """
318
+ return self.best()
319
+
320
+
321
+ CoherenceMatch = Tuple[str, float]
322
+ CoherenceMatches = List[CoherenceMatch]
323
+
324
+
325
+ class CliDetectionResult:
326
+ def __init__(
327
+ self,
328
+ path: str,
329
+ encoding: str | None,
330
+ encoding_aliases: list[str],
331
+ alternative_encodings: list[str],
332
+ language: str,
333
+ alphabets: list[str],
334
+ has_sig_or_bom: bool,
335
+ chaos: float,
336
+ coherence: float,
337
+ unicode_path: str | None,
338
+ is_preferred: bool,
339
+ ):
340
+ self.path: str = path
341
+ self.unicode_path: str | None = unicode_path
342
+ self.encoding: str | None = encoding
343
+ self.encoding_aliases: list[str] = encoding_aliases
344
+ self.alternative_encodings: list[str] = alternative_encodings
345
+ self.language: str = language
346
+ self.alphabets: list[str] = alphabets
347
+ self.has_sig_or_bom: bool = has_sig_or_bom
348
+ self.chaos: float = chaos
349
+ self.coherence: float = coherence
350
+ self.is_preferred: bool = is_preferred
351
+
352
+ @property
353
+ def __dict__(self) -> dict[str, Any]: # type: ignore
354
+ return {
355
+ "path": self.path,
356
+ "encoding": self.encoding,
357
+ "encoding_aliases": self.encoding_aliases,
358
+ "alternative_encodings": self.alternative_encodings,
359
+ "language": self.language,
360
+ "alphabets": self.alphabets,
361
+ "has_sig_or_bom": self.has_sig_or_bom,
362
+ "chaos": self.chaos,
363
+ "coherence": self.coherence,
364
+ "unicode_path": self.unicode_path,
365
+ "is_preferred": self.is_preferred,
366
+ }
367
+
368
+ def to_json(self) -> str:
369
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
.venv/lib/python3.14/site-packages/charset_normalizer/py.typed ADDED
File without changes
.venv/lib/python3.14/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import unicodedata
6
+ from bisect import bisect_right
7
+ from codecs import IncrementalDecoder
8
+ from encodings.aliases import aliases
9
+ from functools import lru_cache
10
+ from re import findall
11
+ from typing import Generator
12
+
13
+ from _multibytecodec import ( # type: ignore[import-not-found,import]
14
+ MultibyteIncrementalDecoder,
15
+ )
16
+
17
+ from .constant import (
18
+ ENCODING_MARKS,
19
+ IANA_SUPPORTED_SIMILAR,
20
+ RE_POSSIBLE_ENCODING_INDICATION,
21
+ UNICODE_RANGES_COMBINED,
22
+ UNICODE_SECONDARY_RANGE_KEYWORD,
23
+ UTF8_MAXIMAL_ALLOCATION,
24
+ COMMON_CJK_CHARACTERS,
25
+ _LATIN,
26
+ _CJK,
27
+ _HANGUL,
28
+ _KATAKANA,
29
+ _HIRAGANA,
30
+ _THAI,
31
+ _ARABIC,
32
+ _ARABIC_ISOLATED_FORM,
33
+ _ACCENT_KEYWORDS,
34
+ _ACCENTUATED,
35
+ )
36
+
37
+
38
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
39
+ def _character_flags(character: str) -> int:
40
+ """Compute all name-based classification flags with a single unicodedata.name() call."""
41
+ try:
42
+ desc: str = unicodedata.name(character)
43
+ except ValueError:
44
+ return 0
45
+
46
+ flags: int = 0
47
+
48
+ if "LATIN" in desc:
49
+ flags |= _LATIN
50
+ if "CJK" in desc:
51
+ flags |= _CJK
52
+ if "HANGUL" in desc:
53
+ flags |= _HANGUL
54
+ if "KATAKANA" in desc:
55
+ flags |= _KATAKANA
56
+ if "HIRAGANA" in desc:
57
+ flags |= _HIRAGANA
58
+ if "THAI" in desc:
59
+ flags |= _THAI
60
+ if "ARABIC" in desc:
61
+ flags |= _ARABIC
62
+ if "ISOLATED FORM" in desc:
63
+ flags |= _ARABIC_ISOLATED_FORM
64
+
65
+ for kw in _ACCENT_KEYWORDS:
66
+ if kw in desc:
67
+ flags |= _ACCENTUATED
68
+ break
69
+
70
+ return flags
71
+
72
+
73
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
74
+ def is_accentuated(character: str) -> bool:
75
+ return bool(_character_flags(character) & _ACCENTUATED)
76
+
77
+
78
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
79
+ def remove_accent(character: str) -> str:
80
+ decomposed: str = unicodedata.decomposition(character)
81
+ if not decomposed:
82
+ return character
83
+
84
+ codes: list[str] = decomposed.split(" ")
85
+
86
+ return chr(int(codes[0], 16))
87
+
88
+
89
+ # Pre-built sorted lookup table for O(log n) binary search in unicode_range().
90
+ # Each entry is (range_start, range_end_exclusive, range_name).
91
+ _UNICODE_RANGES_SORTED: list[tuple[int, int, str]] = sorted(
92
+ (ord_range.start, ord_range.stop, name)
93
+ for name, ord_range in UNICODE_RANGES_COMBINED.items()
94
+ )
95
+ _UNICODE_RANGE_STARTS: list[int] = [e[0] for e in _UNICODE_RANGES_SORTED]
96
+
97
+
98
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
99
+ def unicode_range(character: str) -> str | None:
100
+ """
101
+ Retrieve the Unicode range official name from a single character.
102
+ """
103
+ character_ord: int = ord(character)
104
+
105
+ # Binary search: find the rightmost range whose start <= character_ord
106
+ idx = bisect_right(_UNICODE_RANGE_STARTS, character_ord) - 1
107
+ if idx >= 0:
108
+ start, stop, name = _UNICODE_RANGES_SORTED[idx]
109
+ if character_ord < stop:
110
+ return name
111
+
112
+ return None
113
+
114
+
115
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
116
+ def is_latin(character: str) -> bool:
117
+ return bool(_character_flags(character) & _LATIN)
118
+
119
+
120
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
121
+ def is_punctuation(character: str) -> bool:
122
+ character_category: str = unicodedata.category(character)
123
+
124
+ if "P" in character_category:
125
+ return True
126
+
127
+ character_range: str | None = unicode_range(character)
128
+
129
+ if character_range is None:
130
+ return False
131
+
132
+ return "Punctuation" in character_range
133
+
134
+
135
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
136
+ def is_symbol(character: str) -> bool:
137
+ character_category: str = unicodedata.category(character)
138
+
139
+ if "S" in character_category or "N" in character_category:
140
+ return True
141
+
142
+ character_range: str | None = unicode_range(character)
143
+
144
+ if character_range is None:
145
+ return False
146
+
147
+ return "Forms" in character_range and character_category != "Lo"
148
+
149
+
150
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
151
+ def is_emoticon(character: str) -> bool:
152
+ character_range: str | None = unicode_range(character)
153
+
154
+ if character_range is None:
155
+ return False
156
+
157
+ return "Emoticons" in character_range or "Pictographs" in character_range
158
+
159
+
160
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
161
+ def is_separator(character: str) -> bool:
162
+ if character.isspace() or character in {"|", "+", "<", ">"}:
163
+ return True
164
+
165
+ character_category: str = unicodedata.category(character)
166
+
167
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
168
+
169
+
170
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
171
+ def is_case_variable(character: str) -> bool:
172
+ return character.islower() != character.isupper()
173
+
174
+
175
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
176
+ def is_cjk(character: str) -> bool:
177
+ return bool(_character_flags(character) & _CJK)
178
+
179
+
180
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
181
+ def is_hiragana(character: str) -> bool:
182
+ return bool(_character_flags(character) & _HIRAGANA)
183
+
184
+
185
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
186
+ def is_katakana(character: str) -> bool:
187
+ return bool(_character_flags(character) & _KATAKANA)
188
+
189
+
190
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
191
+ def is_hangul(character: str) -> bool:
192
+ return bool(_character_flags(character) & _HANGUL)
193
+
194
+
195
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
196
+ def is_thai(character: str) -> bool:
197
+ return bool(_character_flags(character) & _THAI)
198
+
199
+
200
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
201
+ def is_arabic(character: str) -> bool:
202
+ return bool(_character_flags(character) & _ARABIC)
203
+
204
+
205
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
206
+ def is_arabic_isolated_form(character: str) -> bool:
207
+ return bool(_character_flags(character) & _ARABIC_ISOLATED_FORM)
208
+
209
+
210
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
211
+ def is_cjk_uncommon(character: str) -> bool:
212
+ return character not in COMMON_CJK_CHARACTERS
213
+
214
+
215
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
216
+ def is_unicode_range_secondary(range_name: str) -> bool:
217
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
218
+
219
+
220
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
221
+ def is_unprintable(character: str) -> bool:
222
+ return (
223
+ character.isspace() is False # includes \n \t \r \v
224
+ and character.isprintable() is False
225
+ and character != "\x1a" # Why? Its the ASCII substitute character.
226
+ and character != "\ufeff" # bug discovered in Python,
227
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
228
+ )
229
+
230
+
231
+ def any_specified_encoding(
232
+ sequence: bytes | bytearray, search_zone: int = 8192
233
+ ) -> str | None:
234
+ """
235
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
236
+ """
237
+ if not isinstance(sequence, (bytes, bytearray)):
238
+ raise TypeError
239
+
240
+ seq_len: int = len(sequence)
241
+
242
+ results: list[str] = findall(
243
+ RE_POSSIBLE_ENCODING_INDICATION,
244
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
245
+ )
246
+
247
+ if len(results) == 0:
248
+ return None
249
+
250
+ for specified_encoding in results:
251
+ specified_encoding = specified_encoding.lower().replace("-", "_")
252
+
253
+ encoding_alias: str
254
+ encoding_iana: str
255
+
256
+ for encoding_alias, encoding_iana in aliases.items():
257
+ if encoding_alias == specified_encoding:
258
+ return encoding_iana
259
+ if encoding_iana == specified_encoding:
260
+ return encoding_iana
261
+
262
+ return None
263
+
264
+
265
+ @lru_cache(maxsize=128)
266
+ def is_multi_byte_encoding(name: str) -> bool:
267
+ """
268
+ Verify is a specific encoding is a multi byte one based on it IANA name
269
+ """
270
+ return name in {
271
+ "utf_8",
272
+ "utf_8_sig",
273
+ "utf_16",
274
+ "utf_16_be",
275
+ "utf_16_le",
276
+ "utf_32",
277
+ "utf_32_le",
278
+ "utf_32_be",
279
+ "utf_7",
280
+ } or issubclass(
281
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
282
+ MultibyteIncrementalDecoder,
283
+ )
284
+
285
+
286
+ def identify_sig_or_bom(sequence: bytes | bytearray) -> tuple[str | None, bytes]:
287
+ """
288
+ Identify and extract SIG/BOM in given sequence.
289
+ """
290
+
291
+ for iana_encoding in ENCODING_MARKS:
292
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
293
+
294
+ if isinstance(marks, bytes):
295
+ marks = [marks]
296
+
297
+ for mark in marks:
298
+ if sequence.startswith(mark):
299
+ return iana_encoding, mark
300
+
301
+ return None, b""
302
+
303
+
304
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
305
+ return iana_encoding not in {"utf_16", "utf_32"}
306
+
307
+
308
+ def iana_name(cp_name: str, strict: bool = True) -> str:
309
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
310
+ cp_name = cp_name.lower().replace("-", "_")
311
+
312
+ encoding_alias: str
313
+ encoding_iana: str
314
+
315
+ for encoding_alias, encoding_iana in aliases.items():
316
+ if cp_name in [encoding_alias, encoding_iana]:
317
+ return encoding_iana
318
+
319
+ if strict:
320
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
321
+
322
+ return cp_name
323
+
324
+
325
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
326
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
327
+ return 0.0
328
+
329
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
330
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
331
+
332
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
333
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
334
+
335
+ character_match_count: int = 0
336
+
337
+ for i in range(256):
338
+ to_be_decoded: bytes = bytes([i])
339
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
340
+ character_match_count += 1
341
+
342
+ return character_match_count / 256
343
+
344
+
345
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
346
+ """
347
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
348
+ the function cp_similarity.
349
+ """
350
+ return (
351
+ iana_name_a in IANA_SUPPORTED_SIMILAR
352
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
353
+ )
354
+
355
+
356
+ def set_logging_handler(
357
+ name: str = "charset_normalizer",
358
+ level: int = logging.INFO,
359
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
360
+ ) -> None:
361
+ logger = logging.getLogger(name)
362
+ logger.setLevel(level)
363
+
364
+ handler = logging.StreamHandler()
365
+ handler.setFormatter(logging.Formatter(format_string))
366
+ logger.addHandler(handler)
367
+
368
+
369
+ def cut_sequence_chunks(
370
+ sequences: bytes | bytearray,
371
+ encoding_iana: str,
372
+ offsets: range,
373
+ chunk_size: int,
374
+ bom_or_sig_available: bool,
375
+ strip_sig_or_bom: bool,
376
+ sig_payload: bytes,
377
+ is_multi_byte_decoder: bool,
378
+ decoded_payload: str | None = None,
379
+ ) -> Generator[str, None, None]:
380
+ if decoded_payload and is_multi_byte_decoder is False:
381
+ for i in offsets:
382
+ chunk = decoded_payload[i : i + chunk_size]
383
+ if not chunk:
384
+ break
385
+ yield chunk
386
+ else:
387
+ for i in offsets:
388
+ chunk_end = i + chunk_size
389
+ if chunk_end > len(sequences) + 8:
390
+ continue
391
+
392
+ cut_sequence = sequences[i : i + chunk_size]
393
+
394
+ if bom_or_sig_available and strip_sig_or_bom is False:
395
+ cut_sequence = sig_payload + cut_sequence
396
+
397
+ chunk = cut_sequence.decode(
398
+ encoding_iana,
399
+ errors="ignore" if is_multi_byte_decoder else "strict",
400
+ )
401
+
402
+ # multi-byte bad cutting detector and adjustment
403
+ # not the cleanest way to perform that fix but clever enough for now.
404
+ if is_multi_byte_decoder and i > 0:
405
+ chunk_partial_size_chk: int = min(chunk_size, 16)
406
+
407
+ if (
408
+ decoded_payload
409
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
410
+ ):
411
+ for j in range(i, i - 4, -1):
412
+ cut_sequence = sequences[j:chunk_end]
413
+
414
+ if bom_or_sig_available and strip_sig_or_bom is False:
415
+ cut_sequence = sig_payload + cut_sequence
416
+
417
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
418
+
419
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
420
+ break
421
+
422
+ yield chunk
.venv/lib/python3.14/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "3.4.7"
8
+ VERSION = __version__.split(".")
.venv/lib/python3.14/site-packages/httpx-0.28.1.dist-info/RECORD CHANGED
@@ -1,4 +1,4 @@
1
- ../../../bin/httpx,sha256=dBzeG2pwxWznqM8RL6D2txukmubdml37PBQ3IW9pckE,318
2
  httpx-0.28.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
3
  httpx-0.28.1.dist-info/METADATA,sha256=_rubD48-gNV8gZnDBPNcQzboWB0dGNeYPJJ2a4J5OyU,7052
4
  httpx-0.28.1.dist-info/RECORD,,
 
1
+ ../../../bin/httpx,sha256=D-7W1mnrCVymlIylxMyzoiH-Fjn-0Uvm1S8L7qmruyc,319
2
  httpx-0.28.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
3
  httpx-0.28.1.dist-info/METADATA,sha256=_rubD48-gNV8gZnDBPNcQzboWB0dGNeYPJJ2a4J5OyU,7052
4
  httpx-0.28.1.dist-info/RECORD,,
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ uv
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/METADATA ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: huggingface_hub
3
+ Version: 1.14.0
4
+ Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
5
+ Home-page: https://github.com/huggingface/huggingface_hub
6
+ Author: Hugging Face, Inc.
7
+ Author-email: julien@huggingface.co
8
+ License: Apache-2.0
9
+ Keywords: model-hub machine-learning models natural-language-processing deep-learning pytorch pretrained-models
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Education
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.10.0
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: filelock>=3.10.0
27
+ Requires-Dist: fsspec>=2023.5.0
28
+ Requires-Dist: hf-xet<2.0.0,>=1.4.3; platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "arm64" or platform_machine == "aarch64"
29
+ Requires-Dist: httpx<1,>=0.23.0
30
+ Requires-Dist: packaging>=20.9
31
+ Requires-Dist: pyyaml>=5.1
32
+ Requires-Dist: tqdm>=4.42.1
33
+ Requires-Dist: typer>=0.20.0
34
+ Requires-Dist: typing-extensions>=4.1.0
35
+ Provides-Extra: oauth
36
+ Requires-Dist: authlib>=1.3.2; extra == "oauth"
37
+ Requires-Dist: fastapi; extra == "oauth"
38
+ Requires-Dist: httpx; extra == "oauth"
39
+ Requires-Dist: itsdangerous; extra == "oauth"
40
+ Provides-Extra: torch
41
+ Requires-Dist: torch; extra == "torch"
42
+ Requires-Dist: safetensors[torch]; extra == "torch"
43
+ Provides-Extra: fastai
44
+ Requires-Dist: toml; extra == "fastai"
45
+ Requires-Dist: fastai>=2.4; extra == "fastai"
46
+ Requires-Dist: fastcore>=1.3.27; extra == "fastai"
47
+ Provides-Extra: hf-xet
48
+ Requires-Dist: hf-xet<2.0.0,>=1.4.3; extra == "hf-xet"
49
+ Provides-Extra: mcp
50
+ Requires-Dist: mcp>=1.8.0; extra == "mcp"
51
+ Provides-Extra: testing
52
+ Requires-Dist: authlib>=1.3.2; extra == "testing"
53
+ Requires-Dist: fastapi; extra == "testing"
54
+ Requires-Dist: httpx; extra == "testing"
55
+ Requires-Dist: itsdangerous; extra == "testing"
56
+ Requires-Dist: jedi; extra == "testing"
57
+ Requires-Dist: Jinja2; extra == "testing"
58
+ Requires-Dist: pytest>=8.4.2; extra == "testing"
59
+ Requires-Dist: pytest-cov; extra == "testing"
60
+ Requires-Dist: pytest-env; extra == "testing"
61
+ Requires-Dist: pytest-xdist; extra == "testing"
62
+ Requires-Dist: pytest-vcr; extra == "testing"
63
+ Requires-Dist: pytest-asyncio; extra == "testing"
64
+ Requires-Dist: pytest-rerunfailures<16.0; extra == "testing"
65
+ Requires-Dist: pytest-mock; extra == "testing"
66
+ Requires-Dist: urllib3<2.0; extra == "testing"
67
+ Requires-Dist: soundfile; extra == "testing"
68
+ Requires-Dist: Pillow; extra == "testing"
69
+ Requires-Dist: numpy; extra == "testing"
70
+ Requires-Dist: duckdb; extra == "testing"
71
+ Requires-Dist: fastapi; extra == "testing"
72
+ Provides-Extra: gradio
73
+ Requires-Dist: gradio>=5.0.0; extra == "gradio"
74
+ Requires-Dist: requests; extra == "gradio"
75
+ Provides-Extra: typing
76
+ Requires-Dist: typing-extensions>=4.8.0; extra == "typing"
77
+ Requires-Dist: types-PyYAML; extra == "typing"
78
+ Requires-Dist: types-simplejson; extra == "typing"
79
+ Requires-Dist: types-toml; extra == "typing"
80
+ Requires-Dist: types-tqdm; extra == "typing"
81
+ Requires-Dist: types-urllib3; extra == "typing"
82
+ Provides-Extra: quality
83
+ Requires-Dist: ruff>=0.9.0; extra == "quality"
84
+ Requires-Dist: mypy==1.15.0; extra == "quality"
85
+ Requires-Dist: libcst>=1.4.0; extra == "quality"
86
+ Requires-Dist: ty; extra == "quality"
87
+ Provides-Extra: all
88
+ Requires-Dist: authlib>=1.3.2; extra == "all"
89
+ Requires-Dist: fastapi; extra == "all"
90
+ Requires-Dist: httpx; extra == "all"
91
+ Requires-Dist: itsdangerous; extra == "all"
92
+ Requires-Dist: jedi; extra == "all"
93
+ Requires-Dist: Jinja2; extra == "all"
94
+ Requires-Dist: pytest>=8.4.2; extra == "all"
95
+ Requires-Dist: pytest-cov; extra == "all"
96
+ Requires-Dist: pytest-env; extra == "all"
97
+ Requires-Dist: pytest-xdist; extra == "all"
98
+ Requires-Dist: pytest-vcr; extra == "all"
99
+ Requires-Dist: pytest-asyncio; extra == "all"
100
+ Requires-Dist: pytest-rerunfailures<16.0; extra == "all"
101
+ Requires-Dist: pytest-mock; extra == "all"
102
+ Requires-Dist: urllib3<2.0; extra == "all"
103
+ Requires-Dist: soundfile; extra == "all"
104
+ Requires-Dist: Pillow; extra == "all"
105
+ Requires-Dist: numpy; extra == "all"
106
+ Requires-Dist: duckdb; extra == "all"
107
+ Requires-Dist: fastapi; extra == "all"
108
+ Requires-Dist: ruff>=0.9.0; extra == "all"
109
+ Requires-Dist: mypy==1.15.0; extra == "all"
110
+ Requires-Dist: libcst>=1.4.0; extra == "all"
111
+ Requires-Dist: ty; extra == "all"
112
+ Requires-Dist: typing-extensions>=4.8.0; extra == "all"
113
+ Requires-Dist: types-PyYAML; extra == "all"
114
+ Requires-Dist: types-simplejson; extra == "all"
115
+ Requires-Dist: types-toml; extra == "all"
116
+ Requires-Dist: types-tqdm; extra == "all"
117
+ Requires-Dist: types-urllib3; extra == "all"
118
+ Provides-Extra: dev
119
+ Requires-Dist: authlib>=1.3.2; extra == "dev"
120
+ Requires-Dist: fastapi; extra == "dev"
121
+ Requires-Dist: httpx; extra == "dev"
122
+ Requires-Dist: itsdangerous; extra == "dev"
123
+ Requires-Dist: jedi; extra == "dev"
124
+ Requires-Dist: Jinja2; extra == "dev"
125
+ Requires-Dist: pytest>=8.4.2; extra == "dev"
126
+ Requires-Dist: pytest-cov; extra == "dev"
127
+ Requires-Dist: pytest-env; extra == "dev"
128
+ Requires-Dist: pytest-xdist; extra == "dev"
129
+ Requires-Dist: pytest-vcr; extra == "dev"
130
+ Requires-Dist: pytest-asyncio; extra == "dev"
131
+ Requires-Dist: pytest-rerunfailures<16.0; extra == "dev"
132
+ Requires-Dist: pytest-mock; extra == "dev"
133
+ Requires-Dist: urllib3<2.0; extra == "dev"
134
+ Requires-Dist: soundfile; extra == "dev"
135
+ Requires-Dist: Pillow; extra == "dev"
136
+ Requires-Dist: numpy; extra == "dev"
137
+ Requires-Dist: duckdb; extra == "dev"
138
+ Requires-Dist: fastapi; extra == "dev"
139
+ Requires-Dist: ruff>=0.9.0; extra == "dev"
140
+ Requires-Dist: mypy==1.15.0; extra == "dev"
141
+ Requires-Dist: libcst>=1.4.0; extra == "dev"
142
+ Requires-Dist: ty; extra == "dev"
143
+ Requires-Dist: typing-extensions>=4.8.0; extra == "dev"
144
+ Requires-Dist: types-PyYAML; extra == "dev"
145
+ Requires-Dist: types-simplejson; extra == "dev"
146
+ Requires-Dist: types-toml; extra == "dev"
147
+ Requires-Dist: types-tqdm; extra == "dev"
148
+ Requires-Dist: types-urllib3; extra == "dev"
149
+ Dynamic: author
150
+ Dynamic: author-email
151
+ Dynamic: classifier
152
+ Dynamic: description
153
+ Dynamic: description-content-type
154
+ Dynamic: home-page
155
+ Dynamic: keywords
156
+ Dynamic: license
157
+ Dynamic: license-file
158
+ Dynamic: provides-extra
159
+ Dynamic: requires-dist
160
+ Dynamic: requires-python
161
+ Dynamic: summary
162
+
163
+ <p align="center">
164
+ <picture>
165
+ <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub-dark.svg">
166
+ <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub.svg">
167
+ <img alt="huggingface_hub library logo" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub.svg" width="352" height="59" style="max-width: 100%">
168
+ </picture>
169
+ <br/>
170
+ <br/>
171
+ </p>
172
+
173
+ <p align="center">
174
+ <i>The official Python client for the Huggingface Hub.</i>
175
+ </p>
176
+
177
+ <p align="center">
178
+ <a href="https://huggingface.co/docs/huggingface_hub/en/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/huggingface_hub/index.svg?down_color=red&down_message=offline&up_message=online&label=doc"></a>
179
+ <a href="https://github.com/huggingface/huggingface_hub/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/huggingface_hub.svg"></a>
180
+ <a href="https://github.com/huggingface/huggingface_hub"><img alt="PyPi version" src="https://img.shields.io/pypi/pyversions/huggingface_hub.svg"></a>
181
+ <a href="https://pypi.org/project/huggingface-hub"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/huggingface_hub"></a>
182
+ <a href="https://codecov.io/gh/huggingface/huggingface_hub"><img alt="Code coverage" src="https://codecov.io/gh/huggingface/huggingface_hub/branch/main/graph/badge.svg?token=RXP95LE2XL"></a>
183
+ </p>
184
+
185
+ <h4 align="center">
186
+ <p>
187
+ <b>English</b> |
188
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_de.md">Deutsch</a> |
189
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_fr.md">Français</a> |
190
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_hi.md">हिंदी</a> |
191
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_ko.md">한국어</a> |
192
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_cn.md">中文 (简体)</a>
193
+ <p>
194
+ </h4>
195
+
196
+ ---
197
+
198
+ **Documentation**: <a href="https://hf.co/docs/huggingface_hub" target="_blank">https://hf.co/docs/huggingface_hub</a>
199
+
200
+ **Source Code**: <a href="https://github.com/huggingface/huggingface_hub" target="_blank">https://github.com/huggingface/huggingface_hub</a>
201
+
202
+ ---
203
+
204
+ ## Welcome to the huggingface_hub library
205
+
206
+ The `huggingface_hub` library allows you to interact with the [Hugging Face Hub](https://huggingface.co/), a platform democratizing open-source Machine Learning for creators and collaborators. Discover pre-trained models and datasets for your projects or play with the thousands of machine learning apps hosted on the Hub. You can also create and share your own models, datasets and demos with the community. The `huggingface_hub` library provides a simple way to do all these things with Python.
207
+
208
+ ## Key features
209
+
210
+ - [Download files](https://huggingface.co/docs/huggingface_hub/en/guides/download) from the Hub.
211
+ - [Upload files](https://huggingface.co/docs/huggingface_hub/en/guides/upload) to the Hub.
212
+ - [Manage your repositories](https://huggingface.co/docs/huggingface_hub/en/guides/repository).
213
+ - [Run Inference](https://huggingface.co/docs/huggingface_hub/en/guides/inference) on deployed models.
214
+ - [Search](https://huggingface.co/docs/huggingface_hub/en/guides/search) for models, datasets and Spaces.
215
+ - [Share Model Cards](https://huggingface.co/docs/huggingface_hub/en/guides/model-cards) to document your models.
216
+ - [Engage with the community](https://huggingface.co/docs/huggingface_hub/en/guides/community) through PRs and comments.
217
+
218
+ ## Installation
219
+
220
+ Install the `huggingface_hub` package with [pip](https://pypi.org/project/huggingface-hub/):
221
+
222
+ ```bash
223
+ pip install huggingface_hub
224
+ ```
225
+
226
+ If you prefer, you can also install it with [conda](https://huggingface.co/docs/huggingface_hub/en/installation#install-with-conda).
227
+
228
+ In order to keep the package minimal by default, `huggingface_hub` comes with optional dependencies useful for some use cases. For example, if you want to use the MCP module, run:
229
+
230
+ ```bash
231
+ pip install "huggingface_hub[mcp]"
232
+ ```
233
+
234
+ To learn more installation and optional dependencies, check out the [installation guide](https://huggingface.co/docs/huggingface_hub/en/installation).
235
+
236
+ ## Quick start
237
+
238
+ ### Download files
239
+
240
+ Download a single file
241
+
242
+ ```py
243
+ from huggingface_hub import hf_hub_download
244
+
245
+ hf_hub_download(repo_id="tiiuae/falcon-7b-instruct", filename="config.json")
246
+ ```
247
+
248
+ Or an entire repository
249
+
250
+ ```py
251
+ from huggingface_hub import snapshot_download
252
+
253
+ snapshot_download("stabilityai/stable-diffusion-2-1")
254
+ ```
255
+
256
+ Files will be downloaded in a local cache folder. More details in [this guide](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache).
257
+
258
+ ### Login
259
+
260
+ The Hugging Face Hub uses tokens to authenticate applications (see [docs](https://huggingface.co/docs/hub/security-tokens)). To log in your machine, run the following CLI:
261
+
262
+ ```bash
263
+ hf auth login
264
+ # or using an environment variable
265
+ hf auth login --token $HUGGINGFACE_TOKEN
266
+ ```
267
+
268
+ ### Create a repository
269
+
270
+ ```py
271
+ from huggingface_hub import create_repo
272
+
273
+ create_repo(repo_id="super-cool-model")
274
+ ```
275
+
276
+ ### Upload files
277
+
278
+ Upload a single file
279
+
280
+ ```py
281
+ from huggingface_hub import upload_file
282
+
283
+ upload_file(
284
+ path_or_fileobj="/home/lysandre/dummy-test/README.md",
285
+ path_in_repo="README.md",
286
+ repo_id="lysandre/test-model",
287
+ )
288
+ ```
289
+
290
+ Or an entire folder
291
+
292
+ ```py
293
+ from huggingface_hub import upload_folder
294
+
295
+ upload_folder(
296
+ folder_path="/path/to/local/space",
297
+ repo_id="username/my-cool-space",
298
+ repo_type="space",
299
+ )
300
+ ```
301
+
302
+ For details in the [upload guide](https://huggingface.co/docs/huggingface_hub/en/guides/upload).
303
+
304
+ ## Integrating to the Hub.
305
+
306
+ We're partnering with cool open source ML libraries to provide free model hosting and versioning. You can find the existing integrations [here](https://huggingface.co/docs/hub/libraries).
307
+
308
+ The advantages are:
309
+
310
+ - Free model or dataset hosting for libraries and their users.
311
+ - Built-in file versioning, even with very large files, thanks to a git-based approach.
312
+ - In-browser widgets to play with the uploaded models.
313
+ - Anyone can upload a new model for your library, they just need to add the corresponding tag for the model to be discoverable.
314
+ - Fast downloads! We use Cloudfront (a CDN) to geo-replicate downloads so they're blazing fast from anywhere on the globe.
315
+ - Usage stats and more features to come.
316
+
317
+ If you would like to integrate your library, feel free to open an issue to begin the discussion. We wrote a [step-by-step guide](https://huggingface.co/docs/hub/adding-a-library) with ❤️ showing how to do this integration.
318
+
319
+ ## Contributions (feature requests, bugs, etc.) are super welcome 💙💚💛💜🧡❤️
320
+
321
+ Everyone is welcome to contribute, and we value everybody's contribution. Code is not the only way to help the community.
322
+ Answering questions, helping others, reaching out and improving the documentations are immensely valuable to the community.
323
+ We wrote a [contribution guide](https://github.com/huggingface/huggingface_hub/blob/main/CONTRIBUTING.md) to summarize
324
+ how to get started to contribute to this repository.
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/RECORD ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../../bin/hf,sha256=kEIWOQ37XiPE_6cjxEXEb_BD52I1tNf2BGws4nh3xLM,335
2
+ ../../../bin/huggingface-cli,sha256=fSCQnSbxN2VynH3UIYZYrZp6jw5kCW69nB-Eygx9Tnk,347
3
+ ../../../bin/tiny-agents,sha256=pg7MkROcreoC_wcKUF4Kka4lbGZzlU9kibHIJBLQu20,345
4
+ huggingface_hub-1.14.0.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
5
+ huggingface_hub-1.14.0.dist-info/METADATA,sha256=NGKGEcwoHsM57iwi1I0nz9hp3WP7M8nTgnQx3W_m0Ik,14025
6
+ huggingface_hub-1.14.0.dist-info/RECORD,,
7
+ huggingface_hub-1.14.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ huggingface_hub-1.14.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
9
+ huggingface_hub-1.14.0.dist-info/entry_points.txt,sha256=zP7F_bBSdircPQFysHQZ9F3Lcn5_dCSOEZxVlGCsG0w,212
10
+ huggingface_hub-1.14.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
11
+ huggingface_hub-1.14.0.dist-info/top_level.txt,sha256=8KzlQJAY4miUvjAssOAJodqKOw3harNzuiwGQ9qLSSk,16
12
+ huggingface_hub/__init__.py,sha256=oI1-0abcGlCY_WJVgCHNQxgObpR0dnVMCpYbyN_ZRPI,58431
13
+ huggingface_hub/_buckets.py,sha256=gqybQRY6R02kwyhMCH6VILQLo3Nd6m5XqhOQnfJMHU8,44899
14
+ huggingface_hub/_commit_api.py,sha256=cEdBcUeL6e2mb2KSTXV9A5bQPuQ48rF1nhJQ4DqC2-c,40992
15
+ huggingface_hub/_commit_scheduler.py,sha256=VG3zjb7xb66NrFMq_pB9dWIsoqSPoF6CntovV7vUiWg,14684
16
+ huggingface_hub/_dataset_viewer.py,sha256=LE9k4s0WJ0JNdriJ8L6qV_-xfbUVC1VEh7CBd0jOjLI,5675
17
+ huggingface_hub/_eval_results.py,sha256=Og7p7bCl2HQG6ZWlSDyir-CRTXjNWPu3S3q65cTGffs,8126
18
+ huggingface_hub/_hot_reload/__init__.py,sha256=nQEuyVZLAS_463Dff1JH5MorTpkYsBVFCyPhYWY3UyI,606
19
+ huggingface_hub/_hot_reload/client.py,sha256=Ax7qT4CUi5wrg3mDzqkAG8ZzDsCwdcIIS1FIXK0SqD8,4269
20
+ huggingface_hub/_hot_reload/sse_client.py,sha256=49kmsB2P9sxgbZ16ed6F4aLDs3rsHrNpbp6iKlvF90c,4948
21
+ huggingface_hub/_hot_reload/types.py,sha256=d3AOJCLUxT41Fhz4jUrFevm9bfpooe_3PQhpBdgNYSs,2725
22
+ huggingface_hub/_inference_endpoints.py,sha256=rwetck_MnbV-MrTLVN7cazIrxUNYi5x2nBkw1devnxY,17643
23
+ huggingface_hub/_jobs_api.py,sha256=XSmWO4MfXalDTfJ9wyrtV6wPW9I_2PiXGEuuMXylAhQ,14479
24
+ huggingface_hub/_local_folder.py,sha256=Fc_hikyMaZRHtXIDZj38hL7MNkq5nZZVxQF-dd-EkMo,18279
25
+ huggingface_hub/_login.py,sha256=Y4n1dlXYBr_6hrTv6X_Sx5iR2SR9_2NgFZ5t8grWmgk,19447
26
+ huggingface_hub/_oauth.py,sha256=S8evEd8JLJHz-JiSjGH4WiIi_A7oDox_xH7E9rN1Jic,18628
27
+ huggingface_hub/_snapshot_download.py,sha256=KqiEBSM8CvVXFAZHUS1lINN94WK05q9-exNWrzYO6kE,20614
28
+ huggingface_hub/_space_api.py,sha256=Sz_v5i8uPl8EUBAzrN47bphQL7ElRd_6B603b5krxDM,12177
29
+ huggingface_hub/_tensorboard_logger.py,sha256=GA-LhoG4Z-0Zq7w5B9vTIDJbWUkAhiG2d8a-ktlwGDY,8377
30
+ huggingface_hub/_upload_large_folder.py,sha256=BWA1zyOK9W_3YUerX9YWQa3wAYtHRe_ypgsO79JEkqo,31235
31
+ huggingface_hub/_webhooks_payload.py,sha256=B9zqljcm20DGeVxJKba570i6RP-cR9CEupMGBuNDHHA,3544
32
+ huggingface_hub/_webhooks_server.py,sha256=A172KmELV-pGjA819D4UgOHcuvHxCFdwOLPlu2kqHa4,15668
33
+ huggingface_hub/cli/__init__.py,sha256=A4zmzuHD2OHjQ5zmdfcnsj0JeCzHVPtpzh-wCjInugA,606
34
+ huggingface_hub/cli/_cli_utils.py,sha256=74tG3KKfBImta8qooCIfdCGwd3jJmqJGsNbyDy_ZHrc,45989
35
+ huggingface_hub/cli/_errors.py,sha256=iVc59bPwNUy7vmE_02vCsqJLbCDli_W7fneHQ-0qVfQ,4494
36
+ huggingface_hub/cli/_file_listing.py,sha256=20SGu3-PRJiGZu3wPEMzf8lKgf1eMTLxuF-q59KJeAE,7950
37
+ huggingface_hub/cli/_output.py,sha256=QmoT4Jngi7XM2zNmumwbZwKmNHzsIvXaZLIVEgooyN4,10635
38
+ huggingface_hub/cli/_skills.py,sha256=MtX4fH3ZhvR4YZ8jw9lzUzBSS7MtOY3T4SfGGYuFx3E,9811
39
+ huggingface_hub/cli/auth.py,sha256=N-u8T1pld-ZrY7X9PKtyFO0d8qcj8c0C9PVoCyTkCSo,5311
40
+ huggingface_hub/cli/buckets.py,sha256=kBhuaAFT6av9WNElmmzuPNeZV4mjfCtR6vrCaMjSL-Y,26668
41
+ huggingface_hub/cli/cache.py,sha256=FHl9lWIA2byWKdjn3eoa8IPoHhFDqmXJDHliAPJiZu4,26198
42
+ huggingface_hub/cli/collections.py,sha256=ttCZBumqb6p3GAqdW36Ux02h2h2ey7m9qCZPZJVelec,10775
43
+ huggingface_hub/cli/datasets.py,sha256=XK_wgrArGuF3AcRIpMJkqv3VwtGppyIiFBE1EwH3ne4,10816
44
+ huggingface_hub/cli/deprecated_cli.py,sha256=vAK_CR4NE9uzNJ0I59SyBnIln5nlypG4CPoohMoPMVg,1090
45
+ huggingface_hub/cli/discussions.py,sha256=mLUZrTW8h40TLt4pIa4qmBkJusVt970z15fcnwo3uSY,12654
46
+ huggingface_hub/cli/download.py,sha256=rXKUBcWayWltonlGQ-vmwFkR-ht036BBE6kHsdf184o,7953
47
+ huggingface_hub/cli/extensions.py,sha256=yQwyQdm-k9OVsezvG99COxl27-YsjoXuB3dR70-Lnqc,22030
48
+ huggingface_hub/cli/hf.py,sha256=71mAgK3VnjV4nRZkVPcSEthnUM-fZgCVvftFAb95kBE,4562
49
+ huggingface_hub/cli/inference_endpoints.py,sha256=0T2FMOo3kIrlJeTjtxgnE1-9BKlelzm0m2g_0jB62UA,14040
50
+ huggingface_hub/cli/jobs.py,sha256=VeshuYeSYos2r4AoIhlenqwvSpJ1PKVXY7A5nW9OyQo,38530
51
+ huggingface_hub/cli/lfs.py,sha256=v2yBjwBNOg0WAr6WtT5wWlTxut8OUVmcQ0ryG99OtCA,5876
52
+ huggingface_hub/cli/models.py,sha256=xdiy8l9ECuY0TSUOmfBdryJAy-hjV7tgwg3T7oAUWms,7747
53
+ huggingface_hub/cli/papers.py,sha256=3MR7hZbvH3BxldCbdbbsNh5o7q85wXSHDQhBn9AK_eY,5747
54
+ huggingface_hub/cli/repo_files.py,sha256=0VdacUlJH_fxCMrVutBt-rtOTjYk_2AhlbW5xlAeKck,2440
55
+ huggingface_hub/cli/repos.py,sha256=77WRElbQ2n4U7fkmAaZg-ryxO4tV5EtvrP0jfYpy-vY,16195
56
+ huggingface_hub/cli/skills.py,sha256=lxJEX3rwoypAzBiKOGMCY5_DlUXJzKtnISmOteoEki0,17909
57
+ huggingface_hub/cli/spaces.py,sha256=GBTcYS7mpNuGUwEIQ0jnNMBBzT-3tgay9YEHLpSHrNY,38098
58
+ huggingface_hub/cli/system.py,sha256=YJmwZcy-ffDKq2MVVXJt1W0_u2Yyua5koPe1Cvma4D4,1723
59
+ huggingface_hub/cli/upload.py,sha256=DXhdx0-Lapiu-HLJg8iP5Mr9xN50sQP2pSNajGH2Lok,10949
60
+ huggingface_hub/cli/upload_large_folder.py,sha256=WDhKyGeH5FqAkesfzCla-nRVQHl6spOVc9VD-iHtcd8,4575
61
+ huggingface_hub/cli/webhooks.py,sha256=uO2t1Kgc4DXy_Ut2nWK3uDlvFrKmqvLUSjb68kbafA0,9268
62
+ huggingface_hub/community.py,sha256=N5OE5Rbc2BMwpNXCFOPQAwn0otqQunsXdEnSE_0J2Xo,12325
63
+ huggingface_hub/constants.py,sha256=zu42eXu8shEy_JGffr7TM83XKWugv_Qca3AK_fn8eNU,11697
64
+ huggingface_hub/dataclasses.py,sha256=LoEOPRQTXgr9V7OPWvieADLuflW47l6THGaduntlqx4,26047
65
+ huggingface_hub/errors.py,sha256=OVdrv_WpfyOHinFP7_RhZncwafVE0CoFNj0tTsHEC8A,14739
66
+ huggingface_hub/fastai_utils.py,sha256=jwKeVXH_VP0zxnOKVNQCVCkF-stH70l-bExbbrYkBKQ,16583
67
+ huggingface_hub/file_download.py,sha256=9_lRpeb7s440EDss4VRlOd2hmwdgQ5oBvtpCpj4W0h8,81520
68
+ huggingface_hub/hf_api.py,sha256=LxRs7YOX4Onz28Kv7qJAwdrRUjMXb3W5Odcezxd8_dc,616527
69
+ huggingface_hub/hf_file_system.py,sha256=hFpVPJgLyfmkOayXEkiYb7yR_hqRS_JYU9o76J2tE6E,61802
70
+ huggingface_hub/hub_mixin.py,sha256=ljKAtm3qQsY-ARtxY-PCFx0koFXLI37JytMrzpqfnQo,36912
71
+ huggingface_hub/inference/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
+ huggingface_hub/inference/_client.py,sha256=yu99zMNIkXBEpq7WFdRZ8sy1ZICIXI-DwCDxjr17_as,157777
73
+ huggingface_hub/inference/_common.py,sha256=X-fBP8s6xuNTAuQsGVIethE8dNQaa-mK7Ds54qVVDz8,14999
74
+ huggingface_hub/inference/_generated/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
+ huggingface_hub/inference/_generated/_async_client.py,sha256=xST7gThlcdS50Ky6KiTo5RukiS5FaFECPi33uaHNnQ4,160988
76
+ huggingface_hub/inference/_generated/types/__init__.py,sha256=e625ENOmTUM4uUL1eAlsZOK7CovNoGBDH-TxSNQhaC4,6869
77
+ huggingface_hub/inference/_generated/types/audio_classification.py,sha256=DGWEvrRPX7xpXlbYY4LMojGcULdEZFTshhNR81rTNcQ,1567
78
+ huggingface_hub/inference/_generated/types/audio_to_audio.py,sha256=2Ep4WkePL7oJwcp5nRJqApwviumGHbft9HhXE9XLHj4,891
79
+ huggingface_hub/inference/_generated/types/automatic_speech_recognition.py,sha256=X0uCj9qwAcOmlKtV3vWH5cAgLGY0PuYIy2eSH7GE1qs,5439
80
+ huggingface_hub/inference/_generated/types/base.py,sha256=EPWp1qudjeEimEAyQ-ptdxpAACoRM8tW342-Ds7pP3I,6938
81
+ huggingface_hub/inference/_generated/types/chat_completion.py,sha256=3yc06ZI0NCZzsaeoxSMWyIHqGc8Vn_6aExncuXa1llE,11100
82
+ huggingface_hub/inference/_generated/types/depth_estimation.py,sha256=5Kn4xNGJg235Cofr1lzO59eRGCuLTZun8z5t6a2do8Q,910
83
+ huggingface_hub/inference/_generated/types/document_question_answering.py,sha256=aUZcrDGexjw8Fnm2rACm_3my77V15NnQHZxj-sa7bjo,3146
84
+ huggingface_hub/inference/_generated/types/feature_extraction.py,sha256=DiO47PqwhdNta_KPDBsXfCPIL6BWNhyCLymxORlqUA0,1509
85
+ huggingface_hub/inference/_generated/types/fill_mask.py,sha256=NhEskgoiMst7U_0cV_S-wryga4_2T5LgpbyQ23vHMp0,1680
86
+ huggingface_hub/inference/_generated/types/image_classification.py,sha256=xWr4OY65EcIIhd7MI610a7xlaAVHQeydi8Gc7tlaPTU,1579
87
+ huggingface_hub/inference/_generated/types/image_segmentation.py,sha256=lZKzbn_gkSi1sYGLb9ryeRr0aDAVrfYh0H5Iiv5JNsY,1935
88
+ huggingface_hub/inference/_generated/types/image_text_to_image.py,sha256=pRQgYKXJbIpEjRBxuIE6NRDtk_km9DSTQ2s2qxkYJRM,2585
89
+ huggingface_hub/inference/_generated/types/image_text_to_video.py,sha256=MVhP8D4eYjpT3hWYkkpuiME-hWY09o7C4rIn_wmnAmY,2450
90
+ huggingface_hub/inference/_generated/types/image_to_image.py,sha256=P9TeMFSnZI_pa6DyinzDPWL0PNm_INJwsqFUEzpwIcU,2262
91
+ huggingface_hub/inference/_generated/types/image_to_text.py,sha256=DCDS0Nqi-4AuATitvZm4EgrzYQuQEQ5lzBcaHuCmHnY,4740
92
+ huggingface_hub/inference/_generated/types/image_to_video.py,sha256=iMCjiFTUza9AW_MyJ1videhYgvMY9QSzQ59kNdTE9QY,2206
93
+ huggingface_hub/inference/_generated/types/object_detection.py,sha256=zNILlyZJh59rNNfqXM5x9uprDCPfEU-bvBtabeKSMzI,1965
94
+ huggingface_hub/inference/_generated/types/question_answering.py,sha256=lIRf8DjRiAsjzWUBEsGt354SB4bvejLokf-8luHw6_o,2845
95
+ huggingface_hub/inference/_generated/types/sentence_similarity.py,sha256=eNGiO4dR-Lft9yOLE0E97jfVyOT_ExDWZW5GeJGdatU,1027
96
+ huggingface_hub/inference/_generated/types/summarization.py,sha256=5Dpt6LRteT-hO9esa3Ty_-xSL5U-zTtVZzy3iKwZRng,1472
97
+ huggingface_hub/inference/_generated/types/table_question_answering.py,sha256=bwn616XJ-eLa5JKzn3m9rZZ3z63INGEUwRSOCWlEh-4,2269
98
+ huggingface_hub/inference/_generated/types/text2text_generation.py,sha256=fotSlPWtIlp_LG26AQvNRXxU-jCUfda5PWsxCkbFeKM,1591
99
+ huggingface_hub/inference/_generated/types/text_classification.py,sha256=uamBzz4YzWQ9mb6yuriykX8r47ZHf0K3eJpOLzB2qJw,1439
100
+ huggingface_hub/inference/_generated/types/text_generation.py,sha256=FRUljypvCSQ2RaiylhlHeLJfHNy8qZvuty6SCB_zcaA,5813
101
+ huggingface_hub/inference/_generated/types/text_to_audio.py,sha256=CiaTUSBdFtOs3gGTosqyQrjCwrN8mVJLRxDDWUtwZpQ,4677
102
+ huggingface_hub/inference/_generated/types/text_to_image.py,sha256=Ju25WIDHE-1n8jlZ_udG4rpAaPYpmghR7-cDgVfuf48,1869
103
+ huggingface_hub/inference/_generated/types/text_to_speech.py,sha256=WqDzWkhZsDcjapkoFrkyjF903SKs7aHhjypsYrn8oBw,4693
104
+ huggingface_hub/inference/_generated/types/text_to_video.py,sha256=KjcDbtChwrIxIdTH6vcesUaMcjL2xjPf2nWEnHazj4Y,1756
105
+ huggingface_hub/inference/_generated/types/token_classification.py,sha256=8KRBlqgFDszaUnPMvdRkZNQaZBtDfBhKZ6SggQldQMM,1894
106
+ huggingface_hub/inference/_generated/types/translation.py,sha256=-j2PC1-qfoxvGMaKesqf5jXelqj0BDlEpfvZ7dsIEJM,1742
107
+ huggingface_hub/inference/_generated/types/video_classification.py,sha256=vfaCqp8MuRMMtGvYCX0x4-AcwEd71kPHgSYJTNIOxLQ,1668
108
+ huggingface_hub/inference/_generated/types/visual_question_answering.py,sha256=sWIB-SuEpW21oU6O5qQF3tt5PavUFsFOa8Xg4upTtek,1654
109
+ huggingface_hub/inference/_generated/types/zero_shot_classification.py,sha256=gx-7WHad7xsNa89iSKSteKHjaHrKbIyzN_nSdJN3HCY,1697
110
+ huggingface_hub/inference/_generated/types/zero_shot_image_classification.py,sha256=JZTQvcbSGrOQlWcPuJvAMT8YziDpBRn1O2RQSw507uY,1449
111
+ huggingface_hub/inference/_generated/types/zero_shot_object_detection.py,sha256=sjdpVUN5zW9aYBymLVUs6i5HVk2qkUBO9ysEjHmsXVM,1605
112
+ huggingface_hub/inference/_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
+ huggingface_hub/inference/_mcp/_cli_hacks.py,sha256=KX9HZJPa1p8ngY3mtYGGlVUXfg4vYbbBRs-8HLToP04,3284
114
+ huggingface_hub/inference/_mcp/agent.py,sha256=ufIzMGHore5n252hV5GZPM0ouDXIl6tv5Jl_5gHXnbg,4250
115
+ huggingface_hub/inference/_mcp/cli.py,sha256=YsabbtVJzQ4vNN5km86BYcLV0SgkgE3hM2pIt6BpyAA,9801
116
+ huggingface_hub/inference/_mcp/constants.py,sha256=lLRgR6gRuqJ4u7jqngVUgVYeOzactgJpkTxx66t1piE,2463
117
+ huggingface_hub/inference/_mcp/mcp_client.py,sha256=dGp8PhN6aVw4bDnuSySFSiguHUiz-nzhgv89CVdO7pI,17243
118
+ huggingface_hub/inference/_mcp/types.py,sha256=yHNfPsM9MhD06oeKdkbmrBsW-3WhUeqA26fyfRfx_bk,929
119
+ huggingface_hub/inference/_mcp/utils.py,sha256=gxSB_rBjQ6VrkApKFsxk6-UzhijxkDVNuZrsjW5pL8k,4318
120
+ huggingface_hub/inference/_providers/__init__.py,sha256=sOsixpAIWprZYEuTmrNGHxMiZ75P7fj94rrIl99hVpI,10688
121
+ huggingface_hub/inference/_providers/_common.py,sha256=9Ykgz6nT5ok-p7PpvnxkuNURR-yr85uP9B1wHwbSJ58,13815
122
+ huggingface_hub/inference/_providers/black_forest_labs.py,sha256=HO3DaD97Hqy6b823b6VUPqiEqhHLKGeMVtJ4CMjGRPA,2817
123
+ huggingface_hub/inference/_providers/cerebras.py,sha256=QOJ-1U-os7uE7p6eUnn_P_APq-yQhx28be7c3Tq2EuA,210
124
+ huggingface_hub/inference/_providers/clarifai.py,sha256=1cEXQwhGk4DRKiPCQUa5y-L6okTo4781EImQC8yJVOw,380
125
+ huggingface_hub/inference/_providers/cohere.py,sha256=P9kbIuvQ2rXI1yNmgbw5VKFFTE0huLq2k-BCyDkDico,1226
126
+ huggingface_hub/inference/_providers/deepinfra.py,sha256=EaeeEMJCiKKuAdOjZwq0lWGonzBISr4ucErb6KpVCgE,1564
127
+ huggingface_hub/inference/_providers/fal_ai.py,sha256=Y2vo5Dl3e8EoehplwNshu8LbDT1e7V4ZyhAMqs0wXws,11705
128
+ huggingface_hub/inference/_providers/featherless_ai.py,sha256=C8OHdFpoFyA--pawLTekUmUSRq4sw_r0D-cSPekD4kE,1347
129
+ huggingface_hub/inference/_providers/fireworks_ai.py,sha256=go6XPum8u0-g768HJ_r0S4Gb445gZJxnuY_acktz-9c,1188
130
+ huggingface_hub/inference/_providers/groq.py,sha256=JTk2JV4ZOlaohho7zLAFQtk92kGVsPmLJ1hmzcwsqvQ,315
131
+ huggingface_hub/inference/_providers/hf_inference.py,sha256=d2vdCKSi5CtyQqYndHHvH7SY131-7YHUK3yE7653eIc,9467
132
+ huggingface_hub/inference/_providers/hyperbolic.py,sha256=Zk2rw2k-R0JxrfpcaHhAUuB5D6s40XK0tuvuKwglsUE,1950
133
+ huggingface_hub/inference/_providers/nebius.py,sha256=OTPXQIYxQSwZc8FzhGAHUaa6Mkp3h6LphSVoa_1gY7I,3513
134
+ huggingface_hub/inference/_providers/novita.py,sha256=ATEoSdPAPLMfx3JpBc0sOyLh4upJYP6xHBuo4YEDYvg,2470
135
+ huggingface_hub/inference/_providers/nscale.py,sha256=T1L2JLI9LqYT9_YEdW76YEtxfDNtdGkoFhH-p2LmSxg,1767
136
+ huggingface_hub/inference/_providers/nvidia.py,sha256=ocMuhycyIo8qiJf-E50oHGYuPvMcJC0BexSXBOYaOFg,251
137
+ huggingface_hub/inference/_providers/openai.py,sha256=wwjaaQ55xLmDsDnHpZk52xbuLoGZfWzJkFsE8AdFVaI,1054
138
+ huggingface_hub/inference/_providers/ovhcloud.py,sha256=tdmymlkbddMJKV7NRZ-tH2wymbLPFDTqUSXpWJUXyDQ,314
139
+ huggingface_hub/inference/_providers/publicai.py,sha256=1I2W6rORloB5QHSvky4njZO2XKLTwA-kPdNoauoT5rg,210
140
+ huggingface_hub/inference/_providers/replicate.py,sha256=hte0ZB2RtGFwpAuLrFp2Gbgsn3EOlccCnJkWTDTW__A,6027
141
+ huggingface_hub/inference/_providers/sambanova.py,sha256=2dkhLf5nKGZ8hOqF-oWYHIN-Wckcwq0cMVf8UYgmDy4,1999
142
+ huggingface_hub/inference/_providers/scaleway.py,sha256=MfIc7ZND1sPr__rOmNHZVg0VpECQo0bVyksgIm_32xQ,1174
143
+ huggingface_hub/inference/_providers/together.py,sha256=OviPGPE6JvJwxf94DKpmsyWlryLU-W9OrHBcr36JT1w,3384
144
+ huggingface_hub/inference/_providers/wavespeed.py,sha256=MGM7Y7r2nQiH_EH0t5UE1o1fT-o3sS7RaQlicS9WHsg,5028
145
+ huggingface_hub/inference/_providers/zai_org.py,sha256=gLJZOEmCPmUZvdM7VDn2nxm4ac3veHdoMgQNR63UeWE,4739
146
+ huggingface_hub/lfs.py,sha256=sVkRZMD8w-3vxaqSDSrsBCog6-lLyWP98u_xR1dIwMo,14123
147
+ huggingface_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
+ huggingface_hub/repocard.py,sha256=p2b9GiEzqrMJZW3O41nwiC6AseQhFNbqCbSIpoP_8VE,34829
149
+ huggingface_hub/repocard_data.py,sha256=lI-DEIbmjtlOMvkntqwXA9ED4icTeCvNbAOYc2P2ZMk,33879
150
+ huggingface_hub/serialization/__init__.py,sha256=jCiw_vVQYW52gwVfWiqgocf2Q19kGTQlRGVpf-4SLP8,963
151
+ huggingface_hub/serialization/_base.py,sha256=-bhRW21ZIqza5gE_kWkL8ax-EE6L6xabKkkK35wOX0Y,8176
152
+ huggingface_hub/serialization/_dduf.py,sha256=FmGRg5wkXI5sNCgZgCNJFmqb_nVrrQYOHf4Lqq0d64I,15385
153
+ huggingface_hub/serialization/_torch.py,sha256=BtaCP-G3oYRP6Pj9MhI-ygBsk3JbBPnVDW9huSOgtkk,46897
154
+ huggingface_hub/templates/datasetcard_template.md,sha256=W-EMqR6wndbrnZorkVv56URWPG49l7MATGeI015kTvs,5503
155
+ huggingface_hub/templates/modelcard_template.md,sha256=4AqArS3cqdtbit5Bo-DhjcnDFR-pza5hErLLTPM4Yuc,6870
156
+ huggingface_hub/utils/__init__.py,sha256=RW7Cq8t_YPvVk6DeFaV9cPEKXB5g31JbPQCFjYIoaDc,4050
157
+ huggingface_hub/utils/_auth.py,sha256=DeSmNYy8UiAgrETNsHE5VZtzc-3-IKRqtFjxMxXBRdE,8227
158
+ huggingface_hub/utils/_cache_assets.py,sha256=nnzHRtQAR50dQeIK6qKddsmjjTW9v9HZ7b9bq7PJqss,5691
159
+ huggingface_hub/utils/_cache_manager.py,sha256=K3MfxR2we_uU2j5xLD1bMru6zihzDDlyZZm6mGUOxtM,33023
160
+ huggingface_hub/utils/_chunk_utils.py,sha256=pTjy8Z-KLU4W_6D3OUh3E8lCodWDCd6aJwjNDU0C5U8,2121
161
+ huggingface_hub/utils/_datetime.py,sha256=tbNyI0Dkh27oScPUtLIT_8apqIIkXZYbigjOn9S3aMw,2755
162
+ huggingface_hub/utils/_deprecation.py,sha256=n4kNHbGipquSObJ-gxodcfd6lqoe_8s-VIsTuo3Oruk,4865
163
+ huggingface_hub/utils/_detect_agent.py,sha256=tSfJZsIzrPZWYHV77UJY07KLiNkaoxAXaOmF9MXWkdI,3029
164
+ huggingface_hub/utils/_dotenv.py,sha256=QfL6aWFp5NffhJEYFd6FtKHqly5EBpY8Cb-o9t8Ul04,1980
165
+ huggingface_hub/utils/_experimental.py,sha256=q9vUvc1JybVFRQ0GRREG-BLouZEJC_40MxcwbAlOud0,2464
166
+ huggingface_hub/utils/_fixes.py,sha256=jTK1VLmc0ZC9ROSXjKoL5F6kOZFByzirRWIdXxhBfWU,4124
167
+ huggingface_hub/utils/_git_credential.py,sha256=1BhjvIScCOAToDORLOKrR3Szs-m0E5AYHPmC0SD7Nrs,4548
168
+ huggingface_hub/utils/_headers.py,sha256=fHd8JflV1sX02mdQnwc7k1zll3ZXceu-HmmGz0UOEFo,8090
169
+ huggingface_hub/utils/_hf_uris.py,sha256=i_4QCMtffC7XKn8X7N38OLiUyW8pdWhQmOlIECBUVP0,16464
170
+ huggingface_hub/utils/_http.py,sha256=SCTMeZjKM9agYGfVfRjtT_eXKMNK9u1-RCSzBif_ojs,42652
171
+ huggingface_hub/utils/_lfs.py,sha256=xMU-ROgNAUpDkzbH6yRZsE-eVYUYTNMGrgVb_QdTv-k,3942
172
+ huggingface_hub/utils/_pagination.py,sha256=buIKERSyWOJqVSY9nv8hImDmNErD-PPOnXosXm2O87Y,1832
173
+ huggingface_hub/utils/_parsing.py,sha256=Z07oEU-19L_0Mav7_LRZLz0z0NuS4bLDaAY1RPdeqf0,2982
174
+ huggingface_hub/utils/_paths.py,sha256=ZceF9JnRzNrvrMSvPbgriuRQ3_LpLa3EE8W21bvBxaU,5265
175
+ huggingface_hub/utils/_runtime.py,sha256=GVJ_Dt6y_48TZWl0ui9yPdW8M7rqGblRGyUqKaXxkCo,13493
176
+ huggingface_hub/utils/_safetensors.py,sha256=qFE7OA-vjU8X0zBBtOeNRzLu147KF-PdFoaxkYfWB4M,4426
177
+ huggingface_hub/utils/_subprocess.py,sha256=tFVBBNot_HLVqQ79y873TGb12C4PUTMwub-GhzDTemE,4542
178
+ huggingface_hub/utils/_telemetry.py,sha256=Tpa3YmOLhK_MEnL2i2fRLQy8aqAEZWbj6MKC18Hs8BA,4824
179
+ huggingface_hub/utils/_terminal.py,sha256=7jdWt1xvmx7nOlYG416833EjJyukvDzksTVfRFloAtg,3532
180
+ huggingface_hub/utils/_typing.py,sha256=1LeE785YedppXSR9a1fQDu5rhTxX5v9kBYBaHrP65rA,3542
181
+ huggingface_hub/utils/_validators.py,sha256=tuC1U4yxB-4XKIajZMnKZUSKvZ6qPpydY8LItfa0g40,8419
182
+ huggingface_hub/utils/_verification.py,sha256=ZSilnolSkYmHB3vmBlWuuVr53xz59P5uGnxC3H-Hjc0,5434
183
+ huggingface_hub/utils/_xet.py,sha256=v2VI_iVGIp4-mGT44L9nY9R4fdBeVepjUBzsVIeQS1g,9834
184
+ huggingface_hub/utils/_xet_progress_reporting.py,sha256=IlrrHAgHYKsep4o9fhsCtdMFhKxp5rKoHs785x5q21k,6826
185
+ huggingface_hub/utils/endpoint_helpers.py,sha256=9VtIAlxQ5H_4y30sjCAgbu7XCqAtNLC7aRYxaNn0hLI,2366
186
+ huggingface_hub/utils/insecure_hashlib.py,sha256=z3dVUFvdBZ8kQI_8Vzvvlr3ims-EBiY-SYPdnzIKOkw,1008
187
+ huggingface_hub/utils/logging.py,sha256=WaXk5gRa8Ml_LUIH34QCr8suYj8i_8Wot4IDnYJyWyM,4870
188
+ huggingface_hub/utils/sha.py,sha256=h8wxheZpcv671RhtiIFcFmQTDSrogN3kwQx3ZaNEUHg,2121
189
+ huggingface_hub/utils/tqdm.py,sha256=KbHhkB5g76JgGOGCcjAQOaR-oc-wzd4Sr3wqd41ny7k,13350
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/REQUESTED ADDED
File without changes
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (79.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/entry_points.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [console_scripts]
2
+ hf = huggingface_hub.cli.hf:main
3
+ huggingface-cli = huggingface_hub.cli.deprecated_cli:main
4
+ tiny-agents = huggingface_hub.inference._mcp.cli:app
5
+
6
+ [fsspec.specs]
7
+ hf = huggingface_hub.HfFileSystem
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
.venv/lib/python3.14/site-packages/huggingface_hub-1.14.0.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface_hub
.venv/lib/python3.14/site-packages/huggingface_hub/__init__.py CHANGED
@@ -46,7 +46,7 @@ import sys
46
  from typing import TYPE_CHECKING
47
 
48
 
49
- __version__ = "1.9.2"
50
 
51
  # Alphabetical order of definitions is ensured in tests
52
  # WARNING: any comment added in this dictionary definition will be lost when
@@ -105,6 +105,7 @@ _SUBMOD_ATTRS = {
105
  "_space_api": [
106
  "SpaceHardware",
107
  "SpaceRuntime",
 
108
  "SpaceStage",
109
  "SpaceStorage",
110
  "SpaceVariable",
@@ -182,12 +183,14 @@ _SUBMOD_ATTRS = {
182
  "GitRefInfo",
183
  "GitRefs",
184
  "HfApi",
 
185
  "ModelInfo",
186
  "Organization",
187
  "RepoFile",
188
  "RepoFolder",
189
  "RepoUrl",
190
  "SpaceInfo",
 
191
  "User",
192
  "UserLikes",
193
  "WebhookInfo",
@@ -203,6 +206,7 @@ _SUBMOD_ATTRS = {
203
  "cancel_job",
204
  "change_discussion_status",
205
  "comment_discussion",
 
206
  "create_branch",
207
  "create_bucket",
208
  "create_collection",
@@ -242,6 +246,7 @@ _SUBMOD_ATTRS = {
242
  "enable_webhook",
243
  "fetch_job_logs",
244
  "fetch_job_metrics",
 
245
  "file_exists",
246
  "get_bucket_file_metadata",
247
  "get_bucket_paths_info",
@@ -258,12 +263,14 @@ _SUBMOD_ATTRS = {
258
  "get_repo_discussions",
259
  "get_safetensors_metadata",
260
  "get_space_runtime",
 
261
  "get_space_variables",
262
  "get_user_overview",
263
  "get_webhook",
264
  "grant_access",
265
  "inspect_job",
266
  "inspect_scheduled_job",
 
267
  "list_accepted_access_requests",
268
  "list_bucket_tree",
269
  "list_buckets",
@@ -289,6 +296,7 @@ _SUBMOD_ATTRS = {
289
  "list_repo_refs",
290
  "list_repo_tree",
291
  "list_spaces",
 
292
  "list_user_followers",
293
  "list_user_following",
294
  "list_webhooks",
@@ -319,6 +327,7 @@ _SUBMOD_ATTRS = {
319
  "run_job",
320
  "run_uv_job",
321
  "scale_to_zero_inference_endpoint",
 
322
  "set_space_sleep_time",
323
  "set_space_volumes",
324
  "space_info",
@@ -576,6 +585,7 @@ _SUBMOD_ATTRS = {
576
  "CorruptedCacheException",
577
  "DeleteCacheStrategy",
578
  "HFCacheInfo",
 
579
  "cached_assets_path",
580
  "close_session",
581
  "dump_environment_info",
@@ -584,6 +594,7 @@ _SUBMOD_ATTRS = {
584
  "get_token",
585
  "hf_raise_for_status",
586
  "logging",
 
587
  "scan_cache_dir",
588
  "set_async_client_factory",
589
  "set_client_factory",
@@ -714,6 +725,7 @@ __all__ = [
714
  "HfFileSystemFile",
715
  "HfFileSystemResolvedPath",
716
  "HfFileSystemStreamFile",
 
717
  "ImageClassificationInput",
718
  "ImageClassificationOutputElement",
719
  "ImageClassificationOutputTransform",
@@ -756,6 +768,7 @@ __all__ = [
756
  "JobOwner",
757
  "JobStage",
758
  "JobStatus",
 
759
  "MCPClient",
760
  "ModelCard",
761
  "ModelCardData",
@@ -790,6 +803,8 @@ __all__ = [
790
  "SpaceHardware",
791
  "SpaceInfo",
792
  "SpaceRuntime",
 
 
793
  "SpaceStage",
794
  "SpaceStorage",
795
  "SpaceVariable",
@@ -903,6 +918,7 @@ __all__ = [
903
  "check_cli_update",
904
  "close_session",
905
  "comment_discussion",
 
906
  "create_branch",
907
  "create_bucket",
908
  "create_collection",
@@ -946,6 +962,7 @@ __all__ = [
946
  "export_folder_as_dduf",
947
  "fetch_job_logs",
948
  "fetch_job_metrics",
 
949
  "file_exists",
950
  "from_pretrained_fastai",
951
  "get_async_session",
@@ -966,6 +983,7 @@ __all__ = [
966
  "get_safetensors_metadata",
967
  "get_session",
968
  "get_space_runtime",
 
969
  "get_space_variables",
970
  "get_token",
971
  "get_torch_storage_id",
@@ -981,6 +999,7 @@ __all__ = [
981
  "inspect_scheduled_job",
982
  "interpreter_login",
983
  "is_offline_mode",
 
984
  "list_accepted_access_requests",
985
  "list_bucket_tree",
986
  "list_buckets",
@@ -1006,6 +1025,7 @@ __all__ = [
1006
  "list_repo_refs",
1007
  "list_repo_tree",
1008
  "list_spaces",
 
1009
  "list_user_followers",
1010
  "list_user_following",
1011
  "list_webhooks",
@@ -1025,6 +1045,7 @@ __all__ = [
1025
  "notebook_login",
1026
  "paper_info",
1027
  "parse_eval_result_entries",
 
1028
  "parse_huggingface_oauth",
1029
  "parse_local_safetensors_file_metadata",
1030
  "parse_safetensors_file_metadata",
@@ -1053,6 +1074,7 @@ __all__ = [
1053
  "save_torch_state_dict",
1054
  "scale_to_zero_inference_endpoint",
1055
  "scan_cache_dir",
 
1056
  "set_async_client_factory",
1057
  "set_client_factory",
1058
  "set_space_sleep_time",
@@ -1231,6 +1253,7 @@ if TYPE_CHECKING: # pragma: no cover
1231
  from ._space_api import (
1232
  SpaceHardware, # noqa: F401
1233
  SpaceRuntime, # noqa: F401
 
1234
  SpaceStage, # noqa: F401
1235
  SpaceStorage, # noqa: F401
1236
  SpaceVariable, # noqa: F401
@@ -1306,12 +1329,14 @@ if TYPE_CHECKING: # pragma: no cover
1306
  GitRefInfo, # noqa: F401
1307
  GitRefs, # noqa: F401
1308
  HfApi, # noqa: F401
 
1309
  ModelInfo, # noqa: F401
1310
  Organization, # noqa: F401
1311
  RepoFile, # noqa: F401
1312
  RepoFolder, # noqa: F401
1313
  RepoUrl, # noqa: F401
1314
  SpaceInfo, # noqa: F401
 
1315
  User, # noqa: F401
1316
  UserLikes, # noqa: F401
1317
  WebhookInfo, # noqa: F401
@@ -1327,6 +1352,7 @@ if TYPE_CHECKING: # pragma: no cover
1327
  cancel_job, # noqa: F401
1328
  change_discussion_status, # noqa: F401
1329
  comment_discussion, # noqa: F401
 
1330
  create_branch, # noqa: F401
1331
  create_bucket, # noqa: F401
1332
  create_collection, # noqa: F401
@@ -1366,6 +1392,7 @@ if TYPE_CHECKING: # pragma: no cover
1366
  enable_webhook, # noqa: F401
1367
  fetch_job_logs, # noqa: F401
1368
  fetch_job_metrics, # noqa: F401
 
1369
  file_exists, # noqa: F401
1370
  get_bucket_file_metadata, # noqa: F401
1371
  get_bucket_paths_info, # noqa: F401
@@ -1382,12 +1409,14 @@ if TYPE_CHECKING: # pragma: no cover
1382
  get_repo_discussions, # noqa: F401
1383
  get_safetensors_metadata, # noqa: F401
1384
  get_space_runtime, # noqa: F401
 
1385
  get_space_variables, # noqa: F401
1386
  get_user_overview, # noqa: F401
1387
  get_webhook, # noqa: F401
1388
  grant_access, # noqa: F401
1389
  inspect_job, # noqa: F401
1390
  inspect_scheduled_job, # noqa: F401
 
1391
  list_accepted_access_requests, # noqa: F401
1392
  list_bucket_tree, # noqa: F401
1393
  list_buckets, # noqa: F401
@@ -1413,6 +1442,7 @@ if TYPE_CHECKING: # pragma: no cover
1413
  list_repo_refs, # noqa: F401
1414
  list_repo_tree, # noqa: F401
1415
  list_spaces, # noqa: F401
 
1416
  list_user_followers, # noqa: F401
1417
  list_user_following, # noqa: F401
1418
  list_webhooks, # noqa: F401
@@ -1443,6 +1473,7 @@ if TYPE_CHECKING: # pragma: no cover
1443
  run_job, # noqa: F401
1444
  run_uv_job, # noqa: F401
1445
  scale_to_zero_inference_endpoint, # noqa: F401
 
1446
  set_space_sleep_time, # noqa: F401
1447
  set_space_volumes, # noqa: F401
1448
  space_info, # noqa: F401
@@ -1694,6 +1725,7 @@ if TYPE_CHECKING: # pragma: no cover
1694
  CorruptedCacheException, # noqa: F401
1695
  DeleteCacheStrategy, # noqa: F401
1696
  HFCacheInfo, # noqa: F401
 
1697
  cached_assets_path, # noqa: F401
1698
  close_session, # noqa: F401
1699
  dump_environment_info, # noqa: F401
@@ -1702,6 +1734,7 @@ if TYPE_CHECKING: # pragma: no cover
1702
  get_token, # noqa: F401
1703
  hf_raise_for_status, # noqa: F401
1704
  logging, # noqa: F401
 
1705
  scan_cache_dir, # noqa: F401
1706
  set_async_client_factory, # noqa: F401
1707
  set_client_factory, # noqa: F401
 
46
  from typing import TYPE_CHECKING
47
 
48
 
49
+ __version__ = "1.14.0"
50
 
51
  # Alphabetical order of definitions is ensured in tests
52
  # WARNING: any comment added in this dictionary definition will be lost when
 
105
  "_space_api": [
106
  "SpaceHardware",
107
  "SpaceRuntime",
108
+ "SpaceSecret",
109
  "SpaceStage",
110
  "SpaceStorage",
111
  "SpaceVariable",
 
183
  "GitRefInfo",
184
  "GitRefs",
185
  "HfApi",
186
+ "KernelInfo",
187
  "ModelInfo",
188
  "Organization",
189
  "RepoFile",
190
  "RepoFolder",
191
  "RepoUrl",
192
  "SpaceInfo",
193
+ "SpaceSearchResult",
194
  "User",
195
  "UserLikes",
196
  "WebhookInfo",
 
206
  "cancel_job",
207
  "change_discussion_status",
208
  "comment_discussion",
209
+ "copy_files",
210
  "create_branch",
211
  "create_bucket",
212
  "create_collection",
 
246
  "enable_webhook",
247
  "fetch_job_logs",
248
  "fetch_job_metrics",
249
+ "fetch_space_logs",
250
  "file_exists",
251
  "get_bucket_file_metadata",
252
  "get_bucket_paths_info",
 
263
  "get_repo_discussions",
264
  "get_safetensors_metadata",
265
  "get_space_runtime",
266
+ "get_space_secrets",
267
  "get_space_variables",
268
  "get_user_overview",
269
  "get_webhook",
270
  "grant_access",
271
  "inspect_job",
272
  "inspect_scheduled_job",
273
+ "kernel_info",
274
  "list_accepted_access_requests",
275
  "list_bucket_tree",
276
  "list_buckets",
 
296
  "list_repo_refs",
297
  "list_repo_tree",
298
  "list_spaces",
299
+ "list_spaces_hardware",
300
  "list_user_followers",
301
  "list_user_following",
302
  "list_webhooks",
 
327
  "run_job",
328
  "run_uv_job",
329
  "scale_to_zero_inference_endpoint",
330
+ "search_spaces",
331
  "set_space_sleep_time",
332
  "set_space_volumes",
333
  "space_info",
 
585
  "CorruptedCacheException",
586
  "DeleteCacheStrategy",
587
  "HFCacheInfo",
588
+ "HfUri",
589
  "cached_assets_path",
590
  "close_session",
591
  "dump_environment_info",
 
594
  "get_token",
595
  "hf_raise_for_status",
596
  "logging",
597
+ "parse_hf_uri",
598
  "scan_cache_dir",
599
  "set_async_client_factory",
600
  "set_client_factory",
 
725
  "HfFileSystemFile",
726
  "HfFileSystemResolvedPath",
727
  "HfFileSystemStreamFile",
728
+ "HfUri",
729
  "ImageClassificationInput",
730
  "ImageClassificationOutputElement",
731
  "ImageClassificationOutputTransform",
 
768
  "JobOwner",
769
  "JobStage",
770
  "JobStatus",
771
+ "KernelInfo",
772
  "MCPClient",
773
  "ModelCard",
774
  "ModelCardData",
 
803
  "SpaceHardware",
804
  "SpaceInfo",
805
  "SpaceRuntime",
806
+ "SpaceSearchResult",
807
+ "SpaceSecret",
808
  "SpaceStage",
809
  "SpaceStorage",
810
  "SpaceVariable",
 
918
  "check_cli_update",
919
  "close_session",
920
  "comment_discussion",
921
+ "copy_files",
922
  "create_branch",
923
  "create_bucket",
924
  "create_collection",
 
962
  "export_folder_as_dduf",
963
  "fetch_job_logs",
964
  "fetch_job_metrics",
965
+ "fetch_space_logs",
966
  "file_exists",
967
  "from_pretrained_fastai",
968
  "get_async_session",
 
983
  "get_safetensors_metadata",
984
  "get_session",
985
  "get_space_runtime",
986
+ "get_space_secrets",
987
  "get_space_variables",
988
  "get_token",
989
  "get_torch_storage_id",
 
999
  "inspect_scheduled_job",
1000
  "interpreter_login",
1001
  "is_offline_mode",
1002
+ "kernel_info",
1003
  "list_accepted_access_requests",
1004
  "list_bucket_tree",
1005
  "list_buckets",
 
1025
  "list_repo_refs",
1026
  "list_repo_tree",
1027
  "list_spaces",
1028
+ "list_spaces_hardware",
1029
  "list_user_followers",
1030
  "list_user_following",
1031
  "list_webhooks",
 
1045
  "notebook_login",
1046
  "paper_info",
1047
  "parse_eval_result_entries",
1048
+ "parse_hf_uri",
1049
  "parse_huggingface_oauth",
1050
  "parse_local_safetensors_file_metadata",
1051
  "parse_safetensors_file_metadata",
 
1074
  "save_torch_state_dict",
1075
  "scale_to_zero_inference_endpoint",
1076
  "scan_cache_dir",
1077
+ "search_spaces",
1078
  "set_async_client_factory",
1079
  "set_client_factory",
1080
  "set_space_sleep_time",
 
1253
  from ._space_api import (
1254
  SpaceHardware, # noqa: F401
1255
  SpaceRuntime, # noqa: F401
1256
+ SpaceSecret, # noqa: F401
1257
  SpaceStage, # noqa: F401
1258
  SpaceStorage, # noqa: F401
1259
  SpaceVariable, # noqa: F401
 
1329
  GitRefInfo, # noqa: F401
1330
  GitRefs, # noqa: F401
1331
  HfApi, # noqa: F401
1332
+ KernelInfo, # noqa: F401
1333
  ModelInfo, # noqa: F401
1334
  Organization, # noqa: F401
1335
  RepoFile, # noqa: F401
1336
  RepoFolder, # noqa: F401
1337
  RepoUrl, # noqa: F401
1338
  SpaceInfo, # noqa: F401
1339
+ SpaceSearchResult, # noqa: F401
1340
  User, # noqa: F401
1341
  UserLikes, # noqa: F401
1342
  WebhookInfo, # noqa: F401
 
1352
  cancel_job, # noqa: F401
1353
  change_discussion_status, # noqa: F401
1354
  comment_discussion, # noqa: F401
1355
+ copy_files, # noqa: F401
1356
  create_branch, # noqa: F401
1357
  create_bucket, # noqa: F401
1358
  create_collection, # noqa: F401
 
1392
  enable_webhook, # noqa: F401
1393
  fetch_job_logs, # noqa: F401
1394
  fetch_job_metrics, # noqa: F401
1395
+ fetch_space_logs, # noqa: F401
1396
  file_exists, # noqa: F401
1397
  get_bucket_file_metadata, # noqa: F401
1398
  get_bucket_paths_info, # noqa: F401
 
1409
  get_repo_discussions, # noqa: F401
1410
  get_safetensors_metadata, # noqa: F401
1411
  get_space_runtime, # noqa: F401
1412
+ get_space_secrets, # noqa: F401
1413
  get_space_variables, # noqa: F401
1414
  get_user_overview, # noqa: F401
1415
  get_webhook, # noqa: F401
1416
  grant_access, # noqa: F401
1417
  inspect_job, # noqa: F401
1418
  inspect_scheduled_job, # noqa: F401
1419
+ kernel_info, # noqa: F401
1420
  list_accepted_access_requests, # noqa: F401
1421
  list_bucket_tree, # noqa: F401
1422
  list_buckets, # noqa: F401
 
1442
  list_repo_refs, # noqa: F401
1443
  list_repo_tree, # noqa: F401
1444
  list_spaces, # noqa: F401
1445
+ list_spaces_hardware, # noqa: F401
1446
  list_user_followers, # noqa: F401
1447
  list_user_following, # noqa: F401
1448
  list_webhooks, # noqa: F401
 
1473
  run_job, # noqa: F401
1474
  run_uv_job, # noqa: F401
1475
  scale_to_zero_inference_endpoint, # noqa: F401
1476
+ search_spaces, # noqa: F401
1477
  set_space_sleep_time, # noqa: F401
1478
  set_space_volumes, # noqa: F401
1479
  space_info, # noqa: F401
 
1725
  CorruptedCacheException, # noqa: F401
1726
  DeleteCacheStrategy, # noqa: F401
1727
  HFCacheInfo, # noqa: F401
1728
+ HfUri, # noqa: F401
1729
  cached_assets_path, # noqa: F401
1730
  close_session, # noqa: F401
1731
  dump_environment_info, # noqa: F401
 
1734
  get_token, # noqa: F401
1735
  hf_raise_for_status, # noqa: F401
1736
  logging, # noqa: F401
1737
+ parse_hf_uri, # noqa: F401
1738
  scan_cache_dir, # noqa: F401
1739
  set_async_client_factory, # noqa: F401
1740
  set_client_factory, # noqa: F401
.venv/lib/python3.14/site-packages/huggingface_hub/_buckets.py CHANGED
@@ -20,6 +20,7 @@ import fnmatch
20
  import json
21
  import mimetypes
22
  import os
 
23
  import sys
24
  import time
25
  from collections.abc import Iterator
@@ -119,6 +120,21 @@ class _BucketAddFile:
119
  )
120
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  @dataclass
123
  class _BucketDeleteFile:
124
  path: str
@@ -377,6 +393,21 @@ def _parse_filter_file(filter_file: str) -> list[tuple[str, str]]:
377
  # =============================================================================
378
 
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  def _list_local_files(local_path: str) -> Iterator[tuple[str, int, float]]:
381
  """List all files in a local directory.
382
 
@@ -390,12 +421,13 @@ def _list_local_files(local_path: str) -> Iterator[tuple[str, int, float]]:
390
  for root, _, files in os.walk(local_path):
391
  for filename in files:
392
  full_path = os.path.join(root, filename)
 
 
 
393
  rel_path = os.path.relpath(full_path, local_path)
394
  # Normalize to forward slashes for consistency
395
  rel_path = rel_path.replace(os.sep, "/")
396
- size = os.path.getsize(full_path)
397
- mtime_ms = os.path.getmtime(full_path) * 1000
398
- yield rel_path, size, mtime_ms
399
 
400
 
401
  def _list_remote_files(api: "HfApi", bucket_id: str, prefix: str) -> Iterator[tuple[str, int, float, Any]]:
@@ -664,11 +696,26 @@ def _compute_sync_plan(
664
 
665
  local_files = {}
666
  if os.path.isdir(local_path):
667
- for rel_path, size, mtime_ms in _list_local_files(local_path):
668
- if filter_matcher.matches(rel_path):
669
- local_files[rel_path] = (size, mtime_ms)
670
- if status:
671
- status.update(f"Scanning local directory ({len(local_files)} files)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  if status:
673
  status.done(f"Scanning local directory ({len(local_files)} files)")
674
 
 
20
  import json
21
  import mimetypes
22
  import os
23
+ import stat
24
  import sys
25
  import time
26
  from collections.abc import Iterator
 
120
  )
121
 
122
 
123
+ @dataclass
124
+ class _BucketCopyFile:
125
+ destination: str
126
+ xet_hash: str
127
+ source_repo_type: str # "model", "dataset", "space", "bucket"
128
+ source_repo_id: str
129
+ size: int | None = field(default=None)
130
+ mtime: int = field(init=False)
131
+ content_type: str | None = field(init=False)
132
+
133
+ def __post_init__(self) -> None:
134
+ self.content_type = mimetypes.guess_type(self.destination)[0]
135
+ self.mtime = int(time.time() * 1000)
136
+
137
+
138
  @dataclass
139
  class _BucketDeleteFile:
140
  path: str
 
393
  # =============================================================================
394
 
395
 
396
+ def _stat_local(path: str) -> tuple[int, float] | None:
397
+ """Stat a local file and return (size, mtime_ms).
398
+
399
+ Returns None if the path is missing or is a directory. Uses a single
400
+ ``os.stat`` call so callers don't pay for multiple syscalls per file.
401
+ """
402
+ try:
403
+ st = os.stat(path)
404
+ except OSError:
405
+ return None
406
+ if stat.S_ISDIR(st.st_mode):
407
+ return None
408
+ return st.st_size, st.st_mtime * 1000
409
+
410
+
411
  def _list_local_files(local_path: str) -> Iterator[tuple[str, int, float]]:
412
  """List all files in a local directory.
413
 
 
421
  for root, _, files in os.walk(local_path):
422
  for filename in files:
423
  full_path = os.path.join(root, filename)
424
+ stat_info = _stat_local(full_path)
425
+ if stat_info is None:
426
+ continue
427
  rel_path = os.path.relpath(full_path, local_path)
428
  # Normalize to forward slashes for consistency
429
  rel_path = rel_path.replace(os.sep, "/")
430
+ yield rel_path, stat_info[0], stat_info[1]
 
 
431
 
432
 
433
  def _list_remote_files(api: "HfApi", bucket_id: str, prefix: str) -> Iterator[tuple[str, int, float, Any]]:
 
696
 
697
  local_files = {}
698
  if os.path.isdir(local_path):
699
+ if delete:
700
+ # Full walk needed to discover local-only files for deletion.
701
+ for rel_path, size, mtime_ms in _list_local_files(local_path):
702
+ if filter_matcher.matches(rel_path):
703
+ local_files[rel_path] = (size, mtime_ms)
704
+ if status:
705
+ status.update(f"Scanning local directory ({len(local_files)} files)")
706
+ else:
707
+ # Without --delete, the plan only depends on paths that exist
708
+ # remotely. Stat just those instead of walking the whole tree,
709
+ # which can take minutes when dest sits in a large directory
710
+ # like ~/.cache/huggingface/.
711
+ for rel_path in remote_files:
712
+ local_file = os.path.join(local_path, rel_path)
713
+ stat_info = _stat_local(local_file)
714
+ if stat_info is None:
715
+ continue
716
+ local_files[rel_path] = stat_info
717
+ if status:
718
+ status.update(f"Scanning local directory ({len(local_files)} files)")
719
  if status:
720
  status.done(f"Scanning local directory ({len(local_files)} files)")
721
 
.venv/lib/python3.14/site-packages/huggingface_hub/_hot_reload/client.py CHANGED
@@ -14,6 +14,7 @@
14
 
15
 
16
  import json
 
17
  from collections import deque
18
  from collections.abc import Iterator
19
  from typing import Literal, TypedDict
@@ -27,6 +28,12 @@ from .types import ApiGetReloadEventSourceData, ApiGetReloadRequest
27
 
28
 
29
  HOT_RELOADING_PORT = 7887
 
 
 
 
 
 
30
 
31
 
32
  class MultiReplicaStreamEvent(TypedDict):
@@ -57,15 +64,19 @@ class ReloadClient:
57
  self.client = httpx.Client(
58
  base_url=f"{base_host}/--replicas/+{replica_hash}",
59
  headers=build_hf_headers(token=token),
 
60
  )
61
 
62
- def get_reload(self, reload_id: str) -> Iterator[ApiGetReloadEventSourceData]:
63
  req = ApiGetReloadRequest(reloadId=reload_id)
64
  with self.client.stream("POST", "/get-reload", json=req) as res:
 
 
65
  hf_raise_for_status(res)
66
  for event in SSEClient(res.iter_bytes()).events():
67
  if event.event == "message":
68
  yield json.loads(event.data)
 
69
 
70
 
71
  def multi_replica_reload_events(
@@ -74,7 +85,10 @@ def multi_replica_reload_events(
74
  subdomain: str,
75
  replica_hashes: list[str],
76
  token: str | None,
77
- ) -> Iterator[MultiReplicaStreamEvent | MultiReplicaStreamReplicaHash | MultiReplicaStreamFullMatch]:
 
 
 
78
  clients = [
79
  ReloadClient(
80
  host=host,
@@ -89,9 +103,20 @@ def multi_replica_reload_events(
89
  for client_index, client in enumerate(clients):
90
  if len(clients) > 1:
91
  yield {"kind": "replicaHash", "hash": client.replica_hash}
 
 
 
 
 
 
 
 
 
 
 
92
  full_match = True
93
  replay: deque[ApiGetReloadEventSourceData] = deque()
94
- for event_index, event in enumerate(client.get_reload(commit_sha)):
95
  if client_index == 0:
96
  first_client_events[event_index] = event
97
  elif full_match := full_match and first_client_events.get(event_index) == event:
@@ -100,5 +125,6 @@ def multi_replica_reload_events(
100
  while replay:
101
  yield {"kind": "event", "event": replay.popleft()}
102
  yield {"kind": "event", "event": event}
 
103
  if client_index > 0 and full_match:
104
  yield {"kind": "fullMatch"}
 
14
 
15
 
16
  import json
17
+ import time
18
  from collections import deque
19
  from collections.abc import Iterator
20
  from typing import Literal, TypedDict
 
28
 
29
 
30
  HOT_RELOADING_PORT = 7887
31
+ CLIENT_TIMEOUT = 20
32
+
33
+
34
+ class MultiReplicaStreamWarning(TypedDict):
35
+ kind: Literal["warning"]
36
+ message: str
37
 
38
 
39
  class MultiReplicaStreamEvent(TypedDict):
 
64
  self.client = httpx.Client(
65
  base_url=f"{base_host}/--replicas/+{replica_hash}",
66
  headers=build_hf_headers(token=token),
67
+ timeout=CLIENT_TIMEOUT,
68
  )
69
 
70
+ def get_reload(self, reload_id: str) -> Iterator[ApiGetReloadEventSourceData] | int:
71
  req = ApiGetReloadRequest(reloadId=reload_id)
72
  with self.client.stream("POST", "/get-reload", json=req) as res:
73
+ if res.status_code != 200:
74
+ return res.status_code
75
  hf_raise_for_status(res)
76
  for event in SSEClient(res.iter_bytes()).events():
77
  if event.event == "message":
78
  yield json.loads(event.data)
79
+ return None
80
 
81
 
82
  def multi_replica_reload_events(
 
85
  subdomain: str,
86
  replica_hashes: list[str],
87
  token: str | None,
88
+ max_retries: int = 10,
89
+ ) -> Iterator[
90
+ MultiReplicaStreamWarning | MultiReplicaStreamEvent | MultiReplicaStreamReplicaHash | MultiReplicaStreamFullMatch
91
+ ]:
92
  clients = [
93
  ReloadClient(
94
  host=host,
 
103
  for client_index, client in enumerate(clients):
104
  if len(clients) > 1:
105
  yield {"kind": "replicaHash", "hash": client.replica_hash}
106
+
107
+ retries = 0
108
+ while isinstance((events := client.get_reload(commit_sha)), int):
109
+ if (retries := retries + 1) > max_retries:
110
+ raise Exception("Too many retries reached")
111
+ if (status_code := events) not in (200, 204):
112
+ raise Exception(f"Unexpected {status_code=} on `ReloadClient.get_reload`")
113
+ subject = "reloadId" if status_code == 204 else "replica"
114
+ yield {"kind": "warning", "message": f"Retrying on unexpected {subject} not found"}
115
+ time.sleep(2)
116
+
117
  full_match = True
118
  replay: deque[ApiGetReloadEventSourceData] = deque()
119
+ for event_index, event in enumerate(events):
120
  if client_index == 0:
121
  first_client_events[event_index] = event
122
  elif full_match := full_match and first_client_events.get(event_index) == event:
 
125
  while replay:
126
  yield {"kind": "event", "event": replay.popleft()}
127
  yield {"kind": "event", "event": event}
128
+
129
  if client_index > 0 and full_match:
130
  yield {"kind": "fullMatch"}
.venv/lib/python3.14/site-packages/huggingface_hub/_hot_reload/types.py CHANGED
@@ -56,6 +56,11 @@ class ReloadOperationUI(TypedDict):
56
  updated: bool
57
 
58
 
 
 
 
 
 
59
  class ApiCreateReloadRequest(TypedDict):
60
  filepath: str
61
  contents: str
@@ -86,6 +91,7 @@ class ApiGetReloadEventSourceData(TypedDict):
86
  | ReloadOperationObject
87
  | ReloadOperationRun
88
  | ReloadOperationUI
 
89
  )
90
 
91
 
 
56
  updated: bool
57
 
58
 
59
+ class ReloadOperationFile(TypedDict):
60
+ kind: Literal["file"]
61
+ created: bool
62
+
63
+
64
  class ApiCreateReloadRequest(TypedDict):
65
  filepath: str
66
  contents: str
 
91
  | ReloadOperationObject
92
  | ReloadOperationRun
93
  | ReloadOperationUI
94
+ | ReloadOperationFile
95
  )
96
 
97
 
.venv/lib/python3.14/site-packages/huggingface_hub/_snapshot_download.py CHANGED
@@ -17,9 +17,10 @@ from .errors import (
17
  RevisionNotFoundError,
18
  )
19
  from .file_download import REGEX_COMMIT_HASH, DryRunFileInfo, hf_hub_download, repo_folder_name
20
- from .hf_api import DatasetInfo, HfApi, ModelInfo, RepoFile, SpaceInfo
21
- from .utils import OfflineModeIsEnabled, filter_repo_objects, is_tqdm_disabled, logging, validate_hf_hub_args
22
- from .utils import tqdm as hf_tqdm
 
23
 
24
 
25
  logger = logging.get_logger(__name__)
@@ -144,7 +145,7 @@ def snapshot_download(
144
  repo_id (`str`):
145
  A user or an organization name and a repo name separated by a `/`.
146
  repo_type (`str`, *optional*):
147
- Set to `"dataset"` or `"space"` if downloading from a dataset or space,
148
  `None` or `"model"` if downloading from a model. Default is `None`.
149
  revision (`str`, *optional*):
150
  An optional Git revision id which can be a branch name, a tag, or a
@@ -218,8 +219,10 @@ def snapshot_download(
218
 
219
  if repo_type is None:
220
  repo_type = "model"
221
- if repo_type not in constants.REPO_TYPES:
222
- raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
 
 
223
 
224
  storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
225
 
@@ -232,7 +235,7 @@ def snapshot_download(
232
  token=token,
233
  )
234
 
235
- repo_info: ModelInfo | DatasetInfo | SpaceInfo | None = None
236
  api_call_error: Exception | None = None
237
  if not local_files_only:
238
  # try/except logic to handle different errors => taken from `hf_hub_download`
@@ -335,10 +338,10 @@ def snapshot_download(
335
 
336
  # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
337
  # In that case, we need to use the `list_repo_tree` method to prevent caching issues.
338
- repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] if repo_info.siblings is not None else []
339
- unreliable_nb_files = (
340
- repo_info.siblings is None or len(repo_info.siblings) == 0 or len(repo_info.siblings) > LARGE_REPO_THRESHOLD
341
- )
342
  if unreliable_nb_files:
343
  logger.info(
344
  "Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed."
@@ -385,14 +388,15 @@ def snapshot_download(
385
  # Create a progress bar for the bytes downloaded
386
  # This progress bar is shared across threads/files and gets updated each time we fetch
387
  # metadata for a file.
388
- bytes_progress = tqdm_class(
 
 
 
389
  desc="Downloading (incomplete total...)",
390
- disable=is_tqdm_disabled(log_level=logger.getEffectiveLevel()),
391
  total=0,
392
  initial=0,
393
  unit="B",
394
  unit_scale=True,
395
- name="huggingface_hub.snapshot_download",
396
  )
397
 
398
  class _AggregatedTqdm:
 
17
  RevisionNotFoundError,
18
  )
19
  from .file_download import REGEX_COMMIT_HASH, DryRunFileInfo, hf_hub_download, repo_folder_name
20
+ from .hf_api import DatasetInfo, HfApi, KernelInfo, ModelInfo, RepoFile, SpaceInfo
21
+ from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
22
+ from .utils.tqdm import _create_progress_bar
23
+ from .utils.tqdm import tqdm as hf_tqdm
24
 
25
 
26
  logger = logging.get_logger(__name__)
 
145
  repo_id (`str`):
146
  A user or an organization name and a repo name separated by a `/`.
147
  repo_type (`str`, *optional*):
148
+ Set to `"dataset"`, `"space"` or `"kernel"` if downloading from a dataset, space or kernel repo,
149
  `None` or `"model"` if downloading from a model. Default is `None`.
150
  revision (`str`, *optional*):
151
  An optional Git revision id which can be a branch name, a tag, or a
 
219
 
220
  if repo_type is None:
221
  repo_type = "model"
222
+ if repo_type not in constants.REPO_TYPES_WITH_KERNEL:
223
+ raise ValueError(
224
+ f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES_WITH_KERNEL)}"
225
+ )
226
 
227
  storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
228
 
 
235
  token=token,
236
  )
237
 
238
+ repo_info: ModelInfo | DatasetInfo | SpaceInfo | KernelInfo | None = None
239
  api_call_error: Exception | None = None
240
  if not local_files_only:
241
  # try/except logic to handle different errors => taken from `hf_hub_download`
 
338
 
339
  # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
340
  # In that case, we need to use the `list_repo_tree` method to prevent caching issues.
341
+ # Note: kernel repos don't expose siblings in their info response, so we always fall back to `list_repo_tree`.
342
+ siblings = getattr(repo_info, "siblings", None)
343
+ repo_files: Iterable[str] = [f.rfilename for f in siblings] if siblings is not None else []
344
+ unreliable_nb_files = siblings is None or len(siblings) == 0 or len(siblings) > LARGE_REPO_THRESHOLD
345
  if unreliable_nb_files:
346
  logger.info(
347
  "Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed."
 
388
  # Create a progress bar for the bytes downloaded
389
  # This progress bar is shared across threads/files and gets updated each time we fetch
390
  # metadata for a file.
391
+ bytes_progress = _create_progress_bar(
392
+ cls=tqdm_class,
393
+ log_level=logger.getEffectiveLevel(),
394
+ name="huggingface_hub.snapshot_download",
395
  desc="Downloading (incomplete total...)",
 
396
  total=0,
397
  initial=0,
398
  unit="B",
399
  unit_scale=True,
 
400
  )
401
 
402
  class _AggregatedTqdm:
.venv/lib/python3.14/site-packages/huggingface_hub/_space_api.py CHANGED
@@ -161,11 +161,18 @@ class Volume:
161
  data["path"] = self.path
162
  return data
163
 
 
 
 
 
 
 
 
164
 
165
  @dataclass
166
  class SpaceHotReloading:
167
  status: Literal["created", "canceled"]
168
- replica_statuses: list[tuple[str, str]] # See _hot_reloading_types.ApiCreateReloadResponse.res.status
169
  raw: dict
170
 
171
  def __init__(self, data: dict) -> None:
@@ -222,6 +229,34 @@ class SpaceRuntime:
222
  self.raw = data
223
 
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  @dataclass
226
  class SpaceVariable:
227
  """
@@ -249,3 +284,69 @@ class SpaceVariable:
249
  self.description = values.get("description")
250
  updated_at = values.get("updatedAt")
251
  self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  data["path"] = self.path
162
  return data
163
 
164
+ def to_hf_handle(self) -> str:
165
+ """Return the volume as an HF handle in the format expected by the CLI."""
166
+ path = f"/{self.path}" if self.path else ""
167
+ revision = f"@{self.revision}" if self.revision else ""
168
+ ro = {True: ":ro", False: ":rw", None: ""}.get(self.read_only, "")
169
+ return f"hf://{self.type}s/{self.source}{revision}{path}:{self.mount_path}{ro}"
170
+
171
 
172
  @dataclass
173
  class SpaceHotReloading:
174
  status: Literal["created", "canceled"]
175
+ replica_statuses: list[tuple[str, str | None]] # See _hot_reloading_types.ApiCreateReloadResponse.res.status
176
  raw: dict
177
 
178
  def __init__(self, data: dict) -> None:
 
229
  self.raw = data
230
 
231
 
232
+ @dataclass
233
+ class SpaceSecret:
234
+ """
235
+ Contains information about a secret of a Space.
236
+
237
+ Secret values are write-only and cannot be read back. Only the key, description,
238
+ and last update time are returned by the API.
239
+
240
+ Args:
241
+ key (`str`):
242
+ Secret key. Example: `"GITHUB_API_KEY"`
243
+ description (`str` or None):
244
+ Description of the secret. Example: `"Github API key to access the Github API"`.
245
+ updated_at (`datetime` or None):
246
+ datetime of the last update of the secret (if the secret has been updated at least once).
247
+ """
248
+
249
+ key: str
250
+ description: str | None
251
+ updated_at: datetime | None
252
+
253
+ def __init__(self, key: str, values: dict) -> None:
254
+ self.key = key
255
+ self.description = values.get("description")
256
+ updated_at = values.get("updatedAt")
257
+ self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
258
+
259
+
260
  @dataclass
261
  class SpaceVariable:
262
  """
 
284
  self.description = values.get("description")
285
  updated_at = values.get("updatedAt")
286
  self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
287
+
288
+
289
+ @dataclass
290
+ class SpaceSearchResult:
291
+ """A single result from the Spaces semantic search API.
292
+
293
+ Returned by [`HfApi.search_spaces`].
294
+
295
+ Attributes:
296
+ id (`str`):
297
+ ID of the Space (e.g. `"username/repo-name"`).
298
+ author (`str`):
299
+ Author of the Space.
300
+ title (`str`):
301
+ Display title of the Space.
302
+ emoji (`str` or `None`):
303
+ Emoji icon of the Space.
304
+ sdk (`str` or `None`):
305
+ SDK used by the Space (e.g. `"gradio"`, `"docker"`, `"static"`).
306
+ likes (`int`):
307
+ Number of likes.
308
+ private (`bool`):
309
+ Whether the Space is private.
310
+ tags (`list[str]` or `None`):
311
+ List of tags.
312
+ runtime ([`SpaceRuntime`] or `None`):
313
+ Runtime information (stage, hardware, etc.).
314
+ ai_short_description (`str` or `None`):
315
+ AI-generated short description.
316
+ ai_category (`str` or `None`):
317
+ AI-generated category (e.g. `"Image Generation"`).
318
+ semantic_relevancy_score (`float` or `None`):
319
+ Semantic relevancy score (0-1) relative to the search query.
320
+ trending_score (`int` or `None`):
321
+ Trending score.
322
+ """
323
+
324
+ id: str
325
+ author: str
326
+ title: str
327
+ emoji: str | None
328
+ sdk: str | None
329
+ likes: int
330
+ private: bool
331
+ tags: list[str] | None
332
+ runtime: SpaceRuntime | None
333
+ ai_short_description: str | None
334
+ ai_category: str | None
335
+ semantic_relevancy_score: float | None
336
+ trending_score: int | None
337
+
338
+ def __init__(self, data: dict) -> None:
339
+ runtime = data.get("runtime")
340
+ self.id = data["id"]
341
+ self.author = data.get("author", "")
342
+ self.title = data.get("title", "")
343
+ self.emoji = data.get("emoji")
344
+ self.sdk = data.get("sdk")
345
+ self.likes = data.get("likes", 0)
346
+ self.private = data.get("private", False)
347
+ self.tags = data.get("tags")
348
+ self.runtime = SpaceRuntime(runtime) if runtime else None
349
+ self.ai_short_description = data.get("ai_short_description")
350
+ self.ai_category = data.get("ai_category")
351
+ self.semantic_relevancy_score = data.get("semanticRelevancyScore")
352
+ self.trending_score = data.get("trendingScore")