PhysShell vikp commited on
Commit
d553408
·
0 Parent(s):

Duplicate from datalab-to/chandra-ocr-2

Browse files

Co-authored-by: Vikas Paruchuri <vikp@users.noreply.huggingface.co>

.eval_results/olmocrbench.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: allenai/olmOCR-bench
3
+ task_id: overall
4
+ value: 85.9
5
+ source:
6
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
7
+ name: Chandra OCR 2 Model Card
8
+ user: davanstrien
9
+
10
+ - dataset:
11
+ id: allenai/olmOCR-bench
12
+ task_id: arxiv_math
13
+ value: 90.2
14
+ source:
15
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
16
+ name: Chandra OCR 2 Model Card
17
+ user: davanstrien
18
+
19
+ - dataset:
20
+ id: allenai/olmOCR-bench
21
+ task_id: old_scans_math
22
+ value: 89.3
23
+ source:
24
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
25
+ name: Chandra OCR 2 Model Card
26
+ user: davanstrien
27
+
28
+ - dataset:
29
+ id: allenai/olmOCR-bench
30
+ task_id: table_tests
31
+ value: 89.9
32
+ source:
33
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
34
+ name: Chandra OCR 2 Model Card
35
+ user: davanstrien
36
+
37
+ - dataset:
38
+ id: allenai/olmOCR-bench
39
+ task_id: old_scans
40
+ value: 49.8
41
+ source:
42
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
43
+ name: Chandra OCR 2 Model Card
44
+ user: davanstrien
45
+
46
+ - dataset:
47
+ id: allenai/olmOCR-bench
48
+ task_id: headers_footers
49
+ value: 92.5
50
+ source:
51
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
52
+ name: Chandra OCR 2 Model Card
53
+ user: davanstrien
54
+
55
+ - dataset:
56
+ id: allenai/olmOCR-bench
57
+ task_id: multi_column
58
+ value: 83.5
59
+ source:
60
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
61
+ name: Chandra OCR 2 Model Card
62
+ user: davanstrien
63
+
64
+ - dataset:
65
+ id: allenai/olmOCR-bench
66
+ task_id: long_tiny_text
67
+ value: 92.1
68
+ source:
69
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
70
+ name: Chandra OCR 2 Model Card
71
+ user: davanstrien
72
+
73
+ - dataset:
74
+ id: allenai/olmOCR-bench
75
+ task_id: baseline
76
+ value: 99.6
77
+ source:
78
+ url: https://huggingface.co/datalab-to/chandra-ocr-2
79
+ name: Chandra OCR 2 Model Card
80
+ user: davanstrien
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ handwritten_form.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AI PUBS OPEN RAIL-M LICENSE (MODIFIED)
2
+
3
+ Version 0.1, March 2, 2023 (Modified)
4
+ http://licenses.ai/
5
+
6
+ PLEASE READ THESE TERMS CAREFULLY BEFORE USING THE MODEL OR A DERIVATIVE WORKS OF THE MODEL MADE AVAILABLE IN CONNECTION WITH THESE TERMS. BY DOWNLOADING, REPRODUCING, DISTRIBUTING OR USING THE MODEL OR A DERIVATIVE WORK OF THE MODEL IN ANY MANNER, YOU (“YOU”) AGREE TO BE BOUND BY THESE TERMS (THE “AGREEMENT”) TO THE EXCLUSION OF ALL OTHER TERMS. YOU REPRESENT AND WARRANT THAT YOU HAVE THE AUTHORITY TO ENTER INTO THIS AGREEMENT; IF YOU ARE ENTERING INTO THIS AGREEMENT ON BEHALF OF AN ORGANIZATION OR ENTITY, REFERENCES TO AND “YOU” IN THIS AGREEMENT, REFER TO THAT ORGANIZATION OR ENTITY. IF YOU DO NOT AGREE TO ALL OF THE FOLLOWING, YOU MAY NOT DOWNLOAD, REPRODUCE, DISTRIBUTE OR USE THE MODEL OR A DERIVATIVE WORK OF THE MODEL IN ANY MANNER.
7
+ Section I: PREAMBLE
8
+ This OpenRAIL-M License, as modified, is generally applicable to any machine-learning Model.
9
+ The “Open” nomenclature indicates that the licensed Model is be freely accessible to downstream and other users. The “RAIL” nomenclature indicates that there are use restrictions prohibiting the use of the Model. These restrictions are intended to avoid potential misuse. This License specifies that the use restrictions in the original License must apply to such derivatives.
10
+ NOW THEREFORE, You and Licensor agree as follows:
11
+ 1. Definitions
12
+ (a) “Complementary Material” means the applicable source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, and any related information, if any. Complementary Material is not licensed under this License.
13
+ (b) "Contribution" means any work, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the rights owner or by an individual or legal entity authorized to submit on behalf of the rights owner. For the purposes of this definition, “submitted” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the rights owner as "Not a Contribution."
14
+ (c) "Contributor" means Licensor and any individual or legal entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
15
+ (d) “Data” means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
16
+ (e) “Derivatives of the Model” means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
17
+ (f) “Distribution” means any transmission, reproduction, publication, distribution, or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means, including but not limited to API-based or web access.
18
+ (g) “Harm” includes but is not limited to physical, mental, psychological, financial and reputational damage, pain, or loss
19
+ (h) "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
20
+ (i) “Licensor” means the rights owner or entity authorized by the rights owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
21
+ (j) “Model” means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
22
+ (k) “Output” means the results of operating a Model as embodied in informational content resulting therefrom.
23
+ (l) “Third Parties” means individuals or legal entities that are not under common control with Licensor or You.
24
+ (m) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application, including but not limited to a chatbot, translator, or image generator.
25
+ Section II: INTELLECTUAL PROPERTY RIGHTS
26
+ Both copyright and patent grants may apply to the Model and Derivatives of the Model. The Model and Derivatives of the Model are subject to additional terms as described in Section III, which shall govern the use of the Model and Derivatives of the Model even in the event Section II is held unenforceable.
27
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Model and Derivatives of the Model.
28
+ 3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and/or Derivatives of the Model where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model or Derivatives of the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or Derivative of the Model and/or a Contribution incorporated within the Model or Derivative of the Model constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Derivative of the Model shall terminate as of the date such litigation is asserted or filed.
29
+ Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
30
+ 4. Distribution and Redistribution. You may host the Model or Derivatives of the Model for remote access by Third Parties, including but not limited to software-as-a-service, reproduce, or Distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the conditions in this Section III:
31
+ (a) Use-based restrictions in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (for example, a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model and Derivatives of the Model are subject to paragraph 5;
32
+ (b) You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
33
+ (c) You must cause any modified files to carry prominent notices stating that You changed the files; and
34
+ (d) You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model or Derivatives of the Model.
35
+ You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions, consistent with paragraph 4.a., for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
36
+ 5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Accordingly, You cannot use the Model or the Derivatives of the Model in violation of such restrictions. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, fine-tuning, updating, running, training, evaluating and/or re-parametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph 5.
37
+ 6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are solely responsible for the Output you generate and its subsequent uses. No use of the Output can contravene any provision as stated in the License.
38
+ 7. Attribution. In connection with any Output, or use of Distribution of any Model or Derivatives of the Model, You agree to give appropriate credit and attribution to Licensor, provide a link to the original Model or Derivatives of the Model, provide a copy of this License, and identify any changes You have made to the Model or Derivatives of the Model (collectively, the “Attribution”). The Attribution must not suggest endorsement by any Licensor.
39
+ 8. Share-a-Like. As a condition to the license and authorizations herein, You agree to apply this License (to the exclusion of all others) to any and all copies of the Model, Derivatives of the Model, any changes or improvements to the Model or Derivatives of the Model, and to the Output and any derivatives, changes or improvements to or of the Output.
40
+ Section IV: OTHER PROVISIONS
41
+ 9. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or cause modification to the Output resulting from updates to the Model based.
42
+ 10. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
43
+ 11. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model and Derivatives of the Model, and assume any risks associated with Your exercise of permissions under this License.
44
+ 12. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
45
+ 13. Accepting Warranty or Additional Liability. While Distributing the Model or Derivatives of the Model, You may choose to charge a fee in exchange for support, warranty, indemnity, or other obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor or Licensor, and only if You agree to indemnify, defend, and hold each Contributor and the Licensor harmless for any liability incurred by, or claims asserted against, such Contributor or Licensor by reason of your accepting any such warranty or additional liability.
46
+ 14. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
47
+ END OF TERMS AND CONDITIONS
48
+
49
+  Attachment A
50
+ USE RESTRICTIONS
51
+ As conditions to the Licenses set forth in this Agreement, You agree not to use, reproduce, modify, create or Distribute the Model, Derivatives of the Model, or Output (collectively, “Use”) in any of the following ways:
52
+ 1. Legal:
53
+ (a) In any way that violates any applicable national, federal, state, local or international law or regulation; or
54
+ (b) to directly or indirectly infringe or misappropriate any third party intellectual property rights (including those of Licensor or any Contributor)
55
+ 2. Commercial:
56
+ (a) for any purpose if You (your employer, or the entity you are affiliated with) generated more than two million US Dollars ($2,000,000) in gross revenue in the prior year, except where Your Use is limited to personal use or research purposes;
57
+ (b) for any purpose if You (your employer, or the entity you are affiliated with) has raised more than two million US dollars ($2,000,000) in total equity or debt funding from any source, except where Your Use is limited to personal use or research purposes; or
58
+ (c) for any purpose if You (your employer, or the entity you are affiliated with) provides or otherwise makes available any product or service that competes with any product or service offered by or made available by Licensor or any of its affiliates.
59
+ Commercial and broader use licenses may be available from Licensor at the following URL: https://www.datalab.to/
README.md ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: openrail
4
+ license_link: LICENSE
5
+ tags:
6
+ - ocr
7
+ - pdf
8
+ - markdown
9
+ - layout
10
+ ---
11
+
12
+ <p align="center">
13
+ <img src="datalab-logo.png" alt="Datalab Logo" width="150"/>
14
+ </p>
15
+
16
+ # Chandra OCR 2
17
+
18
+ Chandra 2 is a state of the art OCR model from [Datalab](https://www.datalab.to) that outputs markdown, HTML, and JSON. It is highly accurate at extracting text from images and PDFs, while preserving layout information.
19
+
20
+ Try Chandra in the [free playground](https://www.datalab.to/playground), or use the [hosted API](https://www.datalab.to/) for higher accuracy and speed.
21
+
22
+ ## What's New in Chandra 2
23
+
24
+ - 85.9% olmocr bench score (sota), 77.8% multilingual bench score (12% improvement over Chandra 1)
25
+ - Significant improvements to math, tables, complex layouts
26
+ - Improved layout, especially on wider documents
27
+ - Significantly better image captioning
28
+ - 90+ language support with major accuracy gains
29
+
30
+ ## Features
31
+
32
+ - Convert documents to markdown, HTML, or JSON with detailed layout information
33
+ - Excellent handwriting support
34
+ - Reconstructs forms accurately, including checkboxes
35
+ - Strong performance with tables, math, and complex layouts
36
+ - Extracts images and diagrams, with captions and structured data
37
+ - Support for 90+ languages
38
+
39
+ <img src="handwritten_form.png" width="600px"/>
40
+
41
+ ## Quickstart
42
+
43
+ ```shell
44
+ pip install chandra-ocr
45
+
46
+ # With vLLM (recommended, easy install)
47
+ chandra_vllm
48
+ chandra input.pdf ./output
49
+
50
+ # With HuggingFace (requires torch)
51
+ pip install chandra-ocr[hf]
52
+ chandra input.pdf ./output --method hf
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ### With vLLM (recommended)
58
+
59
+ ```python
60
+ from chandra.model import InferenceManager
61
+ from chandra.model.schema import BatchInputItem
62
+ from PIL import Image
63
+
64
+ # Start vLLM server first with: chandra_vllm
65
+ manager = InferenceManager(method="vllm")
66
+ batch = [
67
+ BatchInputItem(
68
+ image=Image.open("document.png"),
69
+ prompt_type="ocr_layout"
70
+ )
71
+ ]
72
+ result = manager.generate(batch)[0]
73
+ print(result.markdown)
74
+ ```
75
+
76
+ ### With HuggingFace Transformers
77
+
78
+ ```python
79
+ from transformers import AutoModelForImageTextToText, AutoProcessor
80
+ from chandra.model.hf import generate_hf
81
+ from chandra.model.schema import BatchInputItem
82
+ from chandra.output import parse_markdown
83
+ from PIL import Image
84
+ import torch
85
+
86
+ model = AutoModelForImageTextToText.from_pretrained(
87
+ "datalab-to/chandra-ocr-2",
88
+ dtype=torch.bfloat16,
89
+ device_map="auto",
90
+ )
91
+ model.eval()
92
+ model.processor = AutoProcessor.from_pretrained("datalab-to/chandra-ocr-2")
93
+ model.processor.tokenizer.padding_side = "left"
94
+
95
+ batch = [
96
+ BatchInputItem(
97
+ image=Image.open("document.png"),
98
+ prompt_type="ocr_layout"
99
+ )
100
+ ]
101
+
102
+ result = generate_hf(batch, model)[0]
103
+ markdown = parse_markdown(result.raw)
104
+ print(markdown)
105
+ ```
106
+
107
+ ## Benchmarks
108
+
109
+ ### olmOCR Benchmark
110
+
111
+ <img src="bench.png" width="600px"/>
112
+
113
+ | **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall | Source |
114
+ |:----------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:--------------:|:------:|
115
+ | Datalab API | **90.4** | **90.2** | **90.7** | **54.6** | 91.6 | 83.7 | **92.3** | **99.9** | **86.7 ± 0.8** | Own benchmarks |
116
+ | Chandra 2 | 90.2 | 89.3 | 89.9 | 49.8 | 92.5 | 83.5 | 92.1 | 99.6 | 85.9 ± 0.8 | Own benchmarks |
117
+ | dots.ocr 1.5 | 85.9 | 85.5 | **90.7** | 48.2 | 94.0 | **85.3** | 81.6 | 99.7 | 83.9 | dots.ocr repo |
118
+ | Chandra 1 | 82.2 | 80.3 | 88.0 | 50.4 | 90.8 | 81.2 | **92.3** | **99.9** | 83.1 ± 0.9 | Own benchmarks |
119
+ | olmOCR 2 | 83.0 | 82.3 | 84.9 | 47.7 | **96.1** | 83.7 | 81.9 | 99.6 | 82.4 | olmocr repo |
120
+ | dots.ocr | 82.1 | 64.2 | 88.3 | 40.9 | 94.1 | 82.4 | 81.2 | 99.5 | 79.1 ± 1.0 | dots.ocr repo |
121
+ | olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | 95.1 | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 | olmocr repo |
122
+ | Datalab Marker v1.10.0 | 83.8 | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 | Own benchmarks |
123
+ | Deepseek OCR | 75.2 | 72.3 | 79.7 | 33.3 | **96.1** | 66.7 | 80.1 | 99.7 | 75.4 ± 1.0 | Own benchmarks |
124
+ | Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 | olmocr repo |
125
+ | GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 | olmocr repo |
126
+ | Qwen 3 VL 8B | 70.2 | 75.1 | 45.6 | 37.5 | 89.1 | 62.1 | 43.0 | 94.3 | 64.6 ± 1.1 | Own benchmarks |
127
+ | Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 | olmocr repo |
128
+
129
+ ## Examples
130
+
131
+ | Type | Name | Link |
132
+ |------|------|------|
133
+ | Tables | Statistical Distribution | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/tables/complex_tables.png) |
134
+ | Tables | Financial Table | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/tables/financial_table.png) |
135
+ | Forms | Registration Form | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/forms/handwritten_form.png) |
136
+ | Forms | Lease Form | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/forms/lease_filled.png) |
137
+ | Math | CS229 Textbook | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/cs229.png) |
138
+ | Math | Handwritten Math | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/handwritten_math.png) |
139
+ | Math | Chinese Math | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/chinese_math.png) |
140
+ | Handwriting | Cursive Writing | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/handwriting/cursive_writing.png) |
141
+ | Handwriting | Handwritten Notes | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/handwriting/handwritten_notes.png) |
142
+ | Languages | Arabic | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/arabic.png) |
143
+ | Languages | Japanese | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/japanese.png) |
144
+ | Languages | Hindi | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/hindi.png) |
145
+ | Languages | Russian | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/russian.png) |
146
+ | Other | Charts | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/charts.png) |
147
+ | Other | Chemistry | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/chemistry.png) |
148
+
149
+
150
+ ### Multilingual Benchmark (43 Languages)
151
+
152
+ The table below covers the 43 most common languages, benchmarked across multiple models. For a comprehensive evaluation across 90 languages (Chandra 2 vs Gemini 2.5 Flash only), see the [full 90-language benchmark](#full-90-language-benchmark).
153
+
154
+ <img src="multilingual.png" width="600px"/>
155
+
156
+ | Language | Datalab API | Chandra 2 | Chandra 1 | Gemini 2.5 Flash | GPT-5 Mini |
157
+ |---|:---:|:---:|:---:|:---:|:---:|
158
+ | ar | 67.6% | 68.4% | 34.0% | 84.4% | 55.6% |
159
+ | bn | 85.1% | 72.8% | 45.6% | 55.3% | 23.3% |
160
+ | ca | 88.7% | 85.1% | 84.2% | 88.0% | 78.5% |
161
+ | cs | 88.2% | 85.3% | 84.7% | 79.1% | 78.8% |
162
+ | da | 90.1% | 91.1% | 88.4% | 86.0% | 87.7% |
163
+ | de | 93.8% | 94.8% | 83.0% | 88.3% | 93.8% |
164
+ | el | 89.9% | 85.6% | 85.5% | 83.5% | 82.4% |
165
+ | es | 91.8% | 89.3% | 88.7% | 86.8% | 97.1% |
166
+ | fa | 82.2% | 75.1% | 69.6% | 61.8% | 56.4% |
167
+ | fi | 85.7% | 83.4% | 78.4% | 86.0% | 84.7% |
168
+ | fr | 93.3% | 93.7% | 89.6% | 86.1% | 91.1% |
169
+ | gu | 73.8% | 70.8% | 44.6% | 47.6% | 11.5% |
170
+ | he | 76.4% | 70.4% | 38.9% | 50.9% | 22.3% |
171
+ | hi | 80.5% | 78.4% | 70.2% | 82.7% | 41.0% |
172
+ | hr | 93.4% | 90.1% | 85.9% | 88.2% | 81.3% |
173
+ | hu | 88.1% | 82.1% | 82.5% | 84.5% | 84.8% |
174
+ | id | 91.3% | 91.6% | 86.7% | 88.3% | 89.7% |
175
+ | it | 94.4% | 94.1% | 89.1% | 85.7% | 91.6% |
176
+ | ja | 87.3% | 86.9% | 85.4% | 80.0% | 76.1% |
177
+ | jv | 87.5% | 73.2% | 85.1% | 80.4% | 69.6% |
178
+ | kn | 70.0% | 63.2% | 20.6% | 24.5% | 10.1% |
179
+ | ko | 89.1% | 81.5% | 82.3% | 84.8% | 78.4% |
180
+ | la | 78.0% | 73.8% | 55.9% | 70.5% | 54.6% |
181
+ | ml | 72.4% | 64.3% | 18.1% | 23.8% | 11.9% |
182
+ | mr | 80.8% | 75.0% | 57.0% | 69.7% | 20.9% |
183
+ | nl | 90.0% | 88.6% | 85.3% | 87.5% | 83.8% |
184
+ | no | 89.2% | 90.3% | 85.5% | 87.8% | 87.4% |
185
+ | pl | 93.8% | 91.5% | 83.9% | 89.7% | 90.4% |
186
+ | pt | 97.0% | 95.2% | 84.3% | 89.4% | 90.8% |
187
+ | ro | 86.2% | 84.5% | 82.1% | 76.1% | 77.3% |
188
+ | ru | 88.8% | 85.5% | 88.7% | 82.8% | 72.2% |
189
+ | sa | 57.5% | 51.1% | 33.6% | 44.6% | 12.5% |
190
+ | sr | 95.3% | 90.3% | 82.3% | 89.7% | 83.0% |
191
+ | sv | 91.9% | 92.8% | 82.1% | 91.1% | 92.1% |
192
+ | ta | 82.9% | 77.7% | 50.8% | 53.9% | 8.1% |
193
+ | te | 69.4% | 58.6% | 19.5% | 33.3% | 9.9% |
194
+ | th | 71.6% | 62.6% | 47.0% | 66.7% | 53.8% |
195
+ | tr | 88.9% | 84.1% | 68.1% | 84.1% | 78.2% |
196
+ | uk | 93.1% | 91.0% | 88.5% | 87.9% | 81.9% |
197
+ | ur | 54.1% | 43.2% | 28.1% | 57.6% | 16.9% |
198
+ | vi | 85.0% | 80.4% | 81.6% | 89.5% | 83.6% |
199
+ | zh | 87.8% | 88.7% | 88.3% | 70.0% | 70.4% |
200
+ | **Average** | **80.4%** | **77.8%** | **69.4%** | **67.6%** | **60.5%** |
201
+
202
+ ### Full 90-Language Benchmark
203
+
204
+ We also have a more comprehensive evaluation covering 90 languages, comparing Chandra 2 against Gemini 2.5 Flash. The average scores are lower than the 43-language table above because this includes many lower-resource languages. Chandra 2 averages **72.7%** vs Gemini 2.5 Flash at **60.8%**.
205
+
206
+ See the [full 90-language results](https://github.com/datalab-to/chandra/blob/master/FULL_BENCHMARKS.md).
207
+
208
+ ## Throughput
209
+
210
+ Benchmarked with vLLM on a single NVIDIA H100 80GB GPU using a diverse mix of documents (math, tables, scans, multi-column layouts) from the olmOCR benchmark set. This set is significantly slower than real-world usage - we estimate 2 pages/s in real-world usage.
211
+
212
+ | Configuration | Pages/sec | Avg Latency | P95 Latency | Failure Rate |
213
+ |---|:---:|:---:|:---:|:---:|
214
+ | vLLM, 96 concurrent sequences | 1.44 | 60s | 156s | 0% |
215
+
216
+ ## Commercial Usage
217
+
218
+ Code is Apache 2.0. Model weights use a modified OpenRAIL-M license: free for research, personal use, and startups under $2M funding/revenue. Cannot be used competitively with our API. For broader commercial licensing, see [pricing](https://www.datalab.to/pricing?utm_source=gh-chandra).
219
+
220
+ ## Credits
221
+
222
+ - [Huggingface Transformers](https://github.com/huggingface/transformers)
223
+ - [vLLM](https://github.com/vllm-project/vllm)
224
+ - [olmocr](https://github.com/allenai/olmocr)
225
+ - [Qwen 3.5](https://github.com/QwenLM/Qwen3)
bench.png ADDED
chat_template.jinja ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count, is_system_content=false) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- elif content is iterable and content is not mapping %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if is_system_content %}
10
+ {{- raise_exception('System message cannot contain images.') }}
11
+ {%- endif %}
12
+ {%- if do_vision_count %}
13
+ {%- set image_count.value = image_count.value + 1 %}
14
+ {%- endif %}
15
+ {%- if add_vision_id %}
16
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
17
+ {%- endif %}
18
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
19
+ {%- elif 'video' in item or item.type == 'video' %}
20
+ {%- if is_system_content %}
21
+ {{- raise_exception('System message cannot contain videos.') }}
22
+ {%- endif %}
23
+ {%- if do_vision_count %}
24
+ {%- set video_count.value = video_count.value + 1 %}
25
+ {%- endif %}
26
+ {%- if add_vision_id %}
27
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
28
+ {%- endif %}
29
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
30
+ {%- elif 'text' in item %}
31
+ {{- item.text }}
32
+ {%- else %}
33
+ {{- raise_exception('Unexpected item type in content.') }}
34
+ {%- endif %}
35
+ {%- endfor %}
36
+ {%- elif content is none or content is undefined %}
37
+ {{- '' }}
38
+ {%- else %}
39
+ {{- raise_exception('Unexpected content type.') }}
40
+ {%- endif %}
41
+ {%- endmacro %}
42
+ {%- if not messages %}
43
+ {{- raise_exception('No messages provided.') }}
44
+ {%- endif %}
45
+ {%- if tools and tools is iterable and tools is not mapping %}
46
+ {{- '<|im_start|>system\n' }}
47
+ {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
48
+ {%- for tool in tools %}
49
+ {{- "\n" }}
50
+ {{- tool | tojson }}
51
+ {%- endfor %}
52
+ {{- "\n</tools>" }}
53
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
54
+ {%- if messages[0].role == 'system' %}
55
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
56
+ {%- if content %}
57
+ {{- '\n\n' + content }}
58
+ {%- endif %}
59
+ {%- endif %}
60
+ {{- '<|im_end|>\n' }}
61
+ {%- else %}
62
+ {%- if messages[0].role == 'system' %}
63
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
64
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
65
+ {%- endif %}
66
+ {%- endif %}
67
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
68
+ {%- for message in messages[::-1] %}
69
+ {%- set index = (messages|length - 1) - loop.index0 %}
70
+ {%- if ns.multi_step_tool and message.role == "user" %}
71
+ {%- set content = render_content(message.content, false)|trim %}
72
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
73
+ {%- set ns.multi_step_tool = false %}
74
+ {%- set ns.last_query_index = index %}
75
+ {%- endif %}
76
+ {%- endif %}
77
+ {%- endfor %}
78
+ {%- if ns.multi_step_tool %}
79
+ {{- raise_exception('No user query found in messages.') }}
80
+ {%- endif %}
81
+ {%- for message in messages %}
82
+ {%- set content = render_content(message.content, true)|trim %}
83
+ {%- if message.role == "system" %}
84
+ {%- if not loop.first %}
85
+ {{- raise_exception('System message must be at the beginning.') }}
86
+ {%- endif %}
87
+ {%- elif message.role == "user" %}
88
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
89
+ {%- elif message.role == "assistant" %}
90
+ {%- set reasoning_content = '' %}
91
+ {%- if message.reasoning_content is string %}
92
+ {%- set reasoning_content = message.reasoning_content %}
93
+ {%- else %}
94
+ {%- if '</think>' in content %}
95
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
96
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
97
+ {%- endif %}
98
+ {%- endif %}
99
+ {%- set reasoning_content = reasoning_content|trim %}
100
+ {%- if loop.index0 > ns.last_query_index %}
101
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
102
+ {%- else %}
103
+ {{- '<|im_start|>' + message.role + '\n' + content }}
104
+ {%- endif %}
105
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
106
+ {%- for tool_call in message.tool_calls %}
107
+ {%- if tool_call.function is defined %}
108
+ {%- set tool_call = tool_call.function %}
109
+ {%- endif %}
110
+ {%- if loop.first %}
111
+ {%- if content|trim %}
112
+ {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
113
+ {%- else %}
114
+ {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
115
+ {%- endif %}
116
+ {%- else %}
117
+ {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
118
+ {%- endif %}
119
+ {%- if tool_call.arguments is defined %}
120
+ {%- for args_name, args_value in tool_call.arguments|items %}
121
+ {{- '<parameter=' + args_name + '>\n' }}
122
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
123
+ {{- args_value }}
124
+ {{- '\n</parameter>\n' }}
125
+ {%- endfor %}
126
+ {%- endif %}
127
+ {{- '</function>\n</tool_call>' }}
128
+ {%- endfor %}
129
+ {%- endif %}
130
+ {{- '<|im_end|>\n' }}
131
+ {%- elif message.role == "tool" %}
132
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
133
+ {{- '<|im_start|>user' }}
134
+ {%- endif %}
135
+ {{- '\n<tool_response>\n' }}
136
+ {{- content }}
137
+ {{- '\n</tool_response>' }}
138
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
139
+ {{- '<|im_end|>\n' }}
140
+ {%- elif loop.last %}
141
+ {{- '<|im_end|>\n' }}
142
+ {%- endif %}
143
+ {%- else %}
144
+ {{- raise_exception('Unexpected message role.') }}
145
+ {%- endif %}
146
+ {%- endfor %}
147
+ {%- if add_generation_prompt %}
148
+ {{- '<|im_start|>assistant\n' }}
149
+ {{- '<think>\n\n</think>\n\n' }}
150
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3_5ForConditionalGeneration"
4
+ ],
5
+ "image_token_id": 248056,
6
+ "model_type": "qwen3_5",
7
+ "text_config": {
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attn_output_gate": true,
11
+ "bos_token_id": null,
12
+ "dtype": "bfloat16",
13
+ "eos_token_id": 248044,
14
+ "full_attention_interval": 4,
15
+ "head_dim": 256,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 2560,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 9216,
20
+ "layer_types": [
21
+ "linear_attention",
22
+ "linear_attention",
23
+ "linear_attention",
24
+ "full_attention",
25
+ "linear_attention",
26
+ "linear_attention",
27
+ "linear_attention",
28
+ "full_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "linear_attention",
32
+ "full_attention",
33
+ "linear_attention",
34
+ "linear_attention",
35
+ "linear_attention",
36
+ "full_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "linear_attention",
40
+ "full_attention",
41
+ "linear_attention",
42
+ "linear_attention",
43
+ "linear_attention",
44
+ "full_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "linear_attention",
48
+ "full_attention",
49
+ "linear_attention",
50
+ "linear_attention",
51
+ "linear_attention",
52
+ "full_attention"
53
+ ],
54
+ "linear_conv_kernel_dim": 4,
55
+ "linear_key_head_dim": 128,
56
+ "linear_num_key_heads": 16,
57
+ "linear_num_value_heads": 32,
58
+ "linear_value_head_dim": 128,
59
+ "mamba_ssm_dtype": "float32",
60
+ "max_position_embeddings": 262144,
61
+ "mlp_only_layers": [],
62
+ "model_type": "qwen3_5_text",
63
+ "mtp_num_hidden_layers": 1,
64
+ "mtp_use_dedicated_embeddings": false,
65
+ "num_attention_heads": 16,
66
+ "num_hidden_layers": 32,
67
+ "num_key_value_heads": 4,
68
+ "pad_token_id": null,
69
+ "partial_rotary_factor": 0.25,
70
+ "rms_norm_eps": 1e-06,
71
+ "rope_parameters": {
72
+ "mrope_interleaved": true,
73
+ "mrope_section": [
74
+ 11,
75
+ 11,
76
+ 10
77
+ ],
78
+ "partial_rotary_factor": 0.25,
79
+ "rope_theta": 10000000,
80
+ "rope_type": "default"
81
+ },
82
+ "tie_word_embeddings": true,
83
+ "use_cache": true,
84
+ "vocab_size": 248320
85
+ },
86
+ "tie_word_embeddings": true,
87
+ "transformers_version": "5.2.0",
88
+ "video_token_id": 248057,
89
+ "vision_config": {
90
+ "deepstack_visual_indexes": [],
91
+ "depth": 24,
92
+ "hidden_act": "gelu_pytorch_tanh",
93
+ "hidden_size": 1024,
94
+ "in_channels": 3,
95
+ "initializer_range": 0.02,
96
+ "intermediate_size": 4096,
97
+ "model_type": "qwen3_5",
98
+ "num_heads": 16,
99
+ "num_position_embeddings": 2304,
100
+ "out_hidden_size": 2560,
101
+ "patch_size": 16,
102
+ "spatial_merge_size": 2,
103
+ "temporal_patch_size": 2
104
+ },
105
+ "vision_end_token_id": 248054,
106
+ "vision_start_token_id": 248053
107
+ }
datalab-logo.png ADDED
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 248044,
4
+ "transformers_version": "5.2.0",
5
+ "use_cache": true
6
+ }
handwritten_form.png ADDED

Git LFS Details

  • SHA256: 5915e7608cf8a857b5b846d56673e62164ba17c106917480e09703a0b5a189ce
  • Pointer size: 131 Bytes
  • Size of remote file: 524 kB
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0804568be9f099d6479fad9ed77a4da4611f3c1e7bc6e009af7dce45e8aa3847
3
+ size 10591220088
multilingual.png ADDED
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_format": "channels_first",
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessorFast",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "merge_size": 2,
19
+ "patch_size": 16,
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 16777216,
24
+ "shortest_edge": 65536
25
+ },
26
+ "temporal_patch_size": 2
27
+ }
processor_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "data_format": "channels_first",
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_processor_type": "Qwen2VLImageProcessorFast",
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "merge_size": 2,
20
+ "patch_size": 16,
21
+ "resample": 3,
22
+ "rescale_factor": 0.00392156862745098,
23
+ "size": {
24
+ "longest_edge": 16777216,
25
+ "shortest_edge": 65536
26
+ },
27
+ "temporal_patch_size": 2
28
+ },
29
+ "processor_class": "Qwen3VLProcessor",
30
+ "video_processor": {
31
+ "data_format": "channels_first",
32
+ "default_to_square": true,
33
+ "do_convert_rgb": true,
34
+ "do_normalize": true,
35
+ "do_rescale": true,
36
+ "do_resize": true,
37
+ "do_sample_frames": true,
38
+ "fps": 2,
39
+ "image_mean": [
40
+ 0.5,
41
+ 0.5,
42
+ 0.5
43
+ ],
44
+ "image_std": [
45
+ 0.5,
46
+ 0.5,
47
+ 0.5
48
+ ],
49
+ "max_frames": 768,
50
+ "merge_size": 2,
51
+ "min_frames": 4,
52
+ "patch_size": 16,
53
+ "resample": 3,
54
+ "rescale_factor": 0.00392156862745098,
55
+ "return_metadata": false,
56
+ "size": {
57
+ "longest_edge": 25165824,
58
+ "shortest_edge": 4096
59
+ },
60
+ "temporal_patch_size": 2,
61
+ "video_processor_type": "Qwen3VLVideoProcessor"
62
+ }
63
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
3
+ size 19989343
tokenizer_config.json ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "248044": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "248045": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "248046": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "248047": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "248048": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "248049": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "248050": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "248051": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "248052": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "248053": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "248054": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "248055": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "248056": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "248057": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "248058": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "248059": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "248060": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "248061": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "248062": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "248063": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "248064": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "248065": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "248066": {
181
+ "content": "<tool_response>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "248067": {
189
+ "content": "</tool_response>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "248068": {
197
+ "content": "<think>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "248069": {
205
+ "content": "</think>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "248070": {
213
+ "content": "<|audio_start|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "248071": {
221
+ "content": "<|audio_end|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "248072": {
229
+ "content": "<tts_pad>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "248073": {
237
+ "content": "<tts_text_bos>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "248074": {
245
+ "content": "<tts_text_eod>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "248075": {
253
+ "content": "<tts_text_bos_single>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "248076": {
261
+ "content": "<|audio_pad|>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ }
268
+ },
269
+ "additional_special_tokens": [
270
+ "<|im_start|>",
271
+ "<|im_end|>",
272
+ "<|object_ref_start|>",
273
+ "<|object_ref_end|>",
274
+ "<|box_start|>",
275
+ "<|box_end|>",
276
+ "<|quad_start|>",
277
+ "<|quad_end|>",
278
+ "<|vision_start|>",
279
+ "<|vision_end|>",
280
+ "<|vision_pad|>",
281
+ "<|image_pad|>",
282
+ "<|video_pad|>"
283
+ ],
284
+ "bos_token": null,
285
+ "chat_template": "{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- macro render_content(content, do_vision_count, is_system_content=false) %}\n {%- if content is string %}\n {{- content }}\n {%- elif content is iterable and content is not mapping %}\n {%- for item in content %}\n {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain images.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Picture ' ~ image_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|image_pad|><|vision_end|>' }}\n {%- elif 'video' in item or item.type == 'video' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain videos.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Video ' ~ video_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|video_pad|><|vision_end|>' }}\n {%- elif 'text' in item %}\n {{- item.text }}\n {%- else %}\n {{- raise_exception('Unexpected item type in content.') }}\n {%- endif %}\n {%- endfor %}\n {%- elif content is none or content is undefined %}\n {{- '' }}\n {%- else %}\n {{- raise_exception('Unexpected content type.') }}\n {%- endif %}\n{%- endmacro %}\n{%- if not messages %}\n {{- raise_exception('No messages provided.') }}\n{%- endif %}\n{%- if tools and tools is iterable and tools is not mapping %}\n {{- '<|im_start|>system\\n' }}\n {{- \"# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {%- if content %}\n {{- '\\n\\n' + content }}\n {%- endif %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" %}\n {%- set content = render_content(message.content, false)|trim %}\n {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if ns.multi_step_tool %}\n {{- raise_exception('No user query found in messages.') }}\n{%- endif %}\n{%- for message in messages %}\n {%- set content = render_content(message.content, true)|trim %}\n {%- if message.role == \"system\" %}\n {%- if not loop.first %}\n {{- raise_exception('System message must be at the beginning.') }}\n {%- endif %}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- set reasoning_content = reasoning_content|trim %}\n {%- if loop.index0 > ns.last_query_index %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {%- if loop.first %}\n {%- if content|trim %}\n {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- else %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- else %}\n {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>\\n' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '\\n</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>\\n' }}\n {%- elif loop.last %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Unexpected message role.') }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- else %}\n {{- '<think>\\n' }}\n {%- endif %}\n{%- endif %}",
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|im_end|>",
288
+ "errors": "replace",
289
+ "model_max_length": 262144,
290
+ "pad_token": "<|endoftext|>",
291
+ "split_special_tokens": false,
292
+ "tokenizer_class": "Qwen2Tokenizer",
293
+ "unk_token": null,
294
+ "add_bos_token": false,
295
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
296
+ "extra_special_tokens": {
297
+ "audio_bos_token": "<|audio_start|>",
298
+ "audio_eos_token": "<|audio_end|>",
299
+ "audio_token": "<|audio_pad|>",
300
+ "image_token": "<|image_pad|>",
301
+ "video_token": "<|video_pad|>",
302
+ "vision_bos_token": "<|vision_start|>",
303
+ "vision_eos_token": "<|vision_end|>"
304
+ }
305
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_format": "channels_first",
3
+ "default_to_square": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "do_sample_frames": true,
9
+ "fps": 2,
10
+ "image_mean": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "image_std": [
16
+ 0.5,
17
+ 0.5,
18
+ 0.5
19
+ ],
20
+ "max_frames": 768,
21
+ "merge_size": 2,
22
+ "min_frames": 4,
23
+ "patch_size": 16,
24
+ "resample": 3,
25
+ "rescale_factor": 0.00392156862745098,
26
+ "return_metadata": false,
27
+ "size": {
28
+ "longest_edge": 25165824,
29
+ "shortest_edge": 4096
30
+ },
31
+ "temporal_patch_size": 2,
32
+ "video_processor_type": "Qwen3VLVideoProcessor"
33
+ }