Alexandre commited on
Commit
8595613
·
1 Parent(s): 6c9dd13
Files changed (5) hide show
  1. DockerFile +45 -0
  2. README.md +7 -7
  3. app.py +1677 -0
  4. assets/logo_numind_picto.svg +31 -0
  5. start.sh +40 -0
DockerFile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai:latest
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV MODEL_NAME=NM-dev/NuExtract3.4_4B-RL-400
6
+ ENV OPENAI_API_BASE=http://127.0.0.1:8000/v1
7
+ ENV OPENAI_API_KEY=EMPTY
8
+ ENV GRADIO_SERVER_NAME=0.0.0.0
9
+ ENV GRADIO_SERVER_PORT=7860
10
+ ENV VLLM_PORT=8000
11
+ ENV MAX_MODEL_LEN=8192
12
+ ENV GPU_MEMORY_UTILIZATION=0.90
13
+ ENV NUEXTRACT_MAX_TOKENS=5000
14
+ ENV NUEXTRACT_EXAMPLE_DIR=/home/user/app/examples
15
+ ENV NUEXTRACT_ASSETS_DIR=/home/user/app/assets
16
+ ENV HF_HOME=/data/.cache/huggingface
17
+ ENV TRANSFORMERS_CACHE=/data/.cache/huggingface
18
+ ENV VLLM_CACHE_ROOT=/data/.cache/vllm
19
+
20
+
21
+ RUN pip install --no-cache-dir \
22
+ gradio \
23
+ openai \
24
+ pillow
25
+
26
+ RUN useradd -m -u 1000 user || true
27
+
28
+ WORKDIR /home/user/app
29
+
30
+ COPY --chown=user:user app.py /home/user/app/app.py
31
+ COPY --chown=user:user start.sh /home/user/app/start.sh
32
+ COPY --chown=user:user examples /home/user/app/examples
33
+ COPY --chown=user:user assets /home/user/app/assets
34
+
35
+ # Your app.py hardcodes /home/azureuser/assets, so mirror assets there
36
+ RUN mkdir -p /home/azureuser/assets && \
37
+ cp -r /home/user/app/assets/* /home/azureuser/assets/ && \
38
+ chown -R user:user /home/user /home/azureuser || true && \
39
+ chmod +x /home/user/app/start.sh
40
+
41
+ USER user
42
+
43
+ EXPOSE 7860
44
+
45
+ CMD ["/home/user/app/start.sh"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: NuExtract3 4B
3
- emoji: 📚
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: docker
 
7
  pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: NuExtract 3
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: orange
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ license: mit
10
+ ---
 
app.py ADDED
@@ -0,0 +1,1677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import base64
3
+ import io
4
+ import json
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
9
+
10
+ import gradio as gr
11
+ from openai import OpenAI
12
+ from PIL import Image
13
+
14
+
15
+ # ---------------- Paths ----------------
16
+ APP_DIR = Path(__file__).resolve().parent
17
+
18
+
19
+ # ---------------- CLI / environment configuration ----------------
20
+ def parse_args() -> argparse.Namespace:
21
+ parser = argparse.ArgumentParser(description="NuExtract Gradio demo")
22
+
23
+ parser.add_argument(
24
+ "--model-name",
25
+ default=os.environ.get("MODEL_NAME", "NM-dev/NuExtract3.4_4B-RL-400"),
26
+ help="Model name served by the OpenAI-compatible endpoint.",
27
+ )
28
+ parser.add_argument(
29
+ "--api-base",
30
+ default=os.environ.get("OPENAI_API_BASE", "http://127.0.0.1:8000/v1"),
31
+ help="OpenAI-compatible base URL.",
32
+ )
33
+ parser.add_argument(
34
+ "--api-key",
35
+ default=os.environ.get("OPENAI_API_KEY", "EMPTY"),
36
+ help="API key for the OpenAI-compatible endpoint.",
37
+ )
38
+ parser.add_argument(
39
+ "--server-name",
40
+ default=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
41
+ help="Gradio server host.",
42
+ )
43
+ parser.add_argument(
44
+ "--server-port",
45
+ type=int,
46
+ default=int(os.environ.get("GRADIO_SERVER_PORT", "7860")),
47
+ help="Gradio server port.",
48
+ )
49
+ parser.add_argument(
50
+ "--share",
51
+ action="store_true",
52
+ default=os.environ.get("GRADIO_SHARE", "false").lower() in {"1", "true", "yes"},
53
+ help="Create a public Gradio share link.",
54
+ )
55
+ parser.add_argument(
56
+ "--max-tokens",
57
+ type=int,
58
+ default=int(os.environ.get("NUEXTRACT_MAX_TOKENS", "5000")),
59
+ help="Maximum tokens for model generation. Hidden from the UI.",
60
+ )
61
+ parser.add_argument(
62
+ "--example-dir",
63
+ default=os.environ.get("NUEXTRACT_EXAMPLE_DIR", str(APP_DIR / "examples")),
64
+ help="Directory containing image examples.",
65
+ )
66
+ parser.add_argument(
67
+ "--assets-dir",
68
+ default=os.environ.get("NUEXTRACT_ASSETS_DIR", str(APP_DIR / "assets")),
69
+ help="Directory containing static assets such as the NuExtract logo.",
70
+ )
71
+
72
+ args, _ = parser.parse_known_args()
73
+ return args
74
+
75
+
76
+ def resolve_dir(path_like: str) -> Path:
77
+ path = Path(path_like).expanduser()
78
+ if path.is_absolute():
79
+ return path.resolve()
80
+ return (APP_DIR / path).resolve()
81
+
82
+
83
+ ARGS = parse_args()
84
+
85
+ DEFAULT_MODEL = ARGS.model_name
86
+ DEFAULT_API_BASE = ARGS.api_base
87
+ DEFAULT_API_KEY = ARGS.api_key
88
+ DEFAULT_MAX_TOKENS = ARGS.max_tokens
89
+ EXAMPLE_DIR = resolve_dir(ARGS.example_dir)
90
+ ASSETS_DIR = resolve_dir(ARGS.assets_dir)
91
+
92
+ LOGO_PATH = ASSETS_DIR / "logo_numind_picto.svg"
93
+ if LOGO_PATH.exists():
94
+ LOGO_URL = f"/gradio_api/file={LOGO_PATH}"
95
+ gr.set_static_paths(paths=[ASSETS_DIR])
96
+ else:
97
+ print(f"[assets] Missing logo: {LOGO_PATH}", flush=True)
98
+ LOGO_URL = ""
99
+
100
+ SYSTEM_PROMPT_DEFAULT = (
101
+ "You are a precise information extraction assistant. "
102
+ "Return faithful, source-grounded results only."
103
+ )
104
+
105
+
106
+ # ---------------- Structured extraction examples ----------------
107
+ # These examples populate: Image + Template + Instructions.
108
+ STRUCTURED_EXAMPLE_TEMPLATES: Dict[str, Dict[str, Any]] = {
109
+ "1.jpg": {
110
+ "game_name": "verbatim-string",
111
+ "game_company_issuer_name": "string",
112
+ "currency_code_iso4217": "string",
113
+ "game_price": "number",
114
+ "game_maximum_possible_gain": "number",
115
+ "matched_winning_numbers": ["integer"],
116
+ "matched_winning_symbols": ["string"],
117
+ "gain": "number",
118
+ },
119
+ "2.png": {
120
+ "number_of_bathrooms": "integer",
121
+ "number_of_toilets": "integer",
122
+ "number_of_fireplaces": "integer",
123
+ "number_of_closets": "integer",
124
+ "distance_unit": ["meter", "foot"],
125
+ "rooms_that_are_not_bedrooms_or_corridors_or_toilets": [
126
+ {
127
+ "room_name": "verbatim-string",
128
+ "surface_area": "number",
129
+ "number_of_windows": "integer",
130
+ "number_of_doors": "integer",
131
+ }
132
+ ],
133
+ "bedrooms": [
134
+ {
135
+ "bedroom_name": "verbatim-string",
136
+ "surface_area": "number",
137
+ "number_of_windows": "integer",
138
+ "has_closet": "boolean",
139
+ "has_private_bathroom": "boolean",
140
+ }
141
+ ],
142
+ "has_laundry_room": "boolean",
143
+ "has_terrace": "boolean",
144
+ "has_balcony": "boolean",
145
+ "number_of_parking_spaces_in_garage": "integer",
146
+ "number_of_parking_spaces_exterior": "integer",
147
+ },
148
+ }
149
+
150
+ STRUCTURED_EXAMPLE_INSTRUCTIONS: Dict[str, str] = {
151
+ "1.jpg": "",
152
+ "2.png": "",
153
+ }
154
+
155
+
156
+ # ---------------- Markdown/OCR examples ----------------
157
+ # Put Markdown example image paths here.
158
+ # These examples populate only the Image input and are meant for the
159
+ # “Convert to Markdown” button.
160
+ MARKDOWN_EXAMPLE_IMAGE_PATHS: List[str] = [
161
+ "3.jpg",
162
+ "4.jpg",
163
+ "5.jpg",
164
+ "6.png",
165
+ "7.jpg",
166
+ ]
167
+
168
+
169
+ def resolve_example_path(path_like: str) -> Path:
170
+ path = Path(path_like).expanduser()
171
+ if path.is_absolute():
172
+ return path.resolve()
173
+ return (EXAMPLE_DIR / path).resolve()
174
+
175
+
176
+ def build_structured_examples() -> List[List[Any]]:
177
+ examples: List[List[Any]] = []
178
+
179
+ for filename, template_obj in STRUCTURED_EXAMPLE_TEMPLATES.items():
180
+ image_path = resolve_example_path(filename)
181
+
182
+ if not image_path.exists():
183
+ print(f"[structured examples] Missing image: {image_path}", flush=True)
184
+ continue
185
+
186
+ examples.append(
187
+ [
188
+ str(image_path),
189
+ json.dumps(template_obj, indent=4, ensure_ascii=False),
190
+ STRUCTURED_EXAMPLE_INSTRUCTIONS.get(filename, ""),
191
+ ]
192
+ )
193
+
194
+ return examples
195
+
196
+
197
+ def build_markdown_examples() -> List[List[Any]]:
198
+ examples: List[List[Any]] = []
199
+
200
+ for path_like in MARKDOWN_EXAMPLE_IMAGE_PATHS:
201
+ image_path = resolve_example_path(path_like)
202
+
203
+ if not image_path.exists():
204
+ print(f"[markdown examples] Missing image: {image_path}", flush=True)
205
+ continue
206
+
207
+ examples.append([str(image_path)])
208
+
209
+ return examples
210
+
211
+
212
+ STRUCTURED_EXAMPLES = build_structured_examples()
213
+ MARKDOWN_EXAMPLES = build_markdown_examples()
214
+
215
+
216
+ # ---------------- Utility helpers ----------------
217
+ def image_bytes_to_base64(b: bytes) -> str:
218
+ return base64.b64encode(b).decode("utf-8")
219
+
220
+
221
+ def ensure_rgb_image(image_bytes: bytes) -> Image.Image:
222
+ img = Image.open(io.BytesIO(image_bytes))
223
+ if img.mode != "RGB":
224
+ img = img.convert("RGB")
225
+ return img
226
+
227
+
228
+ def file_path_to_bytes(path: str) -> bytes:
229
+ with open(path, "rb") as f:
230
+ return f.read()
231
+
232
+
233
+ # ---------------- Response parsing ----------------
234
+ def strip_code_fence(payload: str) -> str:
235
+ return re.sub(
236
+ r"^```(?:json|markdown|text)?\s*|\s*```$",
237
+ "",
238
+ payload.strip(),
239
+ flags=re.IGNORECASE | re.MULTILINE,
240
+ ).strip()
241
+
242
+
243
+ def pretty_json_or_text(payload: str) -> str:
244
+ if not payload:
245
+ return ""
246
+
247
+ cleaned = strip_code_fence(payload)
248
+
249
+ try:
250
+ return json.dumps(json.loads(cleaned), indent=4, ensure_ascii=False)
251
+ except Exception:
252
+ return cleaned
253
+
254
+
255
+ def extract_answer_block(text: str) -> str:
256
+ if not text:
257
+ return ""
258
+
259
+ try:
260
+ match = re.search(
261
+ r"<answer>\s*(.*?)\s*</answer>",
262
+ text,
263
+ flags=re.DOTALL | re.IGNORECASE,
264
+ )
265
+ if match:
266
+ return pretty_json_or_text(match.group(1).strip())
267
+ except Exception:
268
+ pass
269
+
270
+ json_objects = list(re.finditer(r"\{[\s\S]*\}", text))
271
+ if json_objects:
272
+ candidate = max(json_objects, key=lambda match: len(match.group(0))).group(0)
273
+ return pretty_json_or_text(candidate)
274
+
275
+ return text.strip()
276
+
277
+
278
+ def split_reasoning_and_output(text: str, reasoning_enabled: bool) -> Tuple[str, str]:
279
+ if not text:
280
+ return "", ""
281
+
282
+ if not reasoning_enabled:
283
+ return "", text.strip()
284
+
285
+ lower = text.lower()
286
+ end_tag = "</think>"
287
+
288
+ if end_tag in lower:
289
+ end_idx = lower.find(end_tag)
290
+ reasoning = text[:end_idx].strip()
291
+ output = text[end_idx + len(end_tag):].strip()
292
+ return reasoning, output
293
+
294
+ return text.strip(), ""
295
+
296
+
297
+ # ---------------- Message building ----------------
298
+ def make_text_content(text: str) -> List[Dict[str, Any]]:
299
+ return [{"type": "text", "text": text or ""}]
300
+
301
+
302
+ def make_image_content(
303
+ image_bytes: bytes,
304
+ extra_text: Optional[str] = None,
305
+ ) -> List[Dict[str, Any]]:
306
+ img = ensure_rgb_image(image_bytes)
307
+
308
+ buffer = io.BytesIO()
309
+ img.save(buffer, format="JPEG", quality=95)
310
+ img_b64 = image_bytes_to_base64(buffer.getvalue())
311
+
312
+ content: List[Dict[str, Any]] = [
313
+ {
314
+ "type": "image_url",
315
+ "image_url": {
316
+ "url": f"data:image/jpeg;base64,{img_b64}",
317
+ "detail": "high",
318
+ },
319
+ }
320
+ ]
321
+
322
+ if extra_text and extra_text.strip():
323
+ content.append({"type": "text", "text": extra_text.strip()})
324
+
325
+ return content
326
+
327
+
328
+ def normalize_template(template: str) -> str:
329
+ tpl = (template or "").strip()
330
+
331
+ if not tpl:
332
+ return "{}"
333
+
334
+ try:
335
+ return json.dumps(json.loads(tpl), indent=4, ensure_ascii=False)
336
+ except Exception:
337
+ return tpl
338
+
339
+
340
+ def collate_single_input(
341
+ *,
342
+ text_or_image: Any,
343
+ template: str,
344
+ system_prompt: Optional[str],
345
+ instruction: Optional[str],
346
+ ) -> Tuple[List[Dict[str, Any]], str]:
347
+ is_image_input = isinstance(text_or_image, dict) and "bytes" in text_or_image
348
+ messages: List[Dict[str, Any]] = []
349
+
350
+ if system_prompt:
351
+ messages.append({"role": "system", "content": system_prompt})
352
+
353
+ template_json = normalize_template(template)
354
+
355
+ extra_parts = []
356
+
357
+ if instruction and instruction.strip():
358
+ extra_parts.append(f"Instructions:\n{instruction.strip()}")
359
+
360
+ if template_json and template_json.strip() not in {"{}", ""}:
361
+ extra_parts.append(f"Extraction template:\n```json\n{template_json}\n```")
362
+
363
+ extra_text_for_user = "\n\n".join(extra_parts) if extra_parts else None
364
+
365
+ if is_image_input:
366
+ messages.append(
367
+ {
368
+ "role": "user",
369
+ "content": make_image_content(
370
+ image_bytes=text_or_image["bytes"],
371
+ extra_text=extra_text_for_user,
372
+ ),
373
+ }
374
+ )
375
+ else:
376
+ text = str(text_or_image or "")
377
+
378
+ if extra_text_for_user:
379
+ text = f"{text}\n\n{extra_text_for_user}".strip()
380
+
381
+ messages.append({"role": "user", "content": make_text_content(text)})
382
+
383
+ return messages, template_json
384
+
385
+
386
+ def collate_for_template_generation(
387
+ *,
388
+ context_text: str,
389
+ context_image_path: Optional[str],
390
+ system_prompt: Optional[str],
391
+ ) -> List[Dict[str, Any]]:
392
+ messages: List[Dict[str, Any]] = []
393
+
394
+ if system_prompt:
395
+ messages.append({"role": "system", "content": system_prompt})
396
+
397
+ guidance = (
398
+ "Generate a concise JSON extraction template for this document. "
399
+ "Use descriptive field names and simple type hints like string, number, YYYY-MM-DD, "
400
+ "boolean, or arrays of objects. Return only the JSON template."
401
+ )
402
+
403
+ if context_image_path:
404
+ messages.append(
405
+ {
406
+ "role": "user",
407
+ "content": make_image_content(
408
+ image_bytes=file_path_to_bytes(context_image_path),
409
+ extra_text=guidance,
410
+ ),
411
+ }
412
+ )
413
+ else:
414
+ text = (context_text or "").strip()
415
+ messages.append(
416
+ {
417
+ "role": "user",
418
+ "content": make_text_content(f"{text}\n\n{guidance}".strip()),
419
+ }
420
+ )
421
+
422
+ return messages
423
+
424
+
425
+ def collate_markdown_image_only(
426
+ *,
427
+ image_bytes: bytes,
428
+ system_prompt: Optional[str],
429
+ ) -> List[Dict[str, Any]]:
430
+ messages: List[Dict[str, Any]] = []
431
+
432
+ if system_prompt:
433
+ messages.append({"role": "system", "content": system_prompt})
434
+
435
+ messages.append(
436
+ {
437
+ "role": "user",
438
+ "content": make_image_content(image_bytes=image_bytes),
439
+ }
440
+ )
441
+
442
+ return messages
443
+
444
+
445
+ # ---------------- Model calls ----------------
446
+ def chunk_to_text(chunk: Any) -> str:
447
+ try:
448
+ if not chunk or not getattr(chunk, "choices", None):
449
+ return ""
450
+
451
+ delta = getattr(chunk.choices[0], "delta", None)
452
+ if delta is None:
453
+ return ""
454
+
455
+ content = getattr(delta, "content", None)
456
+
457
+ if isinstance(content, str):
458
+ return content
459
+
460
+ if isinstance(content, list):
461
+ parts: List[str] = []
462
+ for item in content:
463
+ if isinstance(item, dict) and item.get("text"):
464
+ parts.append(item["text"])
465
+ elif getattr(item, "text", None):
466
+ parts.append(item.text)
467
+ return "".join(parts)
468
+
469
+ except Exception:
470
+ return ""
471
+
472
+ return ""
473
+
474
+
475
+ def build_chat_template_kwargs(
476
+ *,
477
+ template_json: str,
478
+ reasoning: bool,
479
+ instruction: Optional[str],
480
+ markdown_mode: bool,
481
+ ) -> Dict[str, Any]:
482
+ if markdown_mode:
483
+ return {
484
+ "mode": "markdown",
485
+ "enable_thinking": bool(reasoning),
486
+ }
487
+
488
+ use_structured = bool(
489
+ template_json
490
+ and template_json.strip()
491
+ and template_json.strip() != "{}"
492
+ )
493
+
494
+ chat_kwargs: Dict[str, Any] = {
495
+ "mode": "structured" if use_structured else "content",
496
+ "enable_thinking": bool(reasoning),
497
+ }
498
+
499
+ if use_structured:
500
+ chat_kwargs["template"] = template_json
501
+
502
+ if instruction and instruction.strip():
503
+ chat_kwargs["instructions"] = instruction.strip()
504
+
505
+ return chat_kwargs
506
+
507
+
508
+ def call_model_stream(
509
+ *,
510
+ api_base: str,
511
+ api_key: str,
512
+ model_name: str,
513
+ messages: List[Dict[str, Any]],
514
+ template_json: str,
515
+ temperature: float,
516
+ max_tokens: int,
517
+ reasoning: bool,
518
+ instruction: Optional[str],
519
+ markdown_mode: bool,
520
+ ) -> Iterator[str]:
521
+ client = OpenAI(base_url=api_base, api_key=api_key)
522
+
523
+ chat_kwargs = build_chat_template_kwargs(
524
+ template_json=template_json,
525
+ reasoning=reasoning,
526
+ instruction=instruction,
527
+ markdown_mode=markdown_mode,
528
+ )
529
+
530
+ stream = client.chat.completions.create(
531
+ model=model_name,
532
+ temperature=float(temperature),
533
+ max_tokens=int(max_tokens),
534
+ messages=messages,
535
+ stream=True,
536
+ extra_body={"chat_template_kwargs": chat_kwargs},
537
+ )
538
+
539
+ accumulated = ""
540
+
541
+ for chunk in stream:
542
+ delta_text = chunk_to_text(chunk)
543
+ if delta_text:
544
+ accumulated += delta_text
545
+ yield accumulated
546
+
547
+
548
+ def call_model_once(
549
+ *,
550
+ api_base: str,
551
+ api_key: str,
552
+ model_name: str,
553
+ messages: List[Dict[str, Any]],
554
+ mode: str,
555
+ temperature: float,
556
+ max_tokens: int,
557
+ ) -> str:
558
+ client = OpenAI(base_url=api_base, api_key=api_key)
559
+
560
+ chat = client.chat.completions.create(
561
+ model=model_name,
562
+ temperature=float(temperature),
563
+ max_tokens=int(max_tokens),
564
+ messages=messages,
565
+ extra_body={
566
+ "chat_template_kwargs": {
567
+ "mode": mode,
568
+ "enable_thinking": False,
569
+ }
570
+ },
571
+ )
572
+
573
+ return chat.choices[0].message.content if chat.choices else ""
574
+
575
+
576
+ # ---------------- Inference orchestration ----------------
577
+ def prepare_input(context_text: str, context_image_path: Optional[str]) -> Any:
578
+ if context_image_path:
579
+ return {"bytes": file_path_to_bytes(context_image_path)}
580
+
581
+ return context_text or ""
582
+
583
+
584
+ def infer_stream(
585
+ *,
586
+ api_key: str,
587
+ api_base: str,
588
+ system_prompt: str,
589
+ template: str,
590
+ instruction: str,
591
+ context_text: str,
592
+ context_image_path: Optional[str],
593
+ temperature: float,
594
+ reasoning: bool,
595
+ markdown_mode: bool,
596
+ ):
597
+ single_input = prepare_input(context_text, context_image_path)
598
+ is_image = isinstance(single_input, dict) and "bytes" in single_input
599
+
600
+ if markdown_mode:
601
+ if not is_image:
602
+ raise ValueError("Markdown conversion requires an image input.")
603
+
604
+ messages = collate_markdown_image_only(
605
+ image_bytes=single_input["bytes"],
606
+ system_prompt=system_prompt,
607
+ )
608
+ template_json = ""
609
+
610
+ else:
611
+ messages, template_json = collate_single_input(
612
+ text_or_image=single_input,
613
+ template=template,
614
+ system_prompt=system_prompt,
615
+ instruction=instruction,
616
+ )
617
+
618
+ for partial_text in call_model_stream(
619
+ api_base=api_base,
620
+ api_key=api_key,
621
+ model_name=DEFAULT_MODEL,
622
+ messages=messages,
623
+ template_json=template_json,
624
+ temperature=temperature,
625
+ max_tokens=DEFAULT_MAX_TOKENS,
626
+ reasoning=reasoning,
627
+ instruction=instruction,
628
+ markdown_mode=markdown_mode,
629
+ ):
630
+ trace, output_text = split_reasoning_and_output(
631
+ partial_text,
632
+ reasoning_enabled=reasoning,
633
+ )
634
+
635
+ if markdown_mode:
636
+ output_display = output_text or (
637
+ "_(Waiting for output after `</think>`.)_"
638
+ if reasoning
639
+ else "_(Empty output.)_"
640
+ )
641
+
642
+ yield {
643
+ "mode": "markdown",
644
+ "output": output_display,
645
+ "think": trace if reasoning else "",
646
+ }
647
+ continue
648
+
649
+ if not reasoning:
650
+ output_text = partial_text or ""
651
+
652
+ answer = extract_answer_block(output_text)
653
+ output_display = answer or (
654
+ "_(Waiting for output after `</think>`.)_"
655
+ if reasoning
656
+ else "_(No output found yet.)_"
657
+ )
658
+
659
+ if output_display.strip().startswith("{") or output_display.strip().startswith("["):
660
+ output_display = pretty_json_or_text(output_display)
661
+ output_display = f"```json\n{output_display}\n```"
662
+ else:
663
+ output_display = output_display.replace("\\n", "\n")
664
+
665
+ yield {
666
+ "mode": "structured",
667
+ "output": output_display,
668
+ "think": trace if reasoning else "",
669
+ }
670
+
671
+
672
+ def infer_template_generation(
673
+ *,
674
+ api_key: str,
675
+ api_base: str,
676
+ system_prompt: str,
677
+ context_text: str,
678
+ context_image_path: Optional[str],
679
+ temperature: float,
680
+ ) -> str:
681
+ messages = collate_for_template_generation(
682
+ context_text=context_text,
683
+ context_image_path=context_image_path,
684
+ system_prompt=system_prompt,
685
+ )
686
+
687
+ result = call_model_once(
688
+ api_base=api_base,
689
+ api_key=api_key,
690
+ model_name=DEFAULT_MODEL,
691
+ messages=messages,
692
+ mode="template-generation",
693
+ temperature=temperature,
694
+ max_tokens=DEFAULT_MAX_TOKENS,
695
+ )
696
+
697
+ return pretty_json_or_text(result)
698
+
699
+
700
+ # ---------------- UI styling ----------------
701
+ CSS = """
702
+ :root {
703
+ color-scheme: light;
704
+ --bg: #f6f2eb;
705
+ --panel: #ffffff;
706
+ --panel-rgb: 255, 255, 255;
707
+ --panel-strong-rgb: 255, 252, 246;
708
+ --input-rgb: 255, 255, 255;
709
+ --border-blue: rgba(67, 111, 148, 0.30);
710
+ --border-blue-soft: rgba(67, 111, 148, 0.18);
711
+ --border-input: rgba(67, 111, 148, 0.22);
712
+ --border-orange-soft: rgba(190, 103, 36, 0.26);
713
+ --text: #23252b;
714
+ --text-strong: #101318;
715
+ --text-on-accent: #101318;
716
+ --muted: #5f6673;
717
+ --muted-2: #7d8490;
718
+ --logo-blue: #5d9bcf;
719
+ --logo-orange: #d6742f;
720
+ --green: #178f66;
721
+ --card-alpha: 0.88;
722
+ --header-alpha: 0.82;
723
+ --input-alpha: 0.94;
724
+ --shadow: rgba(54, 46, 35, 0.14);
725
+ --inset-highlight: rgba(255, 255, 255, 0.85);
726
+ --logo-opacity: 0.18;
727
+ --focus-ring: rgba(67, 111, 148, 0.26);
728
+ --code-bg: #fdfaf5;
729
+ --dropzone-bg: #fbf8f2;
730
+ }
731
+
732
+ html.dark,
733
+ body.dark,
734
+ .dark,
735
+ [data-theme="dark"] {
736
+ color-scheme: dark;
737
+ --bg: #242529;
738
+ --panel: #1d1f26;
739
+ --panel-rgb: 29, 31, 38;
740
+ --panel-strong-rgb: 21, 22, 26;
741
+ --input-rgb: 12, 14, 19;
742
+ --border-blue: rgba(135, 183, 224, 0.24);
743
+ --border-blue-soft: rgba(135, 183, 224, 0.16);
744
+ --border-input: rgba(135, 183, 224, 0.14);
745
+ --border-orange-soft: rgba(228, 132, 58, 0.22);
746
+ --text: #eef0f4;
747
+ --text-strong: #ffffff;
748
+ --text-on-accent: #101318;
749
+ --muted: #969baa;
750
+ --muted-2: #737988;
751
+ --logo-blue: #87b7e0;
752
+ --logo-orange: #e4843a;
753
+ --green: #31c48d;
754
+ --card-alpha: 0.66;
755
+ --header-alpha: 0.42;
756
+ --input-alpha: 0.78;
757
+ --shadow: rgba(0, 0, 0, 0.28);
758
+ --inset-highlight: rgba(255, 255, 255, 0.055);
759
+ --logo-opacity: 0.88;
760
+ --focus-ring: rgba(135, 183, 224, 0.32);
761
+ --code-bg: rgba(12, 14, 19, 0.78);
762
+ --dropzone-bg: rgba(12, 14, 19, 0.78);
763
+ }
764
+
765
+ @media (prefers-color-scheme: dark) {
766
+ :root:not([data-theme="light"]) {
767
+ color-scheme: dark;
768
+ --bg: #242529;
769
+ --panel: #1d1f26;
770
+ --panel-rgb: 29, 31, 38;
771
+ --panel-strong-rgb: 21, 22, 26;
772
+ --input-rgb: 12, 14, 19;
773
+ --border-blue: rgba(135, 183, 224, 0.24);
774
+ --border-blue-soft: rgba(135, 183, 224, 0.16);
775
+ --border-input: rgba(135, 183, 224, 0.14);
776
+ --border-orange-soft: rgba(228, 132, 58, 0.22);
777
+ --text: #eef0f4;
778
+ --text-strong: #ffffff;
779
+ --text-on-accent: #101318;
780
+ --muted: #969baa;
781
+ --muted-2: #737988;
782
+ --logo-blue: #87b7e0;
783
+ --logo-orange: #e4843a;
784
+ --green: #31c48d;
785
+ --card-alpha: 0.66;
786
+ --header-alpha: 0.42;
787
+ --input-alpha: 0.78;
788
+ --shadow: rgba(0, 0, 0, 0.28);
789
+ --inset-highlight: rgba(255, 255, 255, 0.055);
790
+ --logo-opacity: 0.88;
791
+ --focus-ring: rgba(135, 183, 224, 0.32);
792
+ --code-bg: rgba(12, 14, 19, 0.78);
793
+ --dropzone-bg: rgba(12, 14, 19, 0.78);
794
+ }
795
+ }
796
+
797
+ html,
798
+ body,
799
+ footer,
800
+ .gradio-container {
801
+ color: var(--text) !important;
802
+ }
803
+
804
+ body {
805
+ background: var(--bg) !important;
806
+ background-attachment: fixed !important;
807
+ }
808
+
809
+ footer {
810
+ background: transparent !important;
811
+ }
812
+
813
+ .gradio-container {
814
+ position: relative !important;
815
+ isolation: isolate !important;
816
+ max-width: 1680px !important;
817
+ padding: 10px 18px 18px 18px !important;
818
+ background: transparent !important;
819
+ }
820
+
821
+ .gradio-container::before {
822
+ content: "";
823
+ position: fixed;
824
+ inset: 0;
825
+ z-index: -2;
826
+ pointer-events: none;
827
+ background-image: url("__LOGO_URL__");
828
+ background-repeat: no-repeat;
829
+ background-size: min(86vw, 980px) min(86vw, 980px);
830
+ background-position: calc(100% + 230px) 34px;
831
+ opacity: var(--logo-opacity);
832
+ filter: saturate(1.2) drop-shadow(0 0 28px rgba(135, 183, 224, 0.14));
833
+ }
834
+
835
+ .with-gap,
836
+ .gradio-row {
837
+ gap: 18px !important;
838
+ }
839
+
840
+ .app-header {
841
+ position: relative;
842
+ display: flex;
843
+ align-items: center;
844
+ justify-content: space-between;
845
+ gap: 16px;
846
+ padding: 10px 12px 14px 12px;
847
+ margin-bottom: 10px;
848
+ border-bottom: 1px solid var(--border-blue-soft);
849
+ background: rgba(var(--panel-strong-rgb), var(--header-alpha));
850
+ border-radius: 14px;
851
+ backdrop-filter: blur(8px);
852
+ box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight);
853
+ }
854
+
855
+ .brand {
856
+ display: flex;
857
+ align-items: center;
858
+ gap: 10px;
859
+ }
860
+
861
+ .brand-mark {
862
+ width: 28px;
863
+ height: 28px;
864
+ flex: 0 0 auto;
865
+ object-fit: contain;
866
+ }
867
+
868
+ .brand-title {
869
+ display: flex;
870
+ align-items: baseline;
871
+ gap: 8px;
872
+ }
873
+
874
+ .brand-name {
875
+ font-size: 23px;
876
+ line-height: 1;
877
+ font-weight: 750;
878
+ letter-spacing: -0.045em;
879
+ color: var(--text-strong) !important;
880
+ }
881
+
882
+ .brand-name span {
883
+ color: var(--muted) !important;
884
+ }
885
+
886
+ .model-chip {
887
+ display: inline-flex;
888
+ align-items: center;
889
+ max-width: 520px;
890
+ padding: 5px 9px;
891
+ border-radius: 999px;
892
+ background: rgba(var(--panel-rgb), 0.88);
893
+ border: 1px solid var(--border-blue-soft);
894
+ color: var(--muted) !important;
895
+ font-size: 12px;
896
+ white-space: nowrap;
897
+ overflow: hidden;
898
+ text-overflow: ellipsis;
899
+ }
900
+
901
+ .model-chip code {
902
+ color: var(--text-strong) !important;
903
+ background: transparent !important;
904
+ }
905
+
906
+ .header-actions {
907
+ display: flex;
908
+ align-items: center;
909
+ gap: 10px;
910
+ color: var(--muted) !important;
911
+ font-size: 13px;
912
+ }
913
+
914
+ .status-dot {
915
+ width: 8px;
916
+ height: 8px;
917
+ border-radius: 99px;
918
+ background: var(--green);
919
+ box-shadow: 0 0 14px rgba(49, 196, 141, 0.65);
920
+ }
921
+
922
+ .intro-card {
923
+ margin: 0 0 16px 0;
924
+ padding: 14px 16px;
925
+ border-radius: 14px;
926
+ background: rgba(var(--panel-rgb), var(--card-alpha));
927
+ border: 1px solid var(--border-blue-soft);
928
+ box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight);
929
+ backdrop-filter: blur(8px);
930
+ }
931
+
932
+ .intro-card p {
933
+ margin: 0 0 8px 0;
934
+ line-height: 1.5;
935
+ }
936
+
937
+ .section-title {
938
+ margin: 0 0 8px 0;
939
+ color: var(--text-strong) !important;
940
+ font-size: 13px;
941
+ font-weight: 750;
942
+ letter-spacing: 0.01em;
943
+ }
944
+
945
+ .main-card,
946
+ .output-card,
947
+ .gradio-group {
948
+ background: rgba(var(--panel-rgb), var(--card-alpha)) !important;
949
+ border: 1px solid var(--border-blue) !important;
950
+ border-radius: 14px !important;
951
+ box-shadow: 0 22px 70px var(--shadow), inset 0 1px 0 var(--inset-highlight) !important;
952
+ backdrop-filter: blur(10px) saturate(1.18);
953
+ }
954
+
955
+ .output-card {
956
+ min-height: 820px !important;
957
+ border-color: var(--border-orange-soft) !important;
958
+ }
959
+
960
+ label,
961
+ .markdown,
962
+ .prose,
963
+ h1,
964
+ h2,
965
+ h3,
966
+ h4,
967
+ h5,
968
+ h6,
969
+ p,
970
+ span,
971
+ div {
972
+ color: var(--text) !important;
973
+ }
974
+
975
+ .section-title,
976
+ label > span,
977
+ .gradio-container label {
978
+ color: var(--text-strong) !important;
979
+ }
980
+
981
+ .secondary-note {
982
+ color: var(--muted) !important;
983
+ font-size: 12px;
984
+ line-height: 1.35;
985
+ }
986
+
987
+ textarea,
988
+ input[type="text"],
989
+ input[type="password"],
990
+ input[type="number"],
991
+ input[type="email"],
992
+ .cm-editor {
993
+ background: rgba(var(--input-rgb), var(--input-alpha)) !important;
994
+ color: var(--text) !important;
995
+ border-color: var(--border-input) !important;
996
+ }
997
+
998
+ textarea::placeholder,
999
+ input::placeholder {
1000
+ color: var(--muted-2) !important;
1001
+ }
1002
+
1003
+ textarea:focus,
1004
+ input:focus,
1005
+ .cm-editor.cm-focused {
1006
+ border-color: var(--logo-blue) !important;
1007
+ box-shadow: 0 0 0 3px var(--focus-ring) !important;
1008
+ }
1009
+
1010
+ input[type="checkbox"] {
1011
+ accent-color: var(--logo-blue) !important;
1012
+ }
1013
+
1014
+ #schema-box .cm-editor {
1015
+ min-height: 410px !important;
1016
+ max-height: 480px !important;
1017
+ background: var(--code-bg) !important;
1018
+ }
1019
+
1020
+ .cm-editor,
1021
+ .cm-scroller,
1022
+ .cm-content,
1023
+ .cm-line,
1024
+ .cm-gutters,
1025
+ .cm-activeLine,
1026
+ .cm-activeLineGutter {
1027
+ background: var(--code-bg) !important;
1028
+ color: var(--text) !important;
1029
+ }
1030
+
1031
+ .cm-gutters {
1032
+ border-color: var(--border-blue-soft) !important;
1033
+ color: var(--muted-2) !important;
1034
+ }
1035
+
1036
+ .cm-cursor {
1037
+ border-left-color: var(--text-strong) !important;
1038
+ }
1039
+
1040
+ #image-box {
1041
+ min-height: 335px !important;
1042
+ background: var(--dropzone-bg) !important;
1043
+ border-color: var(--border-blue-soft) !important;
1044
+ }
1045
+
1046
+ #image-box,
1047
+ #image-box *,
1048
+ .upload-container,
1049
+ .upload-container *,
1050
+ .file-preview,
1051
+ .file-preview * {
1052
+ color: var(--text) !important;
1053
+ }
1054
+
1055
+ #image-box button,
1056
+ #image-box .icon-wrap,
1057
+ #image-box .wrap {
1058
+ background: transparent !important;
1059
+ }
1060
+
1061
+ #reasoning-box {
1062
+ min-height: 250px;
1063
+ max-height: 300px;
1064
+ overflow: auto;
1065
+ padding: 8px;
1066
+ border-radius: 8px;
1067
+ background: rgba(var(--input-rgb), var(--input-alpha)) !important;
1068
+ border: 1px solid var(--border-blue-soft);
1069
+ white-space: pre-wrap !important;
1070
+ overflow-wrap: anywhere !important;
1071
+ word-break: break-word !important;
1072
+ }
1073
+
1074
+ #output-box {
1075
+ min-height: 430px;
1076
+ max-height: 520px;
1077
+ overflow: auto;
1078
+ padding: 8px;
1079
+ border-radius: 8px;
1080
+ background: rgba(var(--input-rgb), var(--input-alpha)) !important;
1081
+ border: 1px solid var(--border-blue-soft);
1082
+ white-space: pre-wrap !important;
1083
+ overflow-wrap: anywhere !important;
1084
+ word-break: break-word !important;
1085
+ }
1086
+
1087
+ #reasoning-box pre,
1088
+ #reasoning-box code,
1089
+ #output-box pre,
1090
+ #output-box code {
1091
+ white-space: pre-wrap !important;
1092
+ overflow-wrap: anywhere !important;
1093
+ word-break: break-word !important;
1094
+ color: var(--text) !important;
1095
+ background: transparent !important;
1096
+ }
1097
+
1098
+ button {
1099
+ border-radius: 9px !important;
1100
+ min-height: 34px !important;
1101
+ }
1102
+
1103
+ button.primary-button,
1104
+ .primary-button button,
1105
+ .primary-button {
1106
+ background: var(--logo-blue) !important;
1107
+ background-color: var(--logo-blue) !important;
1108
+ color: var(--text-on-accent) !important;
1109
+ border: none !important;
1110
+ font-weight: 750 !important;
1111
+ }
1112
+
1113
+ button.markdown-button,
1114
+ .markdown-button button,
1115
+ .markdown-button {
1116
+ background: var(--logo-orange) !important;
1117
+ background-color: var(--logo-orange) !important;
1118
+ color: var(--text-on-accent) !important;
1119
+ border: none !important;
1120
+ font-weight: 750 !important;
1121
+ }
1122
+
1123
+ .clear-button button,
1124
+ button.clear-button,
1125
+ .clear-button {
1126
+ background: transparent !important;
1127
+ background-color: transparent !important;
1128
+ color: var(--muted) !important;
1129
+ border: 1px solid var(--border-blue-soft) !important;
1130
+ }
1131
+
1132
+ .gradio-container .wrap,
1133
+ .gradio-container .block,
1134
+ .gradio-container .form,
1135
+ .gradio-container .panel,
1136
+ .gradio-container .tabs,
1137
+ .gradio-container .tabitem {
1138
+ background: transparent !important;
1139
+ color: var(--text) !important;
1140
+ }
1141
+
1142
+ .gradio-accordion {
1143
+ border-color: var(--border-blue-soft) !important;
1144
+ }
1145
+
1146
+ .gradio-container table,
1147
+ .gradio-container th,
1148
+ .gradio-container td {
1149
+ color: var(--text) !important;
1150
+ }
1151
+
1152
+ .gradio-container label,
1153
+ .gradio-container label span,
1154
+ .gradio-container .label-wrap,
1155
+ .gradio-container .label-wrap span {
1156
+ color: var(--text-strong) !important;
1157
+ }
1158
+
1159
+ @media (max-width: 1100px) {
1160
+ .app-header {
1161
+ align-items: flex-start;
1162
+ flex-direction: column;
1163
+ }
1164
+
1165
+ .brand-title {
1166
+ align-items: flex-start;
1167
+ flex-direction: column;
1168
+ }
1169
+
1170
+ .model-chip {
1171
+ max-width: 100%;
1172
+ }
1173
+
1174
+ .output-card {
1175
+ min-height: 520px !important;
1176
+ }
1177
+
1178
+ #reasoning-box {
1179
+ min-height: 180px;
1180
+ }
1181
+
1182
+ #output-box {
1183
+ min-height: 320px;
1184
+ }
1185
+ }
1186
+ """.replace("__LOGO_URL__", LOGO_URL or "")
1187
+
1188
+
1189
+ # ---------------- Gradio app ----------------
1190
+ with gr.Blocks(
1191
+ title="NuExtract",
1192
+ css=CSS,
1193
+ theme=gr.themes.Base(
1194
+ primary_hue="blue",
1195
+ secondary_hue="orange",
1196
+ neutral_hue="slate",
1197
+ ),
1198
+ ) as demo:
1199
+ logo_html = (
1200
+ f'<img class="brand-mark" src="{LOGO_URL}" alt="NuExtract logo" />'
1201
+ if LOGO_URL
1202
+ else '<div class="brand-mark"></div>'
1203
+ )
1204
+
1205
+ gr.HTML(
1206
+ f"""
1207
+ <header class="app-header">
1208
+ <div class="brand">
1209
+ {logo_html}
1210
+
1211
+ <div class="brand-title">
1212
+ <div class="brand-name">Nu<span>Extract</span></div>
1213
+ <div class="model-chip">Model&nbsp;<code>{DEFAULT_MODEL}</code></div>
1214
+ </div>
1215
+ </div>
1216
+
1217
+ <div class="header-actions">
1218
+ <span class="status-dot"></span>
1219
+ <span>OpenAI-compatible endpoint</span>
1220
+ </div>
1221
+ </header>
1222
+ """
1223
+ )
1224
+
1225
+ gr.Markdown(
1226
+ """
1227
+ We introduce **NuExtract 3** — a 4B open-source **MIT License** VLM specialized in document extraction.
1228
+ NuExtract 3 unifies structured extraction — document to JSON — and content extraction — document to Markdown,
1229
+ a.k.a. OCR — into one model.
1230
+
1231
+ NuExtract 3 has been trained via Reinforcement Learning to have extraction-specific reasoning abilities, which can
1232
+ be switched on/off on demand. We find that NuExtract 3 substantially outperforms similar-sized models for both
1233
+ structured extraction and content extraction, making it the new reference model of open-source document extraction.
1234
+ """,
1235
+ elem_classes=["intro-card"],
1236
+ )
1237
+
1238
+ with gr.Row(equal_height=True):
1239
+ # Left: input, schema, controls
1240
+ with gr.Column(scale=1, min_width=520):
1241
+ with gr.Group(elem_classes="main-card"):
1242
+ gr.HTML("<div class='section-title'>Input</div>")
1243
+
1244
+ context_image = gr.Image(
1245
+ label="Image",
1246
+ type="filepath",
1247
+ height=340,
1248
+ sources=["upload", "clipboard"],
1249
+ elem_id="image-box",
1250
+ )
1251
+
1252
+ context_text = gr.Textbox(
1253
+ label="Text",
1254
+ placeholder="Optional: paste document text.",
1255
+ lines=3,
1256
+ max_lines=5,
1257
+ )
1258
+
1259
+ with gr.Group(elem_classes="main-card"):
1260
+ gr.HTML("<div class='section-title'>Schema & instructions</div>")
1261
+
1262
+ instruction = gr.Textbox(
1263
+ label="Instructions",
1264
+ placeholder="Optional extraction instructions.",
1265
+ lines=2,
1266
+ max_lines=3,
1267
+ )
1268
+
1269
+ with gr.Row(equal_height=True):
1270
+ template = gr.Code(
1271
+ label="Template",
1272
+ language="json",
1273
+ value=json.dumps(
1274
+ {
1275
+ "title": "string",
1276
+ "entities": ["string"],
1277
+ "dates": ["YYYY-MM-DD"],
1278
+ "amounts": [
1279
+ {
1280
+ "value": "number",
1281
+ "currency": "string",
1282
+ }
1283
+ ],
1284
+ },
1285
+ indent=4,
1286
+ ),
1287
+ lines=16,
1288
+ scale=5,
1289
+ elem_id="schema-box",
1290
+ )
1291
+
1292
+ with gr.Column(scale=2, min_width=150):
1293
+ generate_template_btn = gr.Button(
1294
+ "Generate template",
1295
+ variant="secondary",
1296
+ )
1297
+
1298
+ gr.HTML(
1299
+ "<div class='secondary-note'>"
1300
+ "Use Extract for JSON. Use Markdown to convert an image document."
1301
+ "</div>"
1302
+ )
1303
+
1304
+ with gr.Group(elem_classes="main-card"):
1305
+ gr.HTML("<div class='section-title'>Run</div>")
1306
+
1307
+ with gr.Row():
1308
+ extract_btn = gr.Button(
1309
+ "Extract JSON",
1310
+ variant="secondary",
1311
+ elem_classes=["primary-button"],
1312
+ )
1313
+ markdown_btn = gr.Button(
1314
+ "Convert to Markdown",
1315
+ variant="secondary",
1316
+ elem_classes=["markdown-button"],
1317
+ )
1318
+
1319
+ with gr.Row():
1320
+ stop_btn = gr.Button("Stop", variant="stop")
1321
+ clear_btn = gr.Button(
1322
+ "Clear results",
1323
+ variant="secondary",
1324
+ elem_classes=["clear-button"],
1325
+ )
1326
+
1327
+ reasoning_checkbox = gr.Checkbox(
1328
+ label="Reasoning",
1329
+ value=True,
1330
+ interactive=True,
1331
+ info="If enabled, reasoning is everything before </think>.",
1332
+ )
1333
+
1334
+ temperature = gr.Slider(
1335
+ 0.0,
1336
+ 1.5,
1337
+ value=0.2,
1338
+ step=0.05,
1339
+ label="Temperature",
1340
+ info="Lower values are best for extraction.",
1341
+ )
1342
+
1343
+ with gr.Accordion("Structured examples", open=False):
1344
+ if STRUCTURED_EXAMPLES:
1345
+ gr.Examples(
1346
+ examples=STRUCTURED_EXAMPLES,
1347
+ inputs=[context_image, template, instruction],
1348
+ label="Load structured example",
1349
+ examples_per_page=8,
1350
+ cache_examples=False,
1351
+ )
1352
+ else:
1353
+ gr.Markdown(
1354
+ f"""
1355
+ No structured examples found.
1356
+
1357
+ Add files referenced in `STRUCTURED_EXAMPLE_TEMPLATES`, for example:
1358
+
1359
+ ```text
1360
+ {EXAMPLE_DIR}/1.jpg
1361
+ {EXAMPLE_DIR}/2.png
1362
+ ```
1363
+ """
1364
+ )
1365
+
1366
+ with gr.Accordion("Markdown examples", open=False):
1367
+ if MARKDOWN_EXAMPLES:
1368
+ gr.Examples(
1369
+ examples=MARKDOWN_EXAMPLES,
1370
+ inputs=[context_image],
1371
+ label="Load Markdown example",
1372
+ examples_per_page=8,
1373
+ cache_examples=False,
1374
+ )
1375
+ else:
1376
+ gr.Markdown(
1377
+ f"""
1378
+ No Markdown examples found.
1379
+
1380
+ Add image paths to `MARKDOWN_EXAMPLE_IMAGE_PATHS`, for example:
1381
+
1382
+ ```python
1383
+ MARKDOWN_EXAMPLE_IMAGE_PATHS = [
1384
+ "markdown_1.png",
1385
+ "markdown_2.jpg",
1386
+ "/home/user/app/examples/report.png",
1387
+ ]
1388
+ ```
1389
+
1390
+ Relative paths are resolved from:
1391
+
1392
+ ```text
1393
+ {EXAMPLE_DIR}
1394
+ ```
1395
+ """
1396
+ )
1397
+
1398
+ # Endpoint settings are intentionally hidden from the UI.
1399
+ api_base = gr.State(DEFAULT_API_BASE)
1400
+ api_key = gr.State(DEFAULT_API_KEY)
1401
+ system_prompt = gr.State(SYSTEM_PROMPT_DEFAULT)
1402
+
1403
+ # Right: reasoning + output
1404
+ with gr.Column(scale=1, min_width=520):
1405
+ with gr.Group(elem_classes="output-card"):
1406
+ gr.HTML("<div class='section-title'>Reasoning</div>")
1407
+ reasoning_md = gr.Markdown(
1408
+ label="Reasoning",
1409
+ elem_id="reasoning-box",
1410
+ )
1411
+
1412
+ gr.HTML("<div class='section-title' style='margin-top: 12px;'>Output</div>")
1413
+ output_md = gr.Markdown(
1414
+ label="Output",
1415
+ elem_id="output-box",
1416
+ )
1417
+
1418
+ error_box = gr.Markdown(visible=False)
1419
+
1420
+ def run_model_click(
1421
+ api_key_val,
1422
+ api_base_val,
1423
+ system_prompt_val,
1424
+ instruction_val,
1425
+ template_val,
1426
+ context_text_val,
1427
+ context_image_val,
1428
+ temperature_val,
1429
+ reasoning_val,
1430
+ markdown_mode_val,
1431
+ ):
1432
+ mode_name = "Markdown" if markdown_mode_val else "Extract"
1433
+ print(f"[button] {mode_name} clicked", flush=True)
1434
+ print(f"[button] image={context_image_val}", flush=True)
1435
+ print(f"[button] text_len={len(context_text_val or '')}", flush=True)
1436
+ print(f"[button] reasoning={bool(reasoning_val)}", flush=True)
1437
+
1438
+ if markdown_mode_val and not context_image_val:
1439
+ msg = "Markdown conversion requires a document image."
1440
+ yield (
1441
+ gr.update(value=""),
1442
+ gr.update(value=""),
1443
+ gr.update(visible=True, value=f"### Error\n{msg}"),
1444
+ )
1445
+ return
1446
+
1447
+ if not context_image_val and not (context_text_val or "").strip():
1448
+ msg = "Please provide a document image or paste document text."
1449
+ yield (
1450
+ gr.update(value=""),
1451
+ gr.update(value=""),
1452
+ gr.update(visible=True, value=f"### Error\n{msg}"),
1453
+ )
1454
+ return
1455
+
1456
+ try:
1457
+ yielded_anything = False
1458
+
1459
+ for res in infer_stream(
1460
+ api_key=api_key_val,
1461
+ api_base=api_base_val,
1462
+ system_prompt=system_prompt_val,
1463
+ template=template_val,
1464
+ instruction=instruction_val,
1465
+ context_text=context_text_val,
1466
+ context_image_path=context_image_val,
1467
+ temperature=temperature_val,
1468
+ reasoning=bool(reasoning_val),
1469
+ markdown_mode=bool(markdown_mode_val),
1470
+ ):
1471
+ yielded_anything = True
1472
+
1473
+ think = res.get("think") or ""
1474
+ output = res.get("output") or "_(Empty output.)_"
1475
+
1476
+ yield (
1477
+ gr.update(value=f"```text\n{think}\n```" if think else ""),
1478
+ gr.update(value=output),
1479
+ gr.update(visible=False, value=""),
1480
+ )
1481
+
1482
+ if not yielded_anything:
1483
+ yield (
1484
+ gr.update(value=""),
1485
+ gr.update(value=""),
1486
+ gr.update(
1487
+ visible=True,
1488
+ value="### Error\nThe model returned no streamed output.",
1489
+ ),
1490
+ )
1491
+
1492
+ except Exception:
1493
+ import traceback
1494
+
1495
+ tb = traceback.format_exc()
1496
+ print(tb, flush=True)
1497
+
1498
+ yield (
1499
+ gr.update(value=""),
1500
+ gr.update(value=""),
1501
+ gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"),
1502
+ )
1503
+
1504
+ def on_extract_click(
1505
+ api_key_val,
1506
+ api_base_val,
1507
+ system_prompt_val,
1508
+ instruction_val,
1509
+ template_val,
1510
+ context_text_val,
1511
+ context_image_val,
1512
+ temperature_val,
1513
+ reasoning_val,
1514
+ ):
1515
+ yield from run_model_click(
1516
+ api_key_val,
1517
+ api_base_val,
1518
+ system_prompt_val,
1519
+ instruction_val,
1520
+ template_val,
1521
+ context_text_val,
1522
+ context_image_val,
1523
+ temperature_val,
1524
+ reasoning_val,
1525
+ False,
1526
+ )
1527
+
1528
+ def on_markdown_click(
1529
+ api_key_val,
1530
+ api_base_val,
1531
+ system_prompt_val,
1532
+ instruction_val,
1533
+ template_val,
1534
+ context_text_val,
1535
+ context_image_val,
1536
+ temperature_val,
1537
+ reasoning_val,
1538
+ ):
1539
+ yield from run_model_click(
1540
+ api_key_val,
1541
+ api_base_val,
1542
+ system_prompt_val,
1543
+ instruction_val,
1544
+ template_val,
1545
+ context_text_val,
1546
+ context_image_val,
1547
+ temperature_val,
1548
+ reasoning_val,
1549
+ True,
1550
+ )
1551
+
1552
+ def on_click_generate_template(
1553
+ api_key_val,
1554
+ api_base_val,
1555
+ system_prompt_val,
1556
+ context_text_val,
1557
+ context_image_val,
1558
+ temperature_val,
1559
+ ):
1560
+ print("[button] Generate template clicked", flush=True)
1561
+
1562
+ if not context_image_val and not (context_text_val or "").strip():
1563
+ return (
1564
+ gr.update(),
1565
+ gr.update(
1566
+ visible=True,
1567
+ value="### Error\nPlease provide a document image or paste document text.",
1568
+ ),
1569
+ )
1570
+
1571
+ try:
1572
+ template_text = infer_template_generation(
1573
+ api_key=api_key_val,
1574
+ api_base=api_base_val,
1575
+ system_prompt=system_prompt_val,
1576
+ context_text=context_text_val,
1577
+ context_image_path=context_image_val,
1578
+ temperature=temperature_val,
1579
+ )
1580
+
1581
+ return gr.update(value=template_text), gr.update(visible=False, value="")
1582
+
1583
+ except Exception:
1584
+ import traceback
1585
+
1586
+ tb = traceback.format_exc()
1587
+ print(tb, flush=True)
1588
+
1589
+ return (
1590
+ gr.update(),
1591
+ gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"),
1592
+ )
1593
+
1594
+ def on_clear():
1595
+ return (
1596
+ gr.update(value=""),
1597
+ gr.update(value=""),
1598
+ gr.update(visible=False, value=""),
1599
+ )
1600
+
1601
+ common_inputs = [
1602
+ api_key,
1603
+ api_base,
1604
+ system_prompt,
1605
+ instruction,
1606
+ template,
1607
+ context_text,
1608
+ context_image,
1609
+ temperature,
1610
+ reasoning_checkbox,
1611
+ ]
1612
+
1613
+ common_outputs = [
1614
+ reasoning_md,
1615
+ output_md,
1616
+ error_box,
1617
+ ]
1618
+
1619
+ extract_event = extract_btn.click(
1620
+ fn=on_extract_click,
1621
+ inputs=common_inputs,
1622
+ outputs=common_outputs,
1623
+ show_progress=True,
1624
+ )
1625
+
1626
+ markdown_event = markdown_btn.click(
1627
+ fn=on_markdown_click,
1628
+ inputs=common_inputs,
1629
+ outputs=common_outputs,
1630
+ show_progress=True,
1631
+ )
1632
+
1633
+ stop_btn.click(
1634
+ fn=None,
1635
+ inputs=None,
1636
+ outputs=None,
1637
+ cancels=[extract_event, markdown_event],
1638
+ )
1639
+
1640
+ clear_btn.click(
1641
+ fn=on_clear,
1642
+ inputs=None,
1643
+ outputs=common_outputs,
1644
+ )
1645
+
1646
+ generate_template_btn.click(
1647
+ fn=on_click_generate_template,
1648
+ inputs=[
1649
+ api_key,
1650
+ api_base,
1651
+ system_prompt,
1652
+ context_text,
1653
+ context_image,
1654
+ temperature,
1655
+ ],
1656
+ outputs=[
1657
+ template,
1658
+ error_box,
1659
+ ],
1660
+ show_progress=True,
1661
+ )
1662
+
1663
+
1664
+ if __name__ == "__main__":
1665
+ allowed_paths = []
1666
+ if ASSETS_DIR.exists():
1667
+ allowed_paths.append(str(ASSETS_DIR))
1668
+ if EXAMPLE_DIR.exists():
1669
+ allowed_paths.append(str(EXAMPLE_DIR))
1670
+
1671
+ demo.queue().launch(
1672
+ share=ARGS.share,
1673
+ server_name=ARGS.server_name,
1674
+ server_port=ARGS.server_port,
1675
+ show_error=True,
1676
+ allowed_paths=allowed_paths or None,
1677
+ )
assets/logo_numind_picto.svg ADDED
start.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ MODEL_NAME="${MODEL_NAME:-NM-dev/NuExtract3.4_4B-RL-400}"
5
+ VLLM_PORT="${VLLM_PORT:-8000}"
6
+ GRADIO_PORT="${GRADIO_SERVER_PORT:-7860}"
7
+
8
+ echo "Starting vLLM with model: ${MODEL_NAME}"
9
+
10
+ python -m vllm.entrypoints.openai.api_server \
11
+ --model "${MODEL_NAME}" \
12
+ --served-model-name "${MODEL_NAME}" \
13
+ --host 127.0.0.1 \
14
+ --port "${VLLM_PORT}" \
15
+ --trust-remote-code \
16
+ --dtype auto \
17
+ --max-model-len "${MAX_MODEL_LEN:-8192}" \
18
+ --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION:-0.90}" \
19
+ --limit-mm-per-prompt image=1 \
20
+ --api-key "${OPENAI_API_KEY:-EMPTY}" &
21
+
22
+ VLLM_PID=$!
23
+
24
+ echo "Waiting for vLLM to become ready..."
25
+ until curl -sf "http://127.0.0.1:${VLLM_PORT}/v1/models" >/dev/null; do
26
+ if ! kill -0 "${VLLM_PID}" 2>/dev/null; then
27
+ echo "vLLM exited before becoming ready."
28
+ exit 1
29
+ fi
30
+ sleep 2
31
+ done
32
+
33
+ echo "vLLM is ready. Starting Gradio..."
34
+
35
+ python /home/user/app/app.py \
36
+ --model-name "${MODEL_NAME}" \
37
+ --api-base "http://127.0.0.1:${VLLM_PORT}/v1" \
38
+ --api-key "${OPENAI_API_KEY:-EMPTY}" \
39
+ --server-name "0.0.0.0" \
40
+ --server-port "${GRADIO_PORT}"