dtufail commited on
Commit
7b615fe
·
verified ·
1 Parent(s): c4b7337

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +477 -11
app.py CHANGED
@@ -1,17 +1,483 @@
1
  """
2
- app.py — Nuremberg Scholar HuggingFace Space entry point
3
- ============================================================
4
- Thin wrapper that initialises the NurembergScholar pipeline
5
- and launches the Gradio UI. All logic lives in rag.py.
6
-
7
- This is the file referenced by `app_file: app.py` in the
8
- Space README YAML.
9
  """
10
 
11
- from rag import NurembergScholar, build_gradio_app
 
 
 
12
 
13
  scholar = NurembergScholar()
14
- app = build_gradio_app(scholar)
15
 
16
- if __name__ == "__main__":
17
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ app.py — Nuremberg Scholar UI (HuggingFace Spaces)
3
+ ======================================================
4
+ Modern research assistant interface for the Nuremberg Trials corpus.
5
+ Designed for Gradio >=5.9.0. Pipeline logic stays in rag.py.
 
 
 
6
  """
7
 
8
+ from rag import NurembergScholar, GROQ_MODEL
9
+ import gradio as gr
10
+
11
+ # ── Initialise pipeline ───────────────────────────────────────────────────────
12
 
13
  scholar = NurembergScholar()
 
14
 
15
+ # ── Theme — override Gradio defaults at the theme level ───────────────────────
16
+
17
+ theme = gr.themes.Base(
18
+ primary_hue=gr.themes.colors.red,
19
+ secondary_hue=gr.themes.colors.gray,
20
+ neutral_hue=gr.themes.colors.gray,
21
+ font=[
22
+ gr.themes.GoogleFont("DM Sans"),
23
+ "system-ui",
24
+ "sans-serif",
25
+ ],
26
+ font_mono=[
27
+ gr.themes.GoogleFont("DM Mono"),
28
+ "Consolas",
29
+ "monospace",
30
+ ],
31
+ radius_size=gr.themes.sizes.radius_md,
32
+ ).set(
33
+ body_background_fill="#fafafa",
34
+ body_text_color="#1a1a1a",
35
+ block_background_fill="#ffffff",
36
+ block_border_width="1px",
37
+ block_border_color="#ebebeb",
38
+ block_radius="10px",
39
+ block_shadow="0 1px 3px rgba(0,0,0,0.04)",
40
+ input_background_fill="#ffffff",
41
+ input_border_color="#ddd",
42
+ input_border_width="1.5px",
43
+ input_radius="10px",
44
+ input_shadow="0 1px 2px rgba(0,0,0,0.03)",
45
+ button_primary_background_fill="#b5321f",
46
+ button_primary_background_fill_hover="#9a2a18",
47
+ button_primary_text_color="#ffffff",
48
+ button_primary_border_color="transparent",
49
+ button_primary_shadow="0 1px 3px rgba(0,0,0,0.1)",
50
+ button_border_width="0px",
51
+ button_large_radius="8px",
52
+ body_background_fill_dark="#0e0e0e",
53
+ body_text_color_dark="#e0e0e0",
54
+ block_background_fill_dark="#171717",
55
+ block_border_color_dark="#252525",
56
+ input_background_fill_dark="#171717",
57
+ input_border_color_dark="#333",
58
+ button_primary_background_fill_dark="#e05a4a",
59
+ button_primary_background_fill_hover_dark="#c94a3c",
60
+ )
61
+
62
+ # ── CSS — only what the theme can't handle ────────────────────────────────────
63
+ # Uses #elem_id selectors (no !important needed) + class selectors for HTML
64
+
65
+ CUSTOM_CSS = """
66
+ /* ── Global ───────────────────────────────────────────── */
67
+ .gradio-container {
68
+ max-width: 840px !important;
69
+ margin: 0 auto !important;
70
+ }
71
+ footer { display: none !important; }
72
+
73
+ /* ── Hero ─────────────────────────────────────────────── */
74
+ .ns-hero {
75
+ text-align: center;
76
+ padding: 2rem 1rem 0.5rem;
77
+ }
78
+ .ns-hero h1 {
79
+ font-size: 1.65rem;
80
+ font-weight: 700;
81
+ letter-spacing: -0.03em;
82
+ margin: 0 0 0.3rem 0;
83
+ }
84
+ .ns-hero .ns-sub {
85
+ font-size: 0.92rem;
86
+ color: #666;
87
+ margin: 0 0 0.65rem 0;
88
+ }
89
+ .ns-hero .ns-meta {
90
+ display: inline-flex;
91
+ align-items: center;
92
+ gap: 0.45rem;
93
+ padding: 0.28rem 0.7rem;
94
+ background: #f0f0f0;
95
+ border-radius: 100px;
96
+ font-size: 0.62rem;
97
+ font-family: var(--font-mono);
98
+ color: #999;
99
+ letter-spacing: 0.015em;
100
+ }
101
+ .ns-hero .ns-dot {
102
+ width: 3px; height: 3px;
103
+ background: #bbb;
104
+ border-radius: 50%;
105
+ display: inline-block;
106
+ }
107
+ @media (prefers-color-scheme: dark) {
108
+ .ns-hero .ns-sub { color: #aaa; }
109
+ .ns-hero .ns-meta { background: #1e1e1e; color: #666; }
110
+ .ns-hero .ns-dot { background: #555; }
111
+ }
112
+
113
+ /* ── Query box ────────────────────────────────────────── */
114
+ #ns-query textarea {
115
+ font-size: 0.95rem !important;
116
+ padding: 0.85rem 1rem !important;
117
+ line-height: 1.5 !important;
118
+ }
119
+ #ns-query textarea:focus {
120
+ border-color: #b5321f !important;
121
+ box-shadow: 0 1px 2px rgba(0,0,0,0.03), 0 0 0 3px rgba(181,50,31,0.06) !important;
122
+ }
123
+ #ns-query label { display: none !important; }
124
+
125
+ /* ── Search row — input + button side by side ─────────── */
126
+ #ns-search-row {
127
+ gap: 0.5rem !important;
128
+ align-items: stretch !important;
129
+ margin-bottom: 0.4rem !important;
130
+ }
131
+ #ns-search {
132
+ min-width: 90px !important;
133
+ max-width: 110px !important;
134
+ font-weight: 600;
135
+ height: auto !important;
136
+ border-radius: 10px !important;
137
+ }
138
+
139
+ /* ── Example chips — compact grid ─────────────────────── */
140
+ #ns-examples {
141
+ margin-top: 0 !important;
142
+ margin-bottom: 0.5rem !important;
143
+ }
144
+ #ns-examples .gallery {
145
+ gap: 0.3rem !important;
146
+ padding: 0 !important;
147
+ }
148
+ #ns-examples button {
149
+ border-radius: 100px !important;
150
+ font-size: 0.74rem !important;
151
+ padding: 0.3rem 0.7rem !important;
152
+ border: 1px solid #e0e0e0 !important;
153
+ background: #fff !important;
154
+ color: #666 !important;
155
+ transition: all 0.15s ease !important;
156
+ white-space: nowrap !important;
157
+ overflow: hidden !important;
158
+ text-overflow: ellipsis !important;
159
+ max-width: 340px !important;
160
+ }
161
+ #ns-examples button:hover {
162
+ border-color: rgba(181,50,31,0.25) !important;
163
+ background: rgba(181,50,31,0.04) !important;
164
+ color: #b5321f !important;
165
+ }
166
+ @media (prefers-color-scheme: dark) {
167
+ #ns-examples button {
168
+ border-color: #333 !important;
169
+ background: #1a1a1a !important;
170
+ color: #aaa !important;
171
+ }
172
+ #ns-examples button:hover {
173
+ border-color: rgba(224,90,74,0.3) !important;
174
+ background: rgba(224,90,74,0.08) !important;
175
+ color: #e05a4a !important;
176
+ }
177
+ }
178
+
179
+ /* ── Source cards (raw HTML) ───────────────────────────── */
180
+ .ns-sources-wrap {
181
+ display: flex;
182
+ flex-wrap: wrap;
183
+ gap: 0.3rem;
184
+ padding: 0.25rem 0;
185
+ }
186
+ .ns-src {
187
+ display: inline-flex;
188
+ align-items: center;
189
+ gap: 0.35rem;
190
+ padding: 0.3rem 0.6rem;
191
+ background: #f5f5f5;
192
+ border: 1px solid #eee;
193
+ border-radius: 8px;
194
+ font-size: 0.76rem;
195
+ color: #555;
196
+ line-height: 1.35;
197
+ transition: all 0.12s ease;
198
+ }
199
+ .ns-src:hover {
200
+ border-color: rgba(181,50,31,0.2);
201
+ background: rgba(181,50,31,0.04);
202
+ }
203
+ .ns-sn {
204
+ display: inline-flex;
205
+ align-items: center;
206
+ justify-content: center;
207
+ min-width: 17px; height: 17px;
208
+ background: #b5321f;
209
+ color: #fff;
210
+ border-radius: 4px;
211
+ font-size: 0.6rem;
212
+ font-weight: 700;
213
+ flex-shrink: 0;
214
+ }
215
+ .ns-sm {
216
+ font-family: var(--font-mono);
217
+ font-size: 0.66rem;
218
+ color: #999;
219
+ }
220
+ @media (prefers-color-scheme: dark) {
221
+ .ns-src { background: #1e1e1e; border-color: #292929; color: #aaa; }
222
+ .ns-src:hover { border-color: rgba(224,90,74,0.25); background: rgba(224,90,74,0.06); }
223
+ .ns-sn { background: #e05a4a; }
224
+ .ns-sm { color: #666; }
225
+ }
226
+
227
+ /* ── Answer ───────────────────────────────────────────── */
228
+ #ns-answer {
229
+ min-height: 50px;
230
+ padding: 1.2rem 1.3rem !important;
231
+ }
232
+ #ns-answer p, #ns-answer li {
233
+ font-size: 0.93rem !important;
234
+ line-height: 1.72 !important;
235
+ }
236
+
237
+ /* ── Citation panel ───────────────────────────────────── */
238
+ #ns-cite {
239
+ background: #f7f7f7;
240
+ border: 1px solid #eee;
241
+ border-radius: 8px;
242
+ padding: 0.75rem 1rem !important;
243
+ }
244
+ #ns-cite p {
245
+ font-size: 0.82rem !important;
246
+ color: #666 !important;
247
+ line-height: 1.5 !important;
248
+ margin: 0 !important;
249
+ }
250
+ #ns-cite code {
251
+ font-size: 0.72rem !important;
252
+ background: #efefef !important;
253
+ padding: 0.1rem 0.3rem !important;
254
+ border-radius: 3px !important;
255
+ }
256
+ #ns-cite strong { color: #333 !important; }
257
+ @media (prefers-color-scheme: dark) {
258
+ #ns-cite { background: #1a1a1a; border-color: #252525; }
259
+ #ns-cite p { color: #999 !important; }
260
+ #ns-cite code { background: #222 !important; }
261
+ #ns-cite strong { color: #ccc !important; }
262
+ }
263
+
264
+ /* ── Source detail accordion ──────────────────────────── */
265
+ #ns-detail .label-wrap {
266
+ font-size: 0.8rem !important;
267
+ color: #888 !important;
268
+ }
269
+ #ns-detail blockquote {
270
+ border-left: 2px solid rgba(181,50,31,0.2) !important;
271
+ background: rgba(181,50,31,0.03) !important;
272
+ padding: 0.45rem 0.8rem !important;
273
+ margin: 0.35rem 0 !important;
274
+ border-radius: 0 6px 6px 0 !important;
275
+ }
276
+
277
+ /* ── Footer ───────────────────────────────────────────── */
278
+ .ns-footer {
279
+ text-align: center;
280
+ padding: 1.2rem 0 0.6rem;
281
+ margin-top: 0.75rem;
282
+ border-top: 1px solid #eee;
283
+ }
284
+ .ns-footer p {
285
+ font-family: var(--font-mono);
286
+ font-size: 0.64rem;
287
+ color: #999;
288
+ letter-spacing: 0.015em;
289
+ margin: 0;
290
+ }
291
+ .ns-footer a { color: #b5321f; text-decoration: none; }
292
+ .ns-footer a:hover { text-decoration: underline; }
293
+ @media (prefers-color-scheme: dark) {
294
+ .ns-footer { border-color: #222; }
295
+ .ns-footer p { color: #555; }
296
+ .ns-footer a { color: #e05a4a; }
297
+ }
298
+
299
+ /* ── Mobile ───────────────────────────────────────────── */
300
+ @media (max-width: 640px) {
301
+ .ns-hero h1 { font-size: 1.3rem; }
302
+ .ns-hero .ns-meta { font-size: 0.56rem; }
303
+ #ns-answer { padding: 0.9rem 1rem !important; }
304
+ }
305
+ """
306
+
307
+ # ── Formatters ────────────────────────────────────────────────────────────────
308
+
309
+ def _format_source_cards(results) -> str:
310
+ """Compact inline source chips shown above the answer."""
311
+ if not results:
312
+ return ""
313
+ cards = []
314
+ for i, r in enumerate(results, 1):
315
+ speaker = r.speaker or "—"
316
+ col = r.collection or ""
317
+ date = r.date_iso or ""
318
+ meta_parts = [p for p in [col, date] if p]
319
+ meta = " · ".join(meta_parts)
320
+ cards.append(
321
+ f'<span class="ns-src">'
322
+ f'<span class="ns-sn">{i}</span>'
323
+ f'<span>{speaker}</span>'
324
+ f'<span class="ns-sm">{meta}</span>'
325
+ f'</span>'
326
+ )
327
+ return f'<div class="ns-sources-wrap">{" ".join(cards)}</div>'
328
+
329
+
330
+ def _format_sources_detail(results) -> str:
331
+ """Full passages for the expandable accordion."""
332
+ if not results:
333
+ return "No sources retrieved."
334
+ lines = []
335
+ for i, r in enumerate(results, 1):
336
+ rerank = f"{r.rerank_score:.4f}" if r.rerank_score is not None else "n/a"
337
+ lines.append(
338
+ f"**[{i}]** `{r.collection}` · {r.date_iso or '?'} · "
339
+ f"speaker: *{r.speaker or '—'}* · page {r.page_number or '?'} · "
340
+ f"rerank: `{rerank}`\n\n"
341
+ f"> {r.body[:400]}{'…' if len(r.body) > 400 else ''}"
342
+ )
343
+ return "\n\n---\n\n".join(lines)
344
+
345
+
346
+ def _format_citation_report(report: dict, cache_hit: bool = False) -> str:
347
+ if not report:
348
+ return ""
349
+ ok = report.get("clean", False)
350
+ status = "✓ All citations verified" if ok else "⚠ Issues detected"
351
+ cache_label = "HIT" if cache_hit else "MISS"
352
+ lines = [
353
+ f"**{status}** · Groq `{GROQ_MODEL}` · Cache {cache_label}",
354
+ ]
355
+ if report.get("cited"):
356
+ lines.append(f"**Cited:** [{report['cited']}]")
357
+ if report.get("hallucinated"):
358
+ lines.append(f"**Hallucinated (stripped):** {report['hallucinated']}")
359
+ if report.get("uncited_sources"):
360
+ lines.append(f"**Unused sources:** [{report['uncited_sources']}]")
361
+ if report.get("uncited_sentences"):
362
+ lines.append(f"**Uncited claims:** {len(report['uncited_sentences'])}")
363
+ stats = scholar.cache_stats
364
+ lines.append(
365
+ f"**Cache:** {stats['size']} entries · "
366
+ f"{stats['hits']}/{stats['hits']+stats['misses']} hit rate "
367
+ f"({stats['hit_rate']:.0%})"
368
+ )
369
+ return " \n".join(lines)
370
+
371
+ # ── Query handler ─────────────────────────────────────────────────────────────
372
+
373
+ def gradio_query(query: str):
374
+ if not query.strip():
375
+ return "", "Please enter a question.", "", ""
376
+ result = scholar.answer(query, top_k=5)
377
+ answer = result["answer"]
378
+ sources = result["sources"]
379
+ source_cards = _format_source_cards(sources)
380
+ source_detail = _format_sources_detail(sources)
381
+ citation_md = _format_citation_report(
382
+ result["citation_report"],
383
+ result.get("cache_hit", False),
384
+ )
385
+ return source_cards, answer, source_detail, citation_md
386
+
387
+ # ── Build Gradio app ──────────────────────────────────────────────────────────
388
+
389
+ with gr.Blocks(title="Nuremberg Scholar", theme=theme, css=CUSTOM_CSS) as app:
390
+
391
+ # Header
392
+ gr.HTML("""
393
+ <div class="ns-hero">
394
+ <h1>Nuremberg Scholar</h1>
395
+ <p class="ns-sub">Research assistant for the International Military Tribunal, 1945–1946</p>
396
+ <div class="ns-meta">
397
+ <span>46,325 passages</span>
398
+ <span class="ns-dot"></span>
399
+ <span>BGE-M3 hybrid retrieval</span>
400
+ <span class="ns-dot"></span>
401
+ <span>Llama-3.1-8B via Groq</span>
402
+ </div>
403
+ </div>
404
+ """)
405
+
406
+ # Search input + button in one row
407
+ with gr.Row(elem_id="ns-search-row"):
408
+ query_box = gr.Textbox(
409
+ placeholder="Ask about the Nuremberg Trials — e.g. What did Speer claim about slave labour?",
410
+ lines=1,
411
+ show_label=False,
412
+ elem_id="ns-query",
413
+ scale=6,
414
+ )
415
+ submit_btn = gr.Button(
416
+ "Search",
417
+ variant="primary",
418
+ size="lg",
419
+ elem_id="ns-search",
420
+ scale=1,
421
+ )
422
+
423
+ # Example chips — trimmed to 4 for compactness
424
+ gr.Examples(
425
+ examples=[
426
+ ["What did Goering say in his defense about the Luftwaffe?"],
427
+ ["How did the Tribunal define crimes against humanity?"],
428
+ ["What evidence was presented about the Final Solution?"],
429
+ ["How were the defendants sentenced on 1 October 1946?"],
430
+ ],
431
+ inputs=[query_box],
432
+ label="",
433
+ examples_per_page=4,
434
+ elem_id="ns-examples",
435
+ )
436
+
437
+ # Source cards (inline, visible)
438
+ source_cards_html = gr.HTML(value="", elem_id="ns-sources")
439
+
440
+ # Answer
441
+ answer_box = gr.Markdown(
442
+ label="Answer",
443
+ elem_id="ns-answer",
444
+ show_label=False,
445
+ )
446
+
447
+ # Citation verification (visible by default)
448
+ citation_box = gr.Markdown(elem_id="ns-cite")
449
+
450
+ # Full sources (accordion — only long content hidden)
451
+ with gr.Accordion("View full source passages", open=False, elem_id="ns-detail"):
452
+ sources_detail_box = gr.Markdown()
453
+
454
+ # Footer
455
+ gr.HTML("""
456
+ <div class="ns-footer">
457
+ <p>
458
+ Source: <a href="https://avalon.law.yale.edu/subject_menus/imt.asp" target="_blank">Yale Avalon Project</a>
459
+ ·
460
+ Dataset: <a href="https://huggingface.co/datasets/dtufail/nuremberg-trials-corpus" target="_blank">dtufail/nuremberg-trials-corpus</a>
461
+ ·
462
+ CC BY 4.0
463
+ </p>
464
+ </div>
465
+ """)
466
+
467
+ # Events
468
+ outputs = [source_cards_html, answer_box, sources_detail_box, citation_box]
469
+
470
+ submit_btn.click(
471
+ fn=gradio_query,
472
+ inputs=[query_box],
473
+ outputs=outputs,
474
+ )
475
+ query_box.submit(
476
+ fn=gradio_query,
477
+ inputs=[query_box],
478
+ outputs=outputs,
479
+ )
480
+
481
+ # ── Launch ────────────────────────────────────────────────────────────────────
482
+
483
+ app.launch(share=True)