Spaces:
Sleeping
Sleeping
Update rag.py
Browse files
rag.py
CHANGED
|
@@ -533,136 +533,3 @@ class NurembergScholar:
|
|
| 533 |
def clear_cache(self) -> None:
|
| 534 |
self._cache.clear()
|
| 535 |
print(" Cache cleared.")
|
| 536 |
-
|
| 537 |
-
# ── Gradio UI ─────────────────────────────────────────────────────────────────
|
| 538 |
-
|
| 539 |
-
def build_gradio_app(scholar: NurembergScholar):
|
| 540 |
-
import gradio as gr
|
| 541 |
-
|
| 542 |
-
def gradio_query(query: str, top_k: int):
|
| 543 |
-
if not query.strip():
|
| 544 |
-
return "Please enter a question.", "", ""
|
| 545 |
-
result = scholar.answer(query, top_k=int(top_k))
|
| 546 |
-
answer = result["answer"]
|
| 547 |
-
sources = _format_sources(result["sources"])
|
| 548 |
-
report = _format_citation_report(
|
| 549 |
-
result["citation_report"],
|
| 550 |
-
result.get("cache_hit", False),
|
| 551 |
-
)
|
| 552 |
-
return answer, sources, report
|
| 553 |
-
|
| 554 |
-
def _format_sources(results) -> str:
|
| 555 |
-
if not results:
|
| 556 |
-
return "No sources retrieved."
|
| 557 |
-
lines = []
|
| 558 |
-
for i, r in enumerate(results, 1):
|
| 559 |
-
lines.append(
|
| 560 |
-
f"**[SOURCE {i}]** `{r.collection}` | {r.date_iso or '?'} | "
|
| 561 |
-
f"speaker: *{r.speaker or '-'}* | page {r.page_number or '?'} | "
|
| 562 |
-
f"rerank: `{f'{r.rerank_score:.4f}' if r.rerank_score is not None else 'n/a'}`\n\n"
|
| 563 |
-
f"> {r.body[:300]}..."
|
| 564 |
-
)
|
| 565 |
-
return "\n\n---\n\n".join(lines)
|
| 566 |
-
|
| 567 |
-
def _format_citation_report(report: dict, cache_hit: bool = False) -> str:
|
| 568 |
-
if not report:
|
| 569 |
-
return ""
|
| 570 |
-
cache_label = "HIT" if cache_hit else "MISS"
|
| 571 |
-
status = "All citations valid" if report.get("clean") else "Issues found"
|
| 572 |
-
lines = [
|
| 573 |
-
f"**Citation check:** {status} | "
|
| 574 |
-
f"**Backend:** Groq ({GROQ_MODEL}) | **Cache:** {cache_label}",
|
| 575 |
-
]
|
| 576 |
-
if report.get("cited"):
|
| 577 |
-
lines.append(f"- Referenced: SOURCE {report['cited']}")
|
| 578 |
-
if report.get("hallucinated"):
|
| 579 |
-
lines.append(f"- Hallucinated refs stripped: {report['hallucinated']}")
|
| 580 |
-
if report.get("uncited_sources"):
|
| 581 |
-
lines.append(
|
| 582 |
-
f"- Retrieved but not cited: SOURCE {report['uncited_sources']}")
|
| 583 |
-
if report.get("uncited_sentences"):
|
| 584 |
-
lines.append(
|
| 585 |
-
f"- Paragraphs without citation: "
|
| 586 |
-
f"{len(report['uncited_sentences'])}")
|
| 587 |
-
stats = scholar.cache_stats
|
| 588 |
-
lines.append(
|
| 589 |
-
f"- Cache: {stats['size']} entries | "
|
| 590 |
-
f"{stats['hits']} hits / {stats['hits']+stats['misses']} queries "
|
| 591 |
-
f"({stats['hit_rate']:.0%})"
|
| 592 |
-
)
|
| 593 |
-
return "\n".join(lines)
|
| 594 |
-
|
| 595 |
-
example_queries = [
|
| 596 |
-
["What did Goering say in his defense about the Luftwaffe?", 5],
|
| 597 |
-
["How did the Tribunal define crimes against humanity under Article 6(c)?", 5],
|
| 598 |
-
["What was Ohlendorf's confession regarding Einsatzgruppen killings?", 5],
|
| 599 |
-
["What evidence was presented about the Final Solution?", 5],
|
| 600 |
-
["What was the London Agreement and why was it significant?", 5],
|
| 601 |
-
["How were the defendants sentenced on 1 October 1946?", 5],
|
| 602 |
-
]
|
| 603 |
-
|
| 604 |
-
with gr.Blocks(title="Nuremberg Scholar", theme=gr.themes.Soft()) as app:
|
| 605 |
-
gr.Markdown(
|
| 606 |
-
"""
|
| 607 |
-
# ⚖️ Nuremberg Scholar
|
| 608 |
-
**AI research assistant for the Nuremberg Trials (1945–1946)**
|
| 609 |
-
|
| 610 |
-
Answers are grounded exclusively in primary source documents.
|
| 611 |
-
Every factual claim is cited to a specific source passage.
|
| 612 |
-
|
| 613 |
-
*Llama-3.1-8B via Groq · BGE-M3 hybrid retrieval · 46,325 indexed passages*
|
| 614 |
-
"""
|
| 615 |
-
)
|
| 616 |
-
with gr.Row():
|
| 617 |
-
with gr.Column(scale=3):
|
| 618 |
-
query_box = gr.Textbox(
|
| 619 |
-
label = "Your question",
|
| 620 |
-
placeholder = "e.g. What did Speer claim about his knowledge of the Holocaust?",
|
| 621 |
-
lines = 2,
|
| 622 |
-
)
|
| 623 |
-
with gr.Row():
|
| 624 |
-
top_k_slider = gr.Slider(
|
| 625 |
-
minimum = 1, maximum = 10, value = 5, step = 1,
|
| 626 |
-
label = "Sources to retrieve",
|
| 627 |
-
)
|
| 628 |
-
submit_btn = gr.Button("Ask", variant="primary")
|
| 629 |
-
gr.Examples(
|
| 630 |
-
examples = example_queries,
|
| 631 |
-
inputs = [query_box, top_k_slider],
|
| 632 |
-
label = "Example questions",
|
| 633 |
-
)
|
| 634 |
-
with gr.Column(scale=1):
|
| 635 |
-
gr.Markdown(
|
| 636 |
-
"""
|
| 637 |
-
**Corpus**
|
| 638 |
-
- 221 trial session transcripts
|
| 639 |
-
- Full Tribunal judgment (Oct 1946)
|
| 640 |
-
- Key prosecution documents
|
| 641 |
-
- 46,325 indexed passages
|
| 642 |
-
|
| 643 |
-
**Retrieval**
|
| 644 |
-
Dense + sparse hybrid search,
|
| 645 |
-
re-ranked by cross-encoder.
|
| 646 |
-
|
| 647 |
-
**Citation policy**
|
| 648 |
-
All claims must cite a SOURCE.
|
| 649 |
-
Hallucinated citations are stripped.
|
| 650 |
-
"""
|
| 651 |
-
)
|
| 652 |
-
|
| 653 |
-
answer_box = gr.Markdown(label="Answer")
|
| 654 |
-
|
| 655 |
-
with gr.Accordion("Citation verification", open=False):
|
| 656 |
-
citation_box = gr.Markdown()
|
| 657 |
-
|
| 658 |
-
with gr.Accordion("Retrieved sources", open=False):
|
| 659 |
-
sources_box = gr.Markdown()
|
| 660 |
-
|
| 661 |
-
for trigger in (submit_btn.click, query_box.submit):
|
| 662 |
-
trigger(
|
| 663 |
-
fn = gradio_query,
|
| 664 |
-
inputs = [query_box, top_k_slider],
|
| 665 |
-
outputs = [answer_box, sources_box, citation_box],
|
| 666 |
-
)
|
| 667 |
-
|
| 668 |
-
return app
|
|
|
|
| 533 |
def clear_cache(self) -> None:
|
| 534 |
self._cache.clear()
|
| 535 |
print(" Cache cleared.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|