Spaces:

Adherence
/

nuremberg-trials-ai

Sleeping

App Files Files Community

Adherence commited on Dec 15, 2025

Commit

34257f6

verified ·

1 Parent(s): 1a2c8d6

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +228 -0

app.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Nuremberg Trials AI - RAG-powered Q&A system
+Deployed on HuggingFace Spaces
+"""
+import json
+import gradio as gr
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download, InferenceClient
+from datasets import load_dataset
+# Configuration
+DATASET_ID = "Adherence/nuremberg-trials-rag"
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+TOP_K = 5
+class NurembergRAG:
+    def __init__(self):
+        self.index = None
+        self.chunks = None
+        self.model = None
+        self.llm_client = None
+    def load(self):
+        """Load RAG components from HuggingFace."""
+        print("Loading Nuremberg Trials RAG system...")
+        # Load embedding model
+        print("  Loading embedding model...")
+        self.model = SentenceTransformer(EMBEDDING_MODEL)
+        # Load chunks from dataset
+        print("  Loading document chunks...")
+        dataset = load_dataset(DATASET_ID, split="train")
+        self.chunks = [
+            {"text": row["text"], "source": row["source"]}
+            for row in dataset
+        ]
+        # Load FAISS index
+        print("  Loading FAISS index...")
+        index_path = hf_hub_download(
+            repo_id=DATASET_ID,
+            filename="faiss_index.bin",
+            repo_type="dataset"
+        )
+        self.index = faiss.read_index(index_path)
+        # Initialize LLM client (free inference API)
+        print("  Initializing LLM client...")
+        self.llm_client = InferenceClient(model=LLM_MODEL)
+        print(f"  Loaded {len(self.chunks)} document chunks")
+        print("Ready!")
+    def search(self, query: str, top_k: int = TOP_K):
+        """Search for relevant chunks."""
+        query_embedding = self.model.encode([query], convert_to_numpy=True)
+        distances, indices = self.index.search(
+            query_embedding.astype(np.float32), top_k
+        )
+        results = []
+        for idx, distance in zip(indices[0], distances[0]):
+            if idx < len(self.chunks):
+                chunk = self.chunks[idx]
+                similarity = 1 / (1 + distance)
+                results.append((chunk, similarity))
+        return results
+    def generate_answer(self, question: str, context: str) -> str:
+        """Generate answer using LLM with retrieved context."""
+        prompt = f"""You are an expert on the Nuremberg Trials. Answer the question based ONLY on the provided context from historical documents. If the context doesn't contain enough information, say so.
+Context from Nuremberg Trial documents:
+{context}
+Question: {question}
+Answer (be specific and cite sources when possible):"""
+        try:
+            response = self.llm_client.text_generation(
+                prompt,
+                max_new_tokens=500,
+                temperature=0.3,
+                do_sample=True,
+            )
+            return response
+        except Exception as e:
+            return f"Error generating answer: {str(e)}"
+    def query(self, question: str) -> tuple:
+        """Full RAG pipeline: retrieve + generate."""
+        if not question.strip():
+            return "Please enter a question.", ""
+        # Retrieve relevant passages
+        results = self.search(question, TOP_K)
+        if not results:
+            return "No relevant information found.", ""
+        # Format context for LLM
+        context_parts = []
+        sources_md = []
+        for i, (chunk, score) in enumerate(results, 1):
+            context_parts.append(f"[{i}] {chunk['text'][:1000]}")
+            sources_md.append(
+                f"**[{i}] {chunk['source']}** (relevance: {score:.0%})\n\n"
+                f"{chunk['text'][:500]}..."
+            )
+        context = "\n\n".join(context_parts)
+        # Generate answer
+        answer = self.generate_answer(question, context)
+        # Format sources
+        sources = "\n\n---\n\n".join(sources_md)
+        return answer, sources
+# Initialize RAG system
+print("Initializing Nuremberg Trials AI...")
+rag = NurembergRAG()
+rag.load()
+def answer_question(question: str) -> tuple:
+    """Gradio interface function."""
+    return rag.query(question)
+# Example questions
+examples = [
+    "How many defendants were sentenced to death at Nuremberg?",
+    "What were the four counts in the Nuremberg indictment?",
+    "Who was the chief prosecutor for the United States?",
+    "What happened to Hermann Goering?",
+    "What was the legal basis for the Nuremberg trials?",
+    "Who were the judges at Nuremberg?",
+    "What was the verdict for Albert Speer?",
+    "What were the crimes against humanity?",
+]
+# Build Gradio interface
+with gr.Blocks(
+    title="Nuremberg Trials AI",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown(
+        """
+        # Nuremberg Trials AI
+        Ask questions about the Nuremberg Trials (1945-1946). This system uses
+        **Retrieval-Augmented Generation (RAG)** to search through 12,000+ passages from:
+        - **Harvard Law School Nuremberg Trials Project** - Full IMT transcript (17,268 pages)
+        - **Yale Avalon Project** - Judgments, indictments, charter documents
+        - **Wikipedia** - Defendant biographies
+        All answers are grounded in actual historical documents with source citations.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="e.g., How many defendants were sentenced to death?",
+                lines=2,
+            )
+            submit_btn = gr.Button("Ask", variant="primary")
+        with gr.Column(scale=1):
+            gr.Examples(
+                examples=examples,
+                inputs=question_input,
+                label="Example Questions",
+            )
+    with gr.Row():
+        with gr.Column():
+            answer_output = gr.Textbox(
+                label="Answer",
+                lines=8,
+                show_copy_button=True,
+            )
+    with gr.Accordion("Source Documents", open=False):
+        sources_output = gr.Markdown(label="Retrieved Passages")
+    submit_btn.click(
+        fn=answer_question,
+        inputs=question_input,
+        outputs=[answer_output, sources_output],
+    )
+    question_input.submit(
+        fn=answer_question,
+        inputs=question_input,
+        outputs=[answer_output, sources_output],
+    )
+    gr.Markdown(
+        """
+        ---
+        **About**: This project aims to make the historical record of the Nuremberg Trials
+        accessible through AI. Built with sentence-transformers, FAISS, and Mistral-7B.
+        **Data Sources**: [Harvard Nuremberg Project](https://nuremberg.law.harvard.edu/) |
+        [Yale Avalon Project](https://avalon.law.yale.edu/subject_menus/imt.asp)
+        **Code**: [GitHub](https://github.com/your-repo) |
+        **Dataset**: [HuggingFace](https://huggingface.co/datasets/Adherence/nuremberg-trials-rag)
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()