Spaces:

Adherence
/

nuremberg-trials-ai

Sleeping

App Files Files Community

nuremberg-trials-ai / app.py

Adherence

Upload app.py with huggingface_hub

174cad1 verified 5 months ago

raw

history blame contribute delete

7.66 kB

	"""
	Nuremberg Trials AI - RAG-powered Q&A system
	Deployed on HuggingFace Spaces
	"""

	import os
	import json
	import gradio as gr
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import hf_hub_download, InferenceClient
	from datasets import load_dataset

	# Configuration
	DATASET_ID = "Adherence/nuremberg-trials-rag"
	EMBEDDING_MODEL = "all-MiniLM-L6-v2"
	TOP_K = 5

	# Try to get HF token from environment (set in Space secrets)
	HF_TOKEN = os.environ.get("HF_TOKEN")


	class NurembergRAG:
	def __init__(self):
	self.index = None
	self.chunks = None
	self.model = None
	self.llm_client = None

	def load(self):
	"""Load RAG components from HuggingFace."""
	print("Loading Nuremberg Trials RAG system...")

	# Load embedding model
	print(" Loading embedding model...")
	self.model = SentenceTransformer(EMBEDDING_MODEL)

	# Load chunks from dataset
	print(" Loading document chunks...")
	dataset = load_dataset(DATASET_ID, split="train")
	self.chunks = [
	{"text": row["text"], "source": row["source"]}
	for row in dataset
	]

	# Load FAISS index
	print(" Loading FAISS index...")
	index_path = hf_hub_download(
	repo_id=DATASET_ID,
	filename="faiss_index.bin",
	repo_type="dataset"
	)
	self.index = faiss.read_index(index_path)

	# Initialize LLM client if token available
	if HF_TOKEN:
	print(" Initializing LLM client...")
	self.llm_client = InferenceClient(token=HF_TOKEN)
	else:
	print(" No HF_TOKEN - running in retrieval-only mode")
	self.llm_client = None

	print(f" Loaded {len(self.chunks)} document chunks")
	print("Ready!")

	def search(self, query: str, top_k: int = TOP_K):
	"""Search for relevant chunks."""
	query_embedding = self.model.encode([query], convert_to_numpy=True)
	distances, indices = self.index.search(
	query_embedding.astype(np.float32), top_k
	)

	results = []
	for idx, distance in zip(indices[0], distances[0]):
	if idx < len(self.chunks):
	chunk = self.chunks[idx]
	similarity = 1 / (1 + distance)
	results.append((chunk, similarity))

	return results

	def generate_answer(self, question: str, context: str) -> str:
	"""Generate answer using LLM with retrieved context."""
	if not self.llm_client:
	# No LLM available - provide retrieval-only summary
	return "Retrieved passages below contain the answer. (LLM generation requires HF_TOKEN)"

	prompt = f"""You are an expert on the Nuremberg Trials. Answer the question based ONLY on the provided context from historical documents. If the context doesn't contain enough information, say so. Be concise.

	Context from Nuremberg Trial documents:
	{context}

	Question: {question}

	Answer:"""

	try:
	response = self.llm_client.text_generation(
	prompt,
	model="HuggingFaceH4/zephyr-7b-beta",
	max_new_tokens=400,
	temperature=0.3,
	)
	return response
	except Exception as e:
	return f"Retrieved passages below contain the answer. (LLM error: {str(e)[:100]})"

	def query(self, question: str) -> tuple:
	"""Full RAG pipeline: retrieve + generate."""
	if not question.strip():
	return "Please enter a question.", ""

	# Retrieve relevant passages
	results = self.search(question, TOP_K)

	if not results:
	return "No relevant information found.", ""

	# Format context for LLM
	context_parts = []
	sources_md = []

	for i, (chunk, score) in enumerate(results, 1):
	context_parts.append(f"[{i}] {chunk['text'][:1000]}")
	sources_md.append(
	f"[{i}] {chunk['source']} (relevance: {score:.0%})\n\n"
	f"{chunk['text'][:600]}..."
	)

	context = "\n\n".join(context_parts)

	# Generate answer
	answer = self.generate_answer(question, context)

	# Format sources
	sources = "\n\n---\n\n".join(sources_md)

	return answer, sources


	# Initialize RAG system
	print("Initializing Nuremberg Trials AI...")
	rag = NurembergRAG()
	rag.load()


	def answer_question(question: str) -> tuple:
	"""Gradio interface function."""
	return rag.query(question)


	# Example questions
	examples = [
	"How many defendants were sentenced to death at Nuremberg?",
	"What were the four counts in the Nuremberg indictment?",
	"Who was the chief prosecutor for the United States?",
	"What happened to Hermann Goering?",
	"What was the legal basis for the Nuremberg trials?",
	"What were the Nazi medical experiments?",
	"What was the Einsatzgruppen trial about?",
	"Who was prosecuted in the IG Farben trial?",
	"What was the verdict for Albert Speer?",
	"What were crimes against humanity as defined at Nuremberg?",
	]

	# Build Gradio interface
	with gr.Blocks(
	title="Nuremberg Trials AI",
	theme=gr.themes.Soft(),
	) as demo:
	gr.Markdown(
	"""
	# Nuremberg Trials AI

	Ask questions about the Nuremberg Trials (1945-1949). This system uses
	Retrieval-Augmented Generation (RAG) to search through 120,000+ passages from:

	- Harvard Law School - All 13 trials, 153,010 pages (IMT + 12 NMT trials)
	- Yale Avalon Project - 857 documents, 11.3M words (judgments, charter, 22 volumes of proceedings)
	- Wikipedia - 49 pages (defendants, prosecutors, organizations)

	All answers are grounded in actual historical documents with source citations.
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="e.g., How many defendants were sentenced to death?",
	lines=2,
	)
	submit_btn = gr.Button("Ask", variant="primary")

	with gr.Column(scale=1):
	gr.Examples(
	examples=examples,
	inputs=question_input,
	label="Example Questions",
	)

	with gr.Row():
	with gr.Column():
	answer_output = gr.Textbox(
	label="Answer",
	lines=8,
	show_copy_button=True,
	)

	with gr.Accordion("Source Documents", open=False):
	sources_output = gr.Markdown(label="Retrieved Passages")

	submit_btn.click(
	fn=answer_question,
	inputs=question_input,
	outputs=[answer_output, sources_output],
	)

	question_input.submit(
	fn=answer_question,
	inputs=question_input,
	outputs=[answer_output, sources_output],
	)

	gr.Markdown(
	"""
	---
	About: This project aims to make the historical record of the Nuremberg Trials
	accessible through AI. Built with sentence-transformers, FAISS, and Mistral-7B.

	Data Sources: [Harvard Nuremberg Project](https://nuremberg.law.harvard.edu/) \|
	[Yale Avalon Project](https://avalon.law.yale.edu/subject_menus/imt.asp)

	Code: [GitHub](https://github.com/your-repo) \|
	Dataset: [HuggingFace](https://huggingface.co/datasets/Adherence/nuremberg-trials-rag)
	"""
	)

	if __name__ == "__main__":
	demo.launch()