martazavro commited on
Commit
6b6d1fd
·
1 Parent(s): a87ef04
Files changed (2) hide show
  1. app.py +1706 -0
  2. requirements.txt +357 -0
app.py ADDED
@@ -0,0 +1,1706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Scimplify.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/11L85VXrmvxrfXd6A9FGuJjI53nVtM0tN
8
+
9
+ # Scimplify
10
+
11
+ A NeuroAI paper simplifier. You paste a paragraph and get a plain-language
12
+ explanation back, with citations to the retrieved chunks the explanation
13
+ came from. The system refuses to answer if it can't ground the claims.
14
+
15
+ ## 1. Setup
16
+ """
17
+
18
+ import os, json, re, io, textwrap, time
19
+ from pathlib import Path
20
+ from collections import Counter, defaultdict
21
+ from typing import List, Dict, Tuple, Optional
22
+ from concurrent.futures import ThreadPoolExecutor, as_completed
23
+
24
+ import requests
25
+ import numpy as np
26
+ import pandas as pd
27
+ import matplotlib.pyplot as plt
28
+
29
+ import openai
30
+ import chromadb
31
+ from chromadb.utils import embedding_functions
32
+ import gradio as gr
33
+ from sentence_transformers import SentenceTransformer
34
+ from PyPDF2 import PdfReader
35
+
36
+
37
+ try:
38
+ import os
39
+ api_key = os.getenv("OPENAI_API_KEY")
40
+ except (ImportError, Exception):
41
+ assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY env var"
42
+
43
+ client_oai = openai.OpenAI()
44
+
45
+ GENERATOR_MODEL = "gpt-4o-mini"
46
+
47
+ JUDGE_MODEL = "gpt-4o-mini"
48
+ GENERATOR_TEMPERATURE = 0.2
49
+ JUDGE_TEMPERATURE = 0.3
50
+ JUDGE_N_SAMPLES = 3
51
+ BOOTSTRAP_N = 2000
52
+ BOOTSTRAP_ALPHA = 0.05
53
+ _rng = np.random.default_rng(7)
54
+
55
+ RUN_EXPERIMENTS = False # re-run experiments
56
+ LIVE_SEMANTIC_CHECK = True # adds 1s per query
57
+ JUDGE_PARALLELISM = 2 # rate limit cap
58
+
59
+ print(f"generator: {GENERATOR_MODEL}")
60
+ print(f"judge: {JUDGE_MODEL}")
61
+ print(f"experiments: {'WILL RE-RUN' if RUN_EXPERIMENTS else 'using cached results'}")
62
+ print(f"live semantic check: {'on' if LIVE_SEMANTIC_CHECK else 'off'}")
63
+
64
+ """## 2. Data loading"""
65
+
66
+ REPO_RAW_BASE = "https://raw.githubusercontent.com/martazavro/scimplify_data/main"
67
+ LOCAL_DATA_DIR = Path("./data")
68
+
69
+ def _load_json(filename):
70
+ url = f"{REPO_RAW_BASE}/{filename}"
71
+ try:
72
+ r = requests.get(url, timeout=10)
73
+ r.raise_for_status()
74
+ print(f"loaded {filename} from repo")
75
+ return r.json()
76
+ except Exception as e:
77
+ local = LOCAL_DATA_DIR / filename
78
+ if local.exists():
79
+ print(f"repo fetch failed ({e.__class__.__name__}), loaded {filename} from local")
80
+ return json.loads(local.read_text())
81
+ raise FileNotFoundError(
82
+ f"Could not load {filename}. Set REPO_RAW_BASE correctly "
83
+ f"or place the file in ./data/{filename}"
84
+ )
85
+
86
+ neuroai_concepts = _load_json("concepts.json")
87
+ print(f"loaded {len(neuroai_concepts)} concepts")
88
+
89
+ def validate_validation_set(vs):
90
+ items = vs["items"]
91
+ ids = [x["id"] for x in items]
92
+ assert len(set(ids)) == len(ids), "duplicate ids"
93
+ required = {"id", "passage", "source", "key_terms", "category", "difficulty", "reference_explanation"}
94
+ valid_cats = {"concepts_only", "recent_paper", "both", "neither"}
95
+ for item in items:
96
+ missing = required - set(item.keys())
97
+ assert not missing, f"item {item.get('id')} missing {missing}"
98
+ assert item["category"] in valid_cats
99
+ cat_counts = Counter(x["category"] for x in items)
100
+ print(f"validation set: {len(items)} items")
101
+ print(f" by category: {dict(cat_counts)}")
102
+
103
+ validation_set = _load_json("validation_set.json")
104
+ validate_validation_set(validation_set)
105
+
106
+ """## 3. PDF extraction and chunking"""
107
+
108
+ def extract_text_from_pdf(pdf_file):
109
+ reader = PdfReader(pdf_file)
110
+ text = ""
111
+ for page in reader.pages:
112
+ page_text = page.extract_text()
113
+ if page_text:
114
+ text += page_text + "\n"
115
+ return text.strip()
116
+
117
+ def chunk_text(text, chunk_size=300, overlap=50):
118
+ paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
119
+ chunks, current, current_len = [], [], 0
120
+ for para in paragraphs:
121
+ words = para.split()
122
+ n = len(words)
123
+ if n > chunk_size:
124
+ if current:
125
+ chunks.append(" ".join(current))
126
+ tail = current[-overlap:] if len(current) > overlap else current
127
+ current = list(tail); current_len = len(current)
128
+ for i in range(0, n, chunk_size - overlap):
129
+ chunk = words[i:i+chunk_size]
130
+ if len(chunk) > 30:
131
+ chunks.append(" ".join(chunk))
132
+ current = []; current_len = 0
133
+ elif current_len + n > chunk_size:
134
+ chunks.append(" ".join(current))
135
+ tail = current[-overlap:] if len(current) > overlap else current
136
+ current = list(tail) + words; current_len = len(current)
137
+ else:
138
+ current.extend(words); current_len += n
139
+ if current and len(current) > 30:
140
+ chunks.append(" ".join(current))
141
+ return chunks
142
+
143
+ """## 4. Vector store setup"""
144
+
145
+ chroma_client = chromadb.Client()
146
+ ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
147
+
148
+ def reset_concepts_collection():
149
+ try:
150
+ chroma_client.delete_collection("neuroai_concepts")
151
+ except Exception:
152
+ pass
153
+ coll = chroma_client.create_collection(name="neuroai_concepts", embedding_function=ef)
154
+ for entry in neuroai_concepts:
155
+ doc = (
156
+ f"Concept: {entry['concept']}\n"
157
+ f"Definition: {entry['definition']}\n"
158
+ f"Context: {entry['context']}\n"
159
+ f"Typically found in: {entry['typical_usage']}"
160
+ )
161
+ coll.add(
162
+ documents=[doc],
163
+ ids=[entry["id"]],
164
+ metadatas=[{"concept_name": entry["concept"], "concept_id": entry["id"]}]
165
+ )
166
+ return coll
167
+
168
+ def reset_papers_collection():
169
+ try:
170
+ chroma_client.delete_collection("neuroai_papers")
171
+ except Exception:
172
+ pass
173
+ return chroma_client.create_collection(name="neuroai_papers", embedding_function=ef)
174
+
175
+ concepts_collection = reset_concepts_collection()
176
+ papers_collection = reset_papers_collection()
177
+ print(f"concepts: {concepts_collection.count()}, papers: {papers_collection.count()}")
178
+
179
+ """## 5. Recent papers ingestion"""
180
+
181
+ PAPER_CHUNKS_URL = f"{REPO_RAW_BASE}/paper_chunks.json"
182
+
183
+ def load_paper_chunks():
184
+ r = requests.get(PAPER_CHUNKS_URL, timeout=15)
185
+ r.raise_for_status()
186
+ return r.json()
187
+
188
+ def ingest_paper_chunks_from_json():
189
+ chunks = load_paper_chunks()
190
+ if not chunks:
191
+ print("paper_chunks.json was empty")
192
+ return 0
193
+
194
+ documents = [c["text"] for c in chunks]
195
+ ids = [c["chunk_id"] for c in chunks]
196
+ metadatas = [{
197
+ "source_name": c["source_name"],
198
+ "source_type": c["source_type"],
199
+ "arxiv_id": c["arxiv_id"],
200
+ "title": c["title"],
201
+ "chunk_idx": c["chunk_idx"],
202
+ "chunk_id": c["chunk_id"],
203
+ } for c in chunks]
204
+
205
+ papers_collection.add(documents=documents, ids=ids, metadatas=metadatas)
206
+
207
+ by_paper = {}
208
+ for c in chunks:
209
+ by_paper[c["arxiv_id"]] = by_paper.get(c["arxiv_id"], 0) + 1
210
+ for aid, n in by_paper.items():
211
+ print(f" {aid}: {n} chunks")
212
+ print(f"papers_collection now has {papers_collection.count()} total chunks")
213
+ return len(chunks)
214
+
215
+ ingest_paper_chunks_from_json()
216
+
217
+ """## 6. arXiv ingestion"""
218
+
219
+ import arxiv
220
+
221
+ def _existing_arxiv_ids():
222
+ if papers_collection.count() == 0:
223
+ return set()
224
+ metas = papers_collection.get()["metadatas"]
225
+ return {m.get("arxiv_id") for m in metas if m.get("arxiv_id")}
226
+
227
+
228
+ def ingest_from_arxiv(query="neuroAI OR (neural AND brain AND deep learning)",
229
+ max_results=10,
230
+ sort_by_recent=True,
231
+ verbose=True):
232
+ """Search arXiv, download PDFs, chunk them, add to papers_collection.
233
+
234
+ Returns dict with stats: {n_papers, n_chunks, n_skipped, errors}.
235
+ Already-ingested papers (matched by arxiv_id) are skipped.
236
+ """
237
+ sort_by = arxiv.SortCriterion.SubmittedDate if sort_by_recent else arxiv.SortCriterion.Relevance
238
+ arxiv_client = arxiv.Client(page_size=20, delay_seconds=3.0, num_retries=3)
239
+ search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by)
240
+
241
+ existing = _existing_arxiv_ids()
242
+ download_dir = Path("./arxiv_papers")
243
+ download_dir.mkdir(exist_ok=True)
244
+
245
+ n_papers, n_chunks, n_skipped = 0, 0, 0
246
+ errors = []
247
+
248
+ for result in arxiv_client.results(search):
249
+ # arxiv.org/abs/2509.23566v1 -> "2509.23566"
250
+ full_id = result.entry_id.rsplit("/", 1)[-1]
251
+ arxiv_id = full_id.split("v")[0]
252
+
253
+ if arxiv_id in existing:
254
+ n_skipped += 1
255
+ if verbose:
256
+ print(f" skip {arxiv_id} (already ingested)")
257
+ continue
258
+
259
+ try:
260
+ if verbose:
261
+ print(f" fetching {arxiv_id}: {result.title[:60]}...")
262
+ pdf_path = result.download_pdf(dirpath=str(download_dir),
263
+ filename=f"{arxiv_id}.pdf")
264
+ text = extract_text_from_pdf(pdf_path)
265
+ chunks = chunk_text(text)
266
+ if not chunks:
267
+ errors.append(f"{arxiv_id}: no chunks extracted")
268
+ continue
269
+
270
+ chunk_ids = [f"arxiv_{arxiv_id.replace('.', '_')}::c{i}" for i in range(len(chunks))]
271
+ metadatas = [{
272
+ "source_name": result.title,
273
+ "source_type": "arxiv_paper",
274
+ "arxiv_id": arxiv_id,
275
+ "title": result.title,
276
+ "chunk_idx": i,
277
+ "chunk_id": chunk_ids[i],
278
+ } for i in range(len(chunks))]
279
+
280
+ papers_collection.add(documents=chunks, ids=chunk_ids, metadatas=metadatas)
281
+ existing.add(arxiv_id) # avoid double-add within the same call
282
+ n_papers += 1
283
+ n_chunks += len(chunks)
284
+ if verbose:
285
+ print(f" -> added {len(chunks)} chunks")
286
+ except Exception as e:
287
+ errors.append(f"{arxiv_id}: {e.__class__.__name__}: {e}")
288
+ if verbose:
289
+ print(f" ERROR: {e}")
290
+
291
+ summary = {
292
+ "n_papers": n_papers,
293
+ "n_chunks": n_chunks,
294
+ "n_skipped": n_skipped,
295
+ "errors": errors,
296
+ "total_in_kb": papers_collection.count(),
297
+ }
298
+ if verbose:
299
+ print(f"\ningested {n_papers} papers ({n_chunks} chunks), skipped {n_skipped} duplicates")
300
+ if errors:
301
+ print(f"errors: {len(errors)}")
302
+ print(f"total in knowledge base: {summary['total_in_kb']} chunks")
303
+ return summary
304
+
305
+ ingest_from_arxiv(query="NeuroAI", max_results=2)
306
+
307
+ ingest_from_arxiv(query="NeuroAI", max_results=15)
308
+
309
+ """## 7. Retrieval variants"""
310
+
311
+ def _flexible_last_word(word):
312
+ if len(word) < 4:
313
+ return re.escape(word)
314
+ stem = re.escape(word[:-2])
315
+ return stem + r"[a-z]{0,4}"
316
+
317
+ def build_concept_patterns(concept_entry):
318
+ name = concept_entry["concept"]
319
+ abbrev_match = re.search(r"\(([^)]+)\)", name)
320
+ abbrev = abbrev_match.group(1) if abbrev_match else None
321
+ base = re.sub(r"\s*\([^)]+\)", "", name).strip()
322
+
323
+ patterns = []
324
+ words = base.split()
325
+ if len(words) == 1:
326
+ long_re = r"\b" + _flexible_last_word(words[0]) + r"\b"
327
+ else:
328
+ parts = [re.escape(w) for w in words[:-1]] + [_flexible_last_word(words[-1])]
329
+ long_re = r"\b" + r"\s+".join(parts) + r"\b"
330
+ patterns.append(re.compile(long_re, re.IGNORECASE))
331
+
332
+ if abbrev:
333
+ patterns.append(re.compile(r"\b" + re.escape(abbrev) + r"s?\b"))
334
+ return patterns
335
+
336
+ CONCEPT_PATTERNS = [(entry, build_concept_patterns(entry)) for entry in neuroai_concepts]
337
+
338
+
339
+ def _concept_doc_text(entry):
340
+ return (
341
+ f"Concept: {entry['concept']}\n"
342
+ f"Definition: {entry['definition']}\n"
343
+ f"Context: {entry['context']}\n"
344
+ f"Typically found in: {entry['typical_usage']}"
345
+ )
346
+
347
+
348
+ def regex_retrieve(passage):
349
+ hits = []
350
+ for entry, patterns in CONCEPT_PATTERNS:
351
+ if any(p.search(passage) for p in patterns):
352
+ hits.append({
353
+ "type": "regex_concept",
354
+ "concept_name": entry["concept"],
355
+ "concept_id": entry["id"],
356
+ "chunk_id": entry["id"],
357
+ "content": _concept_doc_text(entry),
358
+ "distance": 0.0,
359
+ "source_method": "regex",
360
+ })
361
+ return hits
362
+
363
+
364
+ def retrieve_concepts_embedding(passage, n_results=3):
365
+ results = concepts_collection.query(query_texts=[passage], n_results=n_results)
366
+ out = []
367
+ for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
368
+ out.append({
369
+ "type": "concept",
370
+ "concept_name": meta["concept_name"],
371
+ "concept_id": meta.get("concept_id"),
372
+ "chunk_id": meta.get("concept_id"),
373
+ "content": doc,
374
+ "distance": round(dist, 3),
375
+ "source_method": "embedding",
376
+ })
377
+ return out
378
+
379
+
380
+ def retrieve_paper_chunks(passage, n_results=3):
381
+ if papers_collection.count() == 0:
382
+ return []
383
+ results = papers_collection.query(
384
+ query_texts=[passage],
385
+ n_results=min(n_results, papers_collection.count())
386
+ )
387
+ out = []
388
+ for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
389
+ out.append({
390
+ "type": "paper_chunk",
391
+ "source_name": meta["source_name"],
392
+ "source_type": meta["source_type"],
393
+ "chunk_id": meta.get("chunk_id"),
394
+ "content": doc,
395
+ "distance": round(dist, 3),
396
+ "source_method": "embedding",
397
+ })
398
+ return out
399
+
400
+
401
+ def hybrid_retrieve_concepts(passage, n_embedding=3, max_total=6):
402
+
403
+ rgx = regex_retrieve(passage)
404
+ seen_names = {h["concept_name"] for h in rgx}
405
+ out = list(rgx)
406
+
407
+
408
+ if len(out) < max_total:
409
+ emb = retrieve_concepts_embedding(passage, n_results=n_embedding)
410
+ for hit in emb:
411
+ if hit["concept_name"] not in seen_names:
412
+ out.append(hit)
413
+ seen_names.add(hit["concept_name"])
414
+ else:
415
+ for r in out:
416
+ if r["concept_name"] == hit["concept_name"]:
417
+ r["source_method"] = "both"
418
+ break
419
+ if len(out) >= max_total:
420
+ break
421
+ return out
422
+
423
+
424
+ def retrieve_for_variant(passage, variant, n_concepts=3, n_papers=3):
425
+ if variant == "no_rag":
426
+ return [], []
427
+ elif variant == "embedding_only":
428
+ return (retrieve_concepts_embedding(passage, n_results=n_concepts),
429
+ retrieve_paper_chunks(passage, n_results=n_papers))
430
+ elif variant == "regex_only":
431
+ return (regex_retrieve(passage),
432
+ retrieve_paper_chunks(passage, n_results=n_papers))
433
+ elif variant == "hybrid":
434
+ return (hybrid_retrieve_concepts(passage, n_embedding=n_concepts),
435
+ retrieve_paper_chunks(passage, n_results=n_papers))
436
+ else:
437
+ raise ValueError(f"unknown variant: {variant}")
438
+
439
+ """## 8. Citation-enforced generation with semantic guard
440
+
441
+ """
442
+
443
+ CITED_SYSTEM_PROMPT = """You are a scientific reading assistant that helps people understand passages from NeuroAI research papers.
444
+
445
+ You have access to retrieved context. Each source has a stable ID in square brackets like [c004] (for a concept definition) or [arxiv_2511_12345::c3] (for a paper chunk).
446
+
447
+ Your job:
448
+ 1. Read the passage.
449
+ 2. Rewrite it in plain language an undergraduate could follow.
450
+ 3. For EVERY factual sentence in your explanation, append one or more citations in square brackets, drawn ONLY from the IDs of the retrieved sources shown to you.
451
+ 4. Do not invent citation IDs. Do not cite sources you were not shown.
452
+ 5. If the retrieved context does not contain enough information to answer faithfully, output EXACTLY this string and nothing else:
453
+ I don't have enough evidence in the retrieved context.
454
+
455
+ Format:
456
+ **Key terms:** short definitions of technical terms, each with its citation
457
+ **Plain-language version:** the passage rewritten clearly, with citations on every factual sentence
458
+ **What this means in context:** 1-2 sentences on why this matters, with citations
459
+ """
460
+
461
+ ABSTAIN_MESSAGE = "I don't have enough evidence in the retrieved context."
462
+ CITATION_PATTERN = re.compile(r"\[([a-zA-Z0-9_\-:]+)\]")
463
+ SEMANTIC_FAIL_THRESHOLD = 0.5
464
+
465
+
466
+ def _format_context_block(concept_results, paper_results):
467
+ lines = []
468
+ if concept_results:
469
+ lines.append("CONCEPT DEFINITIONS:")
470
+ for r in concept_results:
471
+ cid = r.get("chunk_id") or r.get("concept_id")
472
+ lines.append(f"\n[{cid}] {r['content']}")
473
+ lines.append("---")
474
+ if paper_results:
475
+ lines.append("\nPAPER/ARTICLE CONTEXT:")
476
+ for r in paper_results:
477
+ cid = r.get("chunk_id")
478
+ lines.append(f"\n[{cid}] (from {r['source_name']}): {r['content']}")
479
+ lines.append("---")
480
+ if not concept_results and not paper_results:
481
+ lines.append("(no context retrieved)")
482
+ return "\n".join(lines)
483
+
484
+
485
+ def _collect_allowed_ids(concept_results, paper_results):
486
+ ids = set()
487
+ for r in concept_results + paper_results:
488
+ cid = r.get("chunk_id") or r.get("concept_id")
489
+ if cid:
490
+ ids.add(cid)
491
+ return ids
492
+
493
+
494
+ def _build_chunk_lookup(concept_results, paper_results):
495
+ """Map citation_id -> chunk content. Used by the semantic check."""
496
+ lookup = {}
497
+ for r in concept_results + paper_results:
498
+ cid = r.get("chunk_id") or r.get("concept_id")
499
+ if cid:
500
+ lookup[cid] = r["content"]
501
+ return lookup
502
+
503
+
504
+ def generate_cited_explanation(passage, concept_results, paper_results, model=None):
505
+ model = model or GENERATOR_MODEL
506
+ context_block = _format_context_block(concept_results, paper_results)
507
+ user_msg = f"{context_block}\n\nPASSAGE TO EXPLAIN:\n{passage}"
508
+ resp = client_oai.chat.completions.create(
509
+ model=model,
510
+ temperature=GENERATOR_TEMPERATURE,
511
+ messages=[
512
+ {"role": "system", "content": CITED_SYSTEM_PROMPT},
513
+ {"role": "user", "content": user_msg},
514
+ ],
515
+ )
516
+ return resp.choices[0].message.content
517
+
518
+
519
+ def validate_citations(answer: str, allowed_ids: set) -> Tuple[bool, List[str]]:
520
+ """Lexical guard: every citation ID in the answer must be in the allowed set."""
521
+ if ABSTAIN_MESSAGE in answer:
522
+ return True, []
523
+ cited = CITATION_PATTERN.findall(answer)
524
+ issues = []
525
+ if not cited:
526
+ issues.append("No citations found in non-abstain answer")
527
+ for cid in cited:
528
+ if cid not in allowed_ids:
529
+ issues.append(f"Invalid citation: {cid}")
530
+ return len(issues) == 0, issues
531
+
532
+
533
+
534
+ def _split_into_sentences(text):
535
+ """Cheap sentence splitter that keeps citation brackets attached."""
536
+ # split on . ! ? followed by space and a capital, keeping the punctuation
537
+ parts = re.split(r"(?<=[.!?])\s+(?=[A-Z*])", text.strip())
538
+ return [p.strip() for p in parts if p.strip()]
539
+
540
+
541
+ def _strip_citations(sentence):
542
+ return CITATION_PATTERN.sub("", sentence).strip()
543
+
544
+
545
+ def check_sentence_supported(sentence_text, cited_chunks):
546
+
547
+ claim = _strip_citations(sentence_text)
548
+ if len(claim) < 10 or not cited_chunks:
549
+ return {"label": "skipped", "reason": "no claim or no chunks"}
550
+ evidence = "\n\n".join(f"[{cid}]: {text}" for cid, text in cited_chunks)
551
+ return verify_claim_against_evidence(claim, [evidence])
552
+
553
+
554
+ def semantic_per_sentence_check(answer, chunk_lookup):
555
+
556
+ if ABSTAIN_MESSAGE in answer:
557
+ return []
558
+ sentences = _split_into_sentences(answer)
559
+ findings = []
560
+ for sent in sentences:
561
+ cited_ids = CITATION_PATTERN.findall(sent)
562
+ if not cited_ids:
563
+ continue
564
+ cited_chunks = [(cid, chunk_lookup[cid]) for cid in cited_ids if cid in chunk_lookup]
565
+ if not cited_chunks:
566
+ continue
567
+ result = check_sentence_supported(sent, cited_chunks)
568
+ findings.append({
569
+ "sentence": sent,
570
+ "citations": cited_ids,
571
+ "label": result["label"],
572
+ "reason": result["reason"],
573
+ })
574
+ return findings
575
+
576
+
577
+ def annotate_unsupported_sentences(answer, findings):
578
+ """Mark unsupported sentences in the rendered output."""
579
+ for f in findings:
580
+ if f["label"] in ("contradicted", "insufficient"):
581
+ marker = "⚠️ "
582
+ if marker not in f["sentence"]:
583
+ answer = answer.replace(f["sentence"], marker + f["sentence"], 1)
584
+ return answer
585
+
586
+
587
+ def generate_with_citation_guard(passage, concept_results, paper_results, model=None,
588
+ allow_no_context_bypass=False,
589
+ do_semantic_check=None):
590
+
591
+ do_semantic_check = (do_semantic_check if do_semantic_check is not None
592
+ else LIVE_SEMANTIC_CHECK)
593
+
594
+ if allow_no_context_bypass and not concept_results and not paper_results:
595
+ resp = client_oai.chat.completions.create(
596
+ model=model or GENERATOR_MODEL,
597
+ temperature=GENERATOR_TEMPERATURE,
598
+ messages=[
599
+ {"role": "system", "content": "You are a scientific reading assistant. Explain the given passage in plain language that an undergraduate could follow. Be concise."},
600
+ {"role": "user", "content": f"PASSAGE:\n{passage}"},
601
+ ],
602
+ )
603
+ return {
604
+ "answer": resp.choices[0].message.content,
605
+ "valid_citations": None,
606
+ "guard_triggered": False,
607
+ "issues": [],
608
+ "abstained": False,
609
+ "semantic_findings": [],
610
+ "semantic_fail_rate": np.nan,
611
+ }
612
+
613
+ raw = generate_cited_explanation(passage, concept_results, paper_results, model=model)
614
+ allowed_ids = _collect_allowed_ids(concept_results, paper_results)
615
+ ok, issues = validate_citations(raw, allowed_ids)
616
+
617
+ # lexical guard
618
+ if not ok:
619
+ return {
620
+ "answer": ABSTAIN_MESSAGE,
621
+ "valid_citations": False,
622
+ "guard_triggered": True,
623
+ "issues": issues,
624
+ "abstained": True,
625
+ "raw_rejected": raw,
626
+ "semantic_findings": [],
627
+ "semantic_fail_rate": np.nan,
628
+ }
629
+
630
+ # semantic per-sentence check
631
+ findings = []
632
+ semantic_fail_rate = np.nan
633
+ if do_semantic_check and ABSTAIN_MESSAGE not in raw:
634
+ chunk_lookup = _build_chunk_lookup(concept_results, paper_results)
635
+ findings = semantic_per_sentence_check(raw, chunk_lookup)
636
+ if findings:
637
+ n_failed = sum(1 for f in findings if f["label"] in ("contradicted", "insufficient"))
638
+ semantic_fail_rate = n_failed / len(findings)
639
+
640
+ if semantic_fail_rate > SEMANTIC_FAIL_THRESHOLD:
641
+ return {
642
+ "answer": ABSTAIN_MESSAGE,
643
+ "valid_citations": True,
644
+ "guard_triggered": True,
645
+ "issues": [f"semantic check failed: {n_failed}/{len(findings)} sentences unsupported"],
646
+ "abstained": True,
647
+ "raw_rejected": raw,
648
+ "semantic_findings": findings,
649
+ "semantic_fail_rate": semantic_fail_rate,
650
+ }
651
+
652
+ raw = annotate_unsupported_sentences(raw, findings)
653
+
654
+ return {
655
+ "answer": raw,
656
+ "valid_citations": True,
657
+ "guard_triggered": False,
658
+ "issues": [],
659
+ "abstained": ABSTAIN_MESSAGE in raw,
660
+ "semantic_findings": findings,
661
+ "semantic_fail_rate": semantic_fail_rate,
662
+ }
663
+
664
+ """## 9. LLM-as-judge metrics
665
+
666
+
667
+ """
668
+
669
+ def _coerce_score(x):
670
+ try:
671
+ v = int(float(x))
672
+ except Exception:
673
+ v = 0
674
+ return max(0, min(2, v))
675
+
676
+
677
+ def _single_judge_call(system_prompt, user_prompt):
678
+
679
+ try:
680
+ resp = client_oai.chat.completions.create(
681
+ model=JUDGE_MODEL,
682
+ temperature=JUDGE_TEMPERATURE,
683
+ response_format={"type": "json_object"},
684
+ messages=[
685
+ {"role": "system", "content": system_prompt},
686
+ {"role": "user", "content": user_prompt},
687
+ ],
688
+ )
689
+ data = json.loads(resp.choices[0].message.content)
690
+ return {
691
+ "score": _coerce_score(data.get("score", 0)),
692
+ "reason": str(data.get("reason", "")).strip(),
693
+ }
694
+ except Exception as e:
695
+ return {"score": None, "reason": f"ERROR: {e}"}
696
+
697
+
698
+ def _judge_call_parallel(system_prompt, user_prompt, n=None):
699
+ """Run n judge calls in parallel via ThreadPoolExecutor."""
700
+ n = n or JUDGE_N_SAMPLES
701
+ results = [None] * n
702
+ with ThreadPoolExecutor(max_workers=min(n, JUDGE_PARALLELISM)) as ex:
703
+ futures = {ex.submit(_single_judge_call, system_prompt, user_prompt): i
704
+ for i in range(n)}
705
+ for fut in as_completed(futures):
706
+ i = futures[fut]
707
+ results[i] = fut.result()
708
+ return results
709
+
710
+
711
+ def _aggregate(runs):
712
+ valid = [r for r in runs if r["score"] is not None]
713
+ if not valid:
714
+ return {"score": None, "reasons": [r["reason"] for r in runs], "n_valid": 0}
715
+ return {
716
+ "score": sum(r["score"] for r in valid) / len(valid),
717
+ "reasons": [r["reason"] for r in valid],
718
+ "n_valid": len(valid),
719
+ }
720
+
721
+
722
+ CORRECTNESS_SYSTEM = """You are evaluating answer correctness for a question about a NeuroAI paper passage.
723
+
724
+ Given a passage, a reference explanation (gold-standard), and a system explanation, score the system explanation's correctness using ONLY the information in the passage and reference.
725
+
726
+ Return ONLY a JSON object:
727
+ {"score": <int 0/1/2>, "reason": "<one sentence>"}
728
+
729
+ Scoring scale:
730
+ - 0 = wrong (contradicts the passage or says something incorrect)
731
+ - 1 = partly correct (captures some but not all of the main idea, or adds unsupported claims)
732
+ - 2 = correct (faithful to what the passage actually says)
733
+ """
734
+
735
+ def score_correctness(passage, reference, candidate):
736
+ user = f"PASSAGE:\n{passage}\n\nREFERENCE:\n{reference}\n\nSYSTEM EXPLANATION:\n{candidate}"
737
+ runs = _judge_call_parallel(CORRECTNESS_SYSTEM, user)
738
+ return _aggregate(runs)
739
+
740
+
741
+ EVIDENCE_SYSTEM = """You are evaluating whether a system explanation's key claims are supported by retrieved context.
742
+
743
+ Given a passage, the retrieved context that was shown to the system, and the system's explanation, score whether the explanation's factual claims are well-supported by the retrieved context.
744
+
745
+ Return ONLY a JSON object:
746
+ {"score": <int 0/1/2>, "reason": "<one sentence>"}
747
+
748
+ Scoring scale:
749
+ - 0 = unsupported (most claims cannot be found in retrieved context)
750
+ - 1 = partly supported (some claims supported, others require outside knowledge)
751
+ - 2 = well supported (claims are traceable to retrieved context)
752
+
753
+ If the retrieved context is empty (no RAG baseline), score 0.
754
+ """
755
+
756
+ def score_evidence_support(passage, retrieved_context, candidate):
757
+ user = f"PASSAGE:\n{passage}\n\nRETRIEVED CONTEXT:\n{retrieved_context}\n\nSYSTEM EXPLANATION:\n{candidate}"
758
+ runs = _judge_call_parallel(EVIDENCE_SYSTEM, user)
759
+ return _aggregate(runs)
760
+
761
+
762
+ CITATION_SYSTEM = """You are evaluating whether citations in a system explanation are faithful.
763
+
764
+ The system was asked to cite each factual sentence with an ID from the retrieved context (like [c004] or [arxiv_2511_12345::c3]). Given the retrieved context and the system explanation with citations, score whether the citations are relevant and the cited material actually supports the adjacent claim.
765
+
766
+ Return ONLY a JSON object:
767
+ {"score": <int 0/1/2>, "reason": "<one sentence>"}
768
+
769
+ Scoring scale:
770
+ - 0 = unfaithful (citations invented, missing, or do not support adjacent claims)
771
+ - 1 = mixed (some citations support their claims, others do not)
772
+ - 2 = faithful (citations are present, relevant, and support adjacent claims)
773
+
774
+ If the answer is the abstention message ("I don't have enough evidence..."), score 2 (correctly declined).
775
+ """
776
+
777
+ def score_citation_faithfulness(retrieved_context, candidate):
778
+ user = f"RETRIEVED CONTEXT:\n{retrieved_context}\n\nSYSTEM EXPLANATION:\n{candidate}"
779
+ runs = _judge_call_parallel(CITATION_SYSTEM, user)
780
+ return _aggregate(runs)
781
+
782
+
783
+ def score_all_metrics(passage, reference, retrieved_context, candidate):
784
+ """Run all three metrics in parallel."""
785
+ with ThreadPoolExecutor(max_workers=3) as ex:
786
+ f_c = ex.submit(score_correctness, passage, reference, candidate)
787
+ f_e = ex.submit(score_evidence_support, passage, retrieved_context, candidate)
788
+ f_f = ex.submit(score_citation_faithfulness, retrieved_context, candidate)
789
+ return {
790
+ "correctness": f_c.result(),
791
+ "evidence_support": f_e.result(),
792
+ "citation_faithfulness": f_f.result(),
793
+ }
794
+
795
+ """## 10. Claim-based faithfulness
796
+
797
+ """
798
+
799
+ CLAIM_EXTRACTION_SYSTEM = """Extract atomic factual claims from the given answer.
800
+
801
+ Return ONLY a JSON object:
802
+ {"claims": ["claim 1", "claim 2", ...]}
803
+
804
+ Rules:
805
+ - Each claim should be a single, minimal factual assertion
806
+ - Ignore pure formatting, headers, or meta-commentary
807
+ - Skip citation markers like [c004] when extracting claims
808
+ - If there are no factual claims, return {"claims": []}
809
+ """
810
+
811
+ EVIDENCE_EXTRACTION_SYSTEM = """Extract factual assertions from the given text chunk.
812
+
813
+ Return ONLY a JSON object:
814
+ {"assertions": ["assertion 1", "assertion 2", ...]}
815
+
816
+ Rules:
817
+ - One atomic factual assertion per entry
818
+ - Skip anything that is a question, opinion, or example
819
+ - If there are no assertions, return {"assertions": []}
820
+ """
821
+
822
+ CLAIM_VERIFICATION_SYSTEM = """Classify if a claim is supported, contradicted, or insufficient given evidence.
823
+
824
+ Return ONLY a JSON object:
825
+ {"label": "supported" | "contradicted" | "insufficient", "reason": "<one short sentence>"}
826
+
827
+ Definitions:
828
+ - supported: the evidence directly supports the claim
829
+ - contradicted: the evidence contradicts the claim
830
+ - insufficient: the evidence is silent or unclear on the claim
831
+ """
832
+
833
+ def _json_call(system_prompt, user_prompt, model=None):
834
+ model = model or JUDGE_MODEL
835
+ resp = client_oai.chat.completions.create(
836
+ model=model,
837
+ temperature=JUDGE_TEMPERATURE,
838
+ response_format={"type": "json_object"},
839
+ messages=[
840
+ {"role": "system", "content": system_prompt},
841
+ {"role": "user", "content": user_prompt},
842
+ ],
843
+ )
844
+ try:
845
+ return json.loads(resp.choices[0].message.content)
846
+ except Exception:
847
+ return {}
848
+
849
+
850
+ def extract_claims(answer):
851
+ data = _json_call(CLAIM_EXTRACTION_SYSTEM, f"ANSWER:\n{answer}")
852
+ return [c for c in data.get("claims", []) if c and isinstance(c, str)]
853
+
854
+
855
+ _ASSERTION_CACHE = {}
856
+
857
+ def extract_assertions_from_chunk(chunk):
858
+ key = hash(chunk)
859
+ if key in _ASSERTION_CACHE:
860
+ return _ASSERTION_CACHE[key]
861
+ data = _json_call(EVIDENCE_EXTRACTION_SYSTEM, f"CHUNK:\n{chunk}")
862
+ out = [a for a in data.get("assertions", []) if a and isinstance(a, str)]
863
+ _ASSERTION_CACHE[key] = out
864
+ return out
865
+
866
+
867
+ def _normalize_label(label):
868
+ x = (label or "").strip().lower()
869
+ if "support" in x: return "supported"
870
+ if "contrad" in x: return "contradicted"
871
+ return "insufficient"
872
+
873
+
874
+ def verify_claim_against_evidence(claim, assertions):
875
+ evidence_blob = "\n".join(assertions) if assertions else "NO_EVIDENCE"
876
+ data = _json_call(
877
+ CLAIM_VERIFICATION_SYSTEM,
878
+ f"CLAIM:\n{claim}\n\nEVIDENCE:\n{evidence_blob}"
879
+ )
880
+ return {
881
+ "label": _normalize_label(data.get("label")),
882
+ "reason": str(data.get("reason", "")).strip(),
883
+ }
884
+
885
+
886
+ def claim_based_faithfulness(answer, retrieved_chunks):
887
+ if ABSTAIN_MESSAGE in answer:
888
+ return {
889
+ "n_claims": 0,
890
+ "support_rate": np.nan,
891
+ "contradiction_rate": np.nan,
892
+ "unsupported_rate": np.nan,
893
+ "abstained": True,
894
+ "details": [],
895
+ }
896
+
897
+ claims = extract_claims(answer)
898
+ if not claims:
899
+ return {
900
+ "n_claims": 0,
901
+ "support_rate": np.nan,
902
+ "contradiction_rate": np.nan,
903
+ "unsupported_rate": np.nan,
904
+ "abstained": False,
905
+ "details": [],
906
+ }
907
+
908
+
909
+ with ThreadPoolExecutor(max_workers=JUDGE_PARALLELISM) as ex:
910
+ all_assertions_lists = list(ex.map(extract_assertions_from_chunk, retrieved_chunks))
911
+ all_assertions = [a for sub in all_assertions_lists for a in sub]
912
+
913
+ with ThreadPoolExecutor(max_workers=JUDGE_PARALLELISM) as ex:
914
+ verify_results = list(ex.map(
915
+ lambda c: verify_claim_against_evidence(c, all_assertions),
916
+ claims
917
+ ))
918
+ labels = [r["label"] for r in verify_results]
919
+ details = [{"claim": c, **r} for c, r in zip(claims, verify_results)]
920
+
921
+ n = len(labels)
922
+ return {
923
+ "n_claims": n,
924
+ "support_rate": sum(1 for l in labels if l == "supported") / n,
925
+ "contradiction_rate": sum(1 for l in labels if l == "contradicted") / n,
926
+ "unsupported_rate": sum(1 for l in labels if l == "insufficient") / n,
927
+ "abstained": False,
928
+ "details": details,
929
+ }
930
+
931
+ """## 11. Retrieval precision@k / recall@k and bootstrap CIs"""
932
+
933
+ def precision_recall_at_k(retrieved_chunks, gold_facts, k=3):
934
+ if not gold_facts:
935
+ return np.nan, np.nan
936
+ top_k = retrieved_chunks[:k]
937
+ if not top_k:
938
+ return 0.0, 0.0
939
+ rel_flags = []
940
+ for chunk in top_k:
941
+ c = chunk.lower()
942
+ is_rel = any(fact.lower() in c for fact in gold_facts)
943
+ rel_flags.append(is_rel)
944
+ precision = float(np.mean(rel_flags))
945
+ covered = 0
946
+ for fact in gold_facts:
947
+ if any(fact.lower() in chunk.lower() for chunk in top_k):
948
+ covered += 1
949
+ recall = covered / len(gold_facts)
950
+ return precision, recall
951
+
952
+
953
+ def bootstrap_ci(values, n_boot=None, alpha=None):
954
+ n_boot = n_boot or BOOTSTRAP_N
955
+ alpha = alpha or BOOTSTRAP_ALPHA
956
+ values = np.array(values, dtype=float)
957
+ values = values[~np.isnan(values)]
958
+ if len(values) == 0:
959
+ return np.nan, np.nan, np.nan
960
+ boots = np.empty(n_boot)
961
+ n = len(values)
962
+ for i in range(n_boot):
963
+ sample = _rng.choice(values, size=n, replace=True)
964
+ boots[i] = sample.mean()
965
+ lo = np.percentile(boots, 100 * (alpha / 2))
966
+ hi = np.percentile(boots, 100 * (1 - alpha / 2))
967
+ return float(values.mean()), float(lo), float(hi)
968
+
969
+
970
+ def format_ci(values, digits=3):
971
+ m, lo, hi = bootstrap_ci(values)
972
+ return f"{m:.{digits}f} [{lo:.{digits}f}, {hi:.{digits}f}]"
973
+
974
+ """## 12. Logging"""
975
+
976
+ EVAL_LOG_DIR = Path("./eval_logs")
977
+ EVAL_LOG_DIR.mkdir(exist_ok=True)
978
+
979
+ def log_eval_row(experiment_id, passage_id, variant, retrieved_sources,
980
+ generation_result, judge_scores, extra=None):
981
+ row = {
982
+ "experiment_id": experiment_id,
983
+ "passage_id": passage_id,
984
+ "variant": variant,
985
+ "model": GENERATOR_MODEL,
986
+ "n_retrieved": len(retrieved_sources),
987
+ "retrieved_chunk_ids": ";".join(
988
+ str(r.get("chunk_id") or r.get("concept_id") or "?") for r in retrieved_sources
989
+ ),
990
+ "guard_triggered": int(generation_result.get("guard_triggered", False)),
991
+ "abstained": int(generation_result.get("abstained", False)),
992
+ "answer_chars": len(generation_result.get("answer", "")),
993
+ "generated_text": generation_result.get("answer", ""),
994
+ "correctness": judge_scores.get("correctness", {}).get("score"),
995
+ "evidence_support": judge_scores.get("evidence_support", {}).get("score"),
996
+ "citation_faithfulness": judge_scores.get("citation_faithfulness", {}).get("score"),
997
+ "semantic_fail_rate": generation_result.get("semantic_fail_rate", np.nan),
998
+ }
999
+ if extra:
1000
+ row.update(extra)
1001
+ path = EVAL_LOG_DIR / f"{experiment_id}.csv"
1002
+ pd.DataFrame([row]).to_csv(
1003
+ path, mode="a", header=not path.exists(), index=False
1004
+ )
1005
+ return row
1006
+
1007
+
1008
+ def load_or_run_experiment(experiment_id, runner_fn):
1009
+
1010
+ local_path = EVAL_LOG_DIR / f"{experiment_id}.csv"
1011
+
1012
+ if RUN_EXPERIMENTS:
1013
+
1014
+ if local_path.exists():
1015
+ local_path.unlink()
1016
+ print(f"running {experiment_id} from scratch...")
1017
+ return runner_fn()
1018
+
1019
+
1020
+ url = f"{REPO_RAW_BASE}/eval_logs/{experiment_id}.csv"
1021
+ try:
1022
+ df = pd.read_csv(url)
1023
+ print(f"loaded {experiment_id} from repo cache: {len(df)} rows")
1024
+
1025
+ df.to_csv(local_path, index=False)
1026
+ return df
1027
+ except Exception:
1028
+ pass
1029
+
1030
+
1031
+ if local_path.exists():
1032
+ df = pd.read_csv(local_path)
1033
+ print(f"loaded {experiment_id} from local cache: {len(df)} rows")
1034
+ return df
1035
+
1036
+ print(f"⚠ no cached results for {experiment_id}. Set RUN_EXPERIMENTS=True to generate.")
1037
+ return None
1038
+
1039
+ """## 13. Judge calibration
1040
+
1041
+
1042
+ """
1043
+
1044
+ RUN_CALIBRATION = False # HERE
1045
+
1046
+ def calibrate_judge(n_items=5):
1047
+ items = [x for x in validation_set["items"]]
1048
+ sample = items[:n_items]
1049
+ diffs = {"correctness": [], "evidence_support": [], "citation_faithfulness": []}
1050
+
1051
+ for item in sample:
1052
+ c, p = retrieve_for_variant(item["passage"], "hybrid")
1053
+ result = generate_with_citation_guard(item["passage"], c, p, do_semantic_check=False)
1054
+ explanation = result["answer"]
1055
+ context_text = _format_context_block(c, p)
1056
+
1057
+ print("=" * 70)
1058
+ print(f"ITEM {item['id']}")
1059
+ print(f"PASSAGE: {item['passage'][:300]}")
1060
+ print(f"\nREFERENCE: {item['reference_explanation']}")
1061
+ print(f"\nSYSTEM EXPLANATION:\n{explanation}")
1062
+ print("\nScore each metric 0/1/2 (0=bad, 1=partial, 2=good):")
1063
+ try:
1064
+ human = {
1065
+ "correctness": int(input(" correctness: ")),
1066
+ "evidence_support": int(input(" evidence_support: ")),
1067
+ "citation_faithfulness": int(input(" citation_faithfulness: ")),
1068
+ }
1069
+ except (ValueError, EOFError):
1070
+ print("aborted")
1071
+ return None
1072
+
1073
+ all_scores = score_all_metrics(
1074
+ item["passage"], item["reference_explanation"], context_text, explanation
1075
+ )
1076
+ scores_clean = {k: all_scores[k]["score"] for k in all_scores}
1077
+ for k in diffs:
1078
+ if scores_clean[k] is not None:
1079
+ diffs[k].append(abs(human[k] - scores_clean[k]))
1080
+
1081
+ print("\n=== CALIBRATION RESULTS ===")
1082
+ for k, vals in diffs.items():
1083
+ if vals:
1084
+ mad = sum(vals) / len(vals)
1085
+ flag = " ⚠ DISAGREES" if mad > 0.5 else " ok"
1086
+ print(f" {k}: mean abs diff = {mad:.2f}{flag}")
1087
+ return diffs
1088
+
1089
+
1090
+ if RUN_CALIBRATION:
1091
+ calibrate_judge(n_items=5)
1092
+ else:
1093
+ print("calibration skipped (RUN_CALIBRATION=False)")
1094
+ print("Last calibration: correctness MAD=0.60 (DISAGREES), evidence MAD=0.40, citation MAD=0.20")
1095
+
1096
+ """## 14. Experiment A — retrieval ablation
1097
+
1098
+ **Question.** Does RAG help, and does the regex tier earn its place?
1099
+
1100
+ **Hypothesis.** All RAG variants will beat the no-RAG baseline on claim_support_rate and evidence_support. Hybrid will beat either single-tier variant.
1101
+
1102
+ **Variable changed.** Retrieval method ∈ {no_rag, embedding_only, regex_only, hybrid}. Everything else held constant.
1103
+ """
1104
+
1105
+ def run_experiment_A():
1106
+ items = [x for x in validation_set["items"]]
1107
+ variants = ["no_rag", "embedding_only", "regex_only", "hybrid"]
1108
+ total_runs = len(items) * len(variants)
1109
+ print(f"running experiment A: {len(items)} items × {len(variants)} variants = {total_runs} runs")
1110
+
1111
+ for i, item in enumerate(items):
1112
+ for variant in variants:
1113
+ try:
1114
+ c, p = retrieve_for_variant(item["passage"], variant)
1115
+ retrieved = c + p
1116
+ context_text = _format_context_block(c, p)
1117
+
1118
+
1119
+ result = generate_with_citation_guard(
1120
+ item["passage"], c, p,
1121
+ allow_no_context_bypass=(variant == "no_rag"),
1122
+ do_semantic_check=False,
1123
+ )
1124
+
1125
+ scores = score_all_metrics(
1126
+ item["passage"], item["reference_explanation"],
1127
+ context_text, result["answer"],
1128
+ )
1129
+ cb = claim_based_faithfulness(
1130
+ result["answer"], [r["content"] for r in retrieved],
1131
+ )
1132
+ rp, rr = precision_recall_at_k(
1133
+ [r["content"] for r in retrieved], item["key_terms"], k=3,
1134
+ )
1135
+
1136
+ log_eval_row(
1137
+ "experiment_A", item["id"], variant,
1138
+ retrieved, result, scores,
1139
+ extra={
1140
+ "category": item["category"],
1141
+ "claim_support_rate": cb["support_rate"],
1142
+ "claim_contradiction_rate": cb["contradiction_rate"],
1143
+ "claim_unsupported_rate": cb["unsupported_rate"],
1144
+ "n_claims": cb["n_claims"],
1145
+ "retrieval_precision_at_3": rp,
1146
+ "retrieval_recall_at_3": rr,
1147
+ }
1148
+ )
1149
+ except Exception as e:
1150
+ print(f" ERROR {item['id']}/{variant}: {e}")
1151
+ print(f" done {item['id']} ({i+1}/{len(items)})")
1152
+
1153
+ return pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
1154
+
1155
+
1156
+ experiment_A_df = load_or_run_experiment("experiment_A", run_experiment_A)
1157
+
1158
+ def analyze_experiment_A():
1159
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
1160
+ metric_cols = ["correctness", "evidence_support", "citation_faithfulness",
1161
+ "claim_support_rate", "retrieval_recall_at_3", "abstained"]
1162
+
1163
+ print("=" * 70)
1164
+ print("OVERALL means with 95% bootstrap CIs")
1165
+ print("=" * 70)
1166
+ for variant in ["no_rag", "embedding_only", "regex_only", "hybrid"]:
1167
+ sub = df[df.variant == variant]
1168
+ print(f"\n{variant}")
1169
+ for m in metric_cols:
1170
+ if m in sub.columns:
1171
+ print(f" {m:28s} {format_ci(sub[m].values)}")
1172
+
1173
+ print("\n" + "=" * 70)
1174
+ print("HEADLINE METRIC: claim_support_rate (correctness saturates — see report)")
1175
+ print("=" * 70)
1176
+ for variant in ["no_rag", "embedding_only", "regex_only", "hybrid"]:
1177
+ sub = df[df.variant == variant]
1178
+ if "claim_support_rate" in sub.columns:
1179
+ print(f" {variant:18s} {format_ci(sub['claim_support_rate'].values)}")
1180
+
1181
+ return df
1182
+
1183
+
1184
+ def plot_experiment_A():
1185
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
1186
+ variant_order = ["no_rag", "embedding_only", "regex_only", "hybrid"]
1187
+ colors = ["#888", "#4c72b0", "#dd8452", "#55a868"]
1188
+
1189
+ fig, axes = plt.subplots(1, 3, figsize=(16, 5))
1190
+
1191
+
1192
+ means, los, his = [], [], []
1193
+ for v in variant_order:
1194
+ sub = df[df.variant == v]
1195
+ if "claim_support_rate" in sub.columns:
1196
+ m, lo, hi = bootstrap_ci(sub["claim_support_rate"].values)
1197
+ else:
1198
+ m, lo, hi = 0, 0, 0
1199
+ means.append(m); los.append(m - lo); his.append(hi - m)
1200
+ axes[0].bar(variant_order, means, yerr=[los, his], color=colors, capsize=5)
1201
+ axes[0].set_title("Claim support rate (headline)")
1202
+ axes[0].set_ylabel("Fraction of claims supported")
1203
+ axes[0].set_ylim(0, 1)
1204
+ axes[0].tick_params(axis="x", rotation=20)
1205
+
1206
+
1207
+ means, los, his = [], [], []
1208
+ for v in variant_order:
1209
+ sub = df[df.variant == v]
1210
+ if "retrieval_recall_at_3" in sub.columns:
1211
+ m, lo, hi = bootstrap_ci(sub["retrieval_recall_at_3"].values)
1212
+ else:
1213
+ m, lo, hi = 0, 0, 0
1214
+ means.append(m); los.append(m - lo); his.append(hi - m)
1215
+ axes[1].bar(variant_order, means, yerr=[los, his], color=colors, capsize=5)
1216
+ axes[1].set_title("Retrieval recall@3")
1217
+ axes[1].set_ylabel("Fraction of gold key_terms covered")
1218
+ axes[1].set_ylim(0, 1)
1219
+ axes[1].tick_params(axis="x", rotation=20)
1220
+
1221
+
1222
+ abs_by_var = df.groupby("variant")["abstained"].mean().reindex(variant_order)
1223
+ axes[2].bar(variant_order, abs_by_var.values, color=colors)
1224
+ axes[2].set_title("Abstention rate")
1225
+ axes[2].set_ylabel("Fraction of items guard triggered")
1226
+ axes[2].set_ylim(0, 1)
1227
+ axes[2].tick_params(axis="x", rotation=20)
1228
+
1229
+ plt.tight_layout()
1230
+ plt.show()
1231
+
1232
+
1233
+ if experiment_A_df is not None:
1234
+ analyze_experiment_A()
1235
+ plot_experiment_A()
1236
+
1237
+ """### Release gate"""
1238
+
1239
+ def release_gate_A(variant="hybrid"):
1240
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
1241
+ sub = df[df.variant == variant]
1242
+
1243
+ thresholds = {
1244
+ "claim_support_rate": 0.70, # primary
1245
+ "evidence_support": 1.40,
1246
+ "citation_faithfulness": 1.40,
1247
+ "retrieval_recall_at_3": 0.60,
1248
+ "abstained": 0.30,
1249
+ }
1250
+ lower_is_better = {"abstained"}
1251
+
1252
+ agg = {k: float(np.nanmean(sub[k].values)) for k in thresholds if k in sub.columns}
1253
+
1254
+ print(f"Release gate for variant: {variant}")
1255
+ print("=" * 60)
1256
+ all_pass = True
1257
+ for k, t in thresholds.items():
1258
+ if k not in agg: continue
1259
+ v = agg[k]
1260
+ ok = (v <= t) if k in lower_is_better else (v >= t)
1261
+ direction = "≤" if k in lower_is_better else "≥"
1262
+ status = "PASS" if ok else "FAIL"
1263
+ print(f" {k:28s} {v:.3f} (need {direction} {t}) {status}")
1264
+ all_pass = all_pass and ok
1265
+ print(f"\nFINAL: {'PASS' if all_pass else 'FAIL'}")
1266
+ return all_pass
1267
+
1268
+
1269
+ if experiment_A_df is not None:
1270
+ release_gate_A(variant="hybrid")
1271
+ print()
1272
+ release_gate_A(variant="regex_only")
1273
+
1274
+ """## 16. Experiment B — top-k sweep
1275
+
1276
+ **Question:** How does the number of retrieved sources (top-k) affect answer correctness?
1277
+
1278
+ **Hypothesis:** Performance peaks somewhere in the middle. k=1 misses context; large k dilutes the prompt with irrelevant chunks.
1279
+
1280
+ **Variable changed:** `top_k ∈ {1, 3, 5, 7}`, applied to both retrieval tiers. Hybrid retrieval; everything else held constant.
1281
+ """
1282
+
1283
+ def run_experiment_B(top_k_values=(1, 3, 5, 7)):
1284
+ items = [x for x in validation_set["items"]]
1285
+ print(f"running experiment B: {len(items)} items × {len(top_k_values)} top-k values "
1286
+ f"= {len(items) * len(top_k_values)} runs")
1287
+
1288
+ for k in top_k_values:
1289
+ time.sleep(1.5)
1290
+ print(f"\n--- top_k = {k} ---")
1291
+ for item in items:
1292
+ try:
1293
+ c, p = retrieve_for_variant(
1294
+ item["passage"], "hybrid", n_concepts=k, n_papers=k,
1295
+ )
1296
+ retrieved = c + p
1297
+ context_text = _format_context_block(c, p)
1298
+ result = generate_with_citation_guard(
1299
+ item["passage"], c, p, do_semantic_check=False
1300
+ )
1301
+ scores = score_all_metrics(
1302
+ item["passage"], item["reference_explanation"],
1303
+ context_text, result["answer"]
1304
+ )
1305
+ avg_dist = (float(np.mean([r["distance"] for r in retrieved if r["distance"] > 0]))
1306
+ if retrieved else None)
1307
+
1308
+ log_eval_row(
1309
+ "experiment_B", item["id"], f"topk_{k}",
1310
+ retrieved, result, scores,
1311
+ extra={
1312
+ "category": item["category"],
1313
+ "top_k": k,
1314
+ "n_retrieved_total": len(retrieved),
1315
+ "avg_distance": avg_dist,
1316
+ }
1317
+ )
1318
+ except Exception as e:
1319
+ print(f" ERROR {item['id']}: {e}")
1320
+
1321
+ return pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
1322
+
1323
+
1324
+ def analyze_experiment_B():
1325
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
1326
+ print("mean correctness by top-k (with 95% CI):")
1327
+ for k in sorted(df["top_k"].unique()):
1328
+ sub = df[df.top_k == k]
1329
+ print(f" top_k={k} "
1330
+ f"correctness={format_ci(sub['correctness'].values)} "
1331
+ f"evidence={format_ci(sub['evidence_support'].values)} "
1332
+ f"avg_n_retrieved={sub['n_retrieved_total'].mean():.1f}")
1333
+ return df
1334
+
1335
+
1336
+ def plot_experiment_B():
1337
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
1338
+ ks = sorted(df["top_k"].unique())
1339
+ means, los, his = [], [], []
1340
+ for k in ks:
1341
+ m, lo, hi = bootstrap_ci(df[df.top_k == k]["correctness"].values)
1342
+ means.append(m); los.append(m - lo); his.append(hi - m)
1343
+
1344
+ fig, ax = plt.subplots(figsize=(8, 5))
1345
+ ax.errorbar(ks, means, yerr=[los, his], marker="o", linewidth=2, capsize=5)
1346
+ ax.set_xlabel("top-k (per retrieval tier)")
1347
+ ax.set_ylabel("Mean correctness (0-2)")
1348
+ ax.set_title("Experiment B — Correctness vs top-k (95% CI)")
1349
+ ax.set_xticks(ks)
1350
+ ax.set_ylim(0, 2)
1351
+ ax.grid(alpha=0.3)
1352
+ plt.tight_layout()
1353
+ plt.show()
1354
+
1355
+
1356
+ experiment_B_df = load_or_run_experiment("experiment_B", run_experiment_B)
1357
+ if experiment_B_df is not None:
1358
+ analyze_experiment_B()
1359
+ plot_experiment_B()
1360
+
1361
+ """## 17. Experiment C — confidence threshold tuning
1362
+
1363
+ **Question.** Where should the low-confidence threshold sit so that warnings correlate with wrong answers?
1364
+
1365
+ **Hypothesis.** The default 1.3 threshold was a guess. The F1-maximizing threshold is probably lower.
1366
+
1367
+ **Variable changed.** Threshold ∈ [0.6, 1.6] by 0.1.
1368
+ """
1369
+
1370
+ def run_experiment_C():
1371
+ items = [x for x in validation_set["items"]]
1372
+ for item in items:
1373
+ try:
1374
+ time.sleep(1.5)
1375
+ c, p = retrieve_for_variant(item["passage"], "hybrid")
1376
+ all_dists = [r["distance"] for r in (c + p) if r["distance"] > 0]
1377
+ best_dist = float(min(all_dists)) if all_dists else 999.0
1378
+ context_text = _format_context_block(c, p)
1379
+ result = generate_with_citation_guard(
1380
+ item["passage"], c, p, do_semantic_check=False
1381
+ )
1382
+ scores = score_all_metrics(
1383
+ item["passage"], item["reference_explanation"], context_text, result["answer"]
1384
+ )
1385
+ corr = scores["correctness"]["score"]
1386
+ log_eval_row(
1387
+ "experiment_C", item["id"], "default_system",
1388
+ c + p, result, scores,
1389
+ extra={
1390
+ "category": item["category"],
1391
+ "best_distance": best_dist,
1392
+ "correctness_raw": corr,
1393
+ }
1394
+ )
1395
+ except Exception as e:
1396
+ print(f" ERROR {item['id']}: {e}")
1397
+ return pd.read_csv(EVAL_LOG_DIR / "experiment_C.csv")
1398
+
1399
+
1400
+ def analyze_experiment_C():
1401
+ df = pd.read_csv(EVAL_LOG_DIR / "experiment_C.csv")
1402
+ if "correctness_raw" not in df.columns:
1403
+ df["correctness_raw"] = df["correctness"]
1404
+ df = df.dropna(subset=["best_distance", "correctness_raw"])
1405
+
1406
+
1407
+ strict_pos = (df["correctness_raw"] < 1.0).sum()
1408
+ if strict_pos > 0:
1409
+ df["is_wrong"] = (df["correctness_raw"] < 1.0).astype(int)
1410
+ wrongness_def = "correctness < 1.0 (strict)"
1411
+ else:
1412
+ df["is_wrong"] = (df["correctness_raw"] < 2.0).astype(int)
1413
+ wrongness_def = "correctness < 2.0 (saturation fallback)"
1414
+ print(f"using wrongness definition: {wrongness_def}")
1415
+ print(f" positive class size: {df['is_wrong'].sum()}/{len(df)}")
1416
+ if df["is_wrong"].sum() == 0:
1417
+ print("⚠ WARNING: no wrong answers in eval set. Tuning is meaningless on this data.")
1418
+ return None, None, None
1419
+
1420
+ thresholds = [round(0.6 + 0.1 * i, 2) for i in range(11)]
1421
+ rows = []
1422
+ for t in thresholds:
1423
+ warns = df["best_distance"] > t
1424
+ wrong = df["is_wrong"] == 1
1425
+ tp = int(((warns) & (wrong)).sum())
1426
+ fp = int(((warns) & (~wrong)).sum())
1427
+ fn = int(((~warns) & (wrong)).sum())
1428
+ tn = int(((~warns) & (~wrong)).sum())
1429
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
1430
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
1431
+ f1 = 2*precision*recall / (precision + recall) if (precision + recall) > 0 else 0.0
1432
+ tpr = recall
1433
+ fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
1434
+ rows.append({"threshold": t, "tp": tp, "fp": fp, "fn": fn, "tn": tn,
1435
+ "refusal_precision": round(precision, 3),
1436
+ "refusal_recall": round(recall, 3),
1437
+ "f1": round(f1, 3),
1438
+ "tpr": round(tpr, 3), "fpr": round(fpr, 3)})
1439
+ sweep = pd.DataFrame(rows)
1440
+ print(sweep)
1441
+
1442
+ best = sweep.loc[sweep["f1"].idxmax()]
1443
+ print(f"\nF1-maximizing threshold: {best['threshold']} (F1={best['f1']})")
1444
+ print(f" refusal precision: {best['refusal_precision']}")
1445
+ print(f" refusal recall: {best['refusal_recall']}")
1446
+
1447
+ s = sweep.sort_values("fpr")
1448
+ auc = 0.0
1449
+ for i in range(1, len(s)):
1450
+ auc += (s.iloc[i]["fpr"] - s.iloc[i-1]["fpr"]) * (s.iloc[i]["tpr"] + s.iloc[i-1]["tpr"]) / 2
1451
+ print(f"approx ROC AUC: {auc:.3f}")
1452
+ return sweep, best, auc
1453
+
1454
+
1455
+ def plot_experiment_C():
1456
+ out = analyze_experiment_C()
1457
+ if out is None or out[0] is None:
1458
+ return
1459
+ sweep, best, auc = out
1460
+ fig, ax = plt.subplots(figsize=(7, 7))
1461
+ s = sweep.sort_values("fpr")
1462
+ ax.plot(s["fpr"], s["tpr"], marker="o", linewidth=2, label=f"ROC (AUC≈{auc:.3f})")
1463
+ ax.plot([0, 1], [0, 1], "--", color="gray", alpha=0.5, label="chance")
1464
+ ax.scatter([best["fpr"]], [best["tpr"]], s=200, color="red", zorder=5,
1465
+ label=f"best F1 @ threshold={best['threshold']}")
1466
+ ax.set_xlabel("False positive rate")
1467
+ ax.set_ylabel("True positive rate (refusal recall)")
1468
+ ax.set_title("Experiment C — Abstention threshold ROC")
1469
+ ax.set_xlim(-0.05, 1.05); ax.set_ylim(-0.05, 1.05)
1470
+ ax.legend(loc="lower right")
1471
+ ax.grid(alpha=0.3)
1472
+ plt.tight_layout()
1473
+ plt.show()
1474
+
1475
+
1476
+ experiment_C_df = load_or_run_experiment("experiment_C", run_experiment_C)
1477
+ if experiment_C_df is not None:
1478
+ plot_experiment_C()
1479
+
1480
+ DEFAULT_CONFIDENCE_THRESHOLD = 1.3
1481
+
1482
+ try:
1483
+ if experiment_C_df is not None:
1484
+ out = analyze_experiment_C()
1485
+ if out and out[1] is not None:
1486
+ _, best, _ = out
1487
+ TUNED_CONFIDENCE_THRESHOLD = float(best["threshold"])
1488
+ else:
1489
+ TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
1490
+ else:
1491
+ TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
1492
+ except Exception as e:
1493
+ print(f"falling back to default threshold ({e})")
1494
+ TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
1495
+
1496
+ print(f"TUNED_CONFIDENCE_THRESHOLD = {TUNED_CONFIDENCE_THRESHOLD}")
1497
+
1498
+ """## 18. Main pipeline (with citations + tuned threshold + semantic check)"""
1499
+
1500
+ def check_input_quality(text):
1501
+ if len(text.strip()) < 20:
1502
+ return False, "That's pretty short — try pasting a full sentence or paragraph from a paper."
1503
+ if len(text.strip()) > 3000:
1504
+ return False, "That's a lot of text. Try pasting just 1-2 paragraphs at a time."
1505
+ if len(text.split()) < 5:
1506
+ return False, "Try a longer passage — at least a full sentence from a paper."
1507
+ return True, "ok"
1508
+
1509
+
1510
+ def assess_retrieval_confidence(concept_results, paper_results, threshold=None):
1511
+ threshold = threshold if threshold is not None else TUNED_CONFIDENCE_THRESHOLD
1512
+ dists = [r["distance"] for r in (concept_results + paper_results) if r["distance"] > 0]
1513
+ if not dists:
1514
+ return "low", "I couldn't find any relevant context in my knowledge base."
1515
+ best = min(dists)
1516
+ if best < 0.8:
1517
+ return "high", ""
1518
+ elif best < threshold:
1519
+ return "medium", ("Note: my knowledge base has some related material, but the match isn't perfect. "
1520
+ "Double-check against the paper's own definitions.")
1521
+ else:
1522
+ return "low", "Heads up: the concepts in this passage don't match well with my current knowledge base."
1523
+
1524
+
1525
+ SCOPE_DISCLAIMER = (
1526
+ "---\n"
1527
+ "*This tool helps you understand papers; it doesn't replace them. "
1528
+ "Every factual sentence above is cited to a specific retrieved source. "
1529
+ "⚠️ marks indicate the semantic guard flagged that sentence as not fully supported by its citation. "
1530
+ "Always check the original paper.*"
1531
+ )
1532
+
1533
+
1534
+ def scimplify(passage, variant="hybrid"):
1535
+ is_ok, msg = check_input_quality(passage)
1536
+ if not is_ok:
1537
+ return msg
1538
+
1539
+ c, p = retrieve_for_variant(passage, variant)
1540
+ confidence, warning = assess_retrieval_confidence(c, p)
1541
+ result = generate_with_citation_guard(passage, c, p)
1542
+
1543
+ parts = []
1544
+ if result["guard_triggered"]:
1545
+ which = "semantic" if any("semantic" in i for i in result.get("issues", [])) else "lexical"
1546
+ parts.append(f"⚠️ The {which} citation guard triggered. Returning abstention rather than a potentially ungrounded answer.")
1547
+ if result.get("issues"):
1548
+ parts.append(f"\n*Reason: {'; '.join(result['issues'])}*")
1549
+ parts.append(f"\n{result['answer']}")
1550
+ parts.append(f"\n{SCOPE_DISCLAIMER}")
1551
+ return "\n".join(parts)
1552
+
1553
+ if result["answer"].strip() == ABSTAIN_MESSAGE or result["abstained"]:
1554
+ parts = [result["answer"], SCOPE_DISCLAIMER]
1555
+ return "\n".join(parts)
1556
+
1557
+ if confidence == "low":
1558
+ parts.append(f"⚠️ {warning}\n")
1559
+ elif confidence == "medium":
1560
+ parts.append(f"ℹ️ {warning}\n")
1561
+
1562
+ parts.append(result["answer"])
1563
+
1564
+ # show retrieved sources
1565
+ concept_names = [r["concept_name"] for r in c if "concept_name" in r]
1566
+ if concept_names:
1567
+ parts.append(f"\n\n**Retrieved concepts:** {', '.join(concept_names)}")
1568
+ if p:
1569
+ sources = sorted(set(r["source_name"] for r in p))
1570
+ parts.append(f"**Paper sources:** {', '.join(sources)}")
1571
+
1572
+ # surface semantic check stats if any sentences were checked
1573
+ findings = result.get("semantic_findings", [])
1574
+ if findings:
1575
+ n_total = len(findings)
1576
+ n_unsupported = sum(1 for f in findings if f["label"] in ("contradicted", "insufficient"))
1577
+ if n_unsupported > 0:
1578
+ parts.append(f"\n*Semantic guard: {n_unsupported}/{n_total} cited sentences flagged as not fully supported.*")
1579
+ else:
1580
+ parts.append(f"\n*Semantic guard: all {n_total} cited sentences supported by their citations ✓*")
1581
+
1582
+ parts.append(f"\n{SCOPE_DISCLAIMER}")
1583
+ return "\n".join(parts)
1584
+
1585
+ """## 19. Gradio UI
1586
+
1587
+ """
1588
+
1589
+ def add_pdf_to_kb(pdf_file, source_name, source_type):
1590
+ if pdf_file is None:
1591
+ return "Please upload a PDF file."
1592
+ if not source_name.strip():
1593
+ return "Please provide a name for this source."
1594
+ try:
1595
+ text = extract_text_from_pdf(pdf_file)
1596
+ chunks = chunk_text(text)
1597
+ base = source_name.strip().replace(" ", "_")
1598
+ ids = [f"user_{base}::c{i}" for i in range(len(chunks))]
1599
+ metas = [{
1600
+ "source_name": source_name.strip(),
1601
+ "source_type": source_type,
1602
+ "chunk_id": ids[i],
1603
+ } for i in range(len(chunks))]
1604
+ if chunks:
1605
+ papers_collection.add(documents=chunks, ids=ids, metadatas=metas)
1606
+ return f"Added {len(chunks)} chunks. Total: {papers_collection.count()}"
1607
+ except Exception as e:
1608
+ return f"Error: {e}"
1609
+
1610
+
1611
+ def pull_from_arxiv_ui(query, max_results):
1612
+ """Gradio handler for arXiv ingestion."""
1613
+ try:
1614
+ max_results = int(max_results)
1615
+ if max_results < 1 or max_results > 25:
1616
+ return "Please pick a max_results between 1 and 25."
1617
+ summary = ingest_from_arxiv(query=query, max_results=max_results, verbose=False)
1618
+ msg = (
1619
+ f"✅ Ingested {summary['n_papers']} new paper(s), "
1620
+ f"{summary['n_chunks']} chunks. "
1621
+ f"Skipped {summary['n_skipped']} duplicates. "
1622
+ f"Total in KB: {summary['total_in_kb']} chunks."
1623
+ )
1624
+ if summary["errors"]:
1625
+ msg += f"\n\n⚠️ Errors: {'; '.join(summary['errors'][:3])}"
1626
+ return msg
1627
+ except Exception as e:
1628
+ return f"Error: {e}"
1629
+
1630
+
1631
+ def get_kb_status():
1632
+ n_concepts = concepts_collection.count()
1633
+ n_papers = papers_collection.count()
1634
+ status = f"**Concept definitions:** {n_concepts}\n\n**Paper chunks:** {n_papers}\n"
1635
+ if n_papers > 0:
1636
+ metas = papers_collection.get()["metadatas"]
1637
+ sources = Counter(m["source_name"] for m in metas)
1638
+ status += "\n**Ingested sources:**\n"
1639
+ for name, count in sources.most_common():
1640
+ status += f"- {name} — {count} chunks\n"
1641
+ return status
1642
+
1643
+
1644
+
1645
+ DEMO_CLEAN = "The Diels-Alder reaction is a [4+2] cycloaddition between a conjugated diene and a dienophile, producing a six-membered ring with up to four new stereocenters. The reaction proceeds through a concerted, suprafacial transition state and is highly stereospecific: cis-dienophiles yield cis-substituted cyclohexenes. Electron-withdrawing groups on the dienophile dramatically accelerate the reaction."
1646
+ DEMO_PAPER = "Multi-region neural population dynamics in the brain have been studied using techniques like LFADS to model the latent factors driving observed activity across regions."
1647
+ DEMO_ABSTAIN = "Laminated pastry dough is created by repeatedly folding butter into flour-water dough, producing alternating layers that puff up during baking as steam expands between them. Croissants are the canonical example."
1648
+
1649
+
1650
+ with gr.Blocks(title="Scimplify") as app:
1651
+ gr.Markdown("# Scimplify — NeuroAI Paper Simplifier")
1652
+ gr.Markdown(
1653
+ "Paste a NeuroAI paragraph; get a plain-language explanation with citations. "
1654
+ "Every factual sentence is grounded in a retrieved source. The lexical guard rejects "
1655
+ "invented citation IDs, and the semantic guard verifies that each cited chunk actually "
1656
+ "supports the claim. If neither passes, the system abstains rather than hallucinate."
1657
+ )
1658
+
1659
+ with gr.Tab("Explain Passage"):
1660
+ with gr.Row():
1661
+ with gr.Column(scale=1):
1662
+ inp = gr.Textbox(label="Passage", lines=8,
1663
+ placeholder="Paste a paragraph from a paper...")
1664
+ btn = gr.Button("Explain", variant="primary")
1665
+ gr.Examples(
1666
+ examples=[
1667
+ [DEMO_CLEAN],
1668
+ [DEMO_PAPER],
1669
+ [DEMO_ABSTAIN],
1670
+ ],
1671
+ inputs=[inp],
1672
+ label="Demo passages (clean / paper-chunk / out-of-scope)",
1673
+ )
1674
+ with gr.Column(scale=2):
1675
+ out = gr.Markdown(label="Explanation")
1676
+ btn.click(fn=lambda x: scimplify(x), inputs=[inp], outputs=[out])
1677
+
1678
+ with gr.Tab("Add Papers (PDF)"):
1679
+ pdf_in = gr.File(label="PDF", file_types=[".pdf"])
1680
+ name_in = gr.Textbox(label="Source name")
1681
+ type_in = gr.Radio(["paper", "article", "review"], label="Type", value="paper")
1682
+ add_btn = gr.Button("Add to knowledge base")
1683
+ add_out = gr.Textbox(label="Status")
1684
+ add_btn.click(fn=add_pdf_to_kb, inputs=[pdf_in, name_in, type_in], outputs=[add_out])
1685
+
1686
+ with gr.Tab("Pull from arXiv"):
1687
+ gr.Markdown(
1688
+ "Fetch recent NeuroAI papers from arXiv directly. "
1689
+ "Skips papers already in the knowledge base (matched by arxiv_id)."
1690
+ )
1691
+ arxiv_query = gr.Textbox(
1692
+ label="arXiv query",
1693
+ value="NeuroAI",
1694
+ placeholder="e.g. NeuroAI, brain-inspired deep learning, neural population dynamics",
1695
+ )
1696
+ arxiv_n = gr.Slider(label="Max papers", minimum=1, maximum=20, value=5, step=1)
1697
+ arxiv_btn = gr.Button("Pull from arXiv", variant="primary")
1698
+ arxiv_out = gr.Markdown()
1699
+ arxiv_btn.click(fn=pull_from_arxiv_ui, inputs=[arxiv_query, arxiv_n], outputs=[arxiv_out])
1700
+
1701
+ with gr.Tab("Knowledge Base"):
1702
+ status_out = gr.Markdown(value=get_kb_status())
1703
+ refresh_btn = gr.Button("Refresh")
1704
+ refresh_btn.click(fn=get_kb_status, outputs=[status_out])
1705
+
1706
+ app.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.3.0
2
+ agate==1.6.3
3
+ agate-dbf==0.2.2
4
+ agate-excel==0.2.5
5
+ agate-sql==0.5.8
6
+ aiofiles==23.2.1
7
+ aiogram==2.21
8
+ aiohttp==3.8.1
9
+ aiosignal==1.2.0
10
+ annexremote==1.6.6
11
+ annotated-doc==0.0.4
12
+ annotated-types==0.7.0
13
+ ants==0.0.7
14
+ anyio==4.5.2
15
+ appdirs==1.4.4
16
+ appnope==0.1.2
17
+ argon2-cffi==21.3.0
18
+ argon2-cffi-bindings==21.2.0
19
+ arrow==1.3.0
20
+ arxiv==2.3.2
21
+ asgiref==3.8.1
22
+ astroid==2.4.2
23
+ asttokens==2.0.5
24
+ astunparse==1.6.3
25
+ async-lru==2.0.4
26
+ async-timeout==4.0.2
27
+ attrs==25.3.0
28
+ autopep8==1.5.4
29
+ babel==2.17.0
30
+ backcall==0.2.0
31
+ backoff==2.2.1
32
+ backports.tarfile==1.2.0
33
+ backports.zoneinfo==0.2.1
34
+ based-on-topic @ file:///Users/Marta/Desktop/miniproject/dist/based_on_topic-0.0.1-py3-none-any.whl
35
+ bcrypt==5.0.0
36
+ beautifulsoup4==4.13.5
37
+ black==22.1.0
38
+ bleach==4.1.0
39
+ boto3==1.37.38
40
+ botocore==1.37.38
41
+ branca==0.4.2
42
+ build==1.2.2.post1
43
+ cachetools==5.2.0
44
+ certifi==2022.6.15
45
+ cffi==1.15.0
46
+ chardet==4.0.0
47
+ charset-normalizer==2.1.0
48
+ chroma-hnswlib==0.7.6
49
+ chromadb==0.5.23
50
+ click==8.0.4
51
+ click-plugins==1.1.1
52
+ cligj==0.7.2
53
+ coloredlogs==15.0.1
54
+ comm==0.2.3
55
+ contourpy==1.0.6
56
+ coverage==5.5
57
+ csvkit==1.0.7
58
+ cycler==0.10.0
59
+ datalad==1.1.3
60
+ datasets==2.7.1
61
+ dbfread==2.0.7
62
+ debugpy==1.5.1
63
+ decorator==5.1.1
64
+ defusedxml==0.7.1
65
+ Deprecated==1.3.1
66
+ dill==0.3.6
67
+ distlib==0.3.1
68
+ distro==1.9.0
69
+ Django==4.2.29
70
+ durationpy==0.10
71
+ entrypoints==0.4
72
+ environs==8.0.0
73
+ et-xmlfile==1.1.0
74
+ exceptiongroup==1.3.0
75
+ executing==0.8.2
76
+ Faker==15.3.2
77
+ fastapi==0.124.4
78
+ fasteners==0.20
79
+ fastjsonschema==2.21.2
80
+ feedparser==6.0.12
81
+ ffmpy==0.5.0
82
+ filelock==3.0.12
83
+ Fiona==1.8.21
84
+ Flask==1.1.2
85
+ flatbuffers==22.12.6
86
+ folium==0.12.1
87
+ fonttools==4.38.0
88
+ fqdn==1.5.1
89
+ frozenlist==1.3.0
90
+ fsspec==2025.3.0
91
+ future==0.18.2
92
+ fuzzywuzzy==0.18.0
93
+ gast==0.4.0
94
+ geographiclib==1.50
95
+ geopandas==0.12.1
96
+ geopy==2.1.0
97
+ gevent==24.2.1
98
+ gmplot==1.4.1
99
+ google-auth==2.15.0
100
+ google-auth-oauthlib==0.4.6
101
+ google-pasta==0.2.0
102
+ googleapis-common-protos==1.73.0
103
+ googlemaps==4.7.3
104
+ gradio==4.44.1
105
+ gradio_client==1.3.0
106
+ graphlib_backport==1.1.0
107
+ greenlet==3.1.1
108
+ grpcio==1.70.0
109
+ h11==0.16.0
110
+ h5py==3.7.0
111
+ haversine==2.7.0
112
+ hf-xet==1.1.9
113
+ httpcore==1.0.9
114
+ httptools==0.6.4
115
+ httpx==0.28.1
116
+ huggingface-hub==0.34.4
117
+ humanfriendly==10.0
118
+ humanize==4.10.0
119
+ idna==2.10
120
+ importlib-resources==5.4.0
121
+ importlib_metadata==8.5.0
122
+ iniconfig==1.1.1
123
+ install==1.3.5
124
+ ipykernel==6.9.1
125
+ ipython==8.0.1
126
+ ipython-genutils==0.2.0
127
+ ipywidgets==8.1.7
128
+ iso8601==2.1.0
129
+ isodate==0.6.1
130
+ isoduration==20.11.0
131
+ isort==5.5.2
132
+ itsdangerous==1.1.0
133
+ jaraco.classes==3.4.0
134
+ jaraco.context==6.0.1
135
+ jaraco.functools==4.1.0
136
+ jedi==0.18.1
137
+ Jinja2==3.1.6
138
+ jiter==0.9.1
139
+ jmespath==1.0.1
140
+ joblib==1.1.0
141
+ json5==0.12.1
142
+ jsonpointer==3.0.0
143
+ jsonschema==4.23.0
144
+ jsonschema-specifications==2023.12.1
145
+ jupyter==1.0.0
146
+ jupyter-console==6.4.0
147
+ jupyter-events==0.10.0
148
+ jupyter-lsp==2.3.0
149
+ jupyter_client==7.4.9
150
+ jupyter_core==5.8.1
151
+ jupyter_server==2.14.2
152
+ jupyter_server_terminals==0.5.3
153
+ jupyterlab==4.3.8
154
+ jupyterlab-pygments==0.1.2
155
+ jupyterlab_server==2.27.3
156
+ jupyterlab_widgets==3.0.15
157
+ keyring==25.5.0
158
+ keyrings.alt==5.0.2
159
+ kiwisolver==1.3.2
160
+ kornia==0.7.0
161
+ kubernetes==35.0.0
162
+ lazy-object-proxy==1.4.3
163
+ leather==0.3.4
164
+ libclang==14.0.6
165
+ llvmlite==0.41.1
166
+ logging==0.4.9.6
167
+ looseversion==1.3.0
168
+ lxml==6.0.2
169
+ Markdown==3.4.1
170
+ markdown-it-py==3.0.0
171
+ MarkupSafe==2.1.5
172
+ marshmallow==3.17.0
173
+ matplotlib==3.6.2
174
+ matplotlib-inline==0.1.3
175
+ mccabe==0.6.1
176
+ mdurl==0.1.2
177
+ mistune==3.1.4
178
+ mmh3==5.0.1
179
+ more-itertools==10.5.0
180
+ mplcursors==0.5.2
181
+ mpmath==1.2.1
182
+ msgpack==1.1.1
183
+ multidict==6.0.2
184
+ multiprocess==0.70.14
185
+ munch==2.5.0
186
+ mypy-extensions==0.4.3
187
+ mysql-connector-python==8.0.33
188
+ nbclient==0.5.11
189
+ nbconvert==7.16.6
190
+ nbformat==5.10.4
191
+ nest-asyncio==1.5.4
192
+ networkx==2.8
193
+ nibabel==5.2.1
194
+ nilearn==0.10.4
195
+ nltk==3.6.5
196
+ notebook==7.3.3
197
+ notebook_shim==0.2.4
198
+ numba==0.58.1
199
+ numpy==1.23.5
200
+ oauthlib==3.2.2
201
+ olefile==0.46
202
+ onnxruntime==1.19.2
203
+ openai==1.109.1
204
+ openai-whisper==20250625
205
+ opencv-contrib-python==4.6.0.66
206
+ openpyxl==3.0.10
207
+ opentelemetry-api==1.33.1
208
+ opentelemetry-exporter-otlp-proto-common==1.33.1
209
+ opentelemetry-exporter-otlp-proto-grpc==1.33.1
210
+ opentelemetry-instrumentation==0.54b1
211
+ opentelemetry-instrumentation-asgi==0.54b1
212
+ opentelemetry-instrumentation-fastapi==0.54b1
213
+ opentelemetry-proto==1.33.1
214
+ opentelemetry-sdk==1.33.1
215
+ opentelemetry-semantic-conventions==0.54b1
216
+ opentelemetry-util-http==0.54b1
217
+ opt-einsum==3.3.0
218
+ orjson==3.10.15
219
+ osmnx==1.2.2
220
+ overrides==7.7.0
221
+ packaging==25.0
222
+ pandas==1.5.2
223
+ pandas-to-sql==0.0.546
224
+ pandasql==0.7.3
225
+ pandocfilters==1.5.0
226
+ parsedatetime==2.4
227
+ parso==0.8.3
228
+ pathspec==0.9.0
229
+ patool==1.12
230
+ pexpect==4.8.0
231
+ pickleshare==0.7.5
232
+ Pillow==8.0.1
233
+ pkgutil_resolve_name==1.3.10
234
+ platformdirs==2.5.1
235
+ plotly==5.11.0
236
+ pluggy==0.13.1
237
+ posthog==4.2.0
238
+ prettytable==2.0.0
239
+ prometheus-client==0.13.1
240
+ prompt-toolkit==3.0.28
241
+ protobuf==5.29.6
242
+ ptyprocess==0.7.0
243
+ PuLP==2.7.0
244
+ pure-eval==0.2.2
245
+ py==1.10.0
246
+ pyarrow==10.0.1
247
+ pyasn1==0.4.8
248
+ pyasn1-modules==0.2.8
249
+ pycodestyle==2.6.0
250
+ pycparser==2.21
251
+ pydantic==2.10.6
252
+ pydantic_core==2.27.2
253
+ pydub==0.25.1
254
+ Pygments==2.19.2
255
+ pylint==2.6.0
256
+ pyparsing==2.4.7
257
+ PyPDF2==3.0.1
258
+ PyPika==0.51.1
259
+ pyproj==3.3.1
260
+ pyproject_hooks==1.2.0
261
+ PyQt5==5.15.7
262
+ PyQt5-Qt5==5.15.2
263
+ PyQt5-sip==12.11.0
264
+ pyrsistent==0.18.1
265
+ pytest==6.2.2
266
+ python-dateutil==2.9.0.post0
267
+ python-dotenv==0.20.0
268
+ python-gitlab==4.13.0
269
+ python-json-logger==3.3.0
270
+ python-multipart==0.0.20
271
+ python-slugify==6.1.2
272
+ python-speech-features==0.6
273
+ pytimeparse==1.1.8
274
+ pytz==2020.4
275
+ PyWavelets==1.4.1
276
+ PyYAML==6.0
277
+ pyzmq==27.1.0
278
+ qtconsole==5.2.2
279
+ QtPy==2.0.1
280
+ rawkit==0.6.0
281
+ referencing==0.35.1
282
+ regex==2024.11.6
283
+ requests==2.32.4
284
+ requests-oauthlib==1.3.1
285
+ requests-toolbelt==1.0.0
286
+ responses==0.18.0
287
+ rfc3339-validator==0.1.4
288
+ rfc3986-validator==0.1.1
289
+ rich==14.3.4
290
+ rpds-py==0.20.1
291
+ rsa==4.9
292
+ Rtree==1.0.1
293
+ ruff==0.15.12
294
+ s3transfer==0.11.5
295
+ safetensors==0.5.3
296
+ scikit-learn==1.1.3
297
+ scipy==1.9.3
298
+ seaborn==0.12.1
299
+ semantic-version==2.10.0
300
+ Send2Trash==1.8.3
301
+ sentence-transformers==3.2.1
302
+ sgmllib3k==1.0.0
303
+ Shapely==1.8.2
304
+ shellingham==1.5.4
305
+ six==1.15.0
306
+ sklearn==0.0.post1
307
+ sniffio==1.3.1
308
+ soupsieve==2.7
309
+ SQLAlchemy==1.4.37
310
+ sqlparse==0.5.5
311
+ stack-data==0.2.0
312
+ starlette==0.44.0
313
+ sympy==1.11.1
314
+ tabulate==0.8.7
315
+ tenacity==9.0.0
316
+ termcolor==1.1.0
317
+ terminado==0.13.1
318
+ testpath==0.5.0
319
+ text-unidecode==1.3
320
+ threadpoolctl==3.1.0
321
+ tiktoken==0.7.0
322
+ tokenizers==0.20.3
323
+ toml==0.10.1
324
+ tomli==2.0.1
325
+ tomlkit==0.12.0
326
+ torch==1.13.0
327
+ torchvision==0.14.0
328
+ tornado==6.4.2
329
+ tqdm==4.67.3
330
+ traitlets==5.14.3
331
+ transformers==4.46.3
332
+ typer==0.20.1
333
+ types-python-dateutil==2.9.0.20241206
334
+ typing_extensions==4.13.2
335
+ uri-template==1.3.0
336
+ urllib3==2.2.3
337
+ uvicorn==0.33.0
338
+ uvloop==0.22.1
339
+ virtualenv==20.4.2
340
+ watchfiles==0.24.0
341
+ wcwidth==0.2.5
342
+ webcolors==24.8.0
343
+ webencodings==0.5.1
344
+ websocket-client==1.8.0
345
+ websockets==12.0
346
+ Werkzeug==1.0.1
347
+ widgetsnbextension==4.0.14
348
+ wooldridge==0.4.4
349
+ wordcloud==1.8.2.2
350
+ wrapt==1.12.1
351
+ xgboost==1.7.2
352
+ xlrd==2.0.1
353
+ xxhash==3.1.0
354
+ yarl==1.8.1
355
+ zipp==3.20.2
356
+ zope.event==5.0
357
+ zope.interface==7.2